aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorScott Long <scottl@FreeBSD.org>2021-03-18 07:34:07 +0000
committerScott Long <scottl@FreeBSD.org>2021-03-18 07:34:07 +0000
commit4c6c8f51fdb7e2b3870ec5a6fa5dce51ad3b25a5 (patch)
tree8fb21432ca90081af7732d80f1ceae6df7869597
parentcb370b19715b696cf6db4f7b357cf2e7f2e3adb7 (diff)
downloadsrc-4c6c8f51fdb7e2b3870ec5a6fa5dce51ad3b25a5.tar.gz
src-4c6c8f51fdb7e2b3870ec5a6fa5dce51ad3b25a5.zip
base: remove if_wg(4) and associated utilities, manpage
After length decisions, we've decided that the if_wg(4) driver and related work is not yet ready to live in the tree. This driver has larger security implications than many, and thus will be held to more scrutiny than other drivers. Requested by: secteam Approved by: re
-rw-r--r--sbin/ifconfig/Makefile1
-rw-r--r--sbin/ifconfig/ifwg.c618
-rw-r--r--share/man/man4/Makefile1
-rw-r--r--share/man/man4/wg.4255
-rw-r--r--sys/dev/if_wg/include/crypto/blake2s.h56
-rw-r--r--sys/dev/if_wg/include/crypto/curve25519.h74
-rw-r--r--sys/dev/if_wg/include/crypto/zinc.h15
-rw-r--r--sys/dev/if_wg/include/sys/if_wg_session.h89
-rw-r--r--sys/dev/if_wg/include/sys/if_wg_session_vars.h319
-rw-r--r--sys/dev/if_wg/include/sys/simd-x86_64.h74
-rw-r--r--sys/dev/if_wg/include/sys/support.h342
-rw-r--r--sys/dev/if_wg/include/sys/wg_cookie.h174
-rw-r--r--sys/dev/if_wg/include/sys/wg_module.h121
-rw-r--r--sys/dev/if_wg/include/sys/wg_noise.h286
-rw-r--r--sys/dev/if_wg/include/zinc/blake2s.h50
-rw-r--r--sys/dev/if_wg/include/zinc/chacha20.h68
-rw-r--r--sys/dev/if_wg/include/zinc/chacha20poly1305.h48
-rw-r--r--sys/dev/if_wg/include/zinc/curve25519.h28
-rw-r--r--sys/dev/if_wg/include/zinc/poly1305.h29
-rw-r--r--sys/dev/if_wg/module/blake2s.c256
-rw-r--r--sys/dev/if_wg/module/blake2s.h58
-rw-r--r--sys/dev/if_wg/module/chacha20-x86_64.S2834
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm-glue.c98
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl1227
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm64.pl1163
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips-glue.c27
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips.S424
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64-glue.c132
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64.pl4106
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20.c238
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20poly1305.c196
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm-glue.c140
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm.pl1276
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm64.pl974
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-donna32.c205
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-donna64.c182
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-mips-glue.c37
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-mips.S407
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-mips64.pl467
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-x86_64-glue.c171
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-x86_64.pl4266
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305.c163
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/selftest/blake2s.c2090
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/selftest/chacha20.c2703
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/selftest/chacha20poly1305.c8443
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/selftest/curve25519.c1315
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/selftest/poly1305.c1110
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/selftest/run.h43
-rw-r--r--sys/dev/if_wg/module/curve25519.c867
-rw-r--r--sys/dev/if_wg/module/if_wg_session.c1985
-rw-r--r--sys/dev/if_wg/module/module.c863
-rw-r--r--sys/dev/if_wg/module/poly1305-x86_64.S3021
-rw-r--r--sys/dev/if_wg/module/wg_cookie.c399
-rw-r--r--sys/dev/if_wg/module/wg_noise.c958
-rw-r--r--sys/kern/subr_gtaskqueue.c13
-rw-r--r--sys/modules/Makefile1
-rw-r--r--sys/modules/if_wg/Makefile38
-rw-r--r--sys/sys/gtaskqueue.h1
58 files changed, 0 insertions, 45545 deletions
diff --git a/sbin/ifconfig/Makefile b/sbin/ifconfig/Makefile
index 61cb8ab933fd..b178dc0c7e6a 100644
--- a/sbin/ifconfig/Makefile
+++ b/sbin/ifconfig/Makefile
@@ -35,7 +35,6 @@ SRCS+= ifvxlan.c # VXLAN support
SRCS+= ifgre.c # GRE keys etc
SRCS+= ifgif.c # GIF reversed header workaround
SRCS+= ifipsec.c # IPsec VTI
-SRCS+= ifwg.c # Wireguard
SRCS+= sfp.c # SFP/SFP+ information
LIBADD+= ifconfig m util
diff --git a/sbin/ifconfig/ifwg.c b/sbin/ifconfig/ifwg.c
deleted file mode 100644
index a2b22d2dfbef..000000000000
--- a/sbin/ifconfig/ifwg.c
+++ /dev/null
@@ -1,618 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause
- *
- * Copyright (c) 2020 Rubicon Communications, LLC (Netgate)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#ifndef RESCUE
-#include <sys/param.h>
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-#include <sys/sysctl.h>
-#include <sys/time.h>
-#include <sys/nv.h>
-
-#include <net/ethernet.h>
-#include <net/if.h>
-#include <net/if_dl.h>
-#include <net/if_types.h>
-#include <net/if_media.h>
-#include <net/route.h>
-
-#include <netinet/in.h>
-#include <arpa/inet.h>
-
-#include <assert.h>
-#include <ctype.h>
-#include <err.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <netdb.h>
-#include <string.h>
-#include <unistd.h>
-#include <stdarg.h>
-#include <stddef.h> /* NB: for offsetof */
-#include <locale.h>
-#include <langinfo.h>
-#include <resolv.h>
-
-#include "ifconfig.h"
-
-typedef enum {
- WGC_GET = 0x5,
- WGC_SET = 0x6,
-} wg_cmd_t;
-
-static nvlist_t *nvl_params;
-static bool do_peer;
-static int allowed_ips_count;
-static int allowed_ips_max;
-struct allowedip {
- struct sockaddr_storage a_addr;
- struct sockaddr_storage a_mask;
-};
-struct allowedip *allowed_ips;
-
-#define ALLOWEDIPS_START 16
-#define WG_KEY_LEN 32
-#define WG_KEY_LEN_BASE64 ((((WG_KEY_LEN) + 2) / 3) * 4 + 1)
-#define WG_KEY_LEN_HEX (WG_KEY_LEN * 2 + 1)
-#define WG_MAX_STRLEN 64
-
-static bool
-key_from_base64(uint8_t key[static WG_KEY_LEN], const char *base64)
-{
-
- if (strlen(base64) != WG_KEY_LEN_BASE64 - 1) {
- warnx("bad key len - need %d got %zu\n", WG_KEY_LEN_BASE64 - 1, strlen(base64));
- return false;
- }
- if (base64[WG_KEY_LEN_BASE64 - 2] != '=') {
- warnx("bad key terminator, expected '=' got '%c'", base64[WG_KEY_LEN_BASE64 - 2]);
- return false;
- }
- return (b64_pton(base64, key, WG_KEY_LEN));
-}
-
-static void
-parse_endpoint(const char *endpoint_)
-{
- int err;
- char *base, *endpoint, *port, *colon, *tmp;
- struct addrinfo hints, *res;
-
- endpoint = base = strdup(endpoint_);
- colon = rindex(endpoint, ':');
- if (colon == NULL)
- errx(1, "bad endpoint format %s - no port delimiter found", endpoint);
- *colon = '\0';
- port = colon + 1;
-
- /* [::]:<> */
- if (endpoint[0] == '[') {
- endpoint++;
- tmp = index(endpoint, ']');
- if (tmp == NULL)
- errx(1, "bad endpoint format %s - '[' found with no matching ']'", endpoint);
- *tmp = '\0';
- }
- bzero(&hints, sizeof(hints));
- hints.ai_family = AF_UNSPEC;
- err = getaddrinfo(endpoint, port, &hints, &res);
- if (err)
- errx(1, "%s", gai_strerror(err));
- nvlist_add_binary(nvl_params, "endpoint", res->ai_addr, res->ai_addrlen);
- freeaddrinfo(res);
- free(base);
-}
-
-static void
-in_len2mask(struct in_addr *mask, u_int len)
-{
- u_int i;
- u_char *p;
-
- p = (u_char *)mask;
- memset(mask, 0, sizeof(*mask));
- for (i = 0; i < len / NBBY; i++)
- p[i] = 0xff;
- if (len % NBBY)
- p[i] = (0xff00 >> (len % NBBY)) & 0xff;
-}
-
-static u_int
-in_mask2len(struct in_addr *mask)
-{
- u_int x, y;
- u_char *p;
-
- p = (u_char *)mask;
- for (x = 0; x < sizeof(*mask); x++) {
- if (p[x] != 0xff)
- break;
- }
- y = 0;
- if (x < sizeof(*mask)) {
- for (y = 0; y < NBBY; y++) {
- if ((p[x] & (0x80 >> y)) == 0)
- break;
- }
- }
- return x * NBBY + y;
-}
-
-static void
-in6_prefixlen2mask(struct in6_addr *maskp, int len)
-{
- static const u_char maskarray[NBBY] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
- int bytelen, bitlen, i;
-
- /* sanity check */
- if (len < 0 || len > 128) {
- errx(1, "in6_prefixlen2mask: invalid prefix length(%d)\n",
- len);
- return;
- }
-
- memset(maskp, 0, sizeof(*maskp));
- bytelen = len / NBBY;
- bitlen = len % NBBY;
- for (i = 0; i < bytelen; i++)
- maskp->s6_addr[i] = 0xff;
- if (bitlen)
- maskp->s6_addr[bytelen] = maskarray[bitlen - 1];
-}
-
-static int
-in6_mask2len(struct in6_addr *mask, u_char *lim0)
-{
- int x = 0, y;
- u_char *lim = lim0, *p;
-
- /* ignore the scope_id part */
- if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask))
- lim = (u_char *)mask + sizeof(*mask);
- for (p = (u_char *)mask; p < lim; x++, p++) {
- if (*p != 0xff)
- break;
- }
- y = 0;
- if (p < lim) {
- for (y = 0; y < NBBY; y++) {
- if ((*p & (0x80 >> y)) == 0)
- break;
- }
- }
-
- /*
- * when the limit pointer is given, do a stricter check on the
- * remaining bits.
- */
- if (p < lim) {
- if (y != 0 && (*p & (0x00ff >> y)) != 0)
- return -1;
- for (p = p + 1; p < lim; p++)
- if (*p != 0)
- return -1;
- }
-
- return x * NBBY + y;
-}
-
-static bool
-parse_ip(struct allowedip *aip, const char *value)
-{
- struct addrinfo hints, *res;
- int err;
-
- bzero(&aip->a_addr, sizeof(aip->a_addr));
- bzero(&hints, sizeof(hints));
- hints.ai_family = AF_UNSPEC;
- hints.ai_flags = AI_NUMERICHOST;
- err = getaddrinfo(value, NULL, &hints, &res);
- if (err)
- errx(1, "%s", gai_strerror(err));
-
- memcpy(&aip->a_addr, res->ai_addr, res->ai_addrlen);
-
- freeaddrinfo(res);
- return (true);
-}
-
-static void
-sa_ntop(const struct sockaddr *sa, char *buf, int *port)
-{
- const struct sockaddr_in *sin;
- const struct sockaddr_in6 *sin6;
- int err;
-
- err = getnameinfo(sa, sa->sa_len, buf, INET6_ADDRSTRLEN, NULL,
- 0, NI_NUMERICHOST);
-
- if (sa->sa_family == AF_INET) {
- sin = (const struct sockaddr_in *)sa;
- if (port)
- *port = sin->sin_port;
- } else if (sa->sa_family == AF_INET6) {
- sin6 = (const struct sockaddr_in6 *)sa;
- if (port)
- *port = sin6->sin6_port;
- }
-
- if (err)
- errx(1, "%s", gai_strerror(err));
-}
-
-static void
-dump_peer(const nvlist_t *nvl_peer)
-{
- const void *key;
- const struct allowedip *aips;
- const struct sockaddr *endpoint;
- char outbuf[WG_MAX_STRLEN];
- char addr_buf[INET6_ADDRSTRLEN];
- size_t size;
- int count, port;
-
- printf("[Peer]\n");
- if (nvlist_exists_binary(nvl_peer, "public-key")) {
- key = nvlist_get_binary(nvl_peer, "public-key", &size);
- b64_ntop((const uint8_t *)key, size, outbuf, WG_MAX_STRLEN);
- printf("PublicKey = %s\n", outbuf);
- }
- if (nvlist_exists_binary(nvl_peer, "endpoint")) {
- endpoint = nvlist_get_binary(nvl_peer, "endpoint", &size);
- sa_ntop(endpoint, addr_buf, &port);
- printf("Endpoint = %s:%d\n", addr_buf, ntohs(port));
- }
-
- if (!nvlist_exists_binary(nvl_peer, "allowed-ips"))
- return;
- aips = nvlist_get_binary(nvl_peer, "allowed-ips", &size);
- if (size == 0 || size % sizeof(struct allowedip) != 0) {
- errx(1, "size %zu not integer multiple of allowedip", size);
- }
- printf("AllowedIPs = ");
- count = size / sizeof(struct allowedip);
- for (int i = 0; i < count; i++) {
- int mask;
- sa_family_t family;
- void *bitmask;
- struct sockaddr *sa;
-
- sa = __DECONST(void *, &aips[i].a_addr);
- bitmask = __DECONST(void *,
- ((const struct sockaddr *)&(&aips[i])->a_mask)->sa_data);
- family = aips[i].a_addr.ss_family;
- getnameinfo(sa, sa->sa_len, addr_buf, INET6_ADDRSTRLEN, NULL,
- 0, NI_NUMERICHOST);
- if (family == AF_INET)
- mask = in_mask2len(bitmask);
- else if (family == AF_INET6)
- mask = in6_mask2len(bitmask, NULL);
- else
- errx(1, "bad family in peer %d\n", family);
- printf("%s/%d", addr_buf, mask);
- if (i < count -1)
- printf(", ");
- }
- printf("\n");
-}
-
-static int
-get_nvl_out_size(int sock, u_long op, size_t *size)
-{
- struct ifdrv ifd;
- int err;
-
- memset(&ifd, 0, sizeof(ifd));
-
- strlcpy(ifd.ifd_name, name, sizeof(ifd.ifd_name));
- ifd.ifd_cmd = op;
- ifd.ifd_len = 0;
- ifd.ifd_data = NULL;
-
- err = ioctl(sock, SIOCGDRVSPEC, &ifd);
- if (err)
- return (err);
- *size = ifd.ifd_len;
- return (0);
-}
-
-static int
-do_cmd(int sock, u_long op, void *arg, size_t argsize, int set)
-{
- struct ifdrv ifd;
-
- memset(&ifd, 0, sizeof(ifd));
-
- strlcpy(ifd.ifd_name, name, sizeof(ifd.ifd_name));
- ifd.ifd_cmd = op;
- ifd.ifd_len = argsize;
- ifd.ifd_data = arg;
-
- return (ioctl(sock, set ? SIOCSDRVSPEC : SIOCGDRVSPEC, &ifd));
-}
-
-static
-DECL_CMD_FUNC(peerlist, val, d)
-{
- size_t size, peercount;
- void *packed;
- const nvlist_t *nvl, *nvl_peer;
- const nvlist_t *const *nvl_peerlist;
-
- if (get_nvl_out_size(s, WGC_GET, &size))
- errx(1, "can't get peer list size");
- if ((packed = malloc(size)) == NULL)
- errx(1, "malloc failed for peer list");
- if (do_cmd(s, WGC_GET, packed, size, 0))
- errx(1, "failed to obtain peer list");
-
- nvl = nvlist_unpack(packed, size, 0);
- if (!nvlist_exists_nvlist_array(nvl, "peer-list"))
- return;
- nvl_peerlist = nvlist_get_nvlist_array(nvl, "peer-list", &peercount);
-
- for (int i = 0; i < peercount; i++, nvl_peerlist++) {
- nvl_peer = *nvl_peerlist;
- dump_peer(nvl_peer);
- }
-}
-
-static void
-peerfinish(int s, void *arg)
-{
- nvlist_t *nvl, **nvl_array;
- void *packed;
- size_t size;
-
- if ((nvl = nvlist_create(0)) == NULL)
- errx(1, "failed to allocate nvlist");
- if ((nvl_array = calloc(sizeof(void *), 1)) == NULL)
- errx(1, "failed to allocate nvl_array");
- if (!nvlist_exists_binary(nvl_params, "public-key"))
- errx(1, "must specify a public-key for adding peer");
- if (!nvlist_exists_binary(nvl_params, "endpoint"))
- errx(1, "must specify an endpoint for adding peer");
- if (allowed_ips_count == 0)
- errx(1, "must specify at least one range of allowed-ips to add a peer");
-
- nvl_array[0] = nvl_params;
- nvlist_add_nvlist_array(nvl, "peer-list", (const nvlist_t * const *)nvl_array, 1);
- packed = nvlist_pack(nvl, &size);
-
- if (do_cmd(s, WGC_SET, packed, size, true))
- errx(1, "failed to install peer");
-}
-
-static
-DECL_CMD_FUNC(peerstart, val, d)
-{
- do_peer = true;
- callback_register(peerfinish, NULL);
- allowed_ips = malloc(ALLOWEDIPS_START * sizeof(struct allowedip));
- allowed_ips_max = ALLOWEDIPS_START;
- if (allowed_ips == NULL)
- errx(1, "failed to allocate array for allowedips");
-}
-
-static
-DECL_CMD_FUNC(setwglistenport, val, d)
-{
- struct addrinfo hints, *res;
- const struct sockaddr_in *sin;
- const struct sockaddr_in6 *sin6;
-
- u_long ul;
- int err;
-
- bzero(&hints, sizeof(hints));
- hints.ai_family = AF_UNSPEC;
- hints.ai_flags = AI_NUMERICHOST;
- err = getaddrinfo(NULL, val, &hints, &res);
- if (err)
- errx(1, "%s", gai_strerror(err));
-
- if (res->ai_family == AF_INET) {
- sin = (struct sockaddr_in *)res->ai_addr;
- ul = sin->sin_port;
- } else if (res->ai_family == AF_INET6) {
- sin6 = (struct sockaddr_in6 *)res->ai_addr;
- ul = sin6->sin6_port;
- } else {
- errx(1, "unknown family");
- }
- ul = ntohs((u_short)ul);
- nvlist_add_number(nvl_params, "listen-port", ul);
-}
-
-static
-DECL_CMD_FUNC(setwgprivkey, val, d)
-{
- uint8_t key[WG_KEY_LEN];
-
- if (!key_from_base64(key, val))
- errx(1, "invalid key %s", val);
- nvlist_add_binary(nvl_params, "private-key", key, WG_KEY_LEN);
-}
-
-static
-DECL_CMD_FUNC(setwgpubkey, val, d)
-{
- uint8_t key[WG_KEY_LEN];
-
- if (!do_peer)
- errx(1, "setting public key only valid when adding peer");
-
- if (!key_from_base64(key, val))
- errx(1, "invalid key %s", val);
- nvlist_add_binary(nvl_params, "public-key", key, WG_KEY_LEN);
-}
-
-static
-DECL_CMD_FUNC(setallowedips, val, d)
-{
- char *base, *allowedip, *mask;
- u_long ul;
- char *endp;
- struct allowedip *aip;
-
- if (!do_peer)
- errx(1, "setting allowed ip only valid when adding peer");
- if (allowed_ips_count == allowed_ips_max) {
- /* XXX grow array */
- }
- aip = &allowed_ips[allowed_ips_count];
- base = allowedip = strdup(val);
- mask = index(allowedip, '/');
- if (mask == NULL)
- errx(1, "mask separator not found in allowedip %s", val);
- *mask = '\0';
- mask++;
- parse_ip(aip, allowedip);
- ul = strtoul(mask, &endp, 0);
- if (*endp != '\0')
- errx(1, "invalid value for allowedip mask");
- bzero(&aip->a_mask, sizeof(aip->a_mask));
- if (aip->a_addr.ss_family == AF_INET)
- in_len2mask((struct in_addr *)&((struct sockaddr *)&aip->a_mask)->sa_data, ul);
- else if (aip->a_addr.ss_family == AF_INET6)
- in6_prefixlen2mask((struct in6_addr *)&((struct sockaddr *)&aip->a_mask)->sa_data, ul);
- else
- errx(1, "invalid address family %d\n", aip->a_addr.ss_family);
- allowed_ips_count++;
- if (allowed_ips_count > 1)
- nvlist_free_binary(nvl_params, "allowed-ips");
- nvlist_add_binary(nvl_params, "allowed-ips", allowed_ips,
- allowed_ips_count*sizeof(*aip));
-
- dump_peer(nvl_params);
- free(base);
-}
-
-static
-DECL_CMD_FUNC(setendpoint, val, d)
-{
- if (!do_peer)
- errx(1, "setting endpoint only valid when adding peer");
- parse_endpoint(val);
-}
-
-static void
-wireguard_status(int s)
-{
- size_t size;
- void *packed;
- nvlist_t *nvl;
- char buf[WG_KEY_LEN_BASE64];
- const void *key;
- uint16_t listen_port;
-
- if (get_nvl_out_size(s, WGC_GET, &size))
- return;
- if ((packed = malloc(size)) == NULL)
- return;
- if (do_cmd(s, WGC_GET, packed, size, 0))
- return;
- nvl = nvlist_unpack(packed, size, 0);
- if (nvlist_exists_number(nvl, "listen-port")) {
- listen_port = nvlist_get_number(nvl, "listen-port");
- printf("\tlisten-port: %d\n", listen_port);
- }
- if (nvlist_exists_binary(nvl, "private-key")) {
- key = nvlist_get_binary(nvl, "private-key", &size);
- b64_ntop((const uint8_t *)key, size, buf, WG_MAX_STRLEN);
- printf("\tprivate-key: %s\n", buf);
- }
- if (nvlist_exists_binary(nvl, "public-key")) {
- key = nvlist_get_binary(nvl, "public-key", &size);
- b64_ntop((const uint8_t *)key, size, buf, WG_MAX_STRLEN);
- printf("\tpublic-key: %s\n", buf);
- }
-}
-
-static struct cmd wireguard_cmds[] = {
- DEF_CLONE_CMD_ARG("listen-port", setwglistenport),
- DEF_CLONE_CMD_ARG("private-key", setwgprivkey),
- DEF_CMD("peer-list", 0, peerlist),
- DEF_CMD("peer", 0, peerstart),
- DEF_CMD_ARG("public-key", setwgpubkey),
- DEF_CMD_ARG("allowed-ips", setallowedips),
- DEF_CMD_ARG("endpoint", setendpoint),
-};
-
-static struct afswtch af_wireguard = {
- .af_name = "af_wireguard",
- .af_af = AF_UNSPEC,
- .af_other_status = wireguard_status,
-};
-
-static void
-wg_create(int s, struct ifreq *ifr)
-{
- struct iovec iov;
- void *packed;
- size_t size;
-
- setproctitle("ifconfig %s create ...\n", name);
- if (!nvlist_exists_number(nvl_params, "listen-port"))
- goto legacy;
- if (!nvlist_exists_binary(nvl_params, "private-key"))
- goto legacy;
-
- packed = nvlist_pack(nvl_params, &size);
- if (packed == NULL)
- errx(1, "failed to setup create request");
- iov.iov_len = size;
- iov.iov_base = packed;
- ifr->ifr_data = (caddr_t)&iov;
- if (ioctl(s, SIOCIFCREATE2, ifr) < 0)
- err(1, "SIOCIFCREATE2");
- return;
-legacy:
- ifr->ifr_data == NULL;
- if (ioctl(s, SIOCIFCREATE, ifr) < 0)
- err(1, "SIOCIFCREATE");
-}
-
-static __constructor void
-wireguard_ctor(void)
-{
- int i;
-
- nvl_params = nvlist_create(0);
- for (i = 0; i < nitems(wireguard_cmds); i++)
- cmd_register(&wireguard_cmds[i]);
- af_register(&af_wireguard);
- clone_setdefcallback_prefix("wg", wg_create);
-}
-
-#endif
diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
index b66dcf135733..ffc7a08292e9 100644
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -583,7 +583,6 @@ MAN= aac.4 \
vtnet.4 \
watchdog.4 \
${_wbwd.4} \
- wg.4 \
witness.4 \
wlan.4 \
wlan_acl.4 \
diff --git a/share/man/man4/wg.4 b/share/man/man4/wg.4
deleted file mode 100644
index 760584e3a386..000000000000
--- a/share/man/man4/wg.4
+++ /dev/null
@@ -1,255 +0,0 @@
-.\" Copyright (c) 2020 Gordon Bergling <gbe@FreeBSD.org>
-.\"
-.\" Redistribution and use in source and binary forms, with or without
-.\" modification, are permitted provided that the following conditions
-.\" are met:
-.\" 1. Redistributions of source code must retain the above copyright
-.\" notice, this list of conditions and the following disclaimer.
-.\" 2. Redistributions in binary form must reproduce the above copyright
-.\" notice, this list of conditions and the following disclaimer in the
-.\" documentation and/or other materials provided with the distribution.
-.\"
-.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
-.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
-.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-.\" SUCH DAMAGE.
-.\"
-.\" $FreeBSD$
-.\"
-.Dd March 7, 2021
-.Dt WG 4
-.Os
-.Sh NAME
-.Nm wg
-.Nd "WireGuard - pseudo-device"
-.Sh SYNOPSIS
-To load the driver as a module at boot time, place the following line in
-.Xr loader.conf 5 :
-.Bd -literal -offset indent
-if_wg_load="YES"
-.Ed
-.Sh DESCRIPTION
-The
-.Nm
-driver provides Virtual Private Network (VPN) interfaces for the secure
-exchange of layer 3 traffic with other WireGuard peers using the WireGuard
-protocol.
-.Pp
-A
-.Nm
-interface recognises one or more peers, establishes a secure tunnel with
-each on demand, and tracks each peer's UDP endpoint for exchanging encrypted
-traffic with.
-.Pp
-The interfaces can be created at runtime using the
-.Ic ifconfig Cm wg Ns Ar N Cm create
-command.
-The interface itself can be configured with
-.Xr ifconfig 8 .
-.Pp
-The following parameters are available:
-.Bl -tag -width indent
-.It Cm listen-port
-The listing port of the
-.Nm
-interface.
-.It Cm public-key
-The public key of the
-.Nm
-interface.
-.It Cm private-key
-The private key of the
-.Nm
-interface.
-.It Cm pre-shared-key
-Defines a pre-shared key for the
-.Nm
-interface.
-.It Cm allowed-ips
-A list of allowed IP addresses.
-.It Cm endpoint
-The IP address of the WiredGuard to connect to.
-.It Cm peer-list
-A list of peering IP addresses to connect to.
-.El
-.Pp
-The
-.Nm
-interfaces support the following
-.Xr ioctl 2 Ns s :
-.Bl -tag -width Ds -offset indent
-.It Dv SIOCSWG Fa "struct wg_device_io *"
-Set the device configuration.
-.It Dv SIOCGWG Fa "struct wg_device_io *"
-Get the device configuration.
-.El
-.Pp
-The following glossary provides a brief overview of WireGuard
-terminology:
-.Bl -tag -width indent -offset 3n
-.It Peer
-Peers exchange IPv4 or IPv6 traffic over secure tunnels.
-Each
-.Nm
-interface may be configured to recognise one or more peers.
-.It Key
-Each peer uses its private key and corresponding public key to
-identify itself to others.
-A peer configures a
-.Nm
-interface with its own private key and with the public keys of its peers.
-.It Pre-shared key
-In addition to the public keys, each peer pair may be configured with a
-unique pre-shared symmetric key.
-This is used in their handshake to guard against future compromise of the
-peers' encrypted tunnel if a quantum-computational attack on their
-Diffie-Hellman exchange becomes feasible.
-It is optional, but recommended.
-.It Allowed IPs
-A single
-.Nm
-interface may maintain concurrent tunnels connecting diverse networks.
-The interface therefore implements rudimentary routing and reverse-path
-filtering functions for its tunneled traffic.
-These functions reference a set of allowed IP ranges configured against
-each peer.
-.Pp
-The interface will route outbound tunneled traffic to the peer configured
-with the most specific matching allowed IP address range, or drop it
-if no such match exists.
-.Pp
-The interface will accept tunneled traffic only from the peer
-configured with the most specific matching allowed IP address range
-for the incoming traffic, or drop it if no such match exists.
-That is, tunneled traffic routed to a given peer cannot return through
-another peer of the same
-.Nm
-interface.
-This ensures that peers cannot spoof another's traffic.
-.It Handshake
-Two peers handshake to mutually authenticate each other and to
-establish a shared series of secret ephemeral encryption keys.
-Any peer may initiate a handshake.
-Handshakes occur only when there is traffic to send, and recur every
-two minutes during transfers.
-.It Connectionless
-Due to the handshake behavior, there is no connected or disconnected
-state.
-.El
-.Ss Keys
-Private keys for WireGuard can be generated from any sufficiently
-secure random source.
-The Curve25519 keys and the pre-shared keys are both 32 bytes
-long and are commonly encoded in base64 for ease of use.
-.Pp
-Keys can be generated with
-.Xr openssl 1
-as follows:
-.Pp
-.Dl $ openssl rand -base64 32
-.Pp
-Although a valid Curve25519 key must have 5 bits set to
-specific values, this is done by the interface and so it
-will accept any random 32-byte base64 string.
-.Pp
-When an interface has a private key set with
-.Nm public-key ,
-the corresponding
-public key is shown in the status output of the interface:
-.Bd -literal -offset indent
-# ifconfig wg0 | grep public-key
- public-key: 7lWtsDdqaGB3EY9WNxRN3hVaHMtu1zXw71+bOjNOVUw=
-.Ed
-.Sh EXAMPLES
-Create a
-.Nm
-interface and set random private key.
-.Bd -literal -offset indent
-# ifconfig wg0 create listen-port 54321 private-key `openssl rand -base64 32`
-.Ed
-.Pp
-Retrieve the associated public key from a
-.Nm
-interface.
-.Bd -literal -offset indent
-$ ifconfig wg0 | awk '/public-key/ { print $2 }'`
-.Ed
-.Pp
-Connect to a specific endpoint using its public-key and set the allowed IP address
-.Bd -literal -offset indent
-# ifconfig wg0 peer public-key '7lWtsDdqaGB3EY9WNxRN3hVaHMtu1zXw71+bOjNOVUw=' endpoint 10.0.1.100:54321 allowed-ips 192.168.2.100/32
-.Ed
-.Sh DIAGNOSTICS
-The
-.Nm
-interface supports runtime debugging, which can be enabled with:
-.Pp
-.D1 Ic ifconfig Cm wg Ns Ar N Cm debug
-.Pp
-Some common error messages include:
-.Bl -diag
-.It "Handshake for peer X did not complete after 5 seconds, retrying"
-Peer X did not reply to our initiation packet, for example because:
-.Bl -bullet
-.It
-The peer does not have the local interface configured as a peer.
-Peers must be able to mutually authenticate each other.
-.It
-The peer endpoint IP address is incorrectly configured.
-.It
-There are firewall rules preventing communication between hosts.
-.El
-.It "Invalid handshake initiation"
-The incoming handshake packet could not be processed.
-This is likely due to the local interface not containing
-the correct public key for the peer.
-.It "Invalid initiation MAC"
-The incoming handshake initiation packet had an invalid MAC.
-This is likely because the initiation sender has the wrong public key
-for the handshake receiver.
-.It "Packet has unallowed src IP from peer X"
-After decryption, an incoming data packet has a source IP address that
-is not assigned to the allowed IPs of Peer X.
-.El
-.Sh SEE ALSO
-.Xr inet 4 ,
-.Xr ip 4 ,
-.Xr netintro 4 ,
-.Xr ipf 5 ,
-.Xr pf.conf 5 ,
-.Xr ifconfig 8 ,
-.Xr ipfw 8
-.Rs
-.%T WireGuard whitepaper
-.%U https://www.wireguard.com/papers/wireguard.pdf
-.Re
-.Sh HISTORY
-The
-.Nm
-device driver first appeared in
-.Fx 13.0 .
-.Sh AUTHORS
-The
-.Nm
-device driver was originally written for
-.Ox
-by
-.An Matt Dunwoodie Aq Mt ncon@nconroy.net
-and ported to
-.Fx
-by
-.An Matt Macy Aq Mt mmacy@FreeBSD.org .
-.Pp
-This manual page was written by
-.An Gordon Bergling Aq Mt gbe@FreeBSD.org
-and is based on the
-.Ox
-manual page written by
-.An David Gwynne Aq Mt dlg@openbsd.org .
diff --git a/sys/dev/if_wg/include/crypto/blake2s.h b/sys/dev/if_wg/include/crypto/blake2s.h
deleted file mode 100644
index 17e6447ebcd8..000000000000
--- a/sys/dev/if_wg/include/crypto/blake2s.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <sys/types.h>
-
-#ifndef _BLAKE2S_H_
-#define _BLAKE2S_H_
-
-
-enum blake2s_lengths {
- BLAKE2S_BLOCK_SIZE = 64,
- BLAKE2S_HASH_SIZE = 32,
- BLAKE2S_KEY_SIZE = 32
-};
-
-struct blake2s_state {
- uint32_t h[8];
- uint32_t t[2];
- uint32_t f[2];
- uint8_t buf[BLAKE2S_BLOCK_SIZE];
- size_t buflen;
- uint8_t last_node;
-};
-
-void blake2s_init(struct blake2s_state *state, const size_t outlen);
-void blake2s_init_key(struct blake2s_state *state, const size_t outlen,
- const void *key, const size_t keylen);
-void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen);
-void blake2s_final(struct blake2s_state *state, uint8_t *out, const size_t outlen);
-
-static inline void blake2s(uint8_t *out, const uint8_t *in, const uint8_t *key,
- const size_t outlen, const size_t inlen,
- const size_t keylen)
-{
- struct blake2s_state state;
-#ifdef __linux___
- WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen ||
- outlen > BLAKE2S_HASH_SIZE || keylen > BLAKE2S_KEY_SIZE ||
- (!key && keylen)));
-#endif
-
- if (keylen)
- blake2s_init_key(&state, outlen, key, keylen);
- else
- blake2s_init(&state, outlen);
-
- blake2s_update(&state, in, inlen);
- blake2s_final(&state, out, outlen);
-}
-
-void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key,
- const size_t outlen, const size_t inlen, const size_t keylen);
-
-#endif /* _BLAKE2S_H_ */
diff --git a/sys/dev/if_wg/include/crypto/curve25519.h b/sys/dev/if_wg/include/crypto/curve25519.h
deleted file mode 100644
index 3e90d1b270fe..000000000000
--- a/sys/dev/if_wg/include/crypto/curve25519.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _CURVE25519_H_
-#define _CURVE25519_H_
-
-#include <sys/systm.h>
-
-#define CURVE25519_KEY_SIZE 32
-
-void curve25519_generic(u8 [CURVE25519_KEY_SIZE],
- const u8 [CURVE25519_KEY_SIZE],
- const u8 [CURVE25519_KEY_SIZE]);
-
-static inline void curve25519_clamp_secret(u8 secret[CURVE25519_KEY_SIZE])
-{
- secret[0] &= 248;
- secret[31] = (secret[31] & 127) | 64;
-}
-
-static const u8 null_point[CURVE25519_KEY_SIZE] = { 0 };
-
-static inline int curve25519(u8 mypublic[CURVE25519_KEY_SIZE],
- const u8 secret[CURVE25519_KEY_SIZE],
- const u8 basepoint[CURVE25519_KEY_SIZE])
-{
- curve25519_generic(mypublic, secret, basepoint);
- return timingsafe_bcmp(mypublic, null_point, CURVE25519_KEY_SIZE);
-}
-
-static inline int curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE],
- const u8 secret[CURVE25519_KEY_SIZE])
-{
- static const u8 basepoint[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
-
- if (timingsafe_bcmp(secret, null_point, CURVE25519_KEY_SIZE) == 0)
- return 0;
-
- return curve25519(pub, secret, basepoint);
-}
-
-static inline void curve25519_generate_secret(u8 secret[CURVE25519_KEY_SIZE])
-{
- arc4random_buf(secret, CURVE25519_KEY_SIZE);
- curve25519_clamp_secret(secret);
-}
-
-#endif /* _CURVE25519_H_ */
diff --git a/sys/dev/if_wg/include/crypto/zinc.h b/sys/dev/if_wg/include/crypto/zinc.h
deleted file mode 100644
index 9aa1e8d59bf5..000000000000
--- a/sys/dev/if_wg/include/crypto/zinc.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#ifndef _WG_ZINC_H
-#define _WG_ZINC_H
-
-int chacha20_mod_init(void);
-int poly1305_mod_init(void);
-int chacha20poly1305_mod_init(void);
-int blake2s_mod_init(void);
-int curve25519_mod_init(void);
-
-#endif
diff --git a/sys/dev/if_wg/include/sys/if_wg_session.h b/sys/dev/if_wg/include/sys/if_wg_session.h
deleted file mode 100644
index 45399e534364..000000000000
--- a/sys/dev/if_wg/include/sys/if_wg_session.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2019 Matt Dunwoodie <ncon@noconroy.net>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * $FreeBSD$
- */
-
-#ifndef __IF_WG_H__
-#define __IF_WG_H__
-
-#include <net/if.h>
-#include <netinet/in.h>
-
-/*
- * This is the public interface to the WireGuard network interface.
- *
- * It is designed to be used by tools such as ifconfig(8) and wg(4).
- */
-
-#define WG_KEY_SIZE 32
-
-#define WG_DEVICE_HAS_PUBKEY (1 << 0)
-#define WG_DEVICE_HAS_PRIVKEY (1 << 1)
-#define WG_DEVICE_HAS_MASKED_PRIVKEY (1 << 2)
-#define WG_DEVICE_HAS_PORT (1 << 3)
-#define WG_DEVICE_HAS_RDOMAIN (1 << 4)
-#define WG_DEVICE_REPLACE_PEERS (1 << 5)
-
-#define WG_PEER_HAS_PUBKEY (1 << 0)
-#define WG_PEER_HAS_SHAREDKEY (1 << 1)
-#define WG_PEER_HAS_MASKED_SHAREDKEY (1 << 2)
-#define WG_PEER_HAS_ENDPOINT (1 << 3)
-#define WG_PEER_HAS_PERSISTENTKEEPALIVE (1 << 4)
-#define WG_PEER_REPLACE_CIDRS (1 << 5)
-#define WG_PEER_REMOVE (1 << 6)
-
-#define SIOCSWG _IOWR('i', 200, struct wg_device_io)
-#define SIOCGWG _IOWR('i', 201, struct wg_device_io)
-
-#define WG_PEERS_FOREACH(p, d) \
- for (p = (d)->d_peers; p < (d)->d_peers + (d)->d_num_peers; p++)
-#define WG_CIDRS_FOREACH(c, p) \
- for (c = (p)->p_cidrs; c < (p)->p_cidrs + (p)->p_num_cidrs; c++)
-
-struct wg_allowedip {
- struct sockaddr_storage a_addr;
- struct sockaddr_storage a_mask;
-};
-
-enum {
- WG_PEER_CTR_TX_BYTES,
- WG_PEER_CTR_RX_BYTES,
- WG_PEER_CTR_NUM,
-};
-
-struct wg_device_io {
- char d_name[IFNAMSIZ];
- uint8_t d_flags;
- in_port_t d_port;
- int d_rdomain;
- uint8_t d_pubkey[WG_KEY_SIZE];
- uint8_t d_privkey[WG_KEY_SIZE];
- size_t d_num_peers;
- size_t d_num_cidrs;
- struct wg_peer_io *d_peers;
-};
-
-
-#ifndef ENOKEY
-#define ENOKEY ENOTCAPABLE
-#endif
-
-typedef enum {
- WGC_GET = 0x5,
- WGC_SET = 0x6,
-} wg_cmd_t;
-
-#endif /* __IF_WG_H__ */
diff --git a/sys/dev/if_wg/include/sys/if_wg_session_vars.h b/sys/dev/if_wg/include/sys/if_wg_session_vars.h
deleted file mode 100644
index 5fd85d3b7162..000000000000
--- a/sys/dev/if_wg/include/sys/if_wg_session_vars.h
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * Copyright (c) 2019 Matt Dunwoodie <ncon@noconroy.net>
- * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate)
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * $FreeBSD$
- */
-
-#ifndef _IF_WG_VARS_H_
-#define _IF_WG_VARS_H_
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <crypto/siphash/siphash.h>
-
-
-#include <net/if.h>
-#include <net/if_var.h>
-#include <net/if_types.h>
-#include <net/ethernet.h>
-#include <net/pfvar.h>
-#include <net/iflib.h>
-
-#include <sys/wg_noise.h>
-#include <sys/wg_cookie.h>
-/* This is only needed for wg_keypair. */
-#include <sys/if_wg_session.h>
-
-#define UNIMPLEMENTED() panic("%s not implemented\n", __func__)
-
-#define WG_KEY_SIZE 32
-#define WG_MSG_PADDING_SIZE 16
-
-
-/* Constant for session */
-#define REKEY_TIMEOUT 5
-#define REKEY_TIMEOUT_JITTER 500 /* TODO ok? jason */
-#define REJECT_AFTER_TIME 180
-#define KEEPALIVE_TIMEOUT 10
-#define MAX_TIMER_HANDSHAKES (90 / REKEY_TIMEOUT)
-#define NEW_HANDSHAKE_TIMEOUT (REKEY_TIMEOUT + KEEPALIVE_TIMEOUT)
-
-#define MAX_QUEUED_INCOMING_HANDSHAKES 4096 /* TODO: replace this with DQL */
-#define MAX_QUEUED_PACKETS 1024 /* TODO: replace this with DQL */
-
-#define HASHTABLE_PEER_SIZE (1 << 6) //1 << 11
-#define HASHTABLE_INDEX_SIZE (HASHTABLE_PEER_SIZE * 3) //1 << 13
-
-#define PEER_MAGIC1 0xCAFEBABEB00FDADDULL
-#define PEER_MAGIC2 0xCAAFD0D0D00DBABEULL
-#define PEER_MAGIC3 0xD00DBABEF00DFADEULL
-
-
-enum message_type {
- MESSAGE_INVALID = 0,
- MESSAGE_HANDSHAKE_INITIATION = 1,
- MESSAGE_HANDSHAKE_RESPONSE = 2,
- MESSAGE_HANDSHAKE_COOKIE = 3,
- MESSAGE_DATA = 4
-};
-
-struct wg_softc;
-
-#if __FreeBSD_version > 1300000
-typedef void timeout_t (void *);
-#endif
-
-/* Socket */
-struct wg_endpoint {
- union wg_remote {
- struct sockaddr r_sa;
- struct sockaddr_in r_sin;
- struct sockaddr_in6 r_sin6;
- } e_remote;
- union wg_source {
- struct in_addr l_in;
- struct in6_pktinfo l_pktinfo6;
-#define l_in6 l_pktinfo6.ipi6_addr
- } e_local;
-};
-
-struct wg_socket {
- struct mtx so_mtx;
- in_port_t so_port;
- struct socket *so_so4;
- struct socket *so_so6;
-};
-
-struct wg_queue {
- struct mtx q_mtx;
- struct mbufq q;
-};
-
-struct wg_index {
- LIST_ENTRY(wg_index) i_entry;
- SLIST_ENTRY(wg_index) i_unused_entry;
- uint32_t i_key;
- struct noise_remote *i_value;
-};
-
-struct wg_timers {
- /* t_lock is for blocking wg_timers_event_* when setting t_disabled. */
- struct rwlock t_lock;
-
- int t_disabled;
- int t_need_another_keepalive;
- uint16_t t_persistent_keepalive_interval;
- struct callout t_new_handshake;
- struct callout t_send_keepalive;
- struct callout t_retry_handshake;
- struct callout t_zero_key_material;
- struct callout t_persistent_keepalive;
-
- struct mtx t_handshake_mtx;
- struct timespec t_handshake_last_sent;
- struct timespec t_handshake_complete;
- volatile int t_handshake_retries;
-
-};
-
-struct wg_peer {
- uint64_t p_magic_1;
- CK_LIST_ENTRY(wg_peer) p_hash_entry;
- CK_LIST_ENTRY(wg_peer) p_entry;
- uint64_t p_id;
- struct wg_softc *p_sc;
-
- struct noise_remote p_remote;
- struct cookie_maker p_cookie;
- struct wg_timers p_timers;
-
- struct rwlock p_endpoint_lock;
- struct wg_endpoint p_endpoint;
-
- uint64_t p_magic_2;
-
- SLIST_HEAD(,wg_index) p_unused_index;
- struct wg_index p_index[3];
-
- struct wg_queue p_encap_queue;
- struct wg_queue p_decap_queue;
-
- struct grouptask p_clear_secrets;
- struct grouptask p_send_initiation;
- struct grouptask p_send_keepalive;
- struct grouptask p_send;
- struct grouptask p_recv;
-
- counter_u64_t p_tx_bytes;
- counter_u64_t p_rx_bytes;
-
- CK_LIST_HEAD(, wg_route) p_routes;
- uint64_t p_magic_3;
- struct mtx p_lock;
- struct epoch_context p_ctx;
-};
-
-
-
-/* Packet */
-
-void wg_softc_decrypt(struct wg_softc *);
-void wg_softc_encrypt(struct wg_softc *);
-
-/* Queue */
-void wg_queue_init(struct wg_queue *, const char *);
-void wg_queue_deinit(struct wg_queue *);
-
-/* Counter */
-
-/* Timers */
-
-/* Route */
-enum route_direction {
- IN,
- OUT,
-};
-
-struct wg_route_table {
- size_t t_count;
- struct radix_node_head *t_ip;
- struct radix_node_head *t_ip6;
-};
-struct wg_peer;
-
-struct wg_route {
- struct radix_node r_nodes[2];
- struct wg_allowedip r_cidr;
- CK_LIST_ENTRY(wg_route) r_entry;
- struct wg_peer *r_peer;
-};
-
-
-int wg_route_add(struct wg_route_table *, struct wg_peer *,
- const struct wg_allowedip *);
-int wg_route_delete(struct wg_route_table *, struct wg_peer *);
-
-/* Noise */
-
-/*
- * Peer
- *
- *
- *
- */
-
-struct wg_softc;
-
-struct wg_hashtable {
- struct mtx h_mtx;
- SIPHASH_KEY h_secret;
- CK_LIST_HEAD(, wg_peer) h_peers_list;
- CK_LIST_HEAD(, wg_peer) *h_peers;
- u_long h_peers_mask;
- size_t h_num_peers;
- LIST_HEAD(, noise_keypair) *h_keys;
- u_long h_keys_mask;
- size_t h_num_keys;
-};
-
-/* Softc */
-struct wg_softc {
- if_softc_ctx_t shared;
- if_ctx_t wg_ctx;
- struct ifnet *sc_ifp;
- uint16_t sc_incoming_port;
- uint32_t sc_user_cookie;
-
- struct wg_socket sc_socket;
- struct wg_hashtable sc_hashtable;
- struct wg_route_table sc_routes;
-
- struct mbufq sc_handshake_queue;
- struct grouptask sc_handshake;
-
- struct noise_local sc_local;
- struct cookie_checker sc_cookie;
-
- struct buf_ring *sc_encap_ring;
- struct buf_ring *sc_decap_ring;
-
- struct grouptask *sc_encrypt;
- struct grouptask *sc_decrypt;
-
- struct rwlock sc_index_lock;
- LIST_HEAD(,wg_index) *sc_index;
- u_long sc_index_mask;
-
- struct mtx sc_mtx;
-};
-
-struct wg_tag {
- struct m_tag wt_tag;
- struct wg_endpoint t_endpoint;
- struct wg_peer *t_peer;
- struct mbuf *t_mbuf;
- sa_family_t t_family;
- int t_done;
- int t_mtu;
-};
-
-struct wg_peer *wg_route_lookup(struct wg_route_table *, struct mbuf *,
- enum route_direction);
-
-void wg_peer_remove_all(struct wg_softc *);
-struct wg_peer *wg_peer_alloc(struct wg_softc *);
-void wg_peer_destroy(struct wg_peer *);
-
-void wg_hashtable_init(struct wg_hashtable *);
-void wg_hashtable_destroy(struct wg_hashtable *);
-void wg_hashtable_peer_insert(struct wg_hashtable *, struct wg_peer *);
-struct wg_peer *wg_peer_lookup(struct wg_softc *,
- const uint8_t [WG_KEY_SIZE]);
-void wg_hashtable_peer_remove(struct wg_hashtable *, struct wg_peer *);
-
-
-int wg_queue_out(struct wg_peer *peer, struct mbuf *m);
-
-int wg_route_init(struct wg_route_table *);
-void wg_route_destroy(struct wg_route_table *);
-
-int wg_socket_init(struct wg_softc *sc);
-void wg_socket_reinit(struct wg_softc *, struct socket *so4,
- struct socket *so6);
-int wg_socket_close(struct wg_socket *so);
-
-void wg_softc_handshake_receive(struct wg_softc *sc);
-
-int wg_timers_get_persistent_keepalive(struct wg_timers *, uint16_t *);
-void wg_timers_set_persistent_keepalive(struct wg_timers *t, uint16_t);
-void wg_timers_get_last_handshake(struct wg_timers *, struct timespec *);
-
-
-struct noise_remote *wg_remote_get(struct wg_softc *, uint8_t [NOISE_KEY_SIZE]);
-uint32_t wg_index_set(struct wg_softc *, struct noise_remote *);
-struct noise_remote *wg_index_get(struct wg_softc *, uint32_t);
-void wg_index_drop(struct wg_softc *, uint32_t);
-void wg_encrypt_dispatch(struct wg_softc *);
-void wg_decrypt_dispatch(struct wg_softc *);
-
-struct wg_tag *wg_tag_get(struct mbuf *m);
-
-
-#endif /* _IF_WG_VARS_H_ */
diff --git a/sys/dev/if_wg/include/sys/simd-x86_64.h b/sys/dev/if_wg/include/sys/simd-x86_64.h
deleted file mode 100644
index 1453083aa273..000000000000
--- a/sys/dev/if_wg/include/sys/simd-x86_64.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _SIMD_X86_64_H_
-#define _SIMD_X86_64_H_
-
-
-#include <x86/x86_var.h>
-#include <x86/specialreg.h>
-
-static inline uint64_t
-xgetbv(uint32_t index)
-{
- uint32_t eax, edx;
- /* xgetbv - instruction byte code */
- __asm__ __volatile__(".byte 0x0f; .byte 0x01; .byte 0xd0"
- : "=a" (eax), "=d" (edx)
- : "c" (index));
-
- return ((((uint64_t)edx)<<32) | (uint64_t)eax);
-}
-
-
-/*
- * Detect register set support
- */
-static inline boolean_t
-__simd_state_enabled(const uint64_t state)
-{
- boolean_t has_osxsave;
- uint64_t xcr0;
-
- has_osxsave = !!(cpu_feature2 & CPUID2_OSXSAVE);
-
- if (!has_osxsave)
- return (0);
-
- xcr0 = xgetbv(0);
- return ((xcr0 & state) == state);
-}
-
-#define _XSTATE_SSE_AVX (0x2 | 0x4)
-#define _XSTATE_AVX512 (0xE0 | _XSTATE_SSE_AVX)
-
-#define __ymm_enabled() __simd_state_enabled(_XSTATE_SSE_AVX)
-#define __zmm_enabled() __simd_state_enabled(_XSTATE_AVX512)
-#endif
-
diff --git a/sys/dev/if_wg/include/sys/support.h b/sys/dev/if_wg/include/sys/support.h
deleted file mode 100644
index 7874fd9b1524..000000000000
--- a/sys/dev/if_wg/include/sys/support.h
+++ /dev/null
@@ -1,342 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef SYS_SUPPORT_H_
-#define SYS_SUPPORT_H_
-#ifdef __LOCORE
-#include <machine/asm.h>
-#define SYM_FUNC_START ENTRY
-#define SYM_FUNC_END END
-
-#else
-#include <sys/types.h>
-#include <sys/limits.h>
-#include <sys/endian.h>
-#include <sys/libkern.h>
-#include <sys/malloc.h>
-#include <sys/proc.h>
-#include <sys/lock.h>
-#include <vm/uma.h>
-
-#if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
-#include <machine/fpu.h>
-#endif
-#include <crypto/siphash/siphash.h>
-
-
-#define COMPAT_ZINC_IS_A_MODULE
-MALLOC_DECLARE(M_WG);
-
-#define BUILD_BUG_ON(x) CTASSERT(!(x))
-
-#define BIT(nr) (1UL << (nr))
-#define BIT_ULL(nr) (1ULL << (nr))
-#ifdef __LP64__
-#define BITS_PER_LONG 64
-#else
-#define BITS_PER_LONG 32
-#endif
-
-#define rw_enter_write rw_wlock
-#define rw_exit_write rw_wunlock
-#define rw_enter_read rw_rlock
-#define rw_exit_read rw_runlock
-#define rw_exit rw_unlock
-
-#define ASSERT(x) MPASS(x)
-
-#define ___PASTE(a,b) a##b
-#define __PASTE(a,b) ___PASTE(a,b)
-#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
-
-#define typeof(x) __typeof__(x)
-
-
-#define min_t(t, a, b) ({ t __a = (a); t __b = (b); __a > __b ? __b : __a; })
-
-typedef uint8_t u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint32_t __le32;
-typedef uint64_t u64;
-typedef uint64_t __le64;
-
-#define __must_check __attribute__((__warn_unused_result__))
-#define asmlinkage
-#define __ro_after_init __read_mostly
-
-#define get_unaligned_le32(x) le32dec(x)
-#define get_unaligned_le64(x) le64dec(x)
-
-#define cpu_to_le64(x) htole64(x)
-#define cpu_to_le32(x) htole32(x)
-#define letoh64(x) le64toh(x)
-
-#define need_resched() \
- ((curthread->td_flags & (TDF_NEEDRESCHED|TDF_ASTPENDING)) || \
- curthread->td_owepreempt)
-
-
-#define CONTAINER_OF(a, b, c) __containerof((a), b, c)
-
-typedef struct {
- uint64_t k0;
- uint64_t k1;
-} SIPHASH_KEY;
-
-static inline uint64_t
-siphash24(const SIPHASH_KEY *key, const void *src, size_t len)
-{
- SIPHASH_CTX ctx;
-
- return (SipHashX(&ctx, 2, 4, (const uint8_t *)key, src, len));
-}
-
-static inline void
-put_unaligned_le32(u32 val, void *p)
-{
- *((__le32 *)p) = cpu_to_le32(val);
-}
-
-
-#define rol32(i32, n) ((i32) << (n) | (i32) >> (32 - (n)))
-
-#define memzero_explicit(p, s) explicit_bzero(p, s)
-
-#define EXPORT_SYMBOL(x)
-
-#define U32_MAX ((u32)~0U)
-#if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
-#define kfpu_begin(ctx) { \
- if (ctx->sc_fpu_ctx == NULL) { \
- ctx->sc_fpu_ctx = fpu_kern_alloc_ctx(0); \
- } \
- critical_enter(); \
- fpu_kern_enter(curthread, ctx->sc_fpu_ctx, FPU_KERN_NORMAL); \
-}
-
-#define kfpu_end(ctx) { \
- MPASS(ctx->sc_fpu_ctx != NULL); \
- fpu_kern_leave(curthread, ctx->sc_fpu_ctx); \
- critical_exit(); \
-}
-#else
-#define kfpu_begin(ctx)
-#define kfpu_end(ctx)
-#define fpu_kern_free_ctx(p)
-#endif
-
-typedef enum {
- HAVE_NO_SIMD = 1 << 0,
- HAVE_FULL_SIMD = 1 << 1,
- HAVE_SIMD_IN_USE = 1 << 31
-} simd_context_state_t;
-
-typedef struct {
- simd_context_state_t sc_state;
- struct fpu_kern_ctx *sc_fpu_ctx;
-} simd_context_t;
-
-
-#define DONT_USE_SIMD NULL
-
-static __must_check inline bool
-may_use_simd(void)
-{
-#if defined(__amd64__)
- return true;
-#else
- return false;
-#endif
-}
-
-static inline void
-simd_get(simd_context_t *ctx)
-{
- ctx->sc_state = may_use_simd() ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
-}
-
-static inline void
-simd_put(simd_context_t *ctx)
-{
-#if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
- if (is_fpu_kern_thread(0))
- return;
-#endif
- if (ctx->sc_state & HAVE_SIMD_IN_USE)
- kfpu_end(ctx);
- ctx->sc_state = HAVE_NO_SIMD;
-}
-
-static __must_check inline bool
-simd_use(simd_context_t *ctx)
-{
-#if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
- if (is_fpu_kern_thread(0))
- return true;
-#else
- return false;
-#endif
- if (ctx == NULL)
- return false;
- if (!(ctx->sc_state & HAVE_FULL_SIMD))
- return false;
- if (ctx->sc_state & HAVE_SIMD_IN_USE)
- return true;
- kfpu_begin(ctx);
- ctx->sc_state |= HAVE_SIMD_IN_USE;
- return true;
-}
-
-static inline bool
-simd_relax(simd_context_t *ctx)
-{
- if ((ctx->sc_state & HAVE_SIMD_IN_USE) && need_resched()) {
- simd_put(ctx);
- simd_get(ctx);
- return simd_use(ctx);
- }
- return false;
-}
-
-#define unlikely(x) __predict_false(x)
-#define likely(x) __predict_true(x)
-/* Generic path for arbitrary size */
-
-
-static inline unsigned long
-__crypto_memneq_generic(const void *a, const void *b, size_t size)
-{
- unsigned long neq = 0;
-
- while (size >= sizeof(unsigned long)) {
- neq |= *(const unsigned long *)a ^ *(const unsigned long *)b;
- __compiler_membar();
- a = ((const char *)a + sizeof(unsigned long));
- b = ((const char *)b + sizeof(unsigned long));
- size -= sizeof(unsigned long);
- }
- while (size > 0) {
- neq |= *(const unsigned char *)a ^ *(const unsigned char *)b;
- __compiler_membar();
- a = (const char *)a + 1;
- b = (const char *)b + 1;
- size -= 1;
- }
- return neq;
-}
-
-#define crypto_memneq(a, b, c) __crypto_memneq_generic((a), (b), (c))
-
-static inline void
-__cpu_to_le32s(uint32_t *buf)
-{
- *buf = htole32(*buf);
-}
-
-static inline void cpu_to_le32_array(u32 *buf, unsigned int words)
-{
- while (words--) {
- __cpu_to_le32s(buf);
- buf++;
- }
-}
-
-#define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 1
-void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int len);
-
-static inline void crypto_xor_cpy(u8 *dst, const u8 *src1, const u8 *src2,
- unsigned int size)
-{
- if (CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS &&
- __builtin_constant_p(size) &&
- (size % sizeof(unsigned long)) == 0) {
- unsigned long *d = (unsigned long *)dst;
- const unsigned long *s1 = (const unsigned long *)src1;
- const unsigned long *s2 = (const unsigned long *)src2;
-
- while (size > 0) {
- *d++ = *s1++ ^ *s2++;
- size -= sizeof(unsigned long);
- }
- } else {
- __crypto_xor(dst, src1, src2, size);
- }
-}
-#include <sys/kernel.h>
-#define module_init(fn) \
-static void \
-wrap_ ## fn(void *dummy __unused) \
-{ \
- fn(); \
-} \
-SYSINIT(if_wg_ ## fn, SI_SUB_LAST, SI_ORDER_FIRST, wrap_ ## fn, NULL)
-
-
-#define module_exit(fn) \
-static void \
-wrap_ ## fn(void *dummy __unused) \
-{ \
- fn(); \
-} \
-SYSUNINIT(if_wg_ ## fn, SI_SUB_LAST, SI_ORDER_FIRST, wrap_ ## fn, NULL)
-
-#define module_param(a, b, c)
-#define MODULE_LICENSE(x)
-#define MODULE_DESCRIPTION(x)
-#define MODULE_AUTHOR(x)
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-#define __initconst
-#define __initdata
-#define __init
-#define __exit
-#define BUG() panic("%s:%d bug hit!\n", __FILE__, __LINE__)
-
-#define WARN_ON(cond) ({ \
- bool __ret = (cond); \
- if (__ret) { \
- printf("WARNING %s failed at %s:%d\n", \
- __stringify(cond), __FILE__, __LINE__); \
- } \
- unlikely(__ret); \
-})
-
-#define pr_err printf
-#define pr_info printf
-#define IS_ENABLED(x) 0
-#define ___stringify(...) #__VA_ARGS__
-#define __stringify(...) ___stringify(__VA_ARGS__)
-#define kmalloc(size, flag) malloc((size), M_WG, M_WAITOK)
-#define kfree(p) free(p, M_WG)
-#define vzalloc(size) malloc((size), M_WG, M_WAITOK|M_ZERO)
-#define vfree(p) free(p, M_WG)
-#endif
-#endif
diff --git a/sys/dev/if_wg/include/sys/wg_cookie.h b/sys/dev/if_wg/include/sys/wg_cookie.h
deleted file mode 100644
index 0bac8fefaf42..000000000000
--- a/sys/dev/if_wg/include/sys/wg_cookie.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (C) 2015-2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- * Copyright (C) 2019-2020 Matt Dunwoodie <ncon@noconroy.net>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * ======== wg_cookie.h ========
- *
- * This file provides a thread safe interface to the WireGuard cookie
- * mechanism. It is split into three parts:
- *
- * * cookie_maker
- * Used to create MACs for messages.
- * * cookie_checker
- * Used to validate MACs for messages.
- * * cookie_macs
- * The MACs that authenticate the message.
- *
- * The MACs provide two properties:
- * * mac1 - That the remote end knows a value.
- * * mac2 - That the remote end has a specific IP address.
- *
- * void cookie_maker_init(cookie_maker, ipl, input)
- * - Initialise cookie_maker, should only be called once and before use.
- * input is the shared value used for mac1.
- *
- * int cookie_checker_init(cookie_checker, ipl)
- * - Initialise cookie_checker, should only be called once and before use. It
- * will return ENOBUFS if it cannot allocate required memory.
- *
- * void cookie_checker_update(cookie_checker, input)
- * - Set the input value to check mac1 against.
- *
- * void cookie_checker_deinit(cookie_checker)
- * - Destroy all values associated with cookie_checker. cookie_checker must
- * not be used after calling this function.
- *
- * void cookie_checker_create_payload(cookie_checker, cookie_macs, nonce,
- * payload, sockaddr)
- * - Create a specific payload derived from the sockaddr. The payload is an
- * encrypted shared secret, that the cookie_maker will decrypt and used to
- * key the mac2 value.
- *
- * int cookie_maker_consume_payload(cookie_maker, nonce, payload)
- * - Have cookie_maker consume the payload.
- *
- * void cookie_maker_mac(cookie_maker, cookie_macs, message, len)
- * - Create cookie_macs for the message of length len. It will always compute
- * mac1, however will only compute mac2 if we have recently received a
- * payload to key it with.
- *
- * int cookie_checker_validate_macs(cookie_checker, cookie_macs, message, len,
- * busy, sockaddr)
- * - Use cookie_checker to validate the cookie_macs of message with length
- * len. If busy, then ratelimiting will be applied to the sockaddr.
- *
- * ==========================
- * $FreeBSD$
- */
-
-#ifndef __COOKIE_H__
-#define __COOKIE_H__
-
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/rwlock.h>
-#include <sys/queue.h>
-#include <sys/support.h>
-
-#include <netinet/in.h>
-
-#include <crypto/blake2s.h>
-
-#define COOKIE_MAC_SIZE 16
-#define COOKIE_KEY_SIZE 32
-#define COOKIE_XNONCE_SIZE 24
-#define COOKIE_COOKIE_SIZE 16
-#define COOKIE_SECRET_SIZE 32
-#define COOKIE_INPUT_SIZE 32
-#define COOKIE_ENCRYPTED_SIZE (COOKIE_COOKIE_SIZE + COOKIE_MAC_SIZE)
-
-#define COOKIE_MAC1_KEY_LABEL "mac1----"
-#define COOKIE_COOKIE_KEY_LABEL "cookie--"
-#define COOKIE_SECRET_MAX_AGE 120
-#define COOKIE_SECRET_LATENCY 5
-
-/* Constants for initiation rate limiting */
-#define RATELIMIT_SIZE (1 << 10)
-#define RATELIMIT_SIZE_MAX (RATELIMIT_SIZE * 8)
-#define NSEC_PER_SEC 1000000000LL
-#define INITIATIONS_PER_SECOND 50
-#define INITIATIONS_BURSTABLE 10
-#define INITIATION_COST (NSEC_PER_SEC / INITIATIONS_PER_SECOND)
-#define TOKEN_MAX (INITIATION_COST * INITIATIONS_BURSTABLE)
-#define ELEMENT_TIMEOUT 1
-#define IPV4_MASK_SIZE 4 /* Use all 4 bytes of IPv4 address */
-#define IPV6_MASK_SIZE 8 /* Use top 8 bytes (/64) of IPv6 address */
-
-struct cookie_macs {
- uint8_t mac1[COOKIE_MAC_SIZE];
- uint8_t mac2[COOKIE_MAC_SIZE];
-} __packed;
-
-struct ratelimit_entry {
- LIST_ENTRY(ratelimit_entry) r_entry;
- sa_family_t r_af;
- union {
- struct in_addr r_in;
- struct in6_addr r_in6;
- };
- struct timespec r_last_time; /* nanouptime */
- uint64_t r_tokens;
-};
-
-struct ratelimit {
- SIPHASH_KEY rl_secret;
- uma_zone_t rl_zone;
-
- struct rwlock rl_lock;
- LIST_HEAD(, ratelimit_entry) *rl_table;
- u_long rl_table_mask;
- size_t rl_table_num;
- struct timespec rl_last_gc; /* nanouptime */
-};
-
-struct cookie_maker {
- uint8_t cp_mac1_key[COOKIE_KEY_SIZE];
- uint8_t cp_cookie_key[COOKIE_KEY_SIZE];
-
- struct rwlock cp_lock;
- uint8_t cp_cookie[COOKIE_COOKIE_SIZE];
- struct timespec cp_birthdate; /* nanouptime */
- int cp_mac1_valid;
- uint8_t cp_mac1_last[COOKIE_MAC_SIZE];
-};
-
-struct cookie_checker {
- struct ratelimit cc_ratelimit;
-
- struct rwlock cc_key_lock;
- uint8_t cc_mac1_key[COOKIE_KEY_SIZE];
- uint8_t cc_cookie_key[COOKIE_KEY_SIZE];
-
- struct rwlock cc_secret_lock;
- struct timespec cc_secret_birthdate; /* nanouptime */
- uint8_t cc_secret[COOKIE_SECRET_SIZE];
-};
-
-void cookie_maker_init(struct cookie_maker *, const uint8_t[COOKIE_INPUT_SIZE]);
-int cookie_checker_init(struct cookie_checker *, uma_zone_t);
-void cookie_checker_update(struct cookie_checker *,
- uint8_t[COOKIE_INPUT_SIZE]);
-void cookie_checker_deinit(struct cookie_checker *);
-void cookie_checker_create_payload(struct cookie_checker *,
- struct cookie_macs *cm, uint8_t[COOKIE_XNONCE_SIZE],
- uint8_t [COOKIE_ENCRYPTED_SIZE], struct sockaddr *);
-int cookie_maker_consume_payload(struct cookie_maker *,
- uint8_t[COOKIE_XNONCE_SIZE], uint8_t[COOKIE_ENCRYPTED_SIZE]);
-void cookie_maker_mac(struct cookie_maker *, struct cookie_macs *,
- void *, size_t);
-int cookie_checker_validate_macs(struct cookie_checker *,
- struct cookie_macs *, void *, size_t, int, struct sockaddr *);
-
-#endif /* __COOKIE_H__ */
diff --git a/sys/dev/if_wg/include/sys/wg_module.h b/sys/dev/if_wg/include/sys/wg_module.h
deleted file mode 100644
index cc662104d640..000000000000
--- a/sys/dev/if_wg/include/sys/wg_module.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-#ifndef MODULE_H_
-#define MODULE_H_
-
-#include <sys/mbuf.h>
-#include <sys/socket.h>
-#include <net/if.h>
-#include <net/if_var.h>
-#include <sys/support.h>
-
-
-#include <sys/types.h>
-#include <sys/epoch.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-
-
-
-#include <crypto/curve25519.h>
-#include <zinc/chacha20poly1305.h>
-#include <crypto/blake2s.h>
-
-
-enum noise_lengths {
- NOISE_PUBLIC_KEY_LEN = CURVE25519_KEY_SIZE,
- NOISE_SYMMETRIC_KEY_LEN = CHACHA20POLY1305_KEY_SIZE,
- NOISE_TIMESTAMP_LEN = sizeof(uint64_t) + sizeof(uint32_t),
- NOISE_AUTHTAG_LEN = CHACHA20POLY1305_AUTHTAG_SIZE,
- NOISE_HASH_LEN = BLAKE2S_HASH_SIZE
-};
-
-#define noise_encrypted_len(plain_len) ((plain_len) + NOISE_AUTHTAG_LEN)
-
-enum cookie_values {
- COOKIE_SECRET_MAX_AGE = 2 * 60,
- COOKIE_SECRET_LATENCY = 5,
- COOKIE_NONCE_LEN = XCHACHA20POLY1305_NONCE_SIZE,
- COOKIE_LEN = 16
-};
-
-enum limits {
- REKEY_TIMEOUT = 5,
- INITIATIONS_PER_SECOND = 50,
- MAX_PEERS_PER_DEVICE = 1U << 20,
- KEEPALIVE_TIMEOUT = 10,
- MAX_TIMER_HANDSHAKES = 90 / REKEY_TIMEOUT,
- MAX_QUEUED_INCOMING_HANDSHAKES = 4096, /* TODO: replace this with DQL */
- MAX_STAGED_PACKETS = 128,
- MAX_QUEUED_PACKETS = 1024 /* TODO: replace this with DQL */
-};
-
-#define zfree(addr, type) \
- do { \
- explicit_bzero(addr, sizeof(*addr)); \
- free(addr, type); \
- } while (0)
-
-struct crypt_queue {
- union {
- struct {
- int last_cpu;
- };
- };
-};
-
-#define __ATOMIC_LOAD_SIZE \
- ({ \
- switch (size) { \
- case 1: *(uint8_t *)res = *(volatile uint8_t *)p; break; \
- case 2: *(uint16_t *)res = *(volatile uint16_t *)p; break; \
- case 4: *(uint32_t *)res = *(volatile uint32_t *)p; break; \
- case 8: *(uint64_t *)res = *(volatile uint64_t *)p; break; \
- } \
-})
-
-static inline void
-__atomic_load_acq_size(volatile void *p, void *res, int size)
-{
- __ATOMIC_LOAD_SIZE;
-}
-
-#define atomic_load_acq(x) \
- ({ \
- union { __typeof(x) __val; char __c[1]; } __u; \
- __atomic_load_acq_size(&(x), __u.__c, sizeof(x)); \
- __u.__val; \
-})
-
-
-int wg_ctx_init(void);
-void wg_ctx_uninit(void);
-
-
-#endif
diff --git a/sys/dev/if_wg/include/sys/wg_noise.h b/sys/dev/if_wg/include/sys/wg_noise.h
deleted file mode 100644
index 40bdab515bc7..000000000000
--- a/sys/dev/if_wg/include/sys/wg_noise.h
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Copyright (C) 2015-2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- * Copyright (C) 2019-2020 Matt Dunwoodie <ncon@noconroy.net>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * ======== wg_noise.h ========
- *
- * This file provides a thread safe interface to the Noise protocol as used in
- * WireGuard. The three user facing components are:
- *
- * * noise_local
- * Stores the local state for a noise peer.
- * * noise_remote
- * Stores the remote state for a noise peer.
- * * noise_upcall
- * Stores callback routines for index and peers
- *
- * Additionally a noise_counter, which is invsible to the user is used to track
- * message nonces, to prevent message replay.
- *
- * This module uses Curve25519 for asymmetric crypto, and ChaCha20Poly1305 for
- * symmetric crypto. The handshake uses ephemeral keys, which provide perfect
- * forward secrecy. Keys are NOISE_KEY_SIZE (32) bytes long and can be
- * generated with a CSRNG. While this module will clamp the key to form a valid
- * Curve25519 key, it is recommended that keys are stored in Curve25519 form to
- * preserve interoperability with other systems. Additionally, there is an
- * optional PresharedKey of length NOISE_PSK_SIZE (also 32 bytes), which when
- * used, will provide protection against known quantum attacks. Without it,
- * Curve25519 is broken by Shor's algorithm.
- *
- * -------- noise_local --------
- *
- * void noise_local_init(noise_local *, noise_upcall *)
- * - Initialise noise_local, should only be called once and before use.
- *
- * int noise_local_set_private(noise_local *, uint8_t *private)
- * - Set the local private key. This will also calculate the corresponding
- * public key.
- *
- * int noise_local_keys(noise_local *, uint8_t *public, uint8_t *private)
- * - Get the local keys. It will ensure that a key has been set and if
- * not, will return ENXIO.
- *
- * -------- noise_remote --------
- *
- * void noise_remote_init(noise_remote *, uint8_t *public)
- * - Initialise noise_local, should only be called once and before use. Key
- * must be provided and it cannot be changed once set.
- *
- * void noise_remote_set_psk(noise_remote *, uint8_t *psk)
- * - Set the shared key. To remove the shared key, set a key of all 0x00.
- *
- * void noise_remote_keys(noise_remote *, uint8_t *public, uint8_t *psk)
- * - Get the remote keys.
- *
- * -------- noise_upcall --------
- *
- * The noise_upcall struct is used to lookup incoming public keys, as well as
- * allocate and deallocate index for a remote. The allocation and deallocation
- * are serialised per noise_remote and guaranteed to only have 3 allocated
- * indexes at once.
- *
- * u_arg - passed to callback functions as void *
- * u_get_remote - lookup noise_remote based on public key.
- * u_set_index - allocate index for noise_remote. any further packets that
- * arrive with this index should be passed to noise_* functions
- * with the corresponding noise_remote.
- * u_drop_index - dealloate index passed to callback.
- *
- * -------- crypto --------
- *
- * The following functions are used for the crypto side of things:
- *
- * int noise_create_initiation(noise_remote *, noise_initiation *)
- * int noise_consume_initiation(noise_local *, noise_remote **, noise_initiation *)
- * int noise_create_response(noise_remote *, noise_response *)
- * int noise_consume_response(noise_remote *, noise_response *)
- *
- * int noise_remote_promote(noise_remote *)
- * void noise_remote_clear(noise_remote *)
- * void noise_remote_expire_current(noise_remote *)
- * int noise_remote_encrypt(noise_remote *, noise_data *, size_t)
- * int noise_remote_decrypt(noise_remote *, noise_data *, size_t)
- *
- * $FreeBSD$
- */
-
-#ifndef __NOISE_H__
-#define __NOISE_H__
-
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/rwlock.h>
-#include <sys/support.h>
-
-#include <crypto/blake2s.h>
-#include <zinc/chacha20poly1305.h>
-#include <crypto/curve25519.h>
-
-#define NOISE_KEY_SIZE CURVE25519_KEY_SIZE
-#define NOISE_PSK_SIZE 32
-#define NOISE_MAC_SIZE CHACHA20POLY1305_AUTHTAG_SIZE
-#define NOISE_HASH_SIZE BLAKE2S_HASH_SIZE
-#define NOISE_SYMMETRIC_SIZE CHACHA20POLY1305_KEY_SIZE
-#define NOISE_TIMESTAMP_SIZE 12
-
-/* Protocol string constants */
-#define NOISE_HANDSHAKE_NAME "Noise_IKpsk2_25519_ChaChaPoly_BLAKE2s"
-#define NOISE_IDENTIFIER_NAME "WireGuard v1 zx2c4 Jason@zx2c4.com"
-
-/* Constants for the counter */
-#define COUNTER_TYPE size_t
-#define COUNTER_BITS_TOTAL 512
-#define COUNTER_TYPE_BITS (sizeof(COUNTER_TYPE) * 8)
-#define COUNTER_TYPE_NUM (COUNTER_BITS_TOTAL / COUNTER_TYPE_BITS)
-#define COUNTER_WINDOW_SIZE (COUNTER_BITS_TOTAL - COUNTER_TYPE_BITS)
-
-/* Constants for the keypair */
-#define REKEY_AFTER_MESSAGES (1ull << 60)
-#define REJECT_AFTER_MESSAGES (UINT64_MAX - COUNTER_WINDOW_SIZE - 1)
-#define REKEY_AFTER_TIME 120
-#define REKEY_AFTER_TIME_RECV 165
-#define REJECT_AFTER_TIME 180
-#define REJECT_INTERVAL (1000000000 / 50) /* fifty times per sec */
-/* 24 = floor(log2(REJECT_INTERVAL)) */
-#define REJECT_INTERVAL_MASK (~((1ull<<24)-1))
-
-enum noise_state_hs {
- HS_ZEROED = 0,
- CREATED_INITIATION,
- CONSUMED_INITIATION,
- CREATED_RESPONSE,
- CONSUMED_RESPONSE,
-};
-
-struct noise_handshake {
- enum noise_state_hs hs_state;
- uint32_t hs_local_index;
- uint32_t hs_remote_index;
- uint8_t hs_e[NOISE_KEY_SIZE];
- uint8_t hs_hash[NOISE_HASH_SIZE];
- uint8_t hs_ck[NOISE_HASH_SIZE];
-};
-
-struct noise_counter {
- struct rwlock c_lock;
- uint64_t c_send;
- uint64_t c_recv;
- COUNTER_TYPE c_backtrack[COUNTER_TYPE_NUM];
-};
-
-enum noise_state_kp {
- KP_ZEROED = 0,
- INITIATOR,
- RESPONDER,
-};
-
-struct noise_keypair {
- SLIST_ENTRY(noise_keypair) kp_entry;
- int kp_valid;
- int kp_is_initiator;
- uint32_t kp_local_index;
- uint32_t kp_remote_index;
- uint8_t kp_send[NOISE_SYMMETRIC_SIZE];
- uint8_t kp_recv[NOISE_SYMMETRIC_SIZE];
- struct timespec kp_birthdate; /* nanouptime */
- struct noise_counter kp_ctr;
-};
-
-struct noise_remote {
- uint8_t r_public[NOISE_KEY_SIZE];
- struct noise_local *r_local;
- uint8_t r_ss[NOISE_KEY_SIZE];
-
- struct rwlock r_handshake_lock;
- struct noise_handshake r_handshake;
- uint8_t r_psk[NOISE_PSK_SIZE];
- uint8_t r_timestamp[NOISE_TIMESTAMP_SIZE];
- struct timespec r_last_init; /* nanouptime */
-
- struct rwlock r_keypair_lock;
- SLIST_HEAD(,noise_keypair) r_unused_keypairs;
- struct noise_keypair *r_next, *r_current, *r_previous;
- struct noise_keypair r_keypair[3]; /* 3: next, current, previous. */
-
-};
-
-struct noise_local {
- struct rwlock l_identity_lock;
- int l_has_identity;
- uint8_t l_public[NOISE_KEY_SIZE];
- uint8_t l_private[NOISE_KEY_SIZE];
-
- struct noise_upcall {
- void *u_arg;
- struct noise_remote *
- (*u_remote_get)(void *, uint8_t[NOISE_KEY_SIZE]);
- uint32_t
- (*u_index_set)(void *, struct noise_remote *);
- void (*u_index_drop)(void *, uint32_t);
- } l_upcall;
-};
-
-struct noise_initiation {
- uint32_t s_idx;
- uint8_t ue[NOISE_KEY_SIZE];
- uint8_t es[NOISE_KEY_SIZE + NOISE_MAC_SIZE];
- uint8_t ets[NOISE_TIMESTAMP_SIZE + NOISE_MAC_SIZE];
-} __packed;
-
-struct noise_response {
- uint32_t s_idx;
- uint32_t r_idx;
- uint8_t ue[NOISE_KEY_SIZE];
- uint8_t en[0 + NOISE_MAC_SIZE];
-} __packed;
-
-struct noise_data {
- uint32_t r_idx;
- uint64_t nonce;
- uint8_t buf[];
-} __packed;
-
-
-/* Set/Get noise parameters */
-void noise_local_init(struct noise_local *, struct noise_upcall *);
-void noise_local_lock_identity(struct noise_local *);
-void noise_local_unlock_identity(struct noise_local *);
-int noise_local_set_private(struct noise_local *, uint8_t[NOISE_KEY_SIZE]);
-int noise_local_keys(struct noise_local *, uint8_t[NOISE_KEY_SIZE],
- uint8_t[NOISE_KEY_SIZE]);
-
-void noise_remote_init(struct noise_remote *, const uint8_t[NOISE_KEY_SIZE],
- struct noise_local *);
-int noise_remote_set_psk(struct noise_remote *, const uint8_t[NOISE_PSK_SIZE]);
-int noise_remote_keys(struct noise_remote *, uint8_t[NOISE_KEY_SIZE],
- uint8_t[NOISE_PSK_SIZE]);
-
-/* Should be called anytime noise_local_set_private is called */
-void noise_remote_precompute(struct noise_remote *);
-
-/* Cryptographic functions */
-int noise_create_initiation(
- struct noise_remote *,
- struct noise_initiation *);
-
-int noise_consume_initiation(
- struct noise_local *,
- struct noise_remote **,
- struct noise_initiation *);
-
-int noise_create_response(
- struct noise_remote *,
- struct noise_response *);
-
-int noise_consume_response(
- struct noise_remote *,
- struct noise_response *);
-
- int noise_remote_begin_session(struct noise_remote *);
-void noise_remote_clear(struct noise_remote *);
-void noise_remote_expire_current(struct noise_remote *);
-
-int noise_remote_ready(struct noise_remote *);
-
-int noise_remote_encrypt(
- struct noise_remote *,
- struct noise_data *,
- size_t);
-int noise_remote_decrypt(
- struct noise_remote *,
- struct noise_data *,
- size_t);
-
-#endif /* __NOISE_H__ */
diff --git a/sys/dev/if_wg/include/zinc/blake2s.h b/sys/dev/if_wg/include/zinc/blake2s.h
deleted file mode 100644
index e87bfdbc9f6d..000000000000
--- a/sys/dev/if_wg/include/zinc/blake2s.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#ifndef _ZINC_BLAKE2S_H
-#define _ZINC_BLAKE2S_H
-
-#include <sys/types.h>
-
-enum blake2s_lengths {
- BLAKE2S_BLOCK_SIZE = 64,
- BLAKE2S_HASH_SIZE = 32,
- BLAKE2S_KEY_SIZE = 32
-};
-
-struct blake2s_state {
- uint32_t h[8];
- uint32_t t[2];
- uint32_t f[2];
- uint8_t buf[BLAKE2S_BLOCK_SIZE];
- unsigned int buflen;
- unsigned int outlen;
-};
-
-void blake2s_init(struct blake2s_state *state, const size_t outlen);
-void blake2s_init_key(struct blake2s_state *state, const size_t outlen,
- const void *key, const size_t keylen);
-void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen);
-//void blake2s_final(struct blake2s_state *state, uint8_t *out);
-
-static inline void blake2s(uint8_t *out, const uint8_t *in, const uint8_t *key,
- const size_t outlen, const size_t inlen,
- const size_t keylen)
-{
- struct blake2s_state state;
-
- if (keylen)
- blake2s_init_key(&state, outlen, key, keylen);
- else
- blake2s_init(&state, outlen);
-
- blake2s_update(&state, in, inlen);
- blake2s_final(&state, out);
-}
-
-void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key, const size_t outlen,
- const size_t inlen, const size_t keylen);
-
-#endif /* _ZINC_BLAKE2S_H */
diff --git a/sys/dev/if_wg/include/zinc/chacha20.h b/sys/dev/if_wg/include/zinc/chacha20.h
deleted file mode 100644
index 1a9524bdfe85..000000000000
--- a/sys/dev/if_wg/include/zinc/chacha20.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#ifndef _ZINC_CHACHA20_H
-#define _ZINC_CHACHA20_H
-
-#include <sys/param.h>
-#include <sys/support.h>
-
-enum chacha20_lengths {
- CHACHA20_NONCE_SIZE = 16,
- CHACHA20_KEY_SIZE = 32,
- CHACHA20_KEY_WORDS = CHACHA20_KEY_SIZE / sizeof(u32),
- CHACHA20_BLOCK_SIZE = 64,
- CHACHA20_BLOCK_WORDS = CHACHA20_BLOCK_SIZE / sizeof(u32),
- HCHACHA20_NONCE_SIZE = CHACHA20_NONCE_SIZE,
- HCHACHA20_KEY_SIZE = CHACHA20_KEY_SIZE
-};
-
-enum chacha20_constants { /* expand 32-byte k */
- CHACHA20_CONSTANT_EXPA = 0x61707865U,
- CHACHA20_CONSTANT_ND_3 = 0x3320646eU,
- CHACHA20_CONSTANT_2_BY = 0x79622d32U,
- CHACHA20_CONSTANT_TE_K = 0x6b206574U
-};
-
-struct chacha20_ctx {
- union {
- u32 state[16];
- struct {
- u32 constant[4];
- u32 key[8];
- u32 counter[4];
- };
- };
-};
-
-static inline void chacha20_init(struct chacha20_ctx *ctx,
- const u8 key[CHACHA20_KEY_SIZE],
- const u64 nonce)
-{
- ctx->constant[0] = CHACHA20_CONSTANT_EXPA;
- ctx->constant[1] = CHACHA20_CONSTANT_ND_3;
- ctx->constant[2] = CHACHA20_CONSTANT_2_BY;
- ctx->constant[3] = CHACHA20_CONSTANT_TE_K;
- ctx->key[0] = get_unaligned_le32(key + 0);
- ctx->key[1] = get_unaligned_le32(key + 4);
- ctx->key[2] = get_unaligned_le32(key + 8);
- ctx->key[3] = get_unaligned_le32(key + 12);
- ctx->key[4] = get_unaligned_le32(key + 16);
- ctx->key[5] = get_unaligned_le32(key + 20);
- ctx->key[6] = get_unaligned_le32(key + 24);
- ctx->key[7] = get_unaligned_le32(key + 28);
- ctx->counter[0] = 0;
- ctx->counter[1] = 0;
- ctx->counter[2] = nonce & U32_MAX;
- ctx->counter[3] = nonce >> 32;
-}
-void chacha20(struct chacha20_ctx *ctx, u8 *dst, const u8 *src, u32 len,
- simd_context_t *simd_context);
-
-void hchacha20(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE], simd_context_t *simd_context);
-
-#endif /* _ZINC_CHACHA20_H */
diff --git a/sys/dev/if_wg/include/zinc/chacha20poly1305.h b/sys/dev/if_wg/include/zinc/chacha20poly1305.h
deleted file mode 100644
index 2d18b0fc3e82..000000000000
--- a/sys/dev/if_wg/include/zinc/chacha20poly1305.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#ifndef _ZINC_CHACHA20POLY1305_H
-#define _ZINC_CHACHA20POLY1305_H
-
-#include <sys/types.h>
-
-struct scatterlist;
-
-enum chacha20poly1305_lengths {
- XCHACHA20POLY1305_NONCE_SIZE = 24,
- CHACHA20POLY1305_KEY_SIZE = 32,
- CHACHA20POLY1305_AUTHTAG_SIZE = 16
-};
-
-void chacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, const size_t src_len,
- const uint8_t *ad, const size_t ad_len,
- const uint64_t nonce,
- const uint8_t key[CHACHA20POLY1305_KEY_SIZE]);
-
-bool chacha20poly1305_encrypt_sg_inplace(
- struct scatterlist *src, const size_t src_len, const uint8_t *ad,
- const size_t ad_len, const uint64_t nonce,
- const uint8_t key[CHACHA20POLY1305_KEY_SIZE], simd_context_t *simd_context);
-
-bool chacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src, const size_t src_len,
- const uint8_t *ad, const size_t ad_len, const uint64_t nonce,
- const uint8_t key[CHACHA20POLY1305_KEY_SIZE]);
-
-bool chacha20poly1305_decrypt_sg_inplace(
- struct scatterlist *src, size_t src_len, const uint8_t *ad,
- const size_t ad_len, const uint64_t nonce,
- const uint8_t key[CHACHA20POLY1305_KEY_SIZE], simd_context_t *simd_context);
-
-void xchacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, const size_t src_len,
- const uint8_t *ad, const size_t ad_len,
- const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE],
- const uint8_t key[CHACHA20POLY1305_KEY_SIZE]);
-
-bool xchacha20poly1305_decrypt(
- uint8_t *dst, const uint8_t *src, const size_t src_len, const uint8_t *ad,
- const size_t ad_len, const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE],
- const uint8_t key[CHACHA20POLY1305_KEY_SIZE]);
-
-#endif /* _ZINC_CHACHA20POLY1305_H */
diff --git a/sys/dev/if_wg/include/zinc/curve25519.h b/sys/dev/if_wg/include/zinc/curve25519.h
deleted file mode 100644
index aa32359462da..000000000000
--- a/sys/dev/if_wg/include/zinc/curve25519.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#ifndef _ZINC_CURVE25519_H
-#define _ZINC_CURVE25519_H
-
-#include <sys/types.h>
-
-enum curve25519_lengths {
- CURVE25519_KEY_SIZE = 32
-};
-
-bool curve25519(uint8_t mypublic[CURVE25519_KEY_SIZE],
- const uint8_t secret[CURVE25519_KEY_SIZE],
- const uint8_t basepoint[CURVE25519_KEY_SIZE]);
-void curve25519_generate_secret(uint8_t secret[CURVE25519_KEY_SIZE]);
-bool curve25519_generate_public(
- uint8_t pub[CURVE25519_KEY_SIZE], const uint8_t secret[CURVE25519_KEY_SIZE]);
-
-static inline void curve25519_clamp_secret(uint8_t secret[CURVE25519_KEY_SIZE])
-{
- secret[0] &= 248;
- secret[31] = (secret[31] & 127) | 64;
-}
-
-#endif /* _ZINC_CURVE25519_H */
diff --git a/sys/dev/if_wg/include/zinc/poly1305.h b/sys/dev/if_wg/include/zinc/poly1305.h
deleted file mode 100644
index ca4cc60b41b3..000000000000
--- a/sys/dev/if_wg/include/zinc/poly1305.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#ifndef _ZINC_POLY1305_H
-#define _ZINC_POLY1305_H
-
-
-enum poly1305_lengths {
- POLY1305_BLOCK_SIZE = 16,
- POLY1305_KEY_SIZE = 32,
- POLY1305_MAC_SIZE = 16
-};
-
-struct poly1305_ctx {
- u8 opaque[24 * sizeof(u64)];
- u32 nonce[4];
- u8 data[POLY1305_BLOCK_SIZE];
- size_t num;
-} __aligned(8);
-
-void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE]);
-void poly1305_update(struct poly1305_ctx *ctx, const u8 *input, size_t len,
- simd_context_t *simd_context);
-void poly1305_final(struct poly1305_ctx *ctx, u8 mac[POLY1305_MAC_SIZE],
- simd_context_t *simd_context);
-
-#endif /* _ZINC_POLY1305_H */
diff --git a/sys/dev/if_wg/module/blake2s.c b/sys/dev/if_wg/module/blake2s.c
deleted file mode 100644
index a362a6b350f1..000000000000
--- a/sys/dev/if_wg/module/blake2s.c
+++ /dev/null
@@ -1,256 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2012 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * This is an implementation of the BLAKE2s hash and PRF functions.
- *
- * Information: https://blake2.net/
- *
- */
-
-#include <sys/types.h>
-#include <sys/systm.h>
-#include <sys/endian.h>
-
-#include <crypto/blake2s.h>
-
-static inline uint32_t
-ror32(uint32_t word, unsigned int shift)
-{
- return (word >> shift) | (word << (32 - shift));
-}
-
-typedef union {
- struct {
- uint8_t digest_length;
- uint8_t key_length;
- uint8_t fanout;
- uint8_t depth;
- uint32_t leaf_length;
- uint32_t node_offset;
- uint16_t xof_length;
- uint8_t node_depth;
- uint8_t inner_length;
- uint8_t salt[8];
- uint8_t personal[8];
- };
- uint32_t words[8];
-} __packed blake2s_param;
-
-static const uint32_t blake2s_iv[8] = {
- 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
- 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
-};
-
-static const uint8_t blake2s_sigma[10][16] = {
- { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
- { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
- { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
- { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
- { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
- { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
- { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
- { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
- { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
- { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-};
-
-static inline void blake2s_set_lastblock(struct blake2s_state *state)
-{
- if (state->last_node)
- state->f[1] = -1;
- state->f[0] = -1;
-}
-
-static inline void blake2s_increment_counter(struct blake2s_state *state,
- const uint32_t inc)
-{
- state->t[0] += inc;
- state->t[1] += (state->t[0] < inc);
-}
-
-static inline void blake2s_init_param(struct blake2s_state *state,
- const blake2s_param *param)
-{
- int i;
-
- memset(state, 0, sizeof(*state));
- for (i = 0; i < 8; ++i)
- state->h[i] = blake2s_iv[i] ^ le32toh(param->words[i]);
-}
-
-void blake2s_init(struct blake2s_state *state, const size_t outlen)
-{
- blake2s_param param __aligned(__alignof__(uint32_t)) = {
- .digest_length = outlen,
- .fanout = 1,
- .depth = 1
- };
-
- /*WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE));*/
- blake2s_init_param(state, &param);
-}
-
-void blake2s_init_key(struct blake2s_state *state, const size_t outlen,
- const void *key, const size_t keylen)
-{
- blake2s_param param = { .digest_length = outlen,
- .key_length = keylen,
- .fanout = 1,
- .depth = 1 };
- uint8_t block[BLAKE2S_BLOCK_SIZE] = { 0 };
-
- /*WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE ||
- !key || !keylen || keylen > BLAKE2S_KEY_SIZE));*/
- blake2s_init_param(state, &param);
- memcpy(block, key, keylen);
- blake2s_update(state, block, BLAKE2S_BLOCK_SIZE);
- explicit_bzero(block, BLAKE2S_BLOCK_SIZE);
-}
-
-static inline void blake2s_compress(struct blake2s_state *state,
- const uint8_t *block, size_t nblocks,
- const uint32_t inc)
-{
- uint32_t m[16];
- uint32_t v[16];
- int i;
-
- /*WARN_ON(IS_ENABLED(DEBUG) &&
- (nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));*/
-
- while (nblocks > 0) {
- blake2s_increment_counter(state, inc);
- memcpy(m, block, BLAKE2S_BLOCK_SIZE);
- for(i = 0; i < (sizeof(m)/sizeof(m[0])); i++)
- (m[i]) = le32toh((m[i]));
- memcpy(v, state->h, 32);
- v[ 8] = blake2s_iv[0];
- v[ 9] = blake2s_iv[1];
- v[10] = blake2s_iv[2];
- v[11] = blake2s_iv[3];
- v[12] = blake2s_iv[4] ^ state->t[0];
- v[13] = blake2s_iv[5] ^ state->t[1];
- v[14] = blake2s_iv[6] ^ state->f[0];
- v[15] = blake2s_iv[7] ^ state->f[1];
-
-#define G(r, i, a, b, c, d) do { \
- a += b + m[blake2s_sigma[r][2 * i + 0]]; \
- d = ror32(d ^ a, 16); \
- c += d; \
- b = ror32(b ^ c, 12); \
- a += b + m[blake2s_sigma[r][2 * i + 1]]; \
- d = ror32(d ^ a, 8); \
- c += d; \
- b = ror32(b ^ c, 7); \
-} while (0)
-
-#define ROUND(r) do { \
- G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
- G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
- G(r, 2, v[2], v[ 6], v[10], v[14]); \
- G(r, 3, v[3], v[ 7], v[11], v[15]); \
- G(r, 4, v[0], v[ 5], v[10], v[15]); \
- G(r, 5, v[1], v[ 6], v[11], v[12]); \
- G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
- G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
-} while (0)
- ROUND(0);
- ROUND(1);
- ROUND(2);
- ROUND(3);
- ROUND(4);
- ROUND(5);
- ROUND(6);
- ROUND(7);
- ROUND(8);
- ROUND(9);
-
-#undef G
-#undef ROUND
-
- for (i = 0; i < 8; ++i)
- state->h[i] ^= v[i] ^ v[i + 8];
-
- block += BLAKE2S_BLOCK_SIZE;
- --nblocks;
- }
-}
-
-void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen)
-{
- const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
-
- if (!inlen)
- return;
- if (inlen > fill) {
- memcpy(state->buf + state->buflen, in, fill);
- blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
- state->buflen = 0;
- in += fill;
- inlen -= fill;
- }
- if (inlen > BLAKE2S_BLOCK_SIZE) {
- const size_t nblocks =
- (inlen + BLAKE2S_BLOCK_SIZE - 1) / BLAKE2S_BLOCK_SIZE;
- /* Hash one less (full) block than strictly possible */
- blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
- in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
- inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
- }
- memcpy(state->buf + state->buflen, in, inlen);
- state->buflen += inlen;
-}
-
-void blake2s_final(struct blake2s_state *state, uint8_t *out, const size_t outlen)
-{
- int i;
- /*WARN_ON(IS_ENABLED(DEBUG) &&
- (!out || !outlen || outlen > BLAKE2S_HASH_SIZE));*/
- blake2s_set_lastblock(state);
- memset(state->buf + state->buflen, 0,
- BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
- blake2s_compress(state, state->buf, 1, state->buflen);
- for(i = 0; i < (sizeof(state->h)/sizeof(state->h[0])); i++)
- (state->h[i]) = htole32((state->h[i]));
-
- memcpy(out, state->h, outlen);
- explicit_bzero(state, sizeof(*state));
-}
-
-void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key, const size_t outlen,
- const size_t inlen, const size_t keylen)
-{
- struct blake2s_state state;
- uint8_t x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(uint32_t)) = { 0 };
- uint8_t i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(uint32_t));
- int i;
-
- if (keylen > BLAKE2S_BLOCK_SIZE) {
- blake2s_init(&state, BLAKE2S_HASH_SIZE);
- blake2s_update(&state, key, keylen);
- blake2s_final(&state, x_key, BLAKE2S_HASH_SIZE);
- } else
- memcpy(x_key, key, keylen);
-
- for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
- x_key[i] ^= 0x36;
-
- blake2s_init(&state, BLAKE2S_HASH_SIZE);
- blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
- blake2s_update(&state, in, inlen);
- blake2s_final(&state, i_hash, BLAKE2S_HASH_SIZE);
-
- for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
- x_key[i] ^= 0x5c ^ 0x36;
-
- blake2s_init(&state, BLAKE2S_HASH_SIZE);
- blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
- blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
- blake2s_final(&state, i_hash, BLAKE2S_HASH_SIZE);
-
- memcpy(out, i_hash, outlen);
- explicit_bzero(x_key, BLAKE2S_BLOCK_SIZE);
- explicit_bzero(i_hash, BLAKE2S_HASH_SIZE);
-}
diff --git a/sys/dev/if_wg/module/blake2s.h b/sys/dev/if_wg/module/blake2s.h
deleted file mode 100644
index 865de953fb25..000000000000
--- a/sys/dev/if_wg/module/blake2s.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <sys/types.h>
-
-#ifndef _BLAKE2S_H_
-#define _BLAKE2S_H_
-
-/*#define WARN_ON(a) if(a) printf("%s failed at %s:%d\n", #a, __FILE__, __LINE__)
-#define IS_ENABLED(...) true*/
-
-
-enum blake2s_lengths {
- BLAKE2S_BLOCK_SIZE = 64,
- BLAKE2S_HASH_SIZE = 32,
- BLAKE2S_KEY_SIZE = 32
-};
-
-struct blake2s_state {
- uint32_t h[8];
- uint32_t t[2];
- uint32_t f[2];
- uint8_t buf[BLAKE2S_BLOCK_SIZE];
- size_t buflen;
- uint8_t last_node;
-};
-
-void blake2s_init(struct blake2s_state *state, const size_t outlen);
-void blake2s_init_key(struct blake2s_state *state, const size_t outlen,
- const void *key, const size_t keylen);
-void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen);
-void blake2s_final(struct blake2s_state *state, uint8_t *out, const size_t outlen);
-
-static inline void blake2s(uint8_t *out, const uint8_t *in, const uint8_t *key,
- const size_t outlen, const size_t inlen,
- const size_t keylen)
-{
- struct blake2s_state state;
-
- /*WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen ||
- outlen > BLAKE2S_HASH_SIZE || keylen > BLAKE2S_KEY_SIZE ||
- (!key && keylen)));*/
-
- if (keylen)
- blake2s_init_key(&state, outlen, key, keylen);
- else
- blake2s_init(&state, outlen);
-
- blake2s_update(&state, in, inlen);
- blake2s_final(&state, out, outlen);
-}
-
-void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key,
- const size_t outlen, const size_t inlen, const size_t keylen);
-
-#endif /* _BLAKE2S_H_ */
diff --git a/sys/dev/if_wg/module/chacha20-x86_64.S b/sys/dev/if_wg/module/chacha20-x86_64.S
deleted file mode 100644
index 0edb79483758..000000000000
--- a/sys/dev/if_wg/module/chacha20-x86_64.S
+++ /dev/null
@@ -1,2834 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-//
-// Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
-// Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
-// Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
-//
-// This code is taken from the OpenSSL project but the author, Andy Polyakov,
-// has relicensed it under the licenses specified in the SPDX header above.
-// The original headers, including the original license headers, are
-// included below for completeness.
-//
-// ====================================================================
-// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-// project. The module is, however, dual licensed under OpenSSL and
-// CRYPTOGAMS licenses depending on where you obtain it. For further
-// details see http://www.openssl.org/~appro/cryptogams/.
-// ====================================================================
-//
-// November 2014
-//
-// ChaCha20 for x86_64.
-//
-// December 2016
-//
-// Add AVX512F code path.
-//
-// December 2017
-//
-// Add AVX512VL code path.
-//
-// Performance in cycles per byte out of large buffer.
-//
-// IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v)
-//
-// P4 9.48/+99% - -
-// Core2 7.83/+55% 7.90/5.76 4.35
-// Westmere 7.19/+50% 5.60/4.50 3.00
-// Sandy Bridge 8.31/+42% 5.45/4.00 2.72
-// Ivy Bridge 6.71/+46% 5.40/? 2.41
-// Haswell 5.92/+43% 5.20/3.45 2.42 1.23
-// Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)]
-// Silvermont 12.0/+33% 7.75/6.90 7.03(iii)
-// Knights L 11.7/- ? 9.60(iii) 0.80
-// Goldmont 10.6/+17% 5.10/3.52 3.28
-// Sledgehammer 7.28/+52% - -
-// Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv)
-// Ryzen 5.96/+50% 5.19/3.00 2.40 2.09
-// VIA Nano 10.5/+46% 6.72/6.88 6.05
-//
-// (i) compared to older gcc 3.x one can observe >2x improvement on
-// most platforms;
-// (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used
-// by chacha20_poly1305_tls_cipher, results are EVP-free;
-// (iii) this is not optimal result for Atom because of MSROM
-// limitations, SSE2 can do better, but gain is considered too
-// low to justify the [maintenance] effort;
-// (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20
-// and 4.85 for 128-byte inputs;
-// (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
-// (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
-// cpb in single thread, the corresponding capability is suppressed;
-
-//#include <linux/linkage.h>
-.section .rodata.cst16.Lzero, "aM", @progbits, 16
-.align 16
-.Lzero:
-.long 0,0,0,0
-.section .rodata.cst16.Lone, "aM", @progbits, 16
-.align 16
-.Lone:
-.long 1,0,0,0
-.section .rodata.cst16.Linc, "aM", @progbits, 16
-.align 16
-.Linc:
-.long 0,1,2,3
-.section .rodata.cst16.Lfour, "aM", @progbits, 16
-.align 16
-.Lfour:
-.long 4,4,4,4
-.section .rodata.cst32.Lincy, "aM", @progbits, 32
-.align 32
-.Lincy:
-.long 0,2,4,6,1,3,5,7
-.section .rodata.cst32.Leight, "aM", @progbits, 32
-.align 32
-.Leight:
-.long 8,8,8,8,8,8,8,8
-.section .rodata.cst16.Lrot16, "aM", @progbits, 16
-.align 16
-.Lrot16:
-.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
-.section .rodata.cst16.Lrot24, "aM", @progbits, 16
-.align 16
-.Lrot24:
-.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
-.section .rodata.cst32.Ltwoy, "aM", @progbits, 32
-.align 32
-.Ltwoy:
-.long 2,0,0,0, 2,0,0,0
-.section .rodata.cst64.Lzeroz, "aM", @progbits, 64
-.align 64
-.Lzeroz:
-.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
-.section .rodata.cst64.Lfourz, "aM", @progbits, 64
-.align 64
-.Lfourz:
-.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
-.section .rodata.cst64.Lincz, "aM", @progbits, 64
-.align 64
-.Lincz:
-.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
-.section .rodata.cst64.Lsixteen, "aM", @progbits, 64
-.align 64
-.Lsixteen:
-.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
-.section .rodata.cst16.Lsigma, "aM", @progbits, 16
-.align 16
-.Lsigma:
-.ascii "expand 32-byte k"
-.text
-#ifdef CONFIG_AS_SSSE3
-.align 32
-SYM_FUNC_START(hchacha20_ssse3)
-.Lhchacha20_ssse3:
- movdqa .Lsigma(%rip),%xmm0
- movdqu (%rdx),%xmm1
- movdqu 16(%rdx),%xmm2
- movdqu (%rsi),%xmm3
- # This code is only used when targeting kernel.
- # If targeting win64, xmm{6,7} preserving needs to be added.
- movdqa .Lrot16(%rip),%xmm6
- movdqa .Lrot24(%rip),%xmm7
- mov $10,%r8 # reuse %r8
- jmp 1f
-.align 32
-1:
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm6,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $20,%xmm1
- pslld $12,%xmm4
- por %xmm4,%xmm1
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm7,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $25,%xmm1
- pslld $7,%xmm4
- por %xmm4,%xmm1
- pshufd $147,%xmm0,%xmm0
- pshufd $78,%xmm3,%xmm3
- pshufd $57,%xmm2,%xmm2
- nop
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm6,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $20,%xmm1
- pslld $12,%xmm4
- por %xmm4,%xmm1
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm7,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $25,%xmm1
- pslld $7,%xmm4
- por %xmm4,%xmm1
- pshufd $57,%xmm0,%xmm0
- pshufd $78,%xmm3,%xmm3
- pshufd $147,%xmm2,%xmm2
- dec %r8
- jnz 1b
- movdqu %xmm0, (%rdi)
- movdqu %xmm3, 16(%rdi)
- ret
-SYM_FUNC_END(hchacha20_ssse3)
-.align 32
-SYM_FUNC_START(chacha20_ssse3)
-.Lchacha20_ssse3:
- lea 8(%rsp),%r10 # frame pointer
- cmp $128,%rdx # we might throw away some data,
- je .Lchacha20_128
- ja .Lchacha20_4x # but overall it won't be slower
-
-.Ldo_ssse3_after_all:
- sub $64+8,%rsp
- and $-16,%rsp
- movdqa .Lsigma(%rip),%xmm0
- movdqu (%rcx),%xmm1
- movdqu 16(%rcx),%xmm2
- movdqu (%r8),%xmm3
- movdqa .Lrot16(%rip),%xmm6
- movdqa .Lrot24(%rip),%xmm7
-
- movdqa %xmm0,0x00(%rsp)
- movdqa %xmm1,0x10(%rsp)
- movdqa %xmm2,0x20(%rsp)
- movdqa %xmm3,0x30(%rsp)
- mov $10,%r8 # reuse %r8
- jmp .Loop_ssse3
-
-.align 32
-.Loop_outer_ssse3:
- movdqa .Lone(%rip),%xmm3
- movdqa 0x00(%rsp),%xmm0
- movdqa 0x10(%rsp),%xmm1
- movdqa 0x20(%rsp),%xmm2
- paddd 0x30(%rsp),%xmm3
- mov $10,%r8
- movdqa %xmm3,0x30(%rsp)
- jmp .Loop_ssse3
-
-.align 32
-.Loop_ssse3:
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm6,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $20,%xmm1
- pslld $12,%xmm4
- por %xmm4,%xmm1
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm7,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $25,%xmm1
- pslld $7,%xmm4
- por %xmm4,%xmm1
- pshufd $147,%xmm0,%xmm0
- pshufd $78,%xmm3,%xmm3
- pshufd $57,%xmm2,%xmm2
- nop
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm6,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $20,%xmm1
- pslld $12,%xmm4
- por %xmm4,%xmm1
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm7,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $25,%xmm1
- pslld $7,%xmm4
- por %xmm4,%xmm1
- pshufd $57,%xmm0,%xmm0
- pshufd $78,%xmm3,%xmm3
- pshufd $147,%xmm2,%xmm2
- dec %r8
- jnz .Loop_ssse3
- paddd 0x00(%rsp),%xmm0
- paddd 0x10(%rsp),%xmm1
- paddd 0x20(%rsp),%xmm2
- paddd 0x30(%rsp),%xmm3
-
- cmp $64,%rdx
- jb .Ltail_ssse3
-
- movdqu 0x00(%rsi),%xmm4
- movdqu 0x10(%rsi),%xmm5
- pxor %xmm4,%xmm0 # xor with input
- movdqu 0x20(%rsi),%xmm4
- pxor %xmm5,%xmm1
- movdqu 0x30(%rsi),%xmm5
- lea 0x40(%rsi),%rsi # inp+=64
- pxor %xmm4,%xmm2
- pxor %xmm5,%xmm3
-
- movdqu %xmm0,0x00(%rdi) # write output
- movdqu %xmm1,0x10(%rdi)
- movdqu %xmm2,0x20(%rdi)
- movdqu %xmm3,0x30(%rdi)
- lea 0x40(%rdi),%rdi # out+=64
-
- sub $64,%rdx
- jnz .Loop_outer_ssse3
-
- jmp .Ldone_ssse3
-
-.align 16
-.Ltail_ssse3:
- movdqa %xmm0,0x00(%rsp)
- movdqa %xmm1,0x10(%rsp)
- movdqa %xmm2,0x20(%rsp)
- movdqa %xmm3,0x30(%rsp)
- xor %r8,%r8
-
-.Loop_tail_ssse3:
- movzb (%rsi,%r8),%eax
- movzb (%rsp,%r8),%ecx
- lea 1(%r8),%r8
- xor %ecx,%eax
- mov %al,-1(%rdi,%r8)
- dec %rdx
- jnz .Loop_tail_ssse3
-
-.Ldone_ssse3:
- lea -8(%r10),%rsp
-.Lssse3_epilogue:
- ret
-SYM_FUNC_END(chacha20_ssse3)
-.type chacha20_128,@function
-.align 32
-chacha20_128:
-.Lchacha20_128:
- lea 8(%rsp),%r10 # frame pointer
- sub $64+8,%rsp
- and $-16,%rsp
- movdqa .Lsigma(%rip),%xmm8
- movdqu (%rcx),%xmm9
- movdqu 16(%rcx),%xmm2
- movdqu (%r8),%xmm3
- movdqa .Lone(%rip),%xmm1
- movdqa .Lrot16(%rip),%xmm6
- movdqa .Lrot24(%rip),%xmm7
-
- movdqa %xmm8,%xmm10
- movdqa %xmm8,0x00(%rsp)
- movdqa %xmm9,%xmm11
- movdqa %xmm9,0x10(%rsp)
- movdqa %xmm2,%xmm0
- movdqa %xmm2,0x20(%rsp)
- paddd %xmm3,%xmm1
- movdqa %xmm3,0x30(%rsp)
- mov $10,%r8 # reuse %r8
- jmp .Loop_128
-
-.align 32
-.Loop_128:
- paddd %xmm9,%xmm8
- pxor %xmm8,%xmm3
- paddd %xmm11,%xmm10
- pxor %xmm10,%xmm1
- pshufb %xmm6,%xmm3
- pshufb %xmm6,%xmm1
- paddd %xmm3,%xmm2
- paddd %xmm1,%xmm0
- pxor %xmm2,%xmm9
- pxor %xmm0,%xmm11
- movdqa %xmm9,%xmm4
- psrld $20,%xmm9
- movdqa %xmm11,%xmm5
- pslld $12,%xmm4
- psrld $20,%xmm11
- por %xmm4,%xmm9
- pslld $12,%xmm5
- por %xmm5,%xmm11
- paddd %xmm9,%xmm8
- pxor %xmm8,%xmm3
- paddd %xmm11,%xmm10
- pxor %xmm10,%xmm1
- pshufb %xmm7,%xmm3
- pshufb %xmm7,%xmm1
- paddd %xmm3,%xmm2
- paddd %xmm1,%xmm0
- pxor %xmm2,%xmm9
- pxor %xmm0,%xmm11
- movdqa %xmm9,%xmm4
- psrld $25,%xmm9
- movdqa %xmm11,%xmm5
- pslld $7,%xmm4
- psrld $25,%xmm11
- por %xmm4,%xmm9
- pslld $7,%xmm5
- por %xmm5,%xmm11
- pshufd $147,%xmm8,%xmm8
- pshufd $78,%xmm3,%xmm3
- pshufd $57,%xmm2,%xmm2
- pshufd $147,%xmm10,%xmm10
- pshufd $78,%xmm1,%xmm1
- pshufd $57,%xmm0,%xmm0
- paddd %xmm9,%xmm8
- pxor %xmm8,%xmm3
- paddd %xmm11,%xmm10
- pxor %xmm10,%xmm1
- pshufb %xmm6,%xmm3
- pshufb %xmm6,%xmm1
- paddd %xmm3,%xmm2
- paddd %xmm1,%xmm0
- pxor %xmm2,%xmm9
- pxor %xmm0,%xmm11
- movdqa %xmm9,%xmm4
- psrld $20,%xmm9
- movdqa %xmm11,%xmm5
- pslld $12,%xmm4
- psrld $20,%xmm11
- por %xmm4,%xmm9
- pslld $12,%xmm5
- por %xmm5,%xmm11
- paddd %xmm9,%xmm8
- pxor %xmm8,%xmm3
- paddd %xmm11,%xmm10
- pxor %xmm10,%xmm1
- pshufb %xmm7,%xmm3
- pshufb %xmm7,%xmm1
- paddd %xmm3,%xmm2
- paddd %xmm1,%xmm0
- pxor %xmm2,%xmm9
- pxor %xmm0,%xmm11
- movdqa %xmm9,%xmm4
- psrld $25,%xmm9
- movdqa %xmm11,%xmm5
- pslld $7,%xmm4
- psrld $25,%xmm11
- por %xmm4,%xmm9
- pslld $7,%xmm5
- por %xmm5,%xmm11
- pshufd $57,%xmm8,%xmm8
- pshufd $78,%xmm3,%xmm3
- pshufd $147,%xmm2,%xmm2
- pshufd $57,%xmm10,%xmm10
- pshufd $78,%xmm1,%xmm1
- pshufd $147,%xmm0,%xmm0
- dec %r8
- jnz .Loop_128
- paddd 0x00(%rsp),%xmm8
- paddd 0x10(%rsp),%xmm9
- paddd 0x20(%rsp),%xmm2
- paddd 0x30(%rsp),%xmm3
- paddd .Lone(%rip),%xmm1
- paddd 0x00(%rsp),%xmm10
- paddd 0x10(%rsp),%xmm11
- paddd 0x20(%rsp),%xmm0
- paddd 0x30(%rsp),%xmm1
-
- movdqu 0x00(%rsi),%xmm4
- movdqu 0x10(%rsi),%xmm5
- pxor %xmm4,%xmm8 # xor with input
- movdqu 0x20(%rsi),%xmm4
- pxor %xmm5,%xmm9
- movdqu 0x30(%rsi),%xmm5
- pxor %xmm4,%xmm2
- movdqu 0x40(%rsi),%xmm4
- pxor %xmm5,%xmm3
- movdqu 0x50(%rsi),%xmm5
- pxor %xmm4,%xmm10
- movdqu 0x60(%rsi),%xmm4
- pxor %xmm5,%xmm11
- movdqu 0x70(%rsi),%xmm5
- pxor %xmm4,%xmm0
- pxor %xmm5,%xmm1
-
- movdqu %xmm8,0x00(%rdi) # write output
- movdqu %xmm9,0x10(%rdi)
- movdqu %xmm2,0x20(%rdi)
- movdqu %xmm3,0x30(%rdi)
- movdqu %xmm10,0x40(%rdi)
- movdqu %xmm11,0x50(%rdi)
- movdqu %xmm0,0x60(%rdi)
- movdqu %xmm1,0x70(%rdi)
- lea -8(%r10),%rsp
-.L128_epilogue:
- ret
-.size chacha20_128,.-chacha20_128
-.type chacha20_4x,@function
-.align 32
-chacha20_4x:
-.Lchacha20_4x:
- lea 8(%rsp),%r10 # frame pointer
- cmp $192,%rdx
- ja .Lproceed4x
-.Lproceed4x:
- sub $0x140+8,%rsp
- and $-16,%rsp
- movdqa .Lsigma(%rip),%xmm11 # key[0]
- movdqu (%rcx),%xmm15 # key[1]
- movdqu 16(%rcx),%xmm7 # key[2]
- movdqu (%r8),%xmm3 # key[3]
- lea 0x100(%rsp),%rcx # size optimization
- lea .Lrot16(%rip),%r9
- lea .Lrot24(%rip),%r11
-
- pshufd $0x00,%xmm11,%xmm8 # smash key by lanes...
- pshufd $0x55,%xmm11,%xmm9
- movdqa %xmm8,0x40(%rsp) # ... and offload
- pshufd $0xaa,%xmm11,%xmm10
- movdqa %xmm9,0x50(%rsp)
- pshufd $0xff,%xmm11,%xmm11
- movdqa %xmm10,0x60(%rsp)
- movdqa %xmm11,0x70(%rsp)
-
- pshufd $0x00,%xmm15,%xmm12
- pshufd $0x55,%xmm15,%xmm13
- movdqa %xmm12,0x80-0x100(%rcx)
- pshufd $0xaa,%xmm15,%xmm14
- movdqa %xmm13,0x90-0x100(%rcx)
- pshufd $0xff,%xmm15,%xmm15
- movdqa %xmm14,0xa0-0x100(%rcx)
- movdqa %xmm15,0xb0-0x100(%rcx)
-
- pshufd $0x00,%xmm7,%xmm4 # ""
- pshufd $0x55,%xmm7,%xmm5 # ""
- movdqa %xmm4,0xc0-0x100(%rcx)
- pshufd $0xaa,%xmm7,%xmm6 # ""
- movdqa %xmm5,0xd0-0x100(%rcx)
- pshufd $0xff,%xmm7,%xmm7 # ""
- movdqa %xmm6,0xe0-0x100(%rcx)
- movdqa %xmm7,0xf0-0x100(%rcx)
-
- pshufd $0x00,%xmm3,%xmm0
- pshufd $0x55,%xmm3,%xmm1
- paddd .Linc(%rip),%xmm0 # don't save counters yet
- pshufd $0xaa,%xmm3,%xmm2
- movdqa %xmm1,0x110-0x100(%rcx)
- pshufd $0xff,%xmm3,%xmm3
- movdqa %xmm2,0x120-0x100(%rcx)
- movdqa %xmm3,0x130-0x100(%rcx)
-
- jmp .Loop_enter4x
-
-.align 32
-.Loop_outer4x:
- movdqa 0x40(%rsp),%xmm8 # re-load smashed key
- movdqa 0x50(%rsp),%xmm9
- movdqa 0x60(%rsp),%xmm10
- movdqa 0x70(%rsp),%xmm11
- movdqa 0x80-0x100(%rcx),%xmm12
- movdqa 0x90-0x100(%rcx),%xmm13
- movdqa 0xa0-0x100(%rcx),%xmm14
- movdqa 0xb0-0x100(%rcx),%xmm15
- movdqa 0xc0-0x100(%rcx),%xmm4 # ""
- movdqa 0xd0-0x100(%rcx),%xmm5 # ""
- movdqa 0xe0-0x100(%rcx),%xmm6 # ""
- movdqa 0xf0-0x100(%rcx),%xmm7 # ""
- movdqa 0x100-0x100(%rcx),%xmm0
- movdqa 0x110-0x100(%rcx),%xmm1
- movdqa 0x120-0x100(%rcx),%xmm2
- movdqa 0x130-0x100(%rcx),%xmm3
- paddd .Lfour(%rip),%xmm0 # next SIMD counters
-
-.Loop_enter4x:
- movdqa %xmm6,0x20(%rsp) # SIMD equivalent of "%nox"
- movdqa %xmm7,0x30(%rsp) # SIMD equivalent of "%nox"
- movdqa (%r9),%xmm7 # .Lrot16(%rip)
- mov $10,%eax
- movdqa %xmm0,0x100-0x100(%rcx) # save SIMD counters
- jmp .Loop4x
-
-.align 32
-.Loop4x:
- paddd %xmm12,%xmm8
- paddd %xmm13,%xmm9
- pxor %xmm8,%xmm0
- pxor %xmm9,%xmm1
- pshufb %xmm7,%xmm0
- pshufb %xmm7,%xmm1
- paddd %xmm0,%xmm4
- paddd %xmm1,%xmm5
- pxor %xmm4,%xmm12
- pxor %xmm5,%xmm13
- movdqa %xmm12,%xmm6
- pslld $12,%xmm12
- psrld $20,%xmm6
- movdqa %xmm13,%xmm7
- pslld $12,%xmm13
- por %xmm6,%xmm12
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm13
- paddd %xmm12,%xmm8
- paddd %xmm13,%xmm9
- pxor %xmm8,%xmm0
- pxor %xmm9,%xmm1
- pshufb %xmm6,%xmm0
- pshufb %xmm6,%xmm1
- paddd %xmm0,%xmm4
- paddd %xmm1,%xmm5
- pxor %xmm4,%xmm12
- pxor %xmm5,%xmm13
- movdqa %xmm12,%xmm7
- pslld $7,%xmm12
- psrld $25,%xmm7
- movdqa %xmm13,%xmm6
- pslld $7,%xmm13
- por %xmm7,%xmm12
- psrld $25,%xmm6
- movdqa (%r9),%xmm7
- por %xmm6,%xmm13
- movdqa %xmm4,0(%rsp)
- movdqa %xmm5,16(%rsp)
- movdqa 32(%rsp),%xmm4
- movdqa 48(%rsp),%xmm5
- paddd %xmm14,%xmm10
- paddd %xmm15,%xmm11
- pxor %xmm10,%xmm2
- pxor %xmm11,%xmm3
- pshufb %xmm7,%xmm2
- pshufb %xmm7,%xmm3
- paddd %xmm2,%xmm4
- paddd %xmm3,%xmm5
- pxor %xmm4,%xmm14
- pxor %xmm5,%xmm15
- movdqa %xmm14,%xmm6
- pslld $12,%xmm14
- psrld $20,%xmm6
- movdqa %xmm15,%xmm7
- pslld $12,%xmm15
- por %xmm6,%xmm14
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm15
- paddd %xmm14,%xmm10
- paddd %xmm15,%xmm11
- pxor %xmm10,%xmm2
- pxor %xmm11,%xmm3
- pshufb %xmm6,%xmm2
- pshufb %xmm6,%xmm3
- paddd %xmm2,%xmm4
- paddd %xmm3,%xmm5
- pxor %xmm4,%xmm14
- pxor %xmm5,%xmm15
- movdqa %xmm14,%xmm7
- pslld $7,%xmm14
- psrld $25,%xmm7
- movdqa %xmm15,%xmm6
- pslld $7,%xmm15
- por %xmm7,%xmm14
- psrld $25,%xmm6
- movdqa (%r9),%xmm7
- por %xmm6,%xmm15
- paddd %xmm13,%xmm8
- paddd %xmm14,%xmm9
- pxor %xmm8,%xmm3
- pxor %xmm9,%xmm0
- pshufb %xmm7,%xmm3
- pshufb %xmm7,%xmm0
- paddd %xmm3,%xmm4
- paddd %xmm0,%xmm5
- pxor %xmm4,%xmm13
- pxor %xmm5,%xmm14
- movdqa %xmm13,%xmm6
- pslld $12,%xmm13
- psrld $20,%xmm6
- movdqa %xmm14,%xmm7
- pslld $12,%xmm14
- por %xmm6,%xmm13
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm14
- paddd %xmm13,%xmm8
- paddd %xmm14,%xmm9
- pxor %xmm8,%xmm3
- pxor %xmm9,%xmm0
- pshufb %xmm6,%xmm3
- pshufb %xmm6,%xmm0
- paddd %xmm3,%xmm4
- paddd %xmm0,%xmm5
- pxor %xmm4,%xmm13
- pxor %xmm5,%xmm14
- movdqa %xmm13,%xmm7
- pslld $7,%xmm13
- psrld $25,%xmm7
- movdqa %xmm14,%xmm6
- pslld $7,%xmm14
- por %xmm7,%xmm13
- psrld $25,%xmm6
- movdqa (%r9),%xmm7
- por %xmm6,%xmm14
- movdqa %xmm4,32(%rsp)
- movdqa %xmm5,48(%rsp)
- movdqa 0(%rsp),%xmm4
- movdqa 16(%rsp),%xmm5
- paddd %xmm15,%xmm10
- paddd %xmm12,%xmm11
- pxor %xmm10,%xmm1
- pxor %xmm11,%xmm2
- pshufb %xmm7,%xmm1
- pshufb %xmm7,%xmm2
- paddd %xmm1,%xmm4
- paddd %xmm2,%xmm5
- pxor %xmm4,%xmm15
- pxor %xmm5,%xmm12
- movdqa %xmm15,%xmm6
- pslld $12,%xmm15
- psrld $20,%xmm6
- movdqa %xmm12,%xmm7
- pslld $12,%xmm12
- por %xmm6,%xmm15
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm12
- paddd %xmm15,%xmm10
- paddd %xmm12,%xmm11
- pxor %xmm10,%xmm1
- pxor %xmm11,%xmm2
- pshufb %xmm6,%xmm1
- pshufb %xmm6,%xmm2
- paddd %xmm1,%xmm4
- paddd %xmm2,%xmm5
- pxor %xmm4,%xmm15
- pxor %xmm5,%xmm12
- movdqa %xmm15,%xmm7
- pslld $7,%xmm15
- psrld $25,%xmm7
- movdqa %xmm12,%xmm6
- pslld $7,%xmm12
- por %xmm7,%xmm15
- psrld $25,%xmm6
- movdqa (%r9),%xmm7
- por %xmm6,%xmm12
- dec %eax
- jnz .Loop4x
-
- paddd 0x40(%rsp),%xmm8 # accumulate key material
- paddd 0x50(%rsp),%xmm9
- paddd 0x60(%rsp),%xmm10
- paddd 0x70(%rsp),%xmm11
-
- movdqa %xmm8,%xmm6 # "de-interlace" data
- punpckldq %xmm9,%xmm8
- movdqa %xmm10,%xmm7
- punpckldq %xmm11,%xmm10
- punpckhdq %xmm9,%xmm6
- punpckhdq %xmm11,%xmm7
- movdqa %xmm8,%xmm9
- punpcklqdq %xmm10,%xmm8 # "a0"
- movdqa %xmm6,%xmm11
- punpcklqdq %xmm7,%xmm6 # "a2"
- punpckhqdq %xmm10,%xmm9 # "a1"
- punpckhqdq %xmm7,%xmm11 # "a3"
- paddd 0x80-0x100(%rcx),%xmm12
- paddd 0x90-0x100(%rcx),%xmm13
- paddd 0xa0-0x100(%rcx),%xmm14
- paddd 0xb0-0x100(%rcx),%xmm15
-
- movdqa %xmm8,0x00(%rsp) # offload
- movdqa %xmm9,0x10(%rsp)
- movdqa 0x20(%rsp),%xmm8 # "xc2"
- movdqa 0x30(%rsp),%xmm9 # "xc3"
-
- movdqa %xmm12,%xmm10
- punpckldq %xmm13,%xmm12
- movdqa %xmm14,%xmm7
- punpckldq %xmm15,%xmm14
- punpckhdq %xmm13,%xmm10
- punpckhdq %xmm15,%xmm7
- movdqa %xmm12,%xmm13
- punpcklqdq %xmm14,%xmm12 # "b0"
- movdqa %xmm10,%xmm15
- punpcklqdq %xmm7,%xmm10 # "b2"
- punpckhqdq %xmm14,%xmm13 # "b1"
- punpckhqdq %xmm7,%xmm15 # "b3"
- paddd 0xc0-0x100(%rcx),%xmm4
- paddd 0xd0-0x100(%rcx),%xmm5
- paddd 0xe0-0x100(%rcx),%xmm8
- paddd 0xf0-0x100(%rcx),%xmm9
-
- movdqa %xmm6,0x20(%rsp) # keep offloading
- movdqa %xmm11,0x30(%rsp)
-
- movdqa %xmm4,%xmm14
- punpckldq %xmm5,%xmm4
- movdqa %xmm8,%xmm7
- punpckldq %xmm9,%xmm8
- punpckhdq %xmm5,%xmm14
- punpckhdq %xmm9,%xmm7
- movdqa %xmm4,%xmm5
- punpcklqdq %xmm8,%xmm4 # "c0"
- movdqa %xmm14,%xmm9
- punpcklqdq %xmm7,%xmm14 # "c2"
- punpckhqdq %xmm8,%xmm5 # "c1"
- punpckhqdq %xmm7,%xmm9 # "c3"
- paddd 0x100-0x100(%rcx),%xmm0
- paddd 0x110-0x100(%rcx),%xmm1
- paddd 0x120-0x100(%rcx),%xmm2
- paddd 0x130-0x100(%rcx),%xmm3
-
- movdqa %xmm0,%xmm8
- punpckldq %xmm1,%xmm0
- movdqa %xmm2,%xmm7
- punpckldq %xmm3,%xmm2
- punpckhdq %xmm1,%xmm8
- punpckhdq %xmm3,%xmm7
- movdqa %xmm0,%xmm1
- punpcklqdq %xmm2,%xmm0 # "d0"
- movdqa %xmm8,%xmm3
- punpcklqdq %xmm7,%xmm8 # "d2"
- punpckhqdq %xmm2,%xmm1 # "d1"
- punpckhqdq %xmm7,%xmm3 # "d3"
- cmp $64*4,%rdx
- jb .Ltail4x
-
- movdqu 0x00(%rsi),%xmm6 # xor with input
- movdqu 0x10(%rsi),%xmm11
- movdqu 0x20(%rsi),%xmm2
- movdqu 0x30(%rsi),%xmm7
- pxor 0x00(%rsp),%xmm6 # is offloaded, remember?
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
-
- movdqu %xmm6,0x00(%rdi)
- movdqu 0x40(%rsi),%xmm6
- movdqu %xmm11,0x10(%rdi)
- movdqu 0x50(%rsi),%xmm11
- movdqu %xmm2,0x20(%rdi)
- movdqu 0x60(%rsi),%xmm2
- movdqu %xmm7,0x30(%rdi)
- movdqu 0x70(%rsi),%xmm7
- lea 0x80(%rsi),%rsi # size optimization
- pxor 0x10(%rsp),%xmm6
- pxor %xmm13,%xmm11
- pxor %xmm5,%xmm2
- pxor %xmm1,%xmm7
-
- movdqu %xmm6,0x40(%rdi)
- movdqu 0x00(%rsi),%xmm6
- movdqu %xmm11,0x50(%rdi)
- movdqu 0x10(%rsi),%xmm11
- movdqu %xmm2,0x60(%rdi)
- movdqu 0x20(%rsi),%xmm2
- movdqu %xmm7,0x70(%rdi)
- lea 0x80(%rdi),%rdi # size optimization
- movdqu 0x30(%rsi),%xmm7
- pxor 0x20(%rsp),%xmm6
- pxor %xmm10,%xmm11
- pxor %xmm14,%xmm2
- pxor %xmm8,%xmm7
-
- movdqu %xmm6,0x00(%rdi)
- movdqu 0x40(%rsi),%xmm6
- movdqu %xmm11,0x10(%rdi)
- movdqu 0x50(%rsi),%xmm11
- movdqu %xmm2,0x20(%rdi)
- movdqu 0x60(%rsi),%xmm2
- movdqu %xmm7,0x30(%rdi)
- movdqu 0x70(%rsi),%xmm7
- lea 0x80(%rsi),%rsi # inp+=64*4
- pxor 0x30(%rsp),%xmm6
- pxor %xmm15,%xmm11
- pxor %xmm9,%xmm2
- pxor %xmm3,%xmm7
- movdqu %xmm6,0x40(%rdi)
- movdqu %xmm11,0x50(%rdi)
- movdqu %xmm2,0x60(%rdi)
- movdqu %xmm7,0x70(%rdi)
- lea 0x80(%rdi),%rdi # out+=64*4
-
- sub $64*4,%rdx
- jnz .Loop_outer4x
-
- jmp .Ldone4x
-
-.Ltail4x:
- cmp $192,%rdx
- jae .L192_or_more4x
- cmp $128,%rdx
- jae .L128_or_more4x
- cmp $64,%rdx
- jae .L64_or_more4x
-
- #movdqa 0x00(%rsp),%xmm6 # is offloaded, remember?
- xor %r9,%r9
- #movdqa %xmm6,0x00(%rsp)
- movdqa %xmm12,0x10(%rsp)
- movdqa %xmm4,0x20(%rsp)
- movdqa %xmm0,0x30(%rsp)
- jmp .Loop_tail4x
-
-.align 32
-.L64_or_more4x:
- movdqu 0x00(%rsi),%xmm6 # xor with input
- movdqu 0x10(%rsi),%xmm11
- movdqu 0x20(%rsi),%xmm2
- movdqu 0x30(%rsi),%xmm7
- pxor 0x00(%rsp),%xmm6 # is offloaded, remember?
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
- movdqu %xmm6,0x00(%rdi)
- movdqu %xmm11,0x10(%rdi)
- movdqu %xmm2,0x20(%rdi)
- movdqu %xmm7,0x30(%rdi)
- je .Ldone4x
-
- movdqa 0x10(%rsp),%xmm6 # is offloaded, remember?
- lea 0x40(%rsi),%rsi # inp+=64*1
- xor %r9,%r9
- movdqa %xmm6,0x00(%rsp)
- movdqa %xmm13,0x10(%rsp)
- lea 0x40(%rdi),%rdi # out+=64*1
- movdqa %xmm5,0x20(%rsp)
- sub $64,%rdx # len-=64*1
- movdqa %xmm1,0x30(%rsp)
- jmp .Loop_tail4x
-
-.align 32
-.L128_or_more4x:
- movdqu 0x00(%rsi),%xmm6 # xor with input
- movdqu 0x10(%rsi),%xmm11
- movdqu 0x20(%rsi),%xmm2
- movdqu 0x30(%rsi),%xmm7
- pxor 0x00(%rsp),%xmm6 # is offloaded, remember?
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
-
- movdqu %xmm6,0x00(%rdi)
- movdqu 0x40(%rsi),%xmm6
- movdqu %xmm11,0x10(%rdi)
- movdqu 0x50(%rsi),%xmm11
- movdqu %xmm2,0x20(%rdi)
- movdqu 0x60(%rsi),%xmm2
- movdqu %xmm7,0x30(%rdi)
- movdqu 0x70(%rsi),%xmm7
- pxor 0x10(%rsp),%xmm6
- pxor %xmm13,%xmm11
- pxor %xmm5,%xmm2
- pxor %xmm1,%xmm7
- movdqu %xmm6,0x40(%rdi)
- movdqu %xmm11,0x50(%rdi)
- movdqu %xmm2,0x60(%rdi)
- movdqu %xmm7,0x70(%rdi)
- je .Ldone4x
-
- movdqa 0x20(%rsp),%xmm6 # is offloaded, remember?
- lea 0x80(%rsi),%rsi # inp+=64*2
- xor %r9,%r9
- movdqa %xmm6,0x00(%rsp)
- movdqa %xmm10,0x10(%rsp)
- lea 0x80(%rdi),%rdi # out+=64*2
- movdqa %xmm14,0x20(%rsp)
- sub $128,%rdx # len-=64*2
- movdqa %xmm8,0x30(%rsp)
- jmp .Loop_tail4x
-
-.align 32
-.L192_or_more4x:
- movdqu 0x00(%rsi),%xmm6 # xor with input
- movdqu 0x10(%rsi),%xmm11
- movdqu 0x20(%rsi),%xmm2
- movdqu 0x30(%rsi),%xmm7
- pxor 0x00(%rsp),%xmm6 # is offloaded, remember?
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
-
- movdqu %xmm6,0x00(%rdi)
- movdqu 0x40(%rsi),%xmm6
- movdqu %xmm11,0x10(%rdi)
- movdqu 0x50(%rsi),%xmm11
- movdqu %xmm2,0x20(%rdi)
- movdqu 0x60(%rsi),%xmm2
- movdqu %xmm7,0x30(%rdi)
- movdqu 0x70(%rsi),%xmm7
- lea 0x80(%rsi),%rsi # size optimization
- pxor 0x10(%rsp),%xmm6
- pxor %xmm13,%xmm11
- pxor %xmm5,%xmm2
- pxor %xmm1,%xmm7
-
- movdqu %xmm6,0x40(%rdi)
- movdqu 0x00(%rsi),%xmm6
- movdqu %xmm11,0x50(%rdi)
- movdqu 0x10(%rsi),%xmm11
- movdqu %xmm2,0x60(%rdi)
- movdqu 0x20(%rsi),%xmm2
- movdqu %xmm7,0x70(%rdi)
- lea 0x80(%rdi),%rdi # size optimization
- movdqu 0x30(%rsi),%xmm7
- pxor 0x20(%rsp),%xmm6
- pxor %xmm10,%xmm11
- pxor %xmm14,%xmm2
- pxor %xmm8,%xmm7
- movdqu %xmm6,0x00(%rdi)
- movdqu %xmm11,0x10(%rdi)
- movdqu %xmm2,0x20(%rdi)
- movdqu %xmm7,0x30(%rdi)
- je .Ldone4x
-
- movdqa 0x30(%rsp),%xmm6 # is offloaded, remember?
- lea 0x40(%rsi),%rsi # inp+=64*3
- xor %r9,%r9
- movdqa %xmm6,0x00(%rsp)
- movdqa %xmm15,0x10(%rsp)
- lea 0x40(%rdi),%rdi # out+=64*3
- movdqa %xmm9,0x20(%rsp)
- sub $192,%rdx # len-=64*3
- movdqa %xmm3,0x30(%rsp)
-
-.Loop_tail4x:
- movzb (%rsi,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1(%rdi,%r9)
- dec %rdx
- jnz .Loop_tail4x
-
-.Ldone4x:
- lea -8(%r10),%rsp
-.L4x_epilogue:
- ret
-.size chacha20_4x,.-chacha20_4x
-#endif
-#ifdef CONFIG_AS_AVX2
-.align 32
-SYM_FUNC_START(chacha20_avx2)
-.Lchacha20_avx2:
-.Lchacha20_8x:
- lea 8(%rsp),%r10 # frame register
- sub $0x280+8,%rsp
- and $-32,%rsp
- vzeroupper
-
- ################ stack layout
- # +0x00 SIMD equivalent of %r12d
- # ...
- # +0x80 constant copy of key[0-2] smashed by lanes
- # ...
- # +0x200 SIMD counters (with nonce smashed by lanes)
- # ...
- # +0x280
-
- vbroadcasti128 .Lsigma(%rip),%ymm11 # key[0]
- vbroadcasti128 (%rcx),%ymm3 # key[1]
- vbroadcasti128 16(%rcx),%ymm15 # key[2]
- vbroadcasti128 (%r8),%ymm7 # key[3]
- lea 0x100(%rsp),%rcx # size optimization
- lea 0x200(%rsp),%rax # size optimization
- lea .Lrot16(%rip),%r9
- lea .Lrot24(%rip),%r11
-
- vpshufd $0x00,%ymm11,%ymm8 # smash key by lanes...
- vpshufd $0x55,%ymm11,%ymm9
- vmovdqa %ymm8,0x80-0x100(%rcx) # ... and offload
- vpshufd $0xaa,%ymm11,%ymm10
- vmovdqa %ymm9,0xa0-0x100(%rcx)
- vpshufd $0xff,%ymm11,%ymm11
- vmovdqa %ymm10,0xc0-0x100(%rcx)
- vmovdqa %ymm11,0xe0-0x100(%rcx)
-
- vpshufd $0x00,%ymm3,%ymm0
- vpshufd $0x55,%ymm3,%ymm1
- vmovdqa %ymm0,0x100-0x100(%rcx)
- vpshufd $0xaa,%ymm3,%ymm2
- vmovdqa %ymm1,0x120-0x100(%rcx)
- vpshufd $0xff,%ymm3,%ymm3
- vmovdqa %ymm2,0x140-0x100(%rcx)
- vmovdqa %ymm3,0x160-0x100(%rcx)
-
- vpshufd $0x00,%ymm15,%ymm12 # "xc0"
- vpshufd $0x55,%ymm15,%ymm13 # "xc1"
- vmovdqa %ymm12,0x180-0x200(%rax)
- vpshufd $0xaa,%ymm15,%ymm14 # "xc2"
- vmovdqa %ymm13,0x1a0-0x200(%rax)
- vpshufd $0xff,%ymm15,%ymm15 # "xc3"
- vmovdqa %ymm14,0x1c0-0x200(%rax)
- vmovdqa %ymm15,0x1e0-0x200(%rax)
-
- vpshufd $0x00,%ymm7,%ymm4
- vpshufd $0x55,%ymm7,%ymm5
- vpaddd .Lincy(%rip),%ymm4,%ymm4 # don't save counters yet
- vpshufd $0xaa,%ymm7,%ymm6
- vmovdqa %ymm5,0x220-0x200(%rax)
- vpshufd $0xff,%ymm7,%ymm7
- vmovdqa %ymm6,0x240-0x200(%rax)
- vmovdqa %ymm7,0x260-0x200(%rax)
-
- jmp .Loop_enter8x
-
-.align 32
-.Loop_outer8x:
- vmovdqa 0x80-0x100(%rcx),%ymm8 # re-load smashed key
- vmovdqa 0xa0-0x100(%rcx),%ymm9
- vmovdqa 0xc0-0x100(%rcx),%ymm10
- vmovdqa 0xe0-0x100(%rcx),%ymm11
- vmovdqa 0x100-0x100(%rcx),%ymm0
- vmovdqa 0x120-0x100(%rcx),%ymm1
- vmovdqa 0x140-0x100(%rcx),%ymm2
- vmovdqa 0x160-0x100(%rcx),%ymm3
- vmovdqa 0x180-0x200(%rax),%ymm12 # "xc0"
- vmovdqa 0x1a0-0x200(%rax),%ymm13 # "xc1"
- vmovdqa 0x1c0-0x200(%rax),%ymm14 # "xc2"
- vmovdqa 0x1e0-0x200(%rax),%ymm15 # "xc3"
- vmovdqa 0x200-0x200(%rax),%ymm4
- vmovdqa 0x220-0x200(%rax),%ymm5
- vmovdqa 0x240-0x200(%rax),%ymm6
- vmovdqa 0x260-0x200(%rax),%ymm7
- vpaddd .Leight(%rip),%ymm4,%ymm4 # next SIMD counters
-
-.Loop_enter8x:
- vmovdqa %ymm14,0x40(%rsp) # SIMD equivalent of "%nox"
- vmovdqa %ymm15,0x60(%rsp) # SIMD equivalent of "%nox"
- vbroadcasti128 (%r9),%ymm15
- vmovdqa %ymm4,0x200-0x200(%rax) # save SIMD counters
- mov $10,%eax
- jmp .Loop8x
-
-.align 32
-.Loop8x:
- vpaddd %ymm0,%ymm8,%ymm8
- vpxor %ymm4,%ymm8,%ymm4
- vpshufb %ymm15,%ymm4,%ymm4
- vpaddd %ymm1,%ymm9,%ymm9
- vpxor %ymm5,%ymm9,%ymm5
- vpshufb %ymm15,%ymm5,%ymm5
- vpaddd %ymm4,%ymm12,%ymm12
- vpxor %ymm0,%ymm12,%ymm0
- vpslld $12,%ymm0,%ymm14
- vpsrld $20,%ymm0,%ymm0
- vpor %ymm0,%ymm14,%ymm0
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm5,%ymm13,%ymm13
- vpxor %ymm1,%ymm13,%ymm1
- vpslld $12,%ymm1,%ymm15
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm1,%ymm15,%ymm1
- vpaddd %ymm0,%ymm8,%ymm8
- vpxor %ymm4,%ymm8,%ymm4
- vpshufb %ymm14,%ymm4,%ymm4
- vpaddd %ymm1,%ymm9,%ymm9
- vpxor %ymm5,%ymm9,%ymm5
- vpshufb %ymm14,%ymm5,%ymm5
- vpaddd %ymm4,%ymm12,%ymm12
- vpxor %ymm0,%ymm12,%ymm0
- vpslld $7,%ymm0,%ymm15
- vpsrld $25,%ymm0,%ymm0
- vpor %ymm0,%ymm15,%ymm0
- vbroadcasti128 (%r9),%ymm15
- vpaddd %ymm5,%ymm13,%ymm13
- vpxor %ymm1,%ymm13,%ymm1
- vpslld $7,%ymm1,%ymm14
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm1,%ymm14,%ymm1
- vmovdqa %ymm12,0(%rsp)
- vmovdqa %ymm13,32(%rsp)
- vmovdqa 64(%rsp),%ymm12
- vmovdqa 96(%rsp),%ymm13
- vpaddd %ymm2,%ymm10,%ymm10
- vpxor %ymm6,%ymm10,%ymm6
- vpshufb %ymm15,%ymm6,%ymm6
- vpaddd %ymm3,%ymm11,%ymm11
- vpxor %ymm7,%ymm11,%ymm7
- vpshufb %ymm15,%ymm7,%ymm7
- vpaddd %ymm6,%ymm12,%ymm12
- vpxor %ymm2,%ymm12,%ymm2
- vpslld $12,%ymm2,%ymm14
- vpsrld $20,%ymm2,%ymm2
- vpor %ymm2,%ymm14,%ymm2
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm7,%ymm13,%ymm13
- vpxor %ymm3,%ymm13,%ymm3
- vpslld $12,%ymm3,%ymm15
- vpsrld $20,%ymm3,%ymm3
- vpor %ymm3,%ymm15,%ymm3
- vpaddd %ymm2,%ymm10,%ymm10
- vpxor %ymm6,%ymm10,%ymm6
- vpshufb %ymm14,%ymm6,%ymm6
- vpaddd %ymm3,%ymm11,%ymm11
- vpxor %ymm7,%ymm11,%ymm7
- vpshufb %ymm14,%ymm7,%ymm7
- vpaddd %ymm6,%ymm12,%ymm12
- vpxor %ymm2,%ymm12,%ymm2
- vpslld $7,%ymm2,%ymm15
- vpsrld $25,%ymm2,%ymm2
- vpor %ymm2,%ymm15,%ymm2
- vbroadcasti128 (%r9),%ymm15
- vpaddd %ymm7,%ymm13,%ymm13
- vpxor %ymm3,%ymm13,%ymm3
- vpslld $7,%ymm3,%ymm14
- vpsrld $25,%ymm3,%ymm3
- vpor %ymm3,%ymm14,%ymm3
- vpaddd %ymm1,%ymm8,%ymm8
- vpxor %ymm7,%ymm8,%ymm7
- vpshufb %ymm15,%ymm7,%ymm7
- vpaddd %ymm2,%ymm9,%ymm9
- vpxor %ymm4,%ymm9,%ymm4
- vpshufb %ymm15,%ymm4,%ymm4
- vpaddd %ymm7,%ymm12,%ymm12
- vpxor %ymm1,%ymm12,%ymm1
- vpslld $12,%ymm1,%ymm14
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm1,%ymm14,%ymm1
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm4,%ymm13,%ymm13
- vpxor %ymm2,%ymm13,%ymm2
- vpslld $12,%ymm2,%ymm15
- vpsrld $20,%ymm2,%ymm2
- vpor %ymm2,%ymm15,%ymm2
- vpaddd %ymm1,%ymm8,%ymm8
- vpxor %ymm7,%ymm8,%ymm7
- vpshufb %ymm14,%ymm7,%ymm7
- vpaddd %ymm2,%ymm9,%ymm9
- vpxor %ymm4,%ymm9,%ymm4
- vpshufb %ymm14,%ymm4,%ymm4
- vpaddd %ymm7,%ymm12,%ymm12
- vpxor %ymm1,%ymm12,%ymm1
- vpslld $7,%ymm1,%ymm15
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm1,%ymm15,%ymm1
- vbroadcasti128 (%r9),%ymm15
- vpaddd %ymm4,%ymm13,%ymm13
- vpxor %ymm2,%ymm13,%ymm2
- vpslld $7,%ymm2,%ymm14
- vpsrld $25,%ymm2,%ymm2
- vpor %ymm2,%ymm14,%ymm2
- vmovdqa %ymm12,64(%rsp)
- vmovdqa %ymm13,96(%rsp)
- vmovdqa 0(%rsp),%ymm12
- vmovdqa 32(%rsp),%ymm13
- vpaddd %ymm3,%ymm10,%ymm10
- vpxor %ymm5,%ymm10,%ymm5
- vpshufb %ymm15,%ymm5,%ymm5
- vpaddd %ymm0,%ymm11,%ymm11
- vpxor %ymm6,%ymm11,%ymm6
- vpshufb %ymm15,%ymm6,%ymm6
- vpaddd %ymm5,%ymm12,%ymm12
- vpxor %ymm3,%ymm12,%ymm3
- vpslld $12,%ymm3,%ymm14
- vpsrld $20,%ymm3,%ymm3
- vpor %ymm3,%ymm14,%ymm3
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm6,%ymm13,%ymm13
- vpxor %ymm0,%ymm13,%ymm0
- vpslld $12,%ymm0,%ymm15
- vpsrld $20,%ymm0,%ymm0
- vpor %ymm0,%ymm15,%ymm0
- vpaddd %ymm3,%ymm10,%ymm10
- vpxor %ymm5,%ymm10,%ymm5
- vpshufb %ymm14,%ymm5,%ymm5
- vpaddd %ymm0,%ymm11,%ymm11
- vpxor %ymm6,%ymm11,%ymm6
- vpshufb %ymm14,%ymm6,%ymm6
- vpaddd %ymm5,%ymm12,%ymm12
- vpxor %ymm3,%ymm12,%ymm3
- vpslld $7,%ymm3,%ymm15
- vpsrld $25,%ymm3,%ymm3
- vpor %ymm3,%ymm15,%ymm3
- vbroadcasti128 (%r9),%ymm15
- vpaddd %ymm6,%ymm13,%ymm13
- vpxor %ymm0,%ymm13,%ymm0
- vpslld $7,%ymm0,%ymm14
- vpsrld $25,%ymm0,%ymm0
- vpor %ymm0,%ymm14,%ymm0
- dec %eax
- jnz .Loop8x
-
- lea 0x200(%rsp),%rax # size optimization
- vpaddd 0x80-0x100(%rcx),%ymm8,%ymm8 # accumulate key
- vpaddd 0xa0-0x100(%rcx),%ymm9,%ymm9
- vpaddd 0xc0-0x100(%rcx),%ymm10,%ymm10
- vpaddd 0xe0-0x100(%rcx),%ymm11,%ymm11
-
- vpunpckldq %ymm9,%ymm8,%ymm14 # "de-interlace" data
- vpunpckldq %ymm11,%ymm10,%ymm15
- vpunpckhdq %ymm9,%ymm8,%ymm8
- vpunpckhdq %ymm11,%ymm10,%ymm10
- vpunpcklqdq %ymm15,%ymm14,%ymm9 # "a0"
- vpunpckhqdq %ymm15,%ymm14,%ymm14 # "a1"
- vpunpcklqdq %ymm10,%ymm8,%ymm11 # "a2"
- vpunpckhqdq %ymm10,%ymm8,%ymm8 # "a3"
- vpaddd 0x100-0x100(%rcx),%ymm0,%ymm0
- vpaddd 0x120-0x100(%rcx),%ymm1,%ymm1
- vpaddd 0x140-0x100(%rcx),%ymm2,%ymm2
- vpaddd 0x160-0x100(%rcx),%ymm3,%ymm3
-
- vpunpckldq %ymm1,%ymm0,%ymm10
- vpunpckldq %ymm3,%ymm2,%ymm15
- vpunpckhdq %ymm1,%ymm0,%ymm0
- vpunpckhdq %ymm3,%ymm2,%ymm2
- vpunpcklqdq %ymm15,%ymm10,%ymm1 # "b0"
- vpunpckhqdq %ymm15,%ymm10,%ymm10 # "b1"
- vpunpcklqdq %ymm2,%ymm0,%ymm3 # "b2"
- vpunpckhqdq %ymm2,%ymm0,%ymm0 # "b3"
- vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 # "de-interlace" further
- vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
- vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
- vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
- vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
- vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
- vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
- vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
- vmovdqa %ymm15,0x00(%rsp) # offload
- vmovdqa %ymm9,0x20(%rsp)
- vmovdqa 0x40(%rsp),%ymm15 # %ymm15
- vmovdqa 0x60(%rsp),%ymm9 # %ymm9
-
- vpaddd 0x180-0x200(%rax),%ymm12,%ymm12
- vpaddd 0x1a0-0x200(%rax),%ymm13,%ymm13
- vpaddd 0x1c0-0x200(%rax),%ymm15,%ymm15
- vpaddd 0x1e0-0x200(%rax),%ymm9,%ymm9
-
- vpunpckldq %ymm13,%ymm12,%ymm2
- vpunpckldq %ymm9,%ymm15,%ymm8
- vpunpckhdq %ymm13,%ymm12,%ymm12
- vpunpckhdq %ymm9,%ymm15,%ymm15
- vpunpcklqdq %ymm8,%ymm2,%ymm13 # "c0"
- vpunpckhqdq %ymm8,%ymm2,%ymm2 # "c1"
- vpunpcklqdq %ymm15,%ymm12,%ymm9 # "c2"
- vpunpckhqdq %ymm15,%ymm12,%ymm12 # "c3"
- vpaddd 0x200-0x200(%rax),%ymm4,%ymm4
- vpaddd 0x220-0x200(%rax),%ymm5,%ymm5
- vpaddd 0x240-0x200(%rax),%ymm6,%ymm6
- vpaddd 0x260-0x200(%rax),%ymm7,%ymm7
-
- vpunpckldq %ymm5,%ymm4,%ymm15
- vpunpckldq %ymm7,%ymm6,%ymm8
- vpunpckhdq %ymm5,%ymm4,%ymm4
- vpunpckhdq %ymm7,%ymm6,%ymm6
- vpunpcklqdq %ymm8,%ymm15,%ymm5 # "d0"
- vpunpckhqdq %ymm8,%ymm15,%ymm15 # "d1"
- vpunpcklqdq %ymm6,%ymm4,%ymm7 # "d2"
- vpunpckhqdq %ymm6,%ymm4,%ymm4 # "d3"
- vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 # "de-interlace" further
- vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
- vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
- vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
- vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
- vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
- vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
- vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
- vmovdqa 0x00(%rsp),%ymm6 # was offloaded, remember?
- vmovdqa 0x20(%rsp),%ymm12
-
- cmp $64*8,%rdx
- jb .Ltail8x
-
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vpxor 0x40(%rsi),%ymm1,%ymm1
- vpxor 0x60(%rsi),%ymm5,%ymm5
- lea 0x80(%rsi),%rsi # size optimization
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- vmovdqu %ymm1,0x40(%rdi)
- vmovdqu %ymm5,0x60(%rdi)
- lea 0x80(%rdi),%rdi # size optimization
-
- vpxor 0x00(%rsi),%ymm12,%ymm12
- vpxor 0x20(%rsi),%ymm13,%ymm13
- vpxor 0x40(%rsi),%ymm10,%ymm10
- vpxor 0x60(%rsi),%ymm15,%ymm15
- lea 0x80(%rsi),%rsi # size optimization
- vmovdqu %ymm12,0x00(%rdi)
- vmovdqu %ymm13,0x20(%rdi)
- vmovdqu %ymm10,0x40(%rdi)
- vmovdqu %ymm15,0x60(%rdi)
- lea 0x80(%rdi),%rdi # size optimization
-
- vpxor 0x00(%rsi),%ymm14,%ymm14
- vpxor 0x20(%rsi),%ymm2,%ymm2
- vpxor 0x40(%rsi),%ymm3,%ymm3
- vpxor 0x60(%rsi),%ymm7,%ymm7
- lea 0x80(%rsi),%rsi # size optimization
- vmovdqu %ymm14,0x00(%rdi)
- vmovdqu %ymm2,0x20(%rdi)
- vmovdqu %ymm3,0x40(%rdi)
- vmovdqu %ymm7,0x60(%rdi)
- lea 0x80(%rdi),%rdi # size optimization
-
- vpxor 0x00(%rsi),%ymm11,%ymm11
- vpxor 0x20(%rsi),%ymm9,%ymm9
- vpxor 0x40(%rsi),%ymm0,%ymm0
- vpxor 0x60(%rsi),%ymm4,%ymm4
- lea 0x80(%rsi),%rsi # size optimization
- vmovdqu %ymm11,0x00(%rdi)
- vmovdqu %ymm9,0x20(%rdi)
- vmovdqu %ymm0,0x40(%rdi)
- vmovdqu %ymm4,0x60(%rdi)
- lea 0x80(%rdi),%rdi # size optimization
-
- sub $64*8,%rdx
- jnz .Loop_outer8x
-
- jmp .Ldone8x
-
-.Ltail8x:
- cmp $448,%rdx
- jae .L448_or_more8x
- cmp $384,%rdx
- jae .L384_or_more8x
- cmp $320,%rdx
- jae .L320_or_more8x
- cmp $256,%rdx
- jae .L256_or_more8x
- cmp $192,%rdx
- jae .L192_or_more8x
- cmp $128,%rdx
- jae .L128_or_more8x
- cmp $64,%rdx
- jae .L64_or_more8x
-
- xor %r9,%r9
- vmovdqa %ymm6,0x00(%rsp)
- vmovdqa %ymm8,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L64_or_more8x:
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- je .Ldone8x
-
- lea 0x40(%rsi),%rsi # inp+=64*1
- xor %r9,%r9
- vmovdqa %ymm1,0x00(%rsp)
- lea 0x40(%rdi),%rdi # out+=64*1
- sub $64,%rdx # len-=64*1
- vmovdqa %ymm5,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L128_or_more8x:
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vpxor 0x40(%rsi),%ymm1,%ymm1
- vpxor 0x60(%rsi),%ymm5,%ymm5
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- vmovdqu %ymm1,0x40(%rdi)
- vmovdqu %ymm5,0x60(%rdi)
- je .Ldone8x
-
- lea 0x80(%rsi),%rsi # inp+=64*2
- xor %r9,%r9
- vmovdqa %ymm12,0x00(%rsp)
- lea 0x80(%rdi),%rdi # out+=64*2
- sub $128,%rdx # len-=64*2
- vmovdqa %ymm13,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L192_or_more8x:
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vpxor 0x40(%rsi),%ymm1,%ymm1
- vpxor 0x60(%rsi),%ymm5,%ymm5
- vpxor 0x80(%rsi),%ymm12,%ymm12
- vpxor 0xa0(%rsi),%ymm13,%ymm13
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- vmovdqu %ymm1,0x40(%rdi)
- vmovdqu %ymm5,0x60(%rdi)
- vmovdqu %ymm12,0x80(%rdi)
- vmovdqu %ymm13,0xa0(%rdi)
- je .Ldone8x
-
- lea 0xc0(%rsi),%rsi # inp+=64*3
- xor %r9,%r9
- vmovdqa %ymm10,0x00(%rsp)
- lea 0xc0(%rdi),%rdi # out+=64*3
- sub $192,%rdx # len-=64*3
- vmovdqa %ymm15,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L256_or_more8x:
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vpxor 0x40(%rsi),%ymm1,%ymm1
- vpxor 0x60(%rsi),%ymm5,%ymm5
- vpxor 0x80(%rsi),%ymm12,%ymm12
- vpxor 0xa0(%rsi),%ymm13,%ymm13
- vpxor 0xc0(%rsi),%ymm10,%ymm10
- vpxor 0xe0(%rsi),%ymm15,%ymm15
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- vmovdqu %ymm1,0x40(%rdi)
- vmovdqu %ymm5,0x60(%rdi)
- vmovdqu %ymm12,0x80(%rdi)
- vmovdqu %ymm13,0xa0(%rdi)
- vmovdqu %ymm10,0xc0(%rdi)
- vmovdqu %ymm15,0xe0(%rdi)
- je .Ldone8x
-
- lea 0x100(%rsi),%rsi # inp+=64*4
- xor %r9,%r9
- vmovdqa %ymm14,0x00(%rsp)
- lea 0x100(%rdi),%rdi # out+=64*4
- sub $256,%rdx # len-=64*4
- vmovdqa %ymm2,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L320_or_more8x:
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vpxor 0x40(%rsi),%ymm1,%ymm1
- vpxor 0x60(%rsi),%ymm5,%ymm5
- vpxor 0x80(%rsi),%ymm12,%ymm12
- vpxor 0xa0(%rsi),%ymm13,%ymm13
- vpxor 0xc0(%rsi),%ymm10,%ymm10
- vpxor 0xe0(%rsi),%ymm15,%ymm15
- vpxor 0x100(%rsi),%ymm14,%ymm14
- vpxor 0x120(%rsi),%ymm2,%ymm2
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- vmovdqu %ymm1,0x40(%rdi)
- vmovdqu %ymm5,0x60(%rdi)
- vmovdqu %ymm12,0x80(%rdi)
- vmovdqu %ymm13,0xa0(%rdi)
- vmovdqu %ymm10,0xc0(%rdi)
- vmovdqu %ymm15,0xe0(%rdi)
- vmovdqu %ymm14,0x100(%rdi)
- vmovdqu %ymm2,0x120(%rdi)
- je .Ldone8x
-
- lea 0x140(%rsi),%rsi # inp+=64*5
- xor %r9,%r9
- vmovdqa %ymm3,0x00(%rsp)
- lea 0x140(%rdi),%rdi # out+=64*5
- sub $320,%rdx # len-=64*5
- vmovdqa %ymm7,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L384_or_more8x:
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vpxor 0x40(%rsi),%ymm1,%ymm1
- vpxor 0x60(%rsi),%ymm5,%ymm5
- vpxor 0x80(%rsi),%ymm12,%ymm12
- vpxor 0xa0(%rsi),%ymm13,%ymm13
- vpxor 0xc0(%rsi),%ymm10,%ymm10
- vpxor 0xe0(%rsi),%ymm15,%ymm15
- vpxor 0x100(%rsi),%ymm14,%ymm14
- vpxor 0x120(%rsi),%ymm2,%ymm2
- vpxor 0x140(%rsi),%ymm3,%ymm3
- vpxor 0x160(%rsi),%ymm7,%ymm7
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- vmovdqu %ymm1,0x40(%rdi)
- vmovdqu %ymm5,0x60(%rdi)
- vmovdqu %ymm12,0x80(%rdi)
- vmovdqu %ymm13,0xa0(%rdi)
- vmovdqu %ymm10,0xc0(%rdi)
- vmovdqu %ymm15,0xe0(%rdi)
- vmovdqu %ymm14,0x100(%rdi)
- vmovdqu %ymm2,0x120(%rdi)
- vmovdqu %ymm3,0x140(%rdi)
- vmovdqu %ymm7,0x160(%rdi)
- je .Ldone8x
-
- lea 0x180(%rsi),%rsi # inp+=64*6
- xor %r9,%r9
- vmovdqa %ymm11,0x00(%rsp)
- lea 0x180(%rdi),%rdi # out+=64*6
- sub $384,%rdx # len-=64*6
- vmovdqa %ymm9,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L448_or_more8x:
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vpxor 0x40(%rsi),%ymm1,%ymm1
- vpxor 0x60(%rsi),%ymm5,%ymm5
- vpxor 0x80(%rsi),%ymm12,%ymm12
- vpxor 0xa0(%rsi),%ymm13,%ymm13
- vpxor 0xc0(%rsi),%ymm10,%ymm10
- vpxor 0xe0(%rsi),%ymm15,%ymm15
- vpxor 0x100(%rsi),%ymm14,%ymm14
- vpxor 0x120(%rsi),%ymm2,%ymm2
- vpxor 0x140(%rsi),%ymm3,%ymm3
- vpxor 0x160(%rsi),%ymm7,%ymm7
- vpxor 0x180(%rsi),%ymm11,%ymm11
- vpxor 0x1a0(%rsi),%ymm9,%ymm9
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- vmovdqu %ymm1,0x40(%rdi)
- vmovdqu %ymm5,0x60(%rdi)
- vmovdqu %ymm12,0x80(%rdi)
- vmovdqu %ymm13,0xa0(%rdi)
- vmovdqu %ymm10,0xc0(%rdi)
- vmovdqu %ymm15,0xe0(%rdi)
- vmovdqu %ymm14,0x100(%rdi)
- vmovdqu %ymm2,0x120(%rdi)
- vmovdqu %ymm3,0x140(%rdi)
- vmovdqu %ymm7,0x160(%rdi)
- vmovdqu %ymm11,0x180(%rdi)
- vmovdqu %ymm9,0x1a0(%rdi)
- je .Ldone8x
-
- lea 0x1c0(%rsi),%rsi # inp+=64*7
- xor %r9,%r9
- vmovdqa %ymm0,0x00(%rsp)
- lea 0x1c0(%rdi),%rdi # out+=64*7
- sub $448,%rdx # len-=64*7
- vmovdqa %ymm4,0x20(%rsp)
-
-.Loop_tail8x:
- movzb (%rsi,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1(%rdi,%r9)
- dec %rdx
- jnz .Loop_tail8x
-
-.Ldone8x:
- vzeroall
- lea -8(%r10),%rsp
-.L8x_epilogue:
- ret
-SYM_FUNC_END(chacha20_avx2)
-#endif
-#ifdef CONFIG_AS_AVX512
-.align 32
-SYM_FUNC_START(chacha20_avx512)
-.Lchacha20_avx512:
- lea 8(%rsp),%r10 # frame pointer
- cmp $512,%rdx
- ja .Lchacha20_16x
-
- sub $64+8,%rsp
- and $-64,%rsp
- vbroadcasti32x4 .Lsigma(%rip),%zmm0
- vbroadcasti32x4 (%rcx),%zmm1
- vbroadcasti32x4 16(%rcx),%zmm2
- vbroadcasti32x4 (%r8),%zmm3
-
- vmovdqa32 %zmm0,%zmm16
- vmovdqa32 %zmm1,%zmm17
- vmovdqa32 %zmm2,%zmm18
- vpaddd .Lzeroz(%rip),%zmm3,%zmm3
- vmovdqa32 .Lfourz(%rip),%zmm20
- mov $10,%r8 # reuse %r8
- vmovdqa32 %zmm3,%zmm19
- jmp .Loop_avx512
-
-.align 16
-.Loop_outer_avx512:
- vmovdqa32 %zmm16,%zmm0
- vmovdqa32 %zmm17,%zmm1
- vmovdqa32 %zmm18,%zmm2
- vpaddd %zmm20,%zmm19,%zmm3
- mov $10,%r8
- vmovdqa32 %zmm3,%zmm19
- jmp .Loop_avx512
-
-.align 32
-.Loop_avx512:
- vpaddd %zmm1,%zmm0,%zmm0
- vpxord %zmm0,%zmm3,%zmm3
- vprold $16,%zmm3,%zmm3
- vpaddd %zmm3,%zmm2,%zmm2
- vpxord %zmm2,%zmm1,%zmm1
- vprold $12,%zmm1,%zmm1
- vpaddd %zmm1,%zmm0,%zmm0
- vpxord %zmm0,%zmm3,%zmm3
- vprold $8,%zmm3,%zmm3
- vpaddd %zmm3,%zmm2,%zmm2
- vpxord %zmm2,%zmm1,%zmm1
- vprold $7,%zmm1,%zmm1
- vpshufd $78,%zmm2,%zmm2
- vpshufd $57,%zmm1,%zmm1
- vpshufd $147,%zmm3,%zmm3
- vpaddd %zmm1,%zmm0,%zmm0
- vpxord %zmm0,%zmm3,%zmm3
- vprold $16,%zmm3,%zmm3
- vpaddd %zmm3,%zmm2,%zmm2
- vpxord %zmm2,%zmm1,%zmm1
- vprold $12,%zmm1,%zmm1
- vpaddd %zmm1,%zmm0,%zmm0
- vpxord %zmm0,%zmm3,%zmm3
- vprold $8,%zmm3,%zmm3
- vpaddd %zmm3,%zmm2,%zmm2
- vpxord %zmm2,%zmm1,%zmm1
- vprold $7,%zmm1,%zmm1
- vpshufd $78,%zmm2,%zmm2
- vpshufd $147,%zmm1,%zmm1
- vpshufd $57,%zmm3,%zmm3
- dec %r8
- jnz .Loop_avx512
- vpaddd %zmm16,%zmm0,%zmm0
- vpaddd %zmm17,%zmm1,%zmm1
- vpaddd %zmm18,%zmm2,%zmm2
- vpaddd %zmm19,%zmm3,%zmm3
-
- sub $64,%rdx
- jb .Ltail64_avx512
-
- vpxor 0x00(%rsi),%xmm0,%xmm4 # xor with input
- vpxor 0x10(%rsi),%xmm1,%xmm5
- vpxor 0x20(%rsi),%xmm2,%xmm6
- vpxor 0x30(%rsi),%xmm3,%xmm7
- lea 0x40(%rsi),%rsi # inp+=64
-
- vmovdqu %xmm4,0x00(%rdi) # write output
- vmovdqu %xmm5,0x10(%rdi)
- vmovdqu %xmm6,0x20(%rdi)
- vmovdqu %xmm7,0x30(%rdi)
- lea 0x40(%rdi),%rdi # out+=64
-
- jz .Ldone_avx512
-
- vextracti32x4 $1,%zmm0,%xmm4
- vextracti32x4 $1,%zmm1,%xmm5
- vextracti32x4 $1,%zmm2,%xmm6
- vextracti32x4 $1,%zmm3,%xmm7
-
- sub $64,%rdx
- jb .Ltail_avx512
-
- vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input
- vpxor 0x10(%rsi),%xmm5,%xmm5
- vpxor 0x20(%rsi),%xmm6,%xmm6
- vpxor 0x30(%rsi),%xmm7,%xmm7
- lea 0x40(%rsi),%rsi # inp+=64
-
- vmovdqu %xmm4,0x00(%rdi) # write output
- vmovdqu %xmm5,0x10(%rdi)
- vmovdqu %xmm6,0x20(%rdi)
- vmovdqu %xmm7,0x30(%rdi)
- lea 0x40(%rdi),%rdi # out+=64
-
- jz .Ldone_avx512
-
- vextracti32x4 $2,%zmm0,%xmm4
- vextracti32x4 $2,%zmm1,%xmm5
- vextracti32x4 $2,%zmm2,%xmm6
- vextracti32x4 $2,%zmm3,%xmm7
-
- sub $64,%rdx
- jb .Ltail_avx512
-
- vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input
- vpxor 0x10(%rsi),%xmm5,%xmm5
- vpxor 0x20(%rsi),%xmm6,%xmm6
- vpxor 0x30(%rsi),%xmm7,%xmm7
- lea 0x40(%rsi),%rsi # inp+=64
-
- vmovdqu %xmm4,0x00(%rdi) # write output
- vmovdqu %xmm5,0x10(%rdi)
- vmovdqu %xmm6,0x20(%rdi)
- vmovdqu %xmm7,0x30(%rdi)
- lea 0x40(%rdi),%rdi # out+=64
-
- jz .Ldone_avx512
-
- vextracti32x4 $3,%zmm0,%xmm4
- vextracti32x4 $3,%zmm1,%xmm5
- vextracti32x4 $3,%zmm2,%xmm6
- vextracti32x4 $3,%zmm3,%xmm7
-
- sub $64,%rdx
- jb .Ltail_avx512
-
- vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input
- vpxor 0x10(%rsi),%xmm5,%xmm5
- vpxor 0x20(%rsi),%xmm6,%xmm6
- vpxor 0x30(%rsi),%xmm7,%xmm7
- lea 0x40(%rsi),%rsi # inp+=64
-
- vmovdqu %xmm4,0x00(%rdi) # write output
- vmovdqu %xmm5,0x10(%rdi)
- vmovdqu %xmm6,0x20(%rdi)
- vmovdqu %xmm7,0x30(%rdi)
- lea 0x40(%rdi),%rdi # out+=64
-
- jnz .Loop_outer_avx512
-
- jmp .Ldone_avx512
-
-.align 16
-.Ltail64_avx512:
- vmovdqa %xmm0,0x00(%rsp)
- vmovdqa %xmm1,0x10(%rsp)
- vmovdqa %xmm2,0x20(%rsp)
- vmovdqa %xmm3,0x30(%rsp)
- add $64,%rdx
- jmp .Loop_tail_avx512
-
-.align 16
-.Ltail_avx512:
- vmovdqa %xmm4,0x00(%rsp)
- vmovdqa %xmm5,0x10(%rsp)
- vmovdqa %xmm6,0x20(%rsp)
- vmovdqa %xmm7,0x30(%rsp)
- add $64,%rdx
-
-.Loop_tail_avx512:
- movzb (%rsi,%r8),%eax
- movzb (%rsp,%r8),%ecx
- lea 1(%r8),%r8
- xor %ecx,%eax
- mov %al,-1(%rdi,%r8)
- dec %rdx
- jnz .Loop_tail_avx512
-
- vmovdqu32 %zmm16,0x00(%rsp)
-
-.Ldone_avx512:
- vzeroall
- lea -8(%r10),%rsp
-.Lavx512_epilogue:
- ret
-SYM_FUNC_END(chacha20_avx512)
-.align 32
-SYM_FUNC_START(chacha20_avx512vl)
-.Lchacha20_avx512vl:
- lea 8(%rsp),%r10 # frame pointer
- cmp $128,%rdx
- ja .Lchacha20_8xvl
-
- sub $64+8,%rsp
- and $-32,%rsp
- vbroadcasti128 .Lsigma(%rip),%ymm0
- vbroadcasti128 (%rcx),%ymm1
- vbroadcasti128 16(%rcx),%ymm2
- vbroadcasti128 (%r8),%ymm3
-
- vmovdqa32 %ymm0,%ymm16
- vmovdqa32 %ymm1,%ymm17
- vmovdqa32 %ymm2,%ymm18
- vpaddd .Lzeroz(%rip),%ymm3,%ymm3
- vmovdqa32 .Ltwoy(%rip),%ymm20
- mov $10,%r8 # reuse %r8
- vmovdqa32 %ymm3,%ymm19
- jmp .Loop_avx512vl
-
-.align 16
-.Loop_outer_avx512vl:
- vmovdqa32 %ymm18,%ymm2
- vpaddd %ymm20,%ymm19,%ymm3
- mov $10,%r8
- vmovdqa32 %ymm3,%ymm19
- jmp .Loop_avx512vl
-
-.align 32
-.Loop_avx512vl:
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
- vpshufd $78,%ymm2,%ymm2
- vpshufd $57,%ymm1,%ymm1
- vpshufd $147,%ymm3,%ymm3
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
- vpshufd $78,%ymm2,%ymm2
- vpshufd $147,%ymm1,%ymm1
- vpshufd $57,%ymm3,%ymm3
- dec %r8
- jnz .Loop_avx512vl
- vpaddd %ymm16,%ymm0,%ymm0
- vpaddd %ymm17,%ymm1,%ymm1
- vpaddd %ymm18,%ymm2,%ymm2
- vpaddd %ymm19,%ymm3,%ymm3
-
- sub $64,%rdx
- jb .Ltail64_avx512vl
-
- vpxor 0x00(%rsi),%xmm0,%xmm4 # xor with input
- vpxor 0x10(%rsi),%xmm1,%xmm5
- vpxor 0x20(%rsi),%xmm2,%xmm6
- vpxor 0x30(%rsi),%xmm3,%xmm7
- lea 0x40(%rsi),%rsi # inp+=64
-
- vmovdqu %xmm4,0x00(%rdi) # write output
- vmovdqu %xmm5,0x10(%rdi)
- vmovdqu %xmm6,0x20(%rdi)
- vmovdqu %xmm7,0x30(%rdi)
- lea 0x40(%rdi),%rdi # out+=64
-
- jz .Ldone_avx512vl
-
- vextracti128 $1,%ymm0,%xmm4
- vextracti128 $1,%ymm1,%xmm5
- vextracti128 $1,%ymm2,%xmm6
- vextracti128 $1,%ymm3,%xmm7
-
- sub $64,%rdx
- jb .Ltail_avx512vl
-
- vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input
- vpxor 0x10(%rsi),%xmm5,%xmm5
- vpxor 0x20(%rsi),%xmm6,%xmm6
- vpxor 0x30(%rsi),%xmm7,%xmm7
- lea 0x40(%rsi),%rsi # inp+=64
-
- vmovdqu %xmm4,0x00(%rdi) # write output
- vmovdqu %xmm5,0x10(%rdi)
- vmovdqu %xmm6,0x20(%rdi)
- vmovdqu %xmm7,0x30(%rdi)
- lea 0x40(%rdi),%rdi # out+=64
-
- vmovdqa32 %ymm16,%ymm0
- vmovdqa32 %ymm17,%ymm1
- jnz .Loop_outer_avx512vl
-
- jmp .Ldone_avx512vl
-
-.align 16
-.Ltail64_avx512vl:
- vmovdqa %xmm0,0x00(%rsp)
- vmovdqa %xmm1,0x10(%rsp)
- vmovdqa %xmm2,0x20(%rsp)
- vmovdqa %xmm3,0x30(%rsp)
- add $64,%rdx
- jmp .Loop_tail_avx512vl
-
-.align 16
-.Ltail_avx512vl:
- vmovdqa %xmm4,0x00(%rsp)
- vmovdqa %xmm5,0x10(%rsp)
- vmovdqa %xmm6,0x20(%rsp)
- vmovdqa %xmm7,0x30(%rsp)
- add $64,%rdx
-
-.Loop_tail_avx512vl:
- movzb (%rsi,%r8),%eax
- movzb (%rsp,%r8),%ecx
- lea 1(%r8),%r8
- xor %ecx,%eax
- mov %al,-1(%rdi,%r8)
- dec %rdx
- jnz .Loop_tail_avx512vl
-
- vmovdqu32 %ymm16,0x00(%rsp)
- vmovdqu32 %ymm16,0x20(%rsp)
-
-.Ldone_avx512vl:
- vzeroall
- lea -8(%r10),%rsp
-.Lavx512vl_epilogue:
- ret
-SYM_FUNC_END(chacha20_avx512vl)
-.type chacha20_16x,@function
-.align 32
-chacha20_16x:
-.Lchacha20_16x:
- lea 8(%rsp),%r10 # frame register
- sub $64+8,%rsp
- and $-64,%rsp
- vzeroupper
-
- lea .Lsigma(%rip),%r9
- vbroadcasti32x4 (%r9),%zmm3 # key[0]
- vbroadcasti32x4 (%rcx),%zmm7 # key[1]
- vbroadcasti32x4 16(%rcx),%zmm11 # key[2]
- vbroadcasti32x4 (%r8),%zmm15 # key[3]
-
- vpshufd $0x00,%zmm3,%zmm0 # smash key by lanes...
- vpshufd $0x55,%zmm3,%zmm1
- vpshufd $0xaa,%zmm3,%zmm2
- vpshufd $0xff,%zmm3,%zmm3
- vmovdqa64 %zmm0,%zmm16
- vmovdqa64 %zmm1,%zmm17
- vmovdqa64 %zmm2,%zmm18
- vmovdqa64 %zmm3,%zmm19
-
- vpshufd $0x00,%zmm7,%zmm4
- vpshufd $0x55,%zmm7,%zmm5
- vpshufd $0xaa,%zmm7,%zmm6
- vpshufd $0xff,%zmm7,%zmm7
- vmovdqa64 %zmm4,%zmm20
- vmovdqa64 %zmm5,%zmm21
- vmovdqa64 %zmm6,%zmm22
- vmovdqa64 %zmm7,%zmm23
-
- vpshufd $0x00,%zmm11,%zmm8
- vpshufd $0x55,%zmm11,%zmm9
- vpshufd $0xaa,%zmm11,%zmm10
- vpshufd $0xff,%zmm11,%zmm11
- vmovdqa64 %zmm8,%zmm24
- vmovdqa64 %zmm9,%zmm25
- vmovdqa64 %zmm10,%zmm26
- vmovdqa64 %zmm11,%zmm27
-
- vpshufd $0x00,%zmm15,%zmm12
- vpshufd $0x55,%zmm15,%zmm13
- vpshufd $0xaa,%zmm15,%zmm14
- vpshufd $0xff,%zmm15,%zmm15
- vpaddd .Lincz(%rip),%zmm12,%zmm12 # don't save counters yet
- vmovdqa64 %zmm12,%zmm28
- vmovdqa64 %zmm13,%zmm29
- vmovdqa64 %zmm14,%zmm30
- vmovdqa64 %zmm15,%zmm31
-
- mov $10,%eax
- jmp .Loop16x
-
-.align 32
-.Loop_outer16x:
- vpbroadcastd 0(%r9),%zmm0 # reload key
- vpbroadcastd 4(%r9),%zmm1
- vpbroadcastd 8(%r9),%zmm2
- vpbroadcastd 12(%r9),%zmm3
- vpaddd .Lsixteen(%rip),%zmm28,%zmm28 # next SIMD counters
- vmovdqa64 %zmm20,%zmm4
- vmovdqa64 %zmm21,%zmm5
- vmovdqa64 %zmm22,%zmm6
- vmovdqa64 %zmm23,%zmm7
- vmovdqa64 %zmm24,%zmm8
- vmovdqa64 %zmm25,%zmm9
- vmovdqa64 %zmm26,%zmm10
- vmovdqa64 %zmm27,%zmm11
- vmovdqa64 %zmm28,%zmm12
- vmovdqa64 %zmm29,%zmm13
- vmovdqa64 %zmm30,%zmm14
- vmovdqa64 %zmm31,%zmm15
-
- vmovdqa64 %zmm0,%zmm16
- vmovdqa64 %zmm1,%zmm17
- vmovdqa64 %zmm2,%zmm18
- vmovdqa64 %zmm3,%zmm19
-
- mov $10,%eax
- jmp .Loop16x
-
-.align 32
-.Loop16x:
- vpaddd %zmm4,%zmm0,%zmm0
- vpaddd %zmm5,%zmm1,%zmm1
- vpaddd %zmm6,%zmm2,%zmm2
- vpaddd %zmm7,%zmm3,%zmm3
- vpxord %zmm0,%zmm12,%zmm12
- vpxord %zmm1,%zmm13,%zmm13
- vpxord %zmm2,%zmm14,%zmm14
- vpxord %zmm3,%zmm15,%zmm15
- vprold $16,%zmm12,%zmm12
- vprold $16,%zmm13,%zmm13
- vprold $16,%zmm14,%zmm14
- vprold $16,%zmm15,%zmm15
- vpaddd %zmm12,%zmm8,%zmm8
- vpaddd %zmm13,%zmm9,%zmm9
- vpaddd %zmm14,%zmm10,%zmm10
- vpaddd %zmm15,%zmm11,%zmm11
- vpxord %zmm8,%zmm4,%zmm4
- vpxord %zmm9,%zmm5,%zmm5
- vpxord %zmm10,%zmm6,%zmm6
- vpxord %zmm11,%zmm7,%zmm7
- vprold $12,%zmm4,%zmm4
- vprold $12,%zmm5,%zmm5
- vprold $12,%zmm6,%zmm6
- vprold $12,%zmm7,%zmm7
- vpaddd %zmm4,%zmm0,%zmm0
- vpaddd %zmm5,%zmm1,%zmm1
- vpaddd %zmm6,%zmm2,%zmm2
- vpaddd %zmm7,%zmm3,%zmm3
- vpxord %zmm0,%zmm12,%zmm12
- vpxord %zmm1,%zmm13,%zmm13
- vpxord %zmm2,%zmm14,%zmm14
- vpxord %zmm3,%zmm15,%zmm15
- vprold $8,%zmm12,%zmm12
- vprold $8,%zmm13,%zmm13
- vprold $8,%zmm14,%zmm14
- vprold $8,%zmm15,%zmm15
- vpaddd %zmm12,%zmm8,%zmm8
- vpaddd %zmm13,%zmm9,%zmm9
- vpaddd %zmm14,%zmm10,%zmm10
- vpaddd %zmm15,%zmm11,%zmm11
- vpxord %zmm8,%zmm4,%zmm4
- vpxord %zmm9,%zmm5,%zmm5
- vpxord %zmm10,%zmm6,%zmm6
- vpxord %zmm11,%zmm7,%zmm7
- vprold $7,%zmm4,%zmm4
- vprold $7,%zmm5,%zmm5
- vprold $7,%zmm6,%zmm6
- vprold $7,%zmm7,%zmm7
- vpaddd %zmm5,%zmm0,%zmm0
- vpaddd %zmm6,%zmm1,%zmm1
- vpaddd %zmm7,%zmm2,%zmm2
- vpaddd %zmm4,%zmm3,%zmm3
- vpxord %zmm0,%zmm15,%zmm15
- vpxord %zmm1,%zmm12,%zmm12
- vpxord %zmm2,%zmm13,%zmm13
- vpxord %zmm3,%zmm14,%zmm14
- vprold $16,%zmm15,%zmm15
- vprold $16,%zmm12,%zmm12
- vprold $16,%zmm13,%zmm13
- vprold $16,%zmm14,%zmm14
- vpaddd %zmm15,%zmm10,%zmm10
- vpaddd %zmm12,%zmm11,%zmm11
- vpaddd %zmm13,%zmm8,%zmm8
- vpaddd %zmm14,%zmm9,%zmm9
- vpxord %zmm10,%zmm5,%zmm5
- vpxord %zmm11,%zmm6,%zmm6
- vpxord %zmm8,%zmm7,%zmm7
- vpxord %zmm9,%zmm4,%zmm4
- vprold $12,%zmm5,%zmm5
- vprold $12,%zmm6,%zmm6
- vprold $12,%zmm7,%zmm7
- vprold $12,%zmm4,%zmm4
- vpaddd %zmm5,%zmm0,%zmm0
- vpaddd %zmm6,%zmm1,%zmm1
- vpaddd %zmm7,%zmm2,%zmm2
- vpaddd %zmm4,%zmm3,%zmm3
- vpxord %zmm0,%zmm15,%zmm15
- vpxord %zmm1,%zmm12,%zmm12
- vpxord %zmm2,%zmm13,%zmm13
- vpxord %zmm3,%zmm14,%zmm14
- vprold $8,%zmm15,%zmm15
- vprold $8,%zmm12,%zmm12
- vprold $8,%zmm13,%zmm13
- vprold $8,%zmm14,%zmm14
- vpaddd %zmm15,%zmm10,%zmm10
- vpaddd %zmm12,%zmm11,%zmm11
- vpaddd %zmm13,%zmm8,%zmm8
- vpaddd %zmm14,%zmm9,%zmm9
- vpxord %zmm10,%zmm5,%zmm5
- vpxord %zmm11,%zmm6,%zmm6
- vpxord %zmm8,%zmm7,%zmm7
- vpxord %zmm9,%zmm4,%zmm4
- vprold $7,%zmm5,%zmm5
- vprold $7,%zmm6,%zmm6
- vprold $7,%zmm7,%zmm7
- vprold $7,%zmm4,%zmm4
- dec %eax
- jnz .Loop16x
-
- vpaddd %zmm16,%zmm0,%zmm0 # accumulate key
- vpaddd %zmm17,%zmm1,%zmm1
- vpaddd %zmm18,%zmm2,%zmm2
- vpaddd %zmm19,%zmm3,%zmm3
-
- vpunpckldq %zmm1,%zmm0,%zmm18 # "de-interlace" data
- vpunpckldq %zmm3,%zmm2,%zmm19
- vpunpckhdq %zmm1,%zmm0,%zmm0
- vpunpckhdq %zmm3,%zmm2,%zmm2
- vpunpcklqdq %zmm19,%zmm18,%zmm1 # "a0"
- vpunpckhqdq %zmm19,%zmm18,%zmm18 # "a1"
- vpunpcklqdq %zmm2,%zmm0,%zmm3 # "a2"
- vpunpckhqdq %zmm2,%zmm0,%zmm0 # "a3"
- vpaddd %zmm20,%zmm4,%zmm4
- vpaddd %zmm21,%zmm5,%zmm5
- vpaddd %zmm22,%zmm6,%zmm6
- vpaddd %zmm23,%zmm7,%zmm7
-
- vpunpckldq %zmm5,%zmm4,%zmm2
- vpunpckldq %zmm7,%zmm6,%zmm19
- vpunpckhdq %zmm5,%zmm4,%zmm4
- vpunpckhdq %zmm7,%zmm6,%zmm6
- vpunpcklqdq %zmm19,%zmm2,%zmm5 # "b0"
- vpunpckhqdq %zmm19,%zmm2,%zmm2 # "b1"
- vpunpcklqdq %zmm6,%zmm4,%zmm7 # "b2"
- vpunpckhqdq %zmm6,%zmm4,%zmm4 # "b3"
- vshufi32x4 $0x44,%zmm5,%zmm1,%zmm19 # "de-interlace" further
- vshufi32x4 $0xee,%zmm5,%zmm1,%zmm5
- vshufi32x4 $0x44,%zmm2,%zmm18,%zmm1
- vshufi32x4 $0xee,%zmm2,%zmm18,%zmm2
- vshufi32x4 $0x44,%zmm7,%zmm3,%zmm18
- vshufi32x4 $0xee,%zmm7,%zmm3,%zmm7
- vshufi32x4 $0x44,%zmm4,%zmm0,%zmm3
- vshufi32x4 $0xee,%zmm4,%zmm0,%zmm4
- vpaddd %zmm24,%zmm8,%zmm8
- vpaddd %zmm25,%zmm9,%zmm9
- vpaddd %zmm26,%zmm10,%zmm10
- vpaddd %zmm27,%zmm11,%zmm11
-
- vpunpckldq %zmm9,%zmm8,%zmm6
- vpunpckldq %zmm11,%zmm10,%zmm0
- vpunpckhdq %zmm9,%zmm8,%zmm8
- vpunpckhdq %zmm11,%zmm10,%zmm10
- vpunpcklqdq %zmm0,%zmm6,%zmm9 # "c0"
- vpunpckhqdq %zmm0,%zmm6,%zmm6 # "c1"
- vpunpcklqdq %zmm10,%zmm8,%zmm11 # "c2"
- vpunpckhqdq %zmm10,%zmm8,%zmm8 # "c3"
- vpaddd %zmm28,%zmm12,%zmm12
- vpaddd %zmm29,%zmm13,%zmm13
- vpaddd %zmm30,%zmm14,%zmm14
- vpaddd %zmm31,%zmm15,%zmm15
-
- vpunpckldq %zmm13,%zmm12,%zmm10
- vpunpckldq %zmm15,%zmm14,%zmm0
- vpunpckhdq %zmm13,%zmm12,%zmm12
- vpunpckhdq %zmm15,%zmm14,%zmm14
- vpunpcklqdq %zmm0,%zmm10,%zmm13 # "d0"
- vpunpckhqdq %zmm0,%zmm10,%zmm10 # "d1"
- vpunpcklqdq %zmm14,%zmm12,%zmm15 # "d2"
- vpunpckhqdq %zmm14,%zmm12,%zmm12 # "d3"
- vshufi32x4 $0x44,%zmm13,%zmm9,%zmm0 # "de-interlace" further
- vshufi32x4 $0xee,%zmm13,%zmm9,%zmm13
- vshufi32x4 $0x44,%zmm10,%zmm6,%zmm9
- vshufi32x4 $0xee,%zmm10,%zmm6,%zmm10
- vshufi32x4 $0x44,%zmm15,%zmm11,%zmm6
- vshufi32x4 $0xee,%zmm15,%zmm11,%zmm15
- vshufi32x4 $0x44,%zmm12,%zmm8,%zmm11
- vshufi32x4 $0xee,%zmm12,%zmm8,%zmm12
- vshufi32x4 $0x88,%zmm0,%zmm19,%zmm16 # "de-interlace" further
- vshufi32x4 $0xdd,%zmm0,%zmm19,%zmm19
- vshufi32x4 $0x88,%zmm13,%zmm5,%zmm0
- vshufi32x4 $0xdd,%zmm13,%zmm5,%zmm13
- vshufi32x4 $0x88,%zmm9,%zmm1,%zmm17
- vshufi32x4 $0xdd,%zmm9,%zmm1,%zmm1
- vshufi32x4 $0x88,%zmm10,%zmm2,%zmm9
- vshufi32x4 $0xdd,%zmm10,%zmm2,%zmm10
- vshufi32x4 $0x88,%zmm6,%zmm18,%zmm14
- vshufi32x4 $0xdd,%zmm6,%zmm18,%zmm18
- vshufi32x4 $0x88,%zmm15,%zmm7,%zmm6
- vshufi32x4 $0xdd,%zmm15,%zmm7,%zmm15
- vshufi32x4 $0x88,%zmm11,%zmm3,%zmm8
- vshufi32x4 $0xdd,%zmm11,%zmm3,%zmm3
- vshufi32x4 $0x88,%zmm12,%zmm4,%zmm11
- vshufi32x4 $0xdd,%zmm12,%zmm4,%zmm12
- cmp $64*16,%rdx
- jb .Ltail16x
-
- vpxord 0x00(%rsi),%zmm16,%zmm16 # xor with input
- vpxord 0x40(%rsi),%zmm17,%zmm17
- vpxord 0x80(%rsi),%zmm14,%zmm14
- vpxord 0xc0(%rsi),%zmm8,%zmm8
- vmovdqu32 %zmm16,0x00(%rdi)
- vmovdqu32 %zmm17,0x40(%rdi)
- vmovdqu32 %zmm14,0x80(%rdi)
- vmovdqu32 %zmm8,0xc0(%rdi)
-
- vpxord 0x100(%rsi),%zmm19,%zmm19
- vpxord 0x140(%rsi),%zmm1,%zmm1
- vpxord 0x180(%rsi),%zmm18,%zmm18
- vpxord 0x1c0(%rsi),%zmm3,%zmm3
- vmovdqu32 %zmm19,0x100(%rdi)
- vmovdqu32 %zmm1,0x140(%rdi)
- vmovdqu32 %zmm18,0x180(%rdi)
- vmovdqu32 %zmm3,0x1c0(%rdi)
-
- vpxord 0x200(%rsi),%zmm0,%zmm0
- vpxord 0x240(%rsi),%zmm9,%zmm9
- vpxord 0x280(%rsi),%zmm6,%zmm6
- vpxord 0x2c0(%rsi),%zmm11,%zmm11
- vmovdqu32 %zmm0,0x200(%rdi)
- vmovdqu32 %zmm9,0x240(%rdi)
- vmovdqu32 %zmm6,0x280(%rdi)
- vmovdqu32 %zmm11,0x2c0(%rdi)
-
- vpxord 0x300(%rsi),%zmm13,%zmm13
- vpxord 0x340(%rsi),%zmm10,%zmm10
- vpxord 0x380(%rsi),%zmm15,%zmm15
- vpxord 0x3c0(%rsi),%zmm12,%zmm12
- lea 0x400(%rsi),%rsi
- vmovdqu32 %zmm13,0x300(%rdi)
- vmovdqu32 %zmm10,0x340(%rdi)
- vmovdqu32 %zmm15,0x380(%rdi)
- vmovdqu32 %zmm12,0x3c0(%rdi)
- lea 0x400(%rdi),%rdi
-
- sub $64*16,%rdx
- jnz .Loop_outer16x
-
- jmp .Ldone16x
-
-.align 32
-.Ltail16x:
- xor %r9,%r9
- sub %rsi,%rdi
- cmp $64*1,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm16,%zmm16 # xor with input
- vmovdqu32 %zmm16,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm17,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*2,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm17,%zmm17
- vmovdqu32 %zmm17,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm14,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*3,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm14,%zmm14
- vmovdqu32 %zmm14,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm8,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*4,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm8,%zmm8
- vmovdqu32 %zmm8,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm19,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*5,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm19,%zmm19
- vmovdqu32 %zmm19,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm1,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*6,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm1,%zmm1
- vmovdqu32 %zmm1,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm18,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*7,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm18,%zmm18
- vmovdqu32 %zmm18,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm3,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*8,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm3,%zmm3
- vmovdqu32 %zmm3,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm0,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*9,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm0,%zmm0
- vmovdqu32 %zmm0,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm9,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*10,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm9,%zmm9
- vmovdqu32 %zmm9,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm6,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*11,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm6,%zmm6
- vmovdqu32 %zmm6,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm11,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*12,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm11,%zmm11
- vmovdqu32 %zmm11,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm13,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*13,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm13,%zmm13
- vmovdqu32 %zmm13,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm10,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*14,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm10,%zmm10
- vmovdqu32 %zmm10,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm15,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*15,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm15,%zmm15
- vmovdqu32 %zmm15,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm12,%zmm16
- lea 64(%rsi),%rsi
-
-.Less_than_64_16x:
- vmovdqa32 %zmm16,0x00(%rsp)
- lea (%rdi,%rsi),%rdi
- and $63,%rdx
-
-.Loop_tail16x:
- movzb (%rsi,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1(%rdi,%r9)
- dec %rdx
- jnz .Loop_tail16x
-
- vpxord %zmm16,%zmm16,%zmm16
- vmovdqa32 %zmm16,0(%rsp)
-
-.Ldone16x:
- vzeroall
- lea -8(%r10),%rsp
-.L16x_epilogue:
- ret
-.size chacha20_16x,.-chacha20_16x
-.type chacha20_8xvl,@function
-.align 32
-chacha20_8xvl:
-.Lchacha20_8xvl:
- lea 8(%rsp),%r10 # frame register
- sub $64+8,%rsp
- and $-64,%rsp
- vzeroupper
-
- lea .Lsigma(%rip),%r9
- vbroadcasti128 (%r9),%ymm3 # key[0]
- vbroadcasti128 (%rcx),%ymm7 # key[1]
- vbroadcasti128 16(%rcx),%ymm11 # key[2]
- vbroadcasti128 (%r8),%ymm15 # key[3]
-
- vpshufd $0x00,%ymm3,%ymm0 # smash key by lanes...
- vpshufd $0x55,%ymm3,%ymm1
- vpshufd $0xaa,%ymm3,%ymm2
- vpshufd $0xff,%ymm3,%ymm3
- vmovdqa64 %ymm0,%ymm16
- vmovdqa64 %ymm1,%ymm17
- vmovdqa64 %ymm2,%ymm18
- vmovdqa64 %ymm3,%ymm19
-
- vpshufd $0x00,%ymm7,%ymm4
- vpshufd $0x55,%ymm7,%ymm5
- vpshufd $0xaa,%ymm7,%ymm6
- vpshufd $0xff,%ymm7,%ymm7
- vmovdqa64 %ymm4,%ymm20
- vmovdqa64 %ymm5,%ymm21
- vmovdqa64 %ymm6,%ymm22
- vmovdqa64 %ymm7,%ymm23
-
- vpshufd $0x00,%ymm11,%ymm8
- vpshufd $0x55,%ymm11,%ymm9
- vpshufd $0xaa,%ymm11,%ymm10
- vpshufd $0xff,%ymm11,%ymm11
- vmovdqa64 %ymm8,%ymm24
- vmovdqa64 %ymm9,%ymm25
- vmovdqa64 %ymm10,%ymm26
- vmovdqa64 %ymm11,%ymm27
-
- vpshufd $0x00,%ymm15,%ymm12
- vpshufd $0x55,%ymm15,%ymm13
- vpshufd $0xaa,%ymm15,%ymm14
- vpshufd $0xff,%ymm15,%ymm15
- vpaddd .Lincy(%rip),%ymm12,%ymm12 # don't save counters yet
- vmovdqa64 %ymm12,%ymm28
- vmovdqa64 %ymm13,%ymm29
- vmovdqa64 %ymm14,%ymm30
- vmovdqa64 %ymm15,%ymm31
-
- mov $10,%eax
- jmp .Loop8xvl
-
-.align 32
-.Loop_outer8xvl:
- #vpbroadcastd 0(%r9),%ymm0 # reload key
- #vpbroadcastd 4(%r9),%ymm1
- vpbroadcastd 8(%r9),%ymm2
- vpbroadcastd 12(%r9),%ymm3
- vpaddd .Leight(%rip),%ymm28,%ymm28 # next SIMD counters
- vmovdqa64 %ymm20,%ymm4
- vmovdqa64 %ymm21,%ymm5
- vmovdqa64 %ymm22,%ymm6
- vmovdqa64 %ymm23,%ymm7
- vmovdqa64 %ymm24,%ymm8
- vmovdqa64 %ymm25,%ymm9
- vmovdqa64 %ymm26,%ymm10
- vmovdqa64 %ymm27,%ymm11
- vmovdqa64 %ymm28,%ymm12
- vmovdqa64 %ymm29,%ymm13
- vmovdqa64 %ymm30,%ymm14
- vmovdqa64 %ymm31,%ymm15
-
- vmovdqa64 %ymm0,%ymm16
- vmovdqa64 %ymm1,%ymm17
- vmovdqa64 %ymm2,%ymm18
- vmovdqa64 %ymm3,%ymm19
-
- mov $10,%eax
- jmp .Loop8xvl
-
-.align 32
-.Loop8xvl:
- vpaddd %ymm4,%ymm0,%ymm0
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm7,%ymm3,%ymm3
- vpxor %ymm0,%ymm12,%ymm12
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm3,%ymm15,%ymm15
- vprold $16,%ymm12,%ymm12
- vprold $16,%ymm13,%ymm13
- vprold $16,%ymm14,%ymm14
- vprold $16,%ymm15,%ymm15
- vpaddd %ymm12,%ymm8,%ymm8
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm8,%ymm4,%ymm4
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm11,%ymm7,%ymm7
- vprold $12,%ymm4,%ymm4
- vprold $12,%ymm5,%ymm5
- vprold $12,%ymm6,%ymm6
- vprold $12,%ymm7,%ymm7
- vpaddd %ymm4,%ymm0,%ymm0
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm7,%ymm3,%ymm3
- vpxor %ymm0,%ymm12,%ymm12
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm3,%ymm15,%ymm15
- vprold $8,%ymm12,%ymm12
- vprold $8,%ymm13,%ymm13
- vprold $8,%ymm14,%ymm14
- vprold $8,%ymm15,%ymm15
- vpaddd %ymm12,%ymm8,%ymm8
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm8,%ymm4,%ymm4
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm11,%ymm7,%ymm7
- vprold $7,%ymm4,%ymm4
- vprold $7,%ymm5,%ymm5
- vprold $7,%ymm6,%ymm6
- vprold $7,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpaddd %ymm6,%ymm1,%ymm1
- vpaddd %ymm7,%ymm2,%ymm2
- vpaddd %ymm4,%ymm3,%ymm3
- vpxor %ymm0,%ymm15,%ymm15
- vpxor %ymm1,%ymm12,%ymm12
- vpxor %ymm2,%ymm13,%ymm13
- vpxor %ymm3,%ymm14,%ymm14
- vprold $16,%ymm15,%ymm15
- vprold $16,%ymm12,%ymm12
- vprold $16,%ymm13,%ymm13
- vprold $16,%ymm14,%ymm14
- vpaddd %ymm15,%ymm10,%ymm10
- vpaddd %ymm12,%ymm11,%ymm11
- vpaddd %ymm13,%ymm8,%ymm8
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm10,%ymm5,%ymm5
- vpxor %ymm11,%ymm6,%ymm6
- vpxor %ymm8,%ymm7,%ymm7
- vpxor %ymm9,%ymm4,%ymm4
- vprold $12,%ymm5,%ymm5
- vprold $12,%ymm6,%ymm6
- vprold $12,%ymm7,%ymm7
- vprold $12,%ymm4,%ymm4
- vpaddd %ymm5,%ymm0,%ymm0
- vpaddd %ymm6,%ymm1,%ymm1
- vpaddd %ymm7,%ymm2,%ymm2
- vpaddd %ymm4,%ymm3,%ymm3
- vpxor %ymm0,%ymm15,%ymm15
- vpxor %ymm1,%ymm12,%ymm12
- vpxor %ymm2,%ymm13,%ymm13
- vpxor %ymm3,%ymm14,%ymm14
- vprold $8,%ymm15,%ymm15
- vprold $8,%ymm12,%ymm12
- vprold $8,%ymm13,%ymm13
- vprold $8,%ymm14,%ymm14
- vpaddd %ymm15,%ymm10,%ymm10
- vpaddd %ymm12,%ymm11,%ymm11
- vpaddd %ymm13,%ymm8,%ymm8
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm10,%ymm5,%ymm5
- vpxor %ymm11,%ymm6,%ymm6
- vpxor %ymm8,%ymm7,%ymm7
- vpxor %ymm9,%ymm4,%ymm4
- vprold $7,%ymm5,%ymm5
- vprold $7,%ymm6,%ymm6
- vprold $7,%ymm7,%ymm7
- vprold $7,%ymm4,%ymm4
- dec %eax
- jnz .Loop8xvl
-
- vpaddd %ymm16,%ymm0,%ymm0 # accumulate key
- vpaddd %ymm17,%ymm1,%ymm1
- vpaddd %ymm18,%ymm2,%ymm2
- vpaddd %ymm19,%ymm3,%ymm3
-
- vpunpckldq %ymm1,%ymm0,%ymm18 # "de-interlace" data
- vpunpckldq %ymm3,%ymm2,%ymm19
- vpunpckhdq %ymm1,%ymm0,%ymm0
- vpunpckhdq %ymm3,%ymm2,%ymm2
- vpunpcklqdq %ymm19,%ymm18,%ymm1 # "a0"
- vpunpckhqdq %ymm19,%ymm18,%ymm18 # "a1"
- vpunpcklqdq %ymm2,%ymm0,%ymm3 # "a2"
- vpunpckhqdq %ymm2,%ymm0,%ymm0 # "a3"
- vpaddd %ymm20,%ymm4,%ymm4
- vpaddd %ymm21,%ymm5,%ymm5
- vpaddd %ymm22,%ymm6,%ymm6
- vpaddd %ymm23,%ymm7,%ymm7
-
- vpunpckldq %ymm5,%ymm4,%ymm2
- vpunpckldq %ymm7,%ymm6,%ymm19
- vpunpckhdq %ymm5,%ymm4,%ymm4
- vpunpckhdq %ymm7,%ymm6,%ymm6
- vpunpcklqdq %ymm19,%ymm2,%ymm5 # "b0"
- vpunpckhqdq %ymm19,%ymm2,%ymm2 # "b1"
- vpunpcklqdq %ymm6,%ymm4,%ymm7 # "b2"
- vpunpckhqdq %ymm6,%ymm4,%ymm4 # "b3"
- vshufi32x4 $0,%ymm5,%ymm1,%ymm19 # "de-interlace" further
- vshufi32x4 $3,%ymm5,%ymm1,%ymm5
- vshufi32x4 $0,%ymm2,%ymm18,%ymm1
- vshufi32x4 $3,%ymm2,%ymm18,%ymm2
- vshufi32x4 $0,%ymm7,%ymm3,%ymm18
- vshufi32x4 $3,%ymm7,%ymm3,%ymm7
- vshufi32x4 $0,%ymm4,%ymm0,%ymm3
- vshufi32x4 $3,%ymm4,%ymm0,%ymm4
- vpaddd %ymm24,%ymm8,%ymm8
- vpaddd %ymm25,%ymm9,%ymm9
- vpaddd %ymm26,%ymm10,%ymm10
- vpaddd %ymm27,%ymm11,%ymm11
-
- vpunpckldq %ymm9,%ymm8,%ymm6
- vpunpckldq %ymm11,%ymm10,%ymm0
- vpunpckhdq %ymm9,%ymm8,%ymm8
- vpunpckhdq %ymm11,%ymm10,%ymm10
- vpunpcklqdq %ymm0,%ymm6,%ymm9 # "c0"
- vpunpckhqdq %ymm0,%ymm6,%ymm6 # "c1"
- vpunpcklqdq %ymm10,%ymm8,%ymm11 # "c2"
- vpunpckhqdq %ymm10,%ymm8,%ymm8 # "c3"
- vpaddd %ymm28,%ymm12,%ymm12
- vpaddd %ymm29,%ymm13,%ymm13
- vpaddd %ymm30,%ymm14,%ymm14
- vpaddd %ymm31,%ymm15,%ymm15
-
- vpunpckldq %ymm13,%ymm12,%ymm10
- vpunpckldq %ymm15,%ymm14,%ymm0
- vpunpckhdq %ymm13,%ymm12,%ymm12
- vpunpckhdq %ymm15,%ymm14,%ymm14
- vpunpcklqdq %ymm0,%ymm10,%ymm13 # "d0"
- vpunpckhqdq %ymm0,%ymm10,%ymm10 # "d1"
- vpunpcklqdq %ymm14,%ymm12,%ymm15 # "d2"
- vpunpckhqdq %ymm14,%ymm12,%ymm12 # "d3"
- vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 # "de-interlace" further
- vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
- vperm2i128 $0x20,%ymm10,%ymm6,%ymm9
- vperm2i128 $0x31,%ymm10,%ymm6,%ymm10
- vperm2i128 $0x20,%ymm15,%ymm11,%ymm6
- vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
- vperm2i128 $0x20,%ymm12,%ymm8,%ymm11
- vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
- cmp $64*8,%rdx
- jb .Ltail8xvl
-
- mov $0x80,%eax # size optimization
- vpxord 0x00(%rsi),%ymm19,%ymm19 # xor with input
- vpxor 0x20(%rsi),%ymm0,%ymm0
- vpxor 0x40(%rsi),%ymm5,%ymm5
- vpxor 0x60(%rsi),%ymm13,%ymm13
- lea (%rsi,%rax),%rsi # size optimization
- vmovdqu32 %ymm19,0x00(%rdi)
- vmovdqu %ymm0,0x20(%rdi)
- vmovdqu %ymm5,0x40(%rdi)
- vmovdqu %ymm13,0x60(%rdi)
- lea (%rdi,%rax),%rdi # size optimization
-
- vpxor 0x00(%rsi),%ymm1,%ymm1
- vpxor 0x20(%rsi),%ymm9,%ymm9
- vpxor 0x40(%rsi),%ymm2,%ymm2
- vpxor 0x60(%rsi),%ymm10,%ymm10
- lea (%rsi,%rax),%rsi # size optimization
- vmovdqu %ymm1,0x00(%rdi)
- vmovdqu %ymm9,0x20(%rdi)
- vmovdqu %ymm2,0x40(%rdi)
- vmovdqu %ymm10,0x60(%rdi)
- lea (%rdi,%rax),%rdi # size optimization
-
- vpxord 0x00(%rsi),%ymm18,%ymm18
- vpxor 0x20(%rsi),%ymm6,%ymm6
- vpxor 0x40(%rsi),%ymm7,%ymm7
- vpxor 0x60(%rsi),%ymm15,%ymm15
- lea (%rsi,%rax),%rsi # size optimization
- vmovdqu32 %ymm18,0x00(%rdi)
- vmovdqu %ymm6,0x20(%rdi)
- vmovdqu %ymm7,0x40(%rdi)
- vmovdqu %ymm15,0x60(%rdi)
- lea (%rdi,%rax),%rdi # size optimization
-
- vpxor 0x00(%rsi),%ymm3,%ymm3
- vpxor 0x20(%rsi),%ymm11,%ymm11
- vpxor 0x40(%rsi),%ymm4,%ymm4
- vpxor 0x60(%rsi),%ymm12,%ymm12
- lea (%rsi,%rax),%rsi # size optimization
- vmovdqu %ymm3,0x00(%rdi)
- vmovdqu %ymm11,0x20(%rdi)
- vmovdqu %ymm4,0x40(%rdi)
- vmovdqu %ymm12,0x60(%rdi)
- lea (%rdi,%rax),%rdi # size optimization
-
- vpbroadcastd 0(%r9),%ymm0 # reload key
- vpbroadcastd 4(%r9),%ymm1
-
- sub $64*8,%rdx
- jnz .Loop_outer8xvl
-
- jmp .Ldone8xvl
-
-.align 32
-.Ltail8xvl:
- vmovdqa64 %ymm19,%ymm8 # size optimization
- xor %r9,%r9
- sub %rsi,%rdi
- cmp $64*1,%rdx
- jb .Less_than_64_8xvl
- vpxor 0x00(%rsi),%ymm8,%ymm8 # xor with input
- vpxor 0x20(%rsi),%ymm0,%ymm0
- vmovdqu %ymm8,0x00(%rdi,%rsi)
- vmovdqu %ymm0,0x20(%rdi,%rsi)
- je .Ldone8xvl
- vmovdqa %ymm5,%ymm8
- vmovdqa %ymm13,%ymm0
- lea 64(%rsi),%rsi
-
- cmp $64*2,%rdx
- jb .Less_than_64_8xvl
- vpxor 0x00(%rsi),%ymm5,%ymm5
- vpxor 0x20(%rsi),%ymm13,%ymm13
- vmovdqu %ymm5,0x00(%rdi,%rsi)
- vmovdqu %ymm13,0x20(%rdi,%rsi)
- je .Ldone8xvl
- vmovdqa %ymm1,%ymm8
- vmovdqa %ymm9,%ymm0
- lea 64(%rsi),%rsi
-
- cmp $64*3,%rdx
- jb .Less_than_64_8xvl
- vpxor 0x00(%rsi),%ymm1,%ymm1
- vpxor 0x20(%rsi),%ymm9,%ymm9
- vmovdqu %ymm1,0x00(%rdi,%rsi)
- vmovdqu %ymm9,0x20(%rdi,%rsi)
- je .Ldone8xvl
- vmovdqa %ymm2,%ymm8
- vmovdqa %ymm10,%ymm0
- lea 64(%rsi),%rsi
-
- cmp $64*4,%rdx
- jb .Less_than_64_8xvl
- vpxor 0x00(%rsi),%ymm2,%ymm2
- vpxor 0x20(%rsi),%ymm10,%ymm10
- vmovdqu %ymm2,0x00(%rdi,%rsi)
- vmovdqu %ymm10,0x20(%rdi,%rsi)
- je .Ldone8xvl
- vmovdqa32 %ymm18,%ymm8
- vmovdqa %ymm6,%ymm0
- lea 64(%rsi),%rsi
-
- cmp $64*5,%rdx
- jb .Less_than_64_8xvl
- vpxord 0x00(%rsi),%ymm18,%ymm18
- vpxor 0x20(%rsi),%ymm6,%ymm6
- vmovdqu32 %ymm18,0x00(%rdi,%rsi)
- vmovdqu %ymm6,0x20(%rdi,%rsi)
- je .Ldone8xvl
- vmovdqa %ymm7,%ymm8
- vmovdqa %ymm15,%ymm0
- lea 64(%rsi),%rsi
-
- cmp $64*6,%rdx
- jb .Less_than_64_8xvl
- vpxor 0x00(%rsi),%ymm7,%ymm7
- vpxor 0x20(%rsi),%ymm15,%ymm15
- vmovdqu %ymm7,0x00(%rdi,%rsi)
- vmovdqu %ymm15,0x20(%rdi,%rsi)
- je .Ldone8xvl
- vmovdqa %ymm3,%ymm8
- vmovdqa %ymm11,%ymm0
- lea 64(%rsi),%rsi
-
- cmp $64*7,%rdx
- jb .Less_than_64_8xvl
- vpxor 0x00(%rsi),%ymm3,%ymm3
- vpxor 0x20(%rsi),%ymm11,%ymm11
- vmovdqu %ymm3,0x00(%rdi,%rsi)
- vmovdqu %ymm11,0x20(%rdi,%rsi)
- je .Ldone8xvl
- vmovdqa %ymm4,%ymm8
- vmovdqa %ymm12,%ymm0
- lea 64(%rsi),%rsi
-
-.Less_than_64_8xvl:
- vmovdqa %ymm8,0x00(%rsp)
- vmovdqa %ymm0,0x20(%rsp)
- lea (%rdi,%rsi),%rdi
- and $63,%rdx
-
-.Loop_tail8xvl:
- movzb (%rsi,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1(%rdi,%r9)
- dec %rdx
- jnz .Loop_tail8xvl
-
- vpxor %ymm8,%ymm8,%ymm8
- vmovdqa %ymm8,0x00(%rsp)
- vmovdqa %ymm8,0x20(%rsp)
-
-.Ldone8xvl:
- vzeroall
- lea -8(%r10),%rsp
-.L8xvl_epilogue:
- ret
-.size chacha20_8xvl,.-chacha20_8xvl
-#endif
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm-glue.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm-glue.c
deleted file mode 100644
index 41e2e79abb2b..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm-glue.c
+++ /dev/null
@@ -1,98 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#if defined(CONFIG_ZINC_ARCH_ARM)
-#include <asm/system_info.h>
-#include <asm/cputype.h>
-#endif
-
-asmlinkage void chacha20_arm(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-asmlinkage void hchacha20_arm(const u32 state[16], u32 out[8]);
-asmlinkage void chacha20_neon(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-
-static bool chacha20_use_neon __ro_after_init;
-static bool *const chacha20_nobs[] __initconst = { &chacha20_use_neon };
-static void __init chacha20_fpu_init(void)
-{
-#if defined(CONFIG_ZINC_ARCH_ARM64)
- chacha20_use_neon = cpu_have_named_feature(ASIMD);
-#elif defined(CONFIG_ZINC_ARCH_ARM)
- switch (read_cpuid_part()) {
- case ARM_CPU_PART_CORTEX_A7:
- case ARM_CPU_PART_CORTEX_A5:
- /* The Cortex-A7 and Cortex-A5 do not perform well with the NEON
- * implementation but do incredibly with the scalar one and use
- * less power.
- */
- break;
- default:
- chacha20_use_neon = elf_hwcap & HWCAP_NEON;
- }
-#endif
-}
-
-static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst,
- const u8 *src, size_t len,
- simd_context_t *simd_context)
-{
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE < CHACHA20_BLOCK_SIZE ||
- PAGE_SIZE % CHACHA20_BLOCK_SIZE);
-
- for (;;) {
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && chacha20_use_neon &&
- len >= CHACHA20_BLOCK_SIZE * 3 && simd_use(simd_context)) {
- const size_t bytes = min_t(size_t, len, PAGE_SIZE);
-
- chacha20_neon(dst, src, bytes, ctx->key, ctx->counter);
- ctx->counter[0] += (bytes + 63) / 64;
- len -= bytes;
- if (!len)
- break;
- dst += bytes;
- src += bytes;
- simd_relax(simd_context);
- } else {
- chacha20_arm(dst, src, len, ctx->key, ctx->counter);
- ctx->counter[0] += (len + 63) / 64;
- break;
- }
- }
-
- return true;
-}
-
-static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE],
- simd_context_t *simd_context)
-{
- if (IS_ENABLED(CONFIG_ZINC_ARCH_ARM)) {
- u32 x[] = { CHACHA20_CONSTANT_EXPA,
- CHACHA20_CONSTANT_ND_3,
- CHACHA20_CONSTANT_2_BY,
- CHACHA20_CONSTANT_TE_K,
- get_unaligned_le32(key + 0),
- get_unaligned_le32(key + 4),
- get_unaligned_le32(key + 8),
- get_unaligned_le32(key + 12),
- get_unaligned_le32(key + 16),
- get_unaligned_le32(key + 20),
- get_unaligned_le32(key + 24),
- get_unaligned_le32(key + 28),
- get_unaligned_le32(nonce + 0),
- get_unaligned_le32(nonce + 4),
- get_unaligned_le32(nonce + 8),
- get_unaligned_le32(nonce + 12)
- };
- hchacha20_arm(x, derived_key);
- return true;
- }
- return false;
-}
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl
deleted file mode 100755
index 6785383ab7bb..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl
+++ /dev/null
@@ -1,1227 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# December 2014
-#
-# ChaCha20 for ARMv4.
-#
-# September 2018
-#
-# Improve scalar performance per Eric Biggers' suggestion to eliminate
-# separate rotates. This requires b[0..3] and d[0..3] to be maintained
-# pre-rotated, hence odd twists prior inner loop and when accumulating
-# key material. Since amount of instructions is reduced as result, even
-# NEON performance is improved somewhat, most notably by ~9% on low-end
-# Cortex-A5/A7. Full unroll was shown to provide even better scalar
-# performance on Cortex-A5/A7, naturally at the cost of manyfold size
-# increase. We let it be. Oversized code works in benchmarks, but is not
-# necessarily optimal in real life, when it's likely to be out-of-cache
-# upon entry and evict significant part of cache upon completion.
-#
-# Performance in cycles per byte out of large buffer.
-#
-# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU
-#
-# Cortex-A5 14.2(*)/+160% 21.8 12.9(**)
-# Cortex-A8 10.2(*)/+190% 13.9 6.10
-# Cortex-A9 10.8(*)/+150% 14.3 6.50
-# Cortex-A15 11.0/+40% 16.0 4.90
-# Snapdragon S4 13.9(***)/+90% 13.6 4.90
-#
-# (*) most "favourable" result for aligned data on little-endian
-# processor, result for misaligned data is 10-15% lower;
-# (**) pure 4xNEON [with "vertical" layout] was shown to provide ~8%
-# better performance on Cortex-A5/A7, but not on others;
-# (***) it's 17% slower than original, trade-off is considered
-# acceptable, because of improvement on others, specifically
-# +36% on Cortex-A5/A7 and +20% on Cortex-A9;
-
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
- open STDOUT,">$output";
-}
-
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
- my $arg = pop;
- $arg = "#$arg" if ($arg*1 eq $arg);
- $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
-my @t=map("r$_",(8..11));
-
-sub ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my $odd = $d0&1;
-my ($xc,$xc_) = (@t[0..1]);
-my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
-my @ret;
-
- # Consider order in which variables are addressed by their
- # index:
- #
- # a b c d
- #
- # 0 4 8 12 < even round
- # 1 5 9 13
- # 2 6 10 14
- # 3 7 11 15
- # 0 5 10 15 < odd round
- # 1 6 11 12
- # 2 7 8 13
- # 3 4 9 14
- #
- # 'a', 'b' are permanently allocated in registers, @x[0..7],
- # while 'c's and pair of 'd's are maintained in memory. If
- # you observe 'c' column, you'll notice that pair of 'c's is
- # invariant between rounds. This means that we have to reload
- # them once per round, in the middle. This is why you'll see
- # bunch of 'c' stores and loads in the middle, but none in
- # the beginning or end. If you observe 'd' column, you'll
- # notice that 15 and 13 are reused in next pair of rounds.
- # This is why these two are chosen for offloading to memory,
- # to make loads count more.
- push @ret,(
- "&add (@x[$a0],@x[$a0],@x[$b0],'ror#13')",
- "&add (@x[$a1],@x[$a1],@x[$b1],'ror#13')",
- "&eor ($xd,@x[$a0],$xd,'ror#24')",
- "&eor ($xd_,@x[$a1],$xd_,'ror#24')",
-
- "&add ($xc,$xc,$xd,'ror#16')",
- "&add ($xc_,$xc_,$xd_,'ror#16')",
- "&eor (@x[$b0],$xc, @x[$b0],'ror#13')",
- "&eor (@x[$b1],$xc_,@x[$b1],'ror#13')",
-
- "&add (@x[$a0],@x[$a0],@x[$b0],'ror#20')",
- "&add (@x[$a1],@x[$a1],@x[$b1],'ror#20')",
- "&eor ($xd,@x[$a0],$xd,'ror#16')",
- "&eor ($xd_,@x[$a1],$xd_,'ror#16')" );
- push @ret,(
- "&str ($xd,'[sp,#4*(16+$d0)]')" ) if ($odd);
- push @ret,(
- "&add ($xc,$xc,$xd,'ror#24')" );
- push @ret,(
- "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd);
- push @ret,(
- "&str ($xd_,'[sp,#4*(16+$d1)]')" ) if (!$odd);
- push @ret,(
- "&add ($xc_,$xc_,$xd_,'ror#24')" );
- push @ret,(
- "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd);
- push @ret,(
- "&str ($xc,'[sp,#4*(16+$c0)]')",
- "&eor (@x[$b0],@x[$b0],$xc,'ror#12')",
- "&str ($xc_,'[sp,#4*(16+$c1)]')",
- "&eor (@x[$b1],@x[$b1],$xc_,'ror#12')" );
-
- $xd=@x[$d2] if (!$odd);
- $xd_=@x[$d3] if ($odd);
- push @ret,(
- "&ldr ($xc,'[sp,#4*(16+$c2)]')",
- "&add (@x[$a2],@x[$a2],@x[$b2],'ror#13')",
- "&ldr ($xc_,'[sp,#4*(16+$c3)]')",
- "&add (@x[$a3],@x[$a3],@x[$b3],'ror#13')",
- "&eor ($xd,@x[$a2],$xd,'ror#24')",
- "&eor ($xd_,@x[$a3],$xd_,'ror#24')",
-
- "&add ($xc,$xc,$xd,'ror#16')",
- "&add ($xc_,$xc_,$xd_,'ror#16')",
- "&eor (@x[$b2],$xc, @x[$b2],'ror#13')",
- "&eor (@x[$b3],$xc_,@x[$b3],'ror#13')",
-
- "&add (@x[$a2],@x[$a2],@x[$b2],'ror#20')",
- "&add (@x[$a3],@x[$a3],@x[$b3],'ror#20')",
- "&eor ($xd,@x[$a2],$xd,'ror#16')",
- "&eor ($xd_,@x[$a3],$xd_,'ror#16')",
-
- "&add ($xc,$xc,$xd,'ror#24')",
- "&add ($xc_,$xc_,$xd_,'ror#24')",
- "&eor (@x[$b2],@x[$b2],$xc,'ror#12')",
- "&eor (@x[$b3],@x[$b3],$xc_,'ror#12')" );
-
- @ret;
-}
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
-# define ChaCha20_ctr32 chacha20_arm_cryptogams
-# define ChaCha20_neon chacha20_neon
-#endif
-
-.text
-#if defined(__thumb2__) || defined(__clang__)
-.syntax unified
-# define ldrhsb ldrbhs
-#endif
-#if defined(__thumb2__)
-.thumb
-#else
-.code 32
-#endif
-
-.align 5
-.Lsigma:
-.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
-.Lone:
-.long 1,0,0,0
-.Lrot8:
-.long 0x02010003,0x06050407
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.LChaCha20_ctr32
-#else
-.word -1
-#endif
-
-.globl ChaCha20_ctr32
-.type ChaCha20_ctr32,%function
-.align 5
-ChaCha20_ctr32:
-.LChaCha20_ctr32:
- ldr r12,[sp,#0] @ pull pointer to counter and nonce
- stmdb sp!,{r0-r2,r4-r11,lr}
-#if __ARM_ARCH__<7 && !defined(__thumb2__)
- sub r14,pc,#16 @ ChaCha20_ctr32
-#else
- adr r14,.LChaCha20_ctr32
-#endif
- cmp r2,#0 @ len==0?
-#ifdef __thumb2__
- itt eq
-#endif
- addeq sp,sp,#4*3
- beq .Lno_data
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- cmp r2,#192 @ test len
- bls .Lshort
- ldr r4,[r14,#-24]
- ldr r4,[r14,r4]
-# ifdef __APPLE__
- ldr r4,[r4]
-# endif
- tst r4,#ARMV7_NEON
- bne .LChaCha20_neon
-.Lshort:
-#endif
- ldmia r12,{r4-r7} @ load counter and nonce
- sub sp,sp,#4*(16) @ off-load area
- sub r14,r14,#64 @ .Lsigma
- stmdb sp!,{r4-r7} @ copy counter and nonce
- ldmia r3,{r4-r11} @ load key
- ldmia r14,{r0-r3} @ load sigma
- stmdb sp!,{r4-r11} @ copy key
- stmdb sp!,{r0-r3} @ copy sigma
- str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
- str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
- b .Loop_outer_enter
-
-.align 4
-.Loop_outer:
- ldmia sp,{r0-r9} @ load key material
- str @t[3],[sp,#4*(32+2)] @ save len
- str r12, [sp,#4*(32+1)] @ save inp
- str r14, [sp,#4*(32+0)] @ save out
-.Loop_outer_enter:
- ldr @t[3], [sp,#4*(15)]
- mov @x[4],@x[4],ror#19 @ twist b[0..3]
- ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
- mov @x[5],@x[5],ror#19
- ldr @t[2], [sp,#4*(13)]
- mov @x[6],@x[6],ror#19
- ldr @x[14],[sp,#4*(14)]
- mov @x[7],@x[7],ror#19
- mov @t[3],@t[3],ror#8 @ twist d[0..3]
- mov @x[12],@x[12],ror#8
- mov @t[2],@t[2],ror#8
- mov @x[14],@x[14],ror#8
- str @t[3], [sp,#4*(16+15)]
- mov @t[3],#10
- b .Loop
-
-.align 4
-.Loop:
- subs @t[3],@t[3],#1
-___
- foreach (&ROUND(0, 4, 8,12)) { eval; }
- foreach (&ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- bne .Loop
-
- ldr @t[3],[sp,#4*(32+2)] @ load len
-
- str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
- str @t[1], [sp,#4*(16+9)]
- str @x[12],[sp,#4*(16+12)]
- str @t[2], [sp,#4*(16+13)]
- str @x[14],[sp,#4*(16+14)]
-
- @ at this point we have first half of 512-bit result in
- @ @x[0-7] and second half at sp+4*(16+8)
-
- cmp @t[3],#64 @ done yet?
-#ifdef __thumb2__
- itete lo
-#endif
- addlo r12,sp,#4*(0) @ shortcut or ...
- ldrhs r12,[sp,#4*(32+1)] @ ... load inp
- addlo r14,sp,#4*(0) @ shortcut or ...
- ldrhs r14,[sp,#4*(32+0)] @ ... load out
-
- ldr @t[0],[sp,#4*(0)] @ load key material
- ldr @t[1],[sp,#4*(1)]
-
-#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
-# if __ARM_ARCH__<7
- orr @t[2],r12,r14
- tst @t[2],#3 @ are input and output aligned?
- ldr @t[2],[sp,#4*(2)]
- bne .Lunaligned
- cmp @t[3],#64 @ restore flags
-# else
- ldr @t[2],[sp,#4*(2)]
-# endif
- ldr @t[3],[sp,#4*(3)]
-
- add @x[0],@x[0],@t[0] @ accumulate key material
- add @x[1],@x[1],@t[1]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[0],[r12],#16 @ load input
- ldrhs @t[1],[r12,#-12]
-
- add @x[2],@x[2],@t[2]
- add @x[3],@x[3],@t[3]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[2],[r12,#-8]
- ldrhs @t[3],[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[0],@x[0],@t[0] @ xor with input
- eorhs @x[1],@x[1],@t[1]
- add @t[0],sp,#4*(4)
- str @x[0],[r14],#16 @ store output
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[2],@x[2],@t[2]
- eorhs @x[3],@x[3],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[1],[r14,#-12]
- str @x[2],[r14,#-8]
- str @x[3],[r14,#-4]
-
- add @x[4],@t[0],@x[4],ror#13 @ accumulate key material
- add @x[5],@t[1],@x[5],ror#13
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[0],[r12],#16 @ load input
- ldrhs @t[1],[r12,#-12]
- add @x[6],@t[2],@x[6],ror#13
- add @x[7],@t[3],@x[7],ror#13
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[2],[r12,#-8]
- ldrhs @t[3],[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[4],@x[4],@t[0]
- eorhs @x[5],@x[5],@t[1]
- add @t[0],sp,#4*(8)
- str @x[4],[r14],#16 @ store output
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[6],@x[6],@t[2]
- eorhs @x[7],@x[7],@t[3]
- str @x[5],[r14,#-12]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[6],[r14,#-8]
- add @x[0],sp,#4*(16+8)
- str @x[7],[r14,#-4]
-
- ldmia @x[0],{@x[0]-@x[7]} @ load second half
-
- add @x[0],@x[0],@t[0] @ accumulate key material
- add @x[1],@x[1],@t[1]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[0],[r12],#16 @ load input
- ldrhs @t[1],[r12,#-12]
-# ifdef __thumb2__
- itt hi
-# endif
- strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
- strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
- add @x[2],@x[2],@t[2]
- add @x[3],@x[3],@t[3]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[2],[r12,#-8]
- ldrhs @t[3],[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[0],@x[0],@t[0]
- eorhs @x[1],@x[1],@t[1]
- add @t[0],sp,#4*(12)
- str @x[0],[r14],#16 @ store output
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[2],@x[2],@t[2]
- eorhs @x[3],@x[3],@t[3]
- str @x[1],[r14,#-12]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[2],[r14,#-8]
- str @x[3],[r14,#-4]
-
- add @x[4],@t[0],@x[4],ror#24 @ accumulate key material
- add @x[5],@t[1],@x[5],ror#24
-# ifdef __thumb2__
- itt hi
-# endif
- addhi @t[0],@t[0],#1 @ next counter value
- strhi @t[0],[sp,#4*(12)] @ save next counter value
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[0],[r12],#16 @ load input
- ldrhs @t[1],[r12,#-12]
- add @x[6],@t[2],@x[6],ror#24
- add @x[7],@t[3],@x[7],ror#24
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[2],[r12,#-8]
- ldrhs @t[3],[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[4],@x[4],@t[0]
- eorhs @x[5],@x[5],@t[1]
-# ifdef __thumb2__
- it ne
-# endif
- ldrne @t[0],[sp,#4*(32+2)] @ re-load len
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[6],@x[6],@t[2]
- eorhs @x[7],@x[7],@t[3]
- str @x[4],[r14],#16 @ store output
- str @x[5],[r14,#-12]
-# ifdef __thumb2__
- it hs
-# endif
- subhs @t[3],@t[0],#64 @ len-=64
- str @x[6],[r14,#-8]
- str @x[7],[r14,#-4]
- bhi .Loop_outer
-
- beq .Ldone
-# if __ARM_ARCH__<7
- b .Ltail
-
-.align 4
-.Lunaligned: @ unaligned endian-neutral path
- cmp @t[3],#64 @ restore flags
-# endif
-#endif
-#if __ARM_ARCH__<7
- ldr @t[3],[sp,#4*(3)]
-___
-for ($i=0;$i<16;$i+=4) {
-my $j=$i&0x7;
-my $twist="";
-if ($i==4) { $twist = ",ror#13"; }
-elsif ($i==12) { $twist = ",ror#24"; }
-
-$code.=<<___ if ($i==4);
- add @x[0],sp,#4*(16+8)
-___
-$code.=<<___ if ($i==8);
- ldmia @x[0],{@x[0]-@x[7]} @ load second half
-# ifdef __thumb2__
- itt hi
-# endif
- strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]"
- strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]"
-___
-$code.=<<___;
- add @x[$j+0],@t[0],@x[$j+0]$twist @ accumulate key material
-___
-$code.=<<___ if ($i==12);
-# ifdef __thumb2__
- itt hi
-# endif
- addhi @t[0],@t[0],#1 @ next counter value
- strhi @t[0],[sp,#4*(12)] @ save next counter value
-___
-$code.=<<___;
- add @x[$j+1],@t[1],@x[$j+1]$twist
- add @x[$j+2],@t[2],@x[$j+2]$twist
-# ifdef __thumb2__
- itete lo
-# endif
- eorlo @t[0],@t[0],@t[0] @ zero or ...
- ldrhsb @t[0],[r12],#16 @ ... load input
- eorlo @t[1],@t[1],@t[1]
- ldrhsb @t[1],[r12,#-12]
-
- add @x[$j+3],@t[3],@x[$j+3]$twist
-# ifdef __thumb2__
- itete lo
-# endif
- eorlo @t[2],@t[2],@t[2]
- ldrhsb @t[2],[r12,#-8]
- eorlo @t[3],@t[3],@t[3]
- ldrhsb @t[3],[r12,#-4]
-
- eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero)
- eor @x[$j+1],@t[1],@x[$j+1]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[0],[r12,#-15] @ load more input
- ldrhsb @t[1],[r12,#-11]
- eor @x[$j+2],@t[2],@x[$j+2]
- strb @x[$j+0],[r14],#16 @ store output
- eor @x[$j+3],@t[3],@x[$j+3]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[2],[r12,#-7]
- ldrhsb @t[3],[r12,#-3]
- strb @x[$j+1],[r14,#-12]
- eor @x[$j+0],@t[0],@x[$j+0],lsr#8
- strb @x[$j+2],[r14,#-8]
- eor @x[$j+1],@t[1],@x[$j+1],lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[0],[r12,#-14] @ load more input
- ldrhsb @t[1],[r12,#-10]
- strb @x[$j+3],[r14,#-4]
- eor @x[$j+2],@t[2],@x[$j+2],lsr#8
- strb @x[$j+0],[r14,#-15]
- eor @x[$j+3],@t[3],@x[$j+3],lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[2],[r12,#-6]
- ldrhsb @t[3],[r12,#-2]
- strb @x[$j+1],[r14,#-11]
- eor @x[$j+0],@t[0],@x[$j+0],lsr#8
- strb @x[$j+2],[r14,#-7]
- eor @x[$j+1],@t[1],@x[$j+1],lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[0],[r12,#-13] @ load more input
- ldrhsb @t[1],[r12,#-9]
- strb @x[$j+3],[r14,#-3]
- eor @x[$j+2],@t[2],@x[$j+2],lsr#8
- strb @x[$j+0],[r14,#-14]
- eor @x[$j+3],@t[3],@x[$j+3],lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[2],[r12,#-5]
- ldrhsb @t[3],[r12,#-1]
- strb @x[$j+1],[r14,#-10]
- strb @x[$j+2],[r14,#-6]
- eor @x[$j+0],@t[0],@x[$j+0],lsr#8
- strb @x[$j+3],[r14,#-2]
- eor @x[$j+1],@t[1],@x[$j+1],lsr#8
- strb @x[$j+0],[r14,#-13]
- eor @x[$j+2],@t[2],@x[$j+2],lsr#8
- strb @x[$j+1],[r14,#-9]
- eor @x[$j+3],@t[3],@x[$j+3],lsr#8
- strb @x[$j+2],[r14,#-5]
- strb @x[$j+3],[r14,#-1]
-___
-$code.=<<___ if ($i<12);
- add @t[0],sp,#4*(4+$i)
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
-___
-}
-$code.=<<___;
-# ifdef __thumb2__
- it ne
-# endif
- ldrne @t[0],[sp,#4*(32+2)] @ re-load len
-# ifdef __thumb2__
- it hs
-# endif
- subhs @t[3],@t[0],#64 @ len-=64
- bhi .Loop_outer
-
- beq .Ldone
-#endif
-
-.Ltail:
- ldr r12,[sp,#4*(32+1)] @ load inp
- add @t[1],sp,#4*(0)
- ldr r14,[sp,#4*(32+0)] @ load out
-
-.Loop_tail:
- ldrb @t[2],[@t[1]],#1 @ read buffer on stack
- ldrb @t[3],[r12],#1 @ read input
- subs @t[0],@t[0],#1
- eor @t[3],@t[3],@t[2]
- strb @t[3],[r14],#1 @ store output
- bne .Loop_tail
-
-.Ldone:
- add sp,sp,#4*(32+3)
-.Lno_data:
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4-r11,pc}
-#else
- ldmia sp!,{r4-r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .long 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-.size ChaCha20_ctr32,.-ChaCha20_ctr32
-___
-
-{{{
-my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
- map("q$_",(0..15));
-
-# This can replace vshr-by-24+vsli-by-8. It gives ~3% improvement on
-# Cortex-A5/A7, but hurts Cortex-A9 by 5% and Snapdragon S4 by 14%!
-sub vperm()
-{ my ($dst,$src,$tbl) = @_;
- $code .= " vtbl.8 $dst#lo,{$src#lo},$tbl#lo\n";
- $code .= " vtbl.8 $dst#hi,{$src#hi},$tbl#lo\n";
-}
-
-sub NEONROUND {
-my $odd = pop;
-my ($a,$b,$c,$d,$t)=@_;
-
- (
- "&vadd_i32 ($a,$a,$b)",
- "&veor ($d,$d,$a)",
- "&vrev32_16 ($d,$d)", # vrot ($d,16)
-
- "&vadd_i32 ($c,$c,$d)",
- "&veor ($t,$b,$c)",
- "&vshr_u32 ($b,$t,20)",
- "&vsli_32 ($b,$t,12)",
-
- "&vadd_i32 ($a,$a,$b)",
- "&veor ($t,$d,$a)",
- "&vshr_u32 ($d,$t,24)",
- "&vsli_32 ($d,$t,8)",
- #"&vperm ($d,$t,$t3)",
-
- "&vadd_i32 ($c,$c,$d)",
- "&veor ($t,$b,$c)",
- "&vshr_u32 ($b,$t,25)",
- "&vsli_32 ($b,$t,7)",
-
- "&vext_8 ($a,$a,$a,$odd?4:12)",
- "&vext_8 ($d,$d,$d,8)",
- "&vext_8 ($c,$c,$c,$odd?12:4)"
- );
-}
-
-$code.=<<___;
-#if (defined(__KERNEL__) && defined(CONFIG_KERNEL_MODE_NEON)) || (!defined(__KERNEL__) && __ARM_MAX_ARCH__>=7)
-.arch armv7-a
-.fpu neon
-
-# ifdef __KERNEL__
-.globl ChaCha20_neon
-@ For optimal performance it's appropriate for caller to enforce
-@ minimum input length, 193 bytes is suggested.
-# endif
-.type ChaCha20_neon,%function
-.align 5
-ChaCha20_neon:
- ldr r12,[sp,#0] @ pull pointer to counter and nonce
- stmdb sp!,{r0-r2,r4-r11,lr}
-.LChaCha20_neon:
- adr r14,.Lsigma
- vstmdb sp!,{d8-d15} @ ABI spec says so
- stmdb sp!,{r0-r3}
-
- vld1.32 {$b0-$c0},[r3] @ load key
- ldmia r3,{r4-r11} @ load key
-
- sub sp,sp,#4*(16+16)
- vld1.32 {$d0},[r12] @ load counter and nonce
- add r12,sp,#4*8
- ldmia r14,{r0-r3} @ load sigma
- vld1.32 {$a0},[r14]! @ load sigma
- vld1.32 {$t0},[r14]! @ one
- @ vld1.32 {$t3#lo},[r14] @ rot8
- vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce
- vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key
-
- str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
- str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
- vshl.i32 $t1#lo,$t0#lo,#1 @ two
- vstr $t0#lo,[sp,#4*(16+0)]
- vshl.i32 $t2#lo,$t0#lo,#2 @ four
- vstr $t1#lo,[sp,#4*(16+2)]
- vmov $a1,$a0
- vstr $t2#lo,[sp,#4*(16+4)]
- vmov $a2,$a0
- @ vstr $t3#lo,[sp,#4*(16+6)]
- vmov $b1,$b0
- vmov $b2,$b0
- b .Loop_neon_enter
-
-.align 4
-.Loop_neon_outer:
- ldmia sp,{r0-r9} @ load key material
- cmp @t[3],#64*2 @ if len<=64*2
- bls .Lbreak_neon @ switch to integer-only
- @ vldr $t3#lo,[sp,#4*(16+6)] @ rot8
- vmov $a1,$a0
- str @t[3],[sp,#4*(32+2)] @ save len
- vmov $a2,$a0
- str r12, [sp,#4*(32+1)] @ save inp
- vmov $b1,$b0
- str r14, [sp,#4*(32+0)] @ save out
- vmov $b2,$b0
-.Loop_neon_enter:
- ldr @t[3], [sp,#4*(15)]
- mov @x[4],@x[4],ror#19 @ twist b[0..3]
- vadd.i32 $d1,$d0,$t0 @ counter+1
- ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
- mov @x[5],@x[5],ror#19
- vmov $c1,$c0
- ldr @t[2], [sp,#4*(13)]
- mov @x[6],@x[6],ror#19
- vmov $c2,$c0
- ldr @x[14],[sp,#4*(14)]
- mov @x[7],@x[7],ror#19
- vadd.i32 $d2,$d1,$t0 @ counter+2
- add @x[12],@x[12],#3 @ counter+3
- mov @t[3],@t[3],ror#8 @ twist d[0..3]
- mov @x[12],@x[12],ror#8
- mov @t[2],@t[2],ror#8
- mov @x[14],@x[14],ror#8
- str @t[3], [sp,#4*(16+15)]
- mov @t[3],#10
- b .Loop_neon
-
-.align 4
-.Loop_neon:
- subs @t[3],@t[3],#1
-___
- my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
- my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
- my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
- my @thread3=&ROUND(0,4,8,12);
-
- foreach (@thread0) {
- eval; eval(shift(@thread3));
- eval(shift(@thread1)); eval(shift(@thread3));
- eval(shift(@thread2)); eval(shift(@thread3));
- }
-
- @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
- @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
- @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
- @thread3=&ROUND(0,5,10,15);
-
- foreach (@thread0) {
- eval; eval(shift(@thread3));
- eval(shift(@thread1)); eval(shift(@thread3));
- eval(shift(@thread2)); eval(shift(@thread3));
- }
-$code.=<<___;
- bne .Loop_neon
-
- add @t[3],sp,#32
- vld1.32 {$t0-$t1},[sp] @ load key material
- vld1.32 {$t2-$t3},[@t[3]]
-
- ldr @t[3],[sp,#4*(32+2)] @ load len
-
- str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
- str @t[1], [sp,#4*(16+9)]
- str @x[12],[sp,#4*(16+12)]
- str @t[2], [sp,#4*(16+13)]
- str @x[14],[sp,#4*(16+14)]
-
- @ at this point we have first half of 512-bit result in
- @ @x[0-7] and second half at sp+4*(16+8)
-
- ldr r12,[sp,#4*(32+1)] @ load inp
- ldr r14,[sp,#4*(32+0)] @ load out
-
- vadd.i32 $a0,$a0,$t0 @ accumulate key material
- vadd.i32 $a1,$a1,$t0
- vadd.i32 $a2,$a2,$t0
- vldr $t0#lo,[sp,#4*(16+0)] @ one
-
- vadd.i32 $b0,$b0,$t1
- vadd.i32 $b1,$b1,$t1
- vadd.i32 $b2,$b2,$t1
- vldr $t1#lo,[sp,#4*(16+2)] @ two
-
- vadd.i32 $c0,$c0,$t2
- vadd.i32 $c1,$c1,$t2
- vadd.i32 $c2,$c2,$t2
- vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1
- vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2
-
- vadd.i32 $d0,$d0,$t3
- vadd.i32 $d1,$d1,$t3
- vadd.i32 $d2,$d2,$t3
-
- cmp @t[3],#64*4
- blo .Ltail_neon
-
- vld1.8 {$t0-$t1},[r12]! @ load input
- mov @t[3],sp
- vld1.8 {$t2-$t3},[r12]!
- veor $a0,$a0,$t0 @ xor with input
- veor $b0,$b0,$t1
- vld1.8 {$t0-$t1},[r12]!
- veor $c0,$c0,$t2
- veor $d0,$d0,$t3
- vld1.8 {$t2-$t3},[r12]!
-
- veor $a1,$a1,$t0
- vst1.8 {$a0-$b0},[r14]! @ store output
- veor $b1,$b1,$t1
- vld1.8 {$t0-$t1},[r12]!
- veor $c1,$c1,$t2
- vst1.8 {$c0-$d0},[r14]!
- veor $d1,$d1,$t3
- vld1.8 {$t2-$t3},[r12]!
-
- veor $a2,$a2,$t0
- vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration
- veor $t0#hi,$t0#hi,$t0#hi
- vldr $t0#lo,[sp,#4*(16+4)] @ four
- veor $b2,$b2,$t1
- vld1.32 {$c0-$d0},[@t[3]]
- veor $c2,$c2,$t2
- vst1.8 {$a1-$b1},[r14]!
- veor $d2,$d2,$t3
- vst1.8 {$c1-$d1},[r14]!
-
- vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value
- vldr $t0#lo,[sp,#4*(16+0)] @ one
-
- ldmia sp,{@t[0]-@t[3]} @ load key material
- add @x[0],@x[0],@t[0] @ accumulate key material
- ldr @t[0],[r12],#16 @ load input
- vst1.8 {$a2-$b2},[r14]!
- add @x[1],@x[1],@t[1]
- ldr @t[1],[r12,#-12]
- vst1.8 {$c2-$d2},[r14]!
- add @x[2],@x[2],@t[2]
- ldr @t[2],[r12,#-8]
- add @x[3],@x[3],@t[3]
- ldr @t[3],[r12,#-4]
-# ifdef __ARMEB__
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
-# endif
- eor @x[0],@x[0],@t[0] @ xor with input
- add @t[0],sp,#4*(4)
- eor @x[1],@x[1],@t[1]
- str @x[0],[r14],#16 @ store output
- eor @x[2],@x[2],@t[2]
- str @x[1],[r14,#-12]
- eor @x[3],@x[3],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[2],[r14,#-8]
- str @x[3],[r14,#-4]
-
- add @x[4],@t[0],@x[4],ror#13 @ accumulate key material
- ldr @t[0],[r12],#16 @ load input
- add @x[5],@t[1],@x[5],ror#13
- ldr @t[1],[r12,#-12]
- add @x[6],@t[2],@x[6],ror#13
- ldr @t[2],[r12,#-8]
- add @x[7],@t[3],@x[7],ror#13
- ldr @t[3],[r12,#-4]
-# ifdef __ARMEB__
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
- eor @x[4],@x[4],@t[0]
- add @t[0],sp,#4*(8)
- eor @x[5],@x[5],@t[1]
- str @x[4],[r14],#16 @ store output
- eor @x[6],@x[6],@t[2]
- str @x[5],[r14,#-12]
- eor @x[7],@x[7],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[6],[r14,#-8]
- add @x[0],sp,#4*(16+8)
- str @x[7],[r14,#-4]
-
- ldmia @x[0],{@x[0]-@x[7]} @ load second half
-
- add @x[0],@x[0],@t[0] @ accumulate key material
- ldr @t[0],[r12],#16 @ load input
- add @x[1],@x[1],@t[1]
- ldr @t[1],[r12,#-12]
-# ifdef __thumb2__
- it hi
-# endif
- strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
- add @x[2],@x[2],@t[2]
- ldr @t[2],[r12,#-8]
-# ifdef __thumb2__
- it hi
-# endif
- strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
- add @x[3],@x[3],@t[3]
- ldr @t[3],[r12,#-4]
-# ifdef __ARMEB__
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
-# endif
- eor @x[0],@x[0],@t[0]
- add @t[0],sp,#4*(12)
- eor @x[1],@x[1],@t[1]
- str @x[0],[r14],#16 @ store output
- eor @x[2],@x[2],@t[2]
- str @x[1],[r14,#-12]
- eor @x[3],@x[3],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[2],[r14,#-8]
- str @x[3],[r14,#-4]
-
- add @x[4],@t[0],@x[4],ror#24 @ accumulate key material
- add @t[0],@t[0],#4 @ next counter value
- add @x[5],@t[1],@x[5],ror#24
- str @t[0],[sp,#4*(12)] @ save next counter value
- ldr @t[0],[r12],#16 @ load input
- add @x[6],@t[2],@x[6],ror#24
- add @x[4],@x[4],#3 @ counter+3
- ldr @t[1],[r12,#-12]
- add @x[7],@t[3],@x[7],ror#24
- ldr @t[2],[r12,#-8]
- ldr @t[3],[r12,#-4]
-# ifdef __ARMEB__
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
- eor @x[4],@x[4],@t[0]
-# ifdef __thumb2__
- it hi
-# endif
- ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
- eor @x[5],@x[5],@t[1]
- eor @x[6],@x[6],@t[2]
- str @x[4],[r14],#16 @ store output
- eor @x[7],@x[7],@t[3]
- str @x[5],[r14,#-12]
- sub @t[3],@t[0],#64*4 @ len-=64*4
- str @x[6],[r14,#-8]
- str @x[7],[r14,#-4]
- bhi .Loop_neon_outer
-
- b .Ldone_neon
-
-.align 4
-.Lbreak_neon:
- @ harmonize NEON and integer-only stack frames: load data
- @ from NEON frame, but save to integer-only one; distance
- @ between the two is 4*(32+4+16-32)=4*(20).
-
- str @t[3], [sp,#4*(20+32+2)] @ save len
- add @t[3],sp,#4*(32+4)
- str r12, [sp,#4*(20+32+1)] @ save inp
- str r14, [sp,#4*(20+32+0)] @ save out
-
- ldr @x[12],[sp,#4*(16+10)]
- ldr @x[14],[sp,#4*(16+11)]
- vldmia @t[3],{d8-d15} @ fulfill ABI requirement
- str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]"
- str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]"
-
- ldr @t[3], [sp,#4*(15)]
- mov @x[4],@x[4],ror#19 @ twist b[0..3]
- ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
- mov @x[5],@x[5],ror#19
- ldr @t[2], [sp,#4*(13)]
- mov @x[6],@x[6],ror#19
- ldr @x[14],[sp,#4*(14)]
- mov @x[7],@x[7],ror#19
- mov @t[3],@t[3],ror#8 @ twist d[0..3]
- mov @x[12],@x[12],ror#8
- mov @t[2],@t[2],ror#8
- mov @x[14],@x[14],ror#8
- str @t[3], [sp,#4*(20+16+15)]
- add @t[3],sp,#4*(20)
- vst1.32 {$a0-$b0},[@t[3]]! @ copy key
- add sp,sp,#4*(20) @ switch frame
- vst1.32 {$c0-$d0},[@t[3]]
- mov @t[3],#10
- b .Loop @ go integer-only
-
-.align 4
-.Ltail_neon:
- cmp @t[3],#64*3
- bhs .L192_or_more_neon
- cmp @t[3],#64*2
- bhs .L128_or_more_neon
- cmp @t[3],#64*1
- bhs .L64_or_more_neon
-
- add @t[0],sp,#4*(8)
- vst1.8 {$a0-$b0},[sp]
- add @t[2],sp,#4*(0)
- vst1.8 {$c0-$d0},[@t[0]]
- b .Loop_tail_neon
-
-.align 4
-.L64_or_more_neon:
- vld1.8 {$t0-$t1},[r12]!
- vld1.8 {$t2-$t3},[r12]!
- veor $a0,$a0,$t0
- veor $b0,$b0,$t1
- veor $c0,$c0,$t2
- veor $d0,$d0,$t3
- vst1.8 {$a0-$b0},[r14]!
- vst1.8 {$c0-$d0},[r14]!
-
- beq .Ldone_neon
-
- add @t[0],sp,#4*(8)
- vst1.8 {$a1-$b1},[sp]
- add @t[2],sp,#4*(0)
- vst1.8 {$c1-$d1},[@t[0]]
- sub @t[3],@t[3],#64*1 @ len-=64*1
- b .Loop_tail_neon
-
-.align 4
-.L128_or_more_neon:
- vld1.8 {$t0-$t1},[r12]!
- vld1.8 {$t2-$t3},[r12]!
- veor $a0,$a0,$t0
- veor $b0,$b0,$t1
- vld1.8 {$t0-$t1},[r12]!
- veor $c0,$c0,$t2
- veor $d0,$d0,$t3
- vld1.8 {$t2-$t3},[r12]!
-
- veor $a1,$a1,$t0
- veor $b1,$b1,$t1
- vst1.8 {$a0-$b0},[r14]!
- veor $c1,$c1,$t2
- vst1.8 {$c0-$d0},[r14]!
- veor $d1,$d1,$t3
- vst1.8 {$a1-$b1},[r14]!
- vst1.8 {$c1-$d1},[r14]!
-
- beq .Ldone_neon
-
- add @t[0],sp,#4*(8)
- vst1.8 {$a2-$b2},[sp]
- add @t[2],sp,#4*(0)
- vst1.8 {$c2-$d2},[@t[0]]
- sub @t[3],@t[3],#64*2 @ len-=64*2
- b .Loop_tail_neon
-
-.align 4
-.L192_or_more_neon:
- vld1.8 {$t0-$t1},[r12]!
- vld1.8 {$t2-$t3},[r12]!
- veor $a0,$a0,$t0
- veor $b0,$b0,$t1
- vld1.8 {$t0-$t1},[r12]!
- veor $c0,$c0,$t2
- veor $d0,$d0,$t3
- vld1.8 {$t2-$t3},[r12]!
-
- veor $a1,$a1,$t0
- veor $b1,$b1,$t1
- vld1.8 {$t0-$t1},[r12]!
- veor $c1,$c1,$t2
- vst1.8 {$a0-$b0},[r14]!
- veor $d1,$d1,$t3
- vld1.8 {$t2-$t3},[r12]!
-
- veor $a2,$a2,$t0
- vst1.8 {$c0-$d0},[r14]!
- veor $b2,$b2,$t1
- vst1.8 {$a1-$b1},[r14]!
- veor $c2,$c2,$t2
- vst1.8 {$c1-$d1},[r14]!
- veor $d2,$d2,$t3
- vst1.8 {$a2-$b2},[r14]!
- vst1.8 {$c2-$d2},[r14]!
-
- beq .Ldone_neon
-
- ldmia sp,{@t[0]-@t[3]} @ load key material
- add @x[0],@x[0],@t[0] @ accumulate key material
- add @t[0],sp,#4*(4)
- add @x[1],@x[1],@t[1]
- add @x[2],@x[2],@t[2]
- add @x[3],@x[3],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
-
- add @x[4],@t[0],@x[4],ror#13 @ accumulate key material
- add @t[0],sp,#4*(8)
- add @x[5],@t[1],@x[5],ror#13
- add @x[6],@t[2],@x[6],ror#13
- add @x[7],@t[3],@x[7],ror#13
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
-# ifdef __ARMEB__
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
- stmia sp,{@x[0]-@x[7]}
- add @x[0],sp,#4*(16+8)
-
- ldmia @x[0],{@x[0]-@x[7]} @ load second half
-
- add @x[0],@x[0],@t[0] @ accumulate key material
- add @t[0],sp,#4*(12)
- add @x[1],@x[1],@t[1]
- add @x[2],@x[2],@t[2]
- add @x[3],@x[3],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
-
- add @x[4],@t[0],@x[4],ror#24 @ accumulate key material
- add @t[0],sp,#4*(8)
- add @x[5],@t[1],@x[5],ror#24
- add @x[4],@x[4],#3 @ counter+3
- add @x[6],@t[2],@x[6],ror#24
- add @x[7],@t[3],@x[7],ror#24
- ldr @t[3],[sp,#4*(32+2)] @ re-load len
-# ifdef __ARMEB__
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
- stmia @t[0],{@x[0]-@x[7]}
- add @t[2],sp,#4*(0)
- sub @t[3],@t[3],#64*3 @ len-=64*3
-
-.Loop_tail_neon:
- ldrb @t[0],[@t[2]],#1 @ read buffer on stack
- ldrb @t[1],[r12],#1 @ read input
- subs @t[3],@t[3],#1
- eor @t[0],@t[0],@t[1]
- strb @t[0],[r14],#1 @ store output
- bne .Loop_tail_neon
-
-.Ldone_neon:
- add sp,sp,#4*(32+4)
- vldmia sp,{d8-d15}
- add sp,sp,#4*(16+3)
- ldmia sp!,{r4-r11,pc}
-.size ChaCha20_neon,.-ChaCha20_neon
-# ifndef __KERNEL__
-.comm OPENSSL_armcap_P,4,4
-# endif
-#endif
-___
-}}}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/@/ and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/geo;
-
- s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
-
- print $_,"\n";
-}
-close STDOUT;
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm64.pl b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm64.pl
deleted file mode 100755
index ac14a9924165..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm64.pl
+++ /dev/null
@@ -1,1163 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# June 2015
-#
-# ChaCha20 for ARMv8.
-#
-# Performance in cycles per byte out of large buffer.
-#
-# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU(*)
-#
-# Apple A7 5.50/+49% 3.33 1.70
-# Cortex-A53 8.40/+80% 4.72 4.72(**)
-# Cortex-A57 8.06/+43% 4.90 4.43(***)
-# Denver 4.50/+82% 2.63 2.67(**)
-# X-Gene 9.50/+46% 8.82 8.89(**)
-# Mongoose 8.00/+44% 3.64 3.25(***)
-# Kryo 8.17/+50% 4.83 4.65(***)
-#
-# (*) since no non-Apple processor exhibits significantly better
-# performance, the code path is #ifdef __APPLE__-ed;
-# (**) it's expected that doubling interleave factor doesn't help
-# all processors, only those with higher NEON latency and
-# higher instruction issue rate;
-# (***) expected improvement was actually higher;
-
-$flavour=shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
- open STDOUT,">$output";
-}
-
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
- my $arg = pop;
- $arg = "#$arg" if ($arg*1 eq $arg);
- $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
-
-my @x=map("x$_",(5..17,19..21));
-my @d=map("x$_",(22..28,30));
-
-sub ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-
- (
- "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
- "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
- "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
- "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
- "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
- "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
- "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
- "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
- "&ror_32 (@x[$d0],@x[$d0],16)",
- "&ror_32 (@x[$d1],@x[$d1],16)",
- "&ror_32 (@x[$d2],@x[$d2],16)",
- "&ror_32 (@x[$d3],@x[$d3],16)",
-
- "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
- "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
- "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
- "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
- "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
- "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
- "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
- "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
- "&ror_32 (@x[$b0],@x[$b0],20)",
- "&ror_32 (@x[$b1],@x[$b1],20)",
- "&ror_32 (@x[$b2],@x[$b2],20)",
- "&ror_32 (@x[$b3],@x[$b3],20)",
-
- "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
- "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
- "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
- "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
- "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
- "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
- "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
- "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
- "&ror_32 (@x[$d0],@x[$d0],24)",
- "&ror_32 (@x[$d1],@x[$d1],24)",
- "&ror_32 (@x[$d2],@x[$d2],24)",
- "&ror_32 (@x[$d3],@x[$d3],24)",
-
- "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
- "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
- "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
- "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
- "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
- "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
- "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
- "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
- "&ror_32 (@x[$b0],@x[$b0],25)",
- "&ror_32 (@x[$b1],@x[$b1],25)",
- "&ror_32 (@x[$b2],@x[$b2],25)",
- "&ror_32 (@x[$b3],@x[$b3],25)"
- );
-}
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-.extern OPENSSL_armcap_P
-#else
-# define ChaCha20_ctr32 chacha20_arm
-# define ChaCha20_neon chacha20_neon
-#endif
-
-.text
-
-.align 5
-.Lsigma:
-.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
-.Lone:
-.long 1,0,0,0
-#ifndef __KERNEL__
-.LOPENSSL_armcap_P:
-# ifdef __ILP32__
-.long OPENSSL_armcap_P-.
-# else
-.quad OPENSSL_armcap_P-.
-# endif
-#endif
-
-.globl ChaCha20_ctr32
-.type ChaCha20_ctr32,%function
-.align 5
-ChaCha20_ctr32:
- cbz $len,.Labort
-#ifndef __KERNEL__
- adr @x[0],.LOPENSSL_armcap_P
- cmp $len,#192
- b.lo .Lshort
-# ifdef __ILP32__
- ldrsw @x[1],[@x[0]]
-# else
- ldr @x[1],[@x[0]]
-# endif
- ldr w17,[@x[1],@x[0]]
- tst w17,#ARMV7_NEON
- b.ne ChaCha20_neon
-
-.Lshort:
-#endif
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr @x[0],.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#64
-
- ldp @d[0],@d[1],[@x[0]] // load sigma
- ldp @d[2],@d[3],[$key] // load key
- ldp @d[4],@d[5],[$key,#16]
- ldp @d[6],@d[7],[$ctr] // load counter
-#ifdef __AARCH64EB__
- ror @d[2],@d[2],#32
- ror @d[3],@d[3],#32
- ror @d[4],@d[4],#32
- ror @d[5],@d[5],#32
- ror @d[6],@d[6],#32
- ror @d[7],@d[7],#32
-#endif
-
-.Loop_outer:
- mov.32 @x[0],@d[0] // unpack key block
- lsr @x[1],@d[0],#32
- mov.32 @x[2],@d[1]
- lsr @x[3],@d[1],#32
- mov.32 @x[4],@d[2]
- lsr @x[5],@d[2],#32
- mov.32 @x[6],@d[3]
- lsr @x[7],@d[3],#32
- mov.32 @x[8],@d[4]
- lsr @x[9],@d[4],#32
- mov.32 @x[10],@d[5]
- lsr @x[11],@d[5],#32
- mov.32 @x[12],@d[6]
- lsr @x[13],@d[6],#32
- mov.32 @x[14],@d[7]
- lsr @x[15],@d[7],#32
-
- mov $ctr,#10
- subs $len,$len,#64
-.Loop:
- sub $ctr,$ctr,#1
-___
- foreach (&ROUND(0, 4, 8,12)) { eval; }
- foreach (&ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- cbnz $ctr,.Loop
-
- add.32 @x[0],@x[0],@d[0] // accumulate key block
- add @x[1],@x[1],@d[0],lsr#32
- add.32 @x[2],@x[2],@d[1]
- add @x[3],@x[3],@d[1],lsr#32
- add.32 @x[4],@x[4],@d[2]
- add @x[5],@x[5],@d[2],lsr#32
- add.32 @x[6],@x[6],@d[3]
- add @x[7],@x[7],@d[3],lsr#32
- add.32 @x[8],@x[8],@d[4]
- add @x[9],@x[9],@d[4],lsr#32
- add.32 @x[10],@x[10],@d[5]
- add @x[11],@x[11],@d[5],lsr#32
- add.32 @x[12],@x[12],@d[6]
- add @x[13],@x[13],@d[6],lsr#32
- add.32 @x[14],@x[14],@d[7]
- add @x[15],@x[15],@d[7],lsr#32
-
- b.lo .Ltail
-
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add @x[2],@x[2],@x[3],lsl#32
- ldp @x[1],@x[3],[$inp,#0] // load input
- add @x[4],@x[4],@x[5],lsl#32
- add @x[6],@x[6],@x[7],lsl#32
- ldp @x[5],@x[7],[$inp,#16]
- add @x[8],@x[8],@x[9],lsl#32
- add @x[10],@x[10],@x[11],lsl#32
- ldp @x[9],@x[11],[$inp,#32]
- add @x[12],@x[12],@x[13],lsl#32
- add @x[14],@x[14],@x[15],lsl#32
- ldp @x[13],@x[15],[$inp,#48]
- add $inp,$inp,#64
-#ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-#endif
- eor @x[0],@x[0],@x[1]
- eor @x[2],@x[2],@x[3]
- eor @x[4],@x[4],@x[5]
- eor @x[6],@x[6],@x[7]
- eor @x[8],@x[8],@x[9]
- eor @x[10],@x[10],@x[11]
- eor @x[12],@x[12],@x[13]
- eor @x[14],@x[14],@x[15]
-
- stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#1 // increment counter
- stp @x[4],@x[6],[$out,#16]
- stp @x[8],@x[10],[$out,#32]
- stp @x[12],@x[14],[$out,#48]
- add $out,$out,#64
-
- b.hi .Loop_outer
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
-.Labort:
- ret
-
-.align 4
-.Ltail:
- add $len,$len,#64
-.Less_than_64:
- sub $out,$out,#1
- add $inp,$inp,$len
- add $out,$out,$len
- add $ctr,sp,$len
- neg $len,$len
-
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add @x[2],@x[2],@x[3],lsl#32
- add @x[4],@x[4],@x[5],lsl#32
- add @x[6],@x[6],@x[7],lsl#32
- add @x[8],@x[8],@x[9],lsl#32
- add @x[10],@x[10],@x[11],lsl#32
- add @x[12],@x[12],@x[13],lsl#32
- add @x[14],@x[14],@x[15],lsl#32
-#ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-#endif
- stp @x[0],@x[2],[sp,#0]
- stp @x[4],@x[6],[sp,#16]
- stp @x[8],@x[10],[sp,#32]
- stp @x[12],@x[14],[sp,#48]
-
-.Loop_tail:
- ldrb w10,[$inp,$len]
- ldrb w11,[$ctr,$len]
- add $len,$len,#1
- eor w10,w10,w11
- strb w10,[$out,$len]
- cbnz $len,.Loop_tail
-
- stp xzr,xzr,[sp,#0]
- stp xzr,xzr,[sp,#16]
- stp xzr,xzr,[sp,#32]
- stp xzr,xzr,[sp,#48]
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- ret
-.size ChaCha20_ctr32,.-ChaCha20_ctr32
-___
-
-{{{
-my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
- map("v$_.4s",(0..7,16..23));
-my (@K)=map("v$_.4s",(24..30));
-my $ONE="v31.4s";
-
-sub NEONROUND {
-my $odd = pop;
-my ($a,$b,$c,$d,$t)=@_;
-
- (
- "&add ('$a','$a','$b')",
- "&eor ('$d','$d','$a')",
- "&rev32_16 ('$d','$d')", # vrot ($d,16)
-
- "&add ('$c','$c','$d')",
- "&eor ('$t','$b','$c')",
- "&ushr ('$b','$t',20)",
- "&sli ('$b','$t',12)",
-
- "&add ('$a','$a','$b')",
- "&eor ('$t','$d','$a')",
- "&ushr ('$d','$t',24)",
- "&sli ('$d','$t',8)",
-
- "&add ('$c','$c','$d')",
- "&eor ('$t','$b','$c')",
- "&ushr ('$b','$t',25)",
- "&sli ('$b','$t',7)",
-
- "&ext ('$a','$a','$a',$odd?4:12)",
- "&ext ('$d','$d','$d',8)",
- "&ext ('$c','$c','$c',$odd?12:4)"
- );
-}
-
-$code.=<<___;
-#if !defined(__KERNEL__) || defined(CONFIG_KERNEL_MODE_NEON)
-#ifdef __KERNEL__
-.globl ChaCha20_neon
-.type ChaCha20_neon,%function
-#endif
-.type ChaCha20_neon,%function
-.align 5
-ChaCha20_neon:
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr @x[0],.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
-#ifdef __APPLE__
- cmp $len,#512
- b.hs .L512_or_more_neon
-#endif
-
- sub sp,sp,#64
-
- ldp @d[0],@d[1],[@x[0]] // load sigma
- ld1 {@K[0]},[@x[0]],#16
- ldp @d[2],@d[3],[$key] // load key
- ldp @d[4],@d[5],[$key,#16]
- ld1 {@K[1],@K[2]},[$key]
- ldp @d[6],@d[7],[$ctr] // load counter
- ld1 {@K[3]},[$ctr]
- ld1 {$ONE},[@x[0]]
-#ifdef __AARCH64EB__
- rev64 @K[0],@K[0]
- ror @d[2],@d[2],#32
- ror @d[3],@d[3],#32
- ror @d[4],@d[4],#32
- ror @d[5],@d[5],#32
- ror @d[6],@d[6],#32
- ror @d[7],@d[7],#32
-#endif
- add @K[3],@K[3],$ONE // += 1
- add @K[4],@K[3],$ONE
- add @K[5],@K[4],$ONE
- shl $ONE,$ONE,#2 // 1 -> 4
-
-.Loop_outer_neon:
- mov.32 @x[0],@d[0] // unpack key block
- lsr @x[1],@d[0],#32
- mov $A0,@K[0]
- mov.32 @x[2],@d[1]
- lsr @x[3],@d[1],#32
- mov $A1,@K[0]
- mov.32 @x[4],@d[2]
- lsr @x[5],@d[2],#32
- mov $A2,@K[0]
- mov.32 @x[6],@d[3]
- mov $B0,@K[1]
- lsr @x[7],@d[3],#32
- mov $B1,@K[1]
- mov.32 @x[8],@d[4]
- mov $B2,@K[1]
- lsr @x[9],@d[4],#32
- mov $D0,@K[3]
- mov.32 @x[10],@d[5]
- mov $D1,@K[4]
- lsr @x[11],@d[5],#32
- mov $D2,@K[5]
- mov.32 @x[12],@d[6]
- mov $C0,@K[2]
- lsr @x[13],@d[6],#32
- mov $C1,@K[2]
- mov.32 @x[14],@d[7]
- mov $C2,@K[2]
- lsr @x[15],@d[7],#32
-
- mov $ctr,#10
- subs $len,$len,#256
-.Loop_neon:
- sub $ctr,$ctr,#1
-___
- my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
- my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
- my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
- my @thread3=&ROUND(0,4,8,12);
-
- foreach (@thread0) {
- eval; eval(shift(@thread3));
- eval(shift(@thread1)); eval(shift(@thread3));
- eval(shift(@thread2)); eval(shift(@thread3));
- }
-
- @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
- @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
- @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
- @thread3=&ROUND(0,5,10,15);
-
- foreach (@thread0) {
- eval; eval(shift(@thread3));
- eval(shift(@thread1)); eval(shift(@thread3));
- eval(shift(@thread2)); eval(shift(@thread3));
- }
-$code.=<<___;
- cbnz $ctr,.Loop_neon
-
- add.32 @x[0],@x[0],@d[0] // accumulate key block
- add $A0,$A0,@K[0]
- add @x[1],@x[1],@d[0],lsr#32
- add $A1,$A1,@K[0]
- add.32 @x[2],@x[2],@d[1]
- add $A2,$A2,@K[0]
- add @x[3],@x[3],@d[1],lsr#32
- add $C0,$C0,@K[2]
- add.32 @x[4],@x[4],@d[2]
- add $C1,$C1,@K[2]
- add @x[5],@x[5],@d[2],lsr#32
- add $C2,$C2,@K[2]
- add.32 @x[6],@x[6],@d[3]
- add $D0,$D0,@K[3]
- add @x[7],@x[7],@d[3],lsr#32
- add.32 @x[8],@x[8],@d[4]
- add $D1,$D1,@K[4]
- add @x[9],@x[9],@d[4],lsr#32
- add.32 @x[10],@x[10],@d[5]
- add $D2,$D2,@K[5]
- add @x[11],@x[11],@d[5],lsr#32
- add.32 @x[12],@x[12],@d[6]
- add $B0,$B0,@K[1]
- add @x[13],@x[13],@d[6],lsr#32
- add.32 @x[14],@x[14],@d[7]
- add $B1,$B1,@K[1]
- add @x[15],@x[15],@d[7],lsr#32
- add $B2,$B2,@K[1]
-
- b.lo .Ltail_neon
-
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add @x[2],@x[2],@x[3],lsl#32
- ldp @x[1],@x[3],[$inp,#0] // load input
- add @x[4],@x[4],@x[5],lsl#32
- add @x[6],@x[6],@x[7],lsl#32
- ldp @x[5],@x[7],[$inp,#16]
- add @x[8],@x[8],@x[9],lsl#32
- add @x[10],@x[10],@x[11],lsl#32
- ldp @x[9],@x[11],[$inp,#32]
- add @x[12],@x[12],@x[13],lsl#32
- add @x[14],@x[14],@x[15],lsl#32
- ldp @x[13],@x[15],[$inp,#48]
- add $inp,$inp,#64
-#ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-#endif
- ld1.8 {$T0-$T3},[$inp],#64
- eor @x[0],@x[0],@x[1]
- eor @x[2],@x[2],@x[3]
- eor @x[4],@x[4],@x[5]
- eor @x[6],@x[6],@x[7]
- eor @x[8],@x[8],@x[9]
- eor $A0,$A0,$T0
- eor @x[10],@x[10],@x[11]
- eor $B0,$B0,$T1
- eor @x[12],@x[12],@x[13]
- eor $C0,$C0,$T2
- eor @x[14],@x[14],@x[15]
- eor $D0,$D0,$T3
- ld1.8 {$T0-$T3},[$inp],#64
-
- stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#4 // increment counter
- stp @x[4],@x[6],[$out,#16]
- add @K[3],@K[3],$ONE // += 4
- stp @x[8],@x[10],[$out,#32]
- add @K[4],@K[4],$ONE
- stp @x[12],@x[14],[$out,#48]
- add @K[5],@K[5],$ONE
- add $out,$out,#64
-
- st1.8 {$A0-$D0},[$out],#64
- ld1.8 {$A0-$D0},[$inp],#64
-
- eor $A1,$A1,$T0
- eor $B1,$B1,$T1
- eor $C1,$C1,$T2
- eor $D1,$D1,$T3
- st1.8 {$A1-$D1},[$out],#64
-
- eor $A2,$A2,$A0
- eor $B2,$B2,$B0
- eor $C2,$C2,$C0
- eor $D2,$D2,$D0
- st1.8 {$A2-$D2},[$out],#64
-
- b.hi .Loop_outer_neon
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- ret
-
-.Ltail_neon:
- add $len,$len,#256
- cmp $len,#64
- b.lo .Less_than_64
-
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add @x[2],@x[2],@x[3],lsl#32
- ldp @x[1],@x[3],[$inp,#0] // load input
- add @x[4],@x[4],@x[5],lsl#32
- add @x[6],@x[6],@x[7],lsl#32
- ldp @x[5],@x[7],[$inp,#16]
- add @x[8],@x[8],@x[9],lsl#32
- add @x[10],@x[10],@x[11],lsl#32
- ldp @x[9],@x[11],[$inp,#32]
- add @x[12],@x[12],@x[13],lsl#32
- add @x[14],@x[14],@x[15],lsl#32
- ldp @x[13],@x[15],[$inp,#48]
- add $inp,$inp,#64
-#ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-#endif
- eor @x[0],@x[0],@x[1]
- eor @x[2],@x[2],@x[3]
- eor @x[4],@x[4],@x[5]
- eor @x[6],@x[6],@x[7]
- eor @x[8],@x[8],@x[9]
- eor @x[10],@x[10],@x[11]
- eor @x[12],@x[12],@x[13]
- eor @x[14],@x[14],@x[15]
-
- stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#4 // increment counter
- stp @x[4],@x[6],[$out,#16]
- stp @x[8],@x[10],[$out,#32]
- stp @x[12],@x[14],[$out,#48]
- add $out,$out,#64
- b.eq .Ldone_neon
- sub $len,$len,#64
- cmp $len,#64
- b.lo .Less_than_128
-
- ld1.8 {$T0-$T3},[$inp],#64
- eor $A0,$A0,$T0
- eor $B0,$B0,$T1
- eor $C0,$C0,$T2
- eor $D0,$D0,$T3
- st1.8 {$A0-$D0},[$out],#64
- b.eq .Ldone_neon
- sub $len,$len,#64
- cmp $len,#64
- b.lo .Less_than_192
-
- ld1.8 {$T0-$T3},[$inp],#64
- eor $A1,$A1,$T0
- eor $B1,$B1,$T1
- eor $C1,$C1,$T2
- eor $D1,$D1,$T3
- st1.8 {$A1-$D1},[$out],#64
- b.eq .Ldone_neon
- sub $len,$len,#64
-
- st1.8 {$A2-$D2},[sp]
- b .Last_neon
-
-.Less_than_128:
- st1.8 {$A0-$D0},[sp]
- b .Last_neon
-.Less_than_192:
- st1.8 {$A1-$D1},[sp]
- b .Last_neon
-
-.align 4
-.Last_neon:
- sub $out,$out,#1
- add $inp,$inp,$len
- add $out,$out,$len
- add $ctr,sp,$len
- neg $len,$len
-
-.Loop_tail_neon:
- ldrb w10,[$inp,$len]
- ldrb w11,[$ctr,$len]
- add $len,$len,#1
- eor w10,w10,w11
- strb w10,[$out,$len]
- cbnz $len,.Loop_tail_neon
-
- stp xzr,xzr,[sp,#0]
- stp xzr,xzr,[sp,#16]
- stp xzr,xzr,[sp,#32]
- stp xzr,xzr,[sp,#48]
-
-.Ldone_neon:
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- ret
-.size ChaCha20_neon,.-ChaCha20_neon
-___
-{
-my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
-my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
- $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
-
-$code.=<<___;
-#ifdef __APPLE__
-.type ChaCha20_512_neon,%function
-.align 5
-ChaCha20_512_neon:
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr @x[0],.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
-
-.L512_or_more_neon:
- sub sp,sp,#128+64
-
- ldp @d[0],@d[1],[@x[0]] // load sigma
- ld1 {@K[0]},[@x[0]],#16
- ldp @d[2],@d[3],[$key] // load key
- ldp @d[4],@d[5],[$key,#16]
- ld1 {@K[1],@K[2]},[$key]
- ldp @d[6],@d[7],[$ctr] // load counter
- ld1 {@K[3]},[$ctr]
- ld1 {$ONE},[@x[0]]
-# ifdef __AARCH64EB__
- rev64 @K[0],@K[0]
- ror @d[2],@d[2],#32
- ror @d[3],@d[3],#32
- ror @d[4],@d[4],#32
- ror @d[5],@d[5],#32
- ror @d[6],@d[6],#32
- ror @d[7],@d[7],#32
-# endif
- add @K[3],@K[3],$ONE // += 1
- stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
- add @K[3],@K[3],$ONE // not typo
- str @K[2],[sp,#32]
- add @K[4],@K[3],$ONE
- add @K[5],@K[4],$ONE
- add @K[6],@K[5],$ONE
- shl $ONE,$ONE,#2 // 1 -> 4
-
- stp d8,d9,[sp,#128+0] // meet ABI requirements
- stp d10,d11,[sp,#128+16]
- stp d12,d13,[sp,#128+32]
- stp d14,d15,[sp,#128+48]
-
- sub $len,$len,#512 // not typo
-
-.Loop_outer_512_neon:
- mov $A0,@K[0]
- mov $A1,@K[0]
- mov $A2,@K[0]
- mov $A3,@K[0]
- mov $A4,@K[0]
- mov $A5,@K[0]
- mov $B0,@K[1]
- mov.32 @x[0],@d[0] // unpack key block
- mov $B1,@K[1]
- lsr @x[1],@d[0],#32
- mov $B2,@K[1]
- mov.32 @x[2],@d[1]
- mov $B3,@K[1]
- lsr @x[3],@d[1],#32
- mov $B4,@K[1]
- mov.32 @x[4],@d[2]
- mov $B5,@K[1]
- lsr @x[5],@d[2],#32
- mov $D0,@K[3]
- mov.32 @x[6],@d[3]
- mov $D1,@K[4]
- lsr @x[7],@d[3],#32
- mov $D2,@K[5]
- mov.32 @x[8],@d[4]
- mov $D3,@K[6]
- lsr @x[9],@d[4],#32
- mov $C0,@K[2]
- mov.32 @x[10],@d[5]
- mov $C1,@K[2]
- lsr @x[11],@d[5],#32
- add $D4,$D0,$ONE // +4
- mov.32 @x[12],@d[6]
- add $D5,$D1,$ONE // +4
- lsr @x[13],@d[6],#32
- mov $C2,@K[2]
- mov.32 @x[14],@d[7]
- mov $C3,@K[2]
- lsr @x[15],@d[7],#32
- mov $C4,@K[2]
- stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
- mov $C5,@K[2]
- str @K[5],[sp,#80]
-
- mov $ctr,#5
- subs $len,$len,#512
-.Loop_upper_neon:
- sub $ctr,$ctr,#1
-___
- my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
- my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
- my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
- my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
- my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
- my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
- my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
- my $diff = ($#thread0+1)*6 - $#thread67 - 1;
- my $i = 0;
-
- foreach (@thread0) {
- eval; eval(shift(@thread67));
- eval(shift(@thread1)); eval(shift(@thread67));
- eval(shift(@thread2)); eval(shift(@thread67));
- eval(shift(@thread3)); eval(shift(@thread67));
- eval(shift(@thread4)); eval(shift(@thread67));
- eval(shift(@thread5)); eval(shift(@thread67));
- }
-
- @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
- @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
- @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
- @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
- @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
- @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
- @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
-
- foreach (@thread0) {
- eval; eval(shift(@thread67));
- eval(shift(@thread1)); eval(shift(@thread67));
- eval(shift(@thread2)); eval(shift(@thread67));
- eval(shift(@thread3)); eval(shift(@thread67));
- eval(shift(@thread4)); eval(shift(@thread67));
- eval(shift(@thread5)); eval(shift(@thread67));
- }
-$code.=<<___;
- cbnz $ctr,.Loop_upper_neon
-
- add.32 @x[0],@x[0],@d[0] // accumulate key block
- add @x[1],@x[1],@d[0],lsr#32
- add.32 @x[2],@x[2],@d[1]
- add @x[3],@x[3],@d[1],lsr#32
- add.32 @x[4],@x[4],@d[2]
- add @x[5],@x[5],@d[2],lsr#32
- add.32 @x[6],@x[6],@d[3]
- add @x[7],@x[7],@d[3],lsr#32
- add.32 @x[8],@x[8],@d[4]
- add @x[9],@x[9],@d[4],lsr#32
- add.32 @x[10],@x[10],@d[5]
- add @x[11],@x[11],@d[5],lsr#32
- add.32 @x[12],@x[12],@d[6]
- add @x[13],@x[13],@d[6],lsr#32
- add.32 @x[14],@x[14],@d[7]
- add @x[15],@x[15],@d[7],lsr#32
-
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add @x[2],@x[2],@x[3],lsl#32
- ldp @x[1],@x[3],[$inp,#0] // load input
- add @x[4],@x[4],@x[5],lsl#32
- add @x[6],@x[6],@x[7],lsl#32
- ldp @x[5],@x[7],[$inp,#16]
- add @x[8],@x[8],@x[9],lsl#32
- add @x[10],@x[10],@x[11],lsl#32
- ldp @x[9],@x[11],[$inp,#32]
- add @x[12],@x[12],@x[13],lsl#32
- add @x[14],@x[14],@x[15],lsl#32
- ldp @x[13],@x[15],[$inp,#48]
- add $inp,$inp,#64
-# ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-# endif
- eor @x[0],@x[0],@x[1]
- eor @x[2],@x[2],@x[3]
- eor @x[4],@x[4],@x[5]
- eor @x[6],@x[6],@x[7]
- eor @x[8],@x[8],@x[9]
- eor @x[10],@x[10],@x[11]
- eor @x[12],@x[12],@x[13]
- eor @x[14],@x[14],@x[15]
-
- stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#1 // increment counter
- mov.32 @x[0],@d[0] // unpack key block
- lsr @x[1],@d[0],#32
- stp @x[4],@x[6],[$out,#16]
- mov.32 @x[2],@d[1]
- lsr @x[3],@d[1],#32
- stp @x[8],@x[10],[$out,#32]
- mov.32 @x[4],@d[2]
- lsr @x[5],@d[2],#32
- stp @x[12],@x[14],[$out,#48]
- add $out,$out,#64
- mov.32 @x[6],@d[3]
- lsr @x[7],@d[3],#32
- mov.32 @x[8],@d[4]
- lsr @x[9],@d[4],#32
- mov.32 @x[10],@d[5]
- lsr @x[11],@d[5],#32
- mov.32 @x[12],@d[6]
- lsr @x[13],@d[6],#32
- mov.32 @x[14],@d[7]
- lsr @x[15],@d[7],#32
-
- mov $ctr,#5
-.Loop_lower_neon:
- sub $ctr,$ctr,#1
-___
- @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
- @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
- @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
- @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
- @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
- @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
- @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
-
- foreach (@thread0) {
- eval; eval(shift(@thread67));
- eval(shift(@thread1)); eval(shift(@thread67));
- eval(shift(@thread2)); eval(shift(@thread67));
- eval(shift(@thread3)); eval(shift(@thread67));
- eval(shift(@thread4)); eval(shift(@thread67));
- eval(shift(@thread5)); eval(shift(@thread67));
- }
-
- @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
- @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
- @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
- @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
- @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
- @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
- @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
-
- foreach (@thread0) {
- eval; eval(shift(@thread67));
- eval(shift(@thread1)); eval(shift(@thread67));
- eval(shift(@thread2)); eval(shift(@thread67));
- eval(shift(@thread3)); eval(shift(@thread67));
- eval(shift(@thread4)); eval(shift(@thread67));
- eval(shift(@thread5)); eval(shift(@thread67));
- }
-$code.=<<___;
- cbnz $ctr,.Loop_lower_neon
-
- add.32 @x[0],@x[0],@d[0] // accumulate key block
- ldp @K[0],@K[1],[sp,#0]
- add @x[1],@x[1],@d[0],lsr#32
- ldp @K[2],@K[3],[sp,#32]
- add.32 @x[2],@x[2],@d[1]
- ldp @K[4],@K[5],[sp,#64]
- add @x[3],@x[3],@d[1],lsr#32
- add $A0,$A0,@K[0]
- add.32 @x[4],@x[4],@d[2]
- add $A1,$A1,@K[0]
- add @x[5],@x[5],@d[2],lsr#32
- add $A2,$A2,@K[0]
- add.32 @x[6],@x[6],@d[3]
- add $A3,$A3,@K[0]
- add @x[7],@x[7],@d[3],lsr#32
- add $A4,$A4,@K[0]
- add.32 @x[8],@x[8],@d[4]
- add $A5,$A5,@K[0]
- add @x[9],@x[9],@d[4],lsr#32
- add $C0,$C0,@K[2]
- add.32 @x[10],@x[10],@d[5]
- add $C1,$C1,@K[2]
- add @x[11],@x[11],@d[5],lsr#32
- add $C2,$C2,@K[2]
- add.32 @x[12],@x[12],@d[6]
- add $C3,$C3,@K[2]
- add @x[13],@x[13],@d[6],lsr#32
- add $C4,$C4,@K[2]
- add.32 @x[14],@x[14],@d[7]
- add $C5,$C5,@K[2]
- add @x[15],@x[15],@d[7],lsr#32
- add $D4,$D4,$ONE // +4
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add $D5,$D5,$ONE // +4
- add @x[2],@x[2],@x[3],lsl#32
- add $D0,$D0,@K[3]
- ldp @x[1],@x[3],[$inp,#0] // load input
- add $D1,$D1,@K[4]
- add @x[4],@x[4],@x[5],lsl#32
- add $D2,$D2,@K[5]
- add @x[6],@x[6],@x[7],lsl#32
- add $D3,$D3,@K[6]
- ldp @x[5],@x[7],[$inp,#16]
- add $D4,$D4,@K[3]
- add @x[8],@x[8],@x[9],lsl#32
- add $D5,$D5,@K[4]
- add @x[10],@x[10],@x[11],lsl#32
- add $B0,$B0,@K[1]
- ldp @x[9],@x[11],[$inp,#32]
- add $B1,$B1,@K[1]
- add @x[12],@x[12],@x[13],lsl#32
- add $B2,$B2,@K[1]
- add @x[14],@x[14],@x[15],lsl#32
- add $B3,$B3,@K[1]
- ldp @x[13],@x[15],[$inp,#48]
- add $B4,$B4,@K[1]
- add $inp,$inp,#64
- add $B5,$B5,@K[1]
-
-# ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-# endif
- ld1.8 {$T0-$T3},[$inp],#64
- eor @x[0],@x[0],@x[1]
- eor @x[2],@x[2],@x[3]
- eor @x[4],@x[4],@x[5]
- eor @x[6],@x[6],@x[7]
- eor @x[8],@x[8],@x[9]
- eor $A0,$A0,$T0
- eor @x[10],@x[10],@x[11]
- eor $B0,$B0,$T1
- eor @x[12],@x[12],@x[13]
- eor $C0,$C0,$T2
- eor @x[14],@x[14],@x[15]
- eor $D0,$D0,$T3
- ld1.8 {$T0-$T3},[$inp],#64
-
- stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#7 // increment counter
- stp @x[4],@x[6],[$out,#16]
- stp @x[8],@x[10],[$out,#32]
- stp @x[12],@x[14],[$out,#48]
- add $out,$out,#64
- st1.8 {$A0-$D0},[$out],#64
-
- ld1.8 {$A0-$D0},[$inp],#64
- eor $A1,$A1,$T0
- eor $B1,$B1,$T1
- eor $C1,$C1,$T2
- eor $D1,$D1,$T3
- st1.8 {$A1-$D1},[$out],#64
-
- ld1.8 {$A1-$D1},[$inp],#64
- eor $A2,$A2,$A0
- ldp @K[0],@K[1],[sp,#0]
- eor $B2,$B2,$B0
- ldp @K[2],@K[3],[sp,#32]
- eor $C2,$C2,$C0
- eor $D2,$D2,$D0
- st1.8 {$A2-$D2},[$out],#64
-
- ld1.8 {$A2-$D2},[$inp],#64
- eor $A3,$A3,$A1
- eor $B3,$B3,$B1
- eor $C3,$C3,$C1
- eor $D3,$D3,$D1
- st1.8 {$A3-$D3},[$out],#64
-
- ld1.8 {$A3-$D3},[$inp],#64
- eor $A4,$A4,$A2
- eor $B4,$B4,$B2
- eor $C4,$C4,$C2
- eor $D4,$D4,$D2
- st1.8 {$A4-$D4},[$out],#64
-
- shl $A0,$ONE,#1 // 4 -> 8
- eor $A5,$A5,$A3
- eor $B5,$B5,$B3
- eor $C5,$C5,$C3
- eor $D5,$D5,$D3
- st1.8 {$A5-$D5},[$out],#64
-
- add @K[3],@K[3],$A0 // += 8
- add @K[4],@K[4],$A0
- add @K[5],@K[5],$A0
- add @K[6],@K[6],$A0
-
- b.hs .Loop_outer_512_neon
-
- adds $len,$len,#512
- ushr $A0,$ONE,#2 // 4 -> 1
-
- ldp d8,d9,[sp,#128+0] // meet ABI requirements
- ldp d10,d11,[sp,#128+16]
- ldp d12,d13,[sp,#128+32]
- ldp d14,d15,[sp,#128+48]
-
- stp @K[0],$ONE,[sp,#0] // wipe off-load area
- stp @K[0],$ONE,[sp,#32]
- stp @K[0],$ONE,[sp,#64]
-
- b.eq .Ldone_512_neon
-
- cmp $len,#192
- sub @K[3],@K[3],$A0 // -= 1
- sub @K[4],@K[4],$A0
- sub @K[5],@K[5],$A0
- add sp,sp,#128
- b.hs .Loop_outer_neon
-
- eor @K[1],@K[1],@K[1]
- eor @K[2],@K[2],@K[2]
- eor @K[3],@K[3],@K[3]
- eor @K[4],@K[4],@K[4]
- eor @K[5],@K[5],@K[5]
- eor @K[6],@K[6],@K[6]
- b .Loop_outer
-
-.Ldone_512_neon:
- ldp x19,x20,[x29,#16]
- add sp,sp,#128+64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- ret
-.size ChaCha20_512_neon,.-ChaCha20_512_neon
-#endif
-#endif
-___
-}
-}}}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/\/\// and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/geo;
-
- (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
- (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
- (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
- (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
- (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
-
- print $_,"\n";
-}
-close STDOUT; # flush
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips-glue.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips-glue.c
deleted file mode 100644
index 96ce01e2c133..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips-glue.c
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-asmlinkage void chacha20_mips(u32 state[16], u8 *out, const u8 *in,
- const size_t len);
-static bool *const chacha20_nobs[] __initconst = { };
-static void __init chacha20_fpu_init(void)
-{
-}
-
-static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst,
- const u8 *src, size_t len,
- simd_context_t *simd_context)
-{
- chacha20_mips(ctx->state, dst, src, len);
- return true;
-}
-
-static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE],
- simd_context_t *simd_context)
-{
- return false;
-}
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips.S b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips.S
deleted file mode 100644
index a81e02db95e7..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips.S
+++ /dev/null
@@ -1,424 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#define MASK_U32 0x3c
-#define CHACHA20_BLOCK_SIZE 64
-#define STACK_SIZE 32
-
-#define X0 $t0
-#define X1 $t1
-#define X2 $t2
-#define X3 $t3
-#define X4 $t4
-#define X5 $t5
-#define X6 $t6
-#define X7 $t7
-#define X8 $t8
-#define X9 $t9
-#define X10 $v1
-#define X11 $s6
-#define X12 $s5
-#define X13 $s4
-#define X14 $s3
-#define X15 $s2
-/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
-#define T0 $s1
-#define T1 $s0
-#define T(n) T ## n
-#define X(n) X ## n
-
-/* Input arguments */
-#define STATE $a0
-#define OUT $a1
-#define IN $a2
-#define BYTES $a3
-
-/* Output argument */
-/* NONCE[0] is kept in a register and not in memory.
- * We don't want to touch original value in memory.
- * Must be incremented every loop iteration.
- */
-#define NONCE_0 $v0
-
-/* SAVED_X and SAVED_CA are set in the jump table.
- * Use regs which are overwritten on exit else we don't leak clear data.
- * They are used to handling the last bytes which are not multiple of 4.
- */
-#define SAVED_X X15
-#define SAVED_CA $s7
-
-#define IS_UNALIGNED $s7
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#define MSB 0
-#define LSB 3
-#define ROTx rotl
-#define ROTR(n) rotr n, 24
-#define CPU_TO_LE32(n) \
- wsbh n; \
- rotr n, 16;
-#else
-#define MSB 3
-#define LSB 0
-#define ROTx rotr
-#define CPU_TO_LE32(n)
-#define ROTR(n)
-#endif
-
-#define FOR_EACH_WORD(x) \
- x( 0); \
- x( 1); \
- x( 2); \
- x( 3); \
- x( 4); \
- x( 5); \
- x( 6); \
- x( 7); \
- x( 8); \
- x( 9); \
- x(10); \
- x(11); \
- x(12); \
- x(13); \
- x(14); \
- x(15);
-
-#define FOR_EACH_WORD_REV(x) \
- x(15); \
- x(14); \
- x(13); \
- x(12); \
- x(11); \
- x(10); \
- x( 9); \
- x( 8); \
- x( 7); \
- x( 6); \
- x( 5); \
- x( 4); \
- x( 3); \
- x( 2); \
- x( 1); \
- x( 0);
-
-#define PLUS_ONE_0 1
-#define PLUS_ONE_1 2
-#define PLUS_ONE_2 3
-#define PLUS_ONE_3 4
-#define PLUS_ONE_4 5
-#define PLUS_ONE_5 6
-#define PLUS_ONE_6 7
-#define PLUS_ONE_7 8
-#define PLUS_ONE_8 9
-#define PLUS_ONE_9 10
-#define PLUS_ONE_10 11
-#define PLUS_ONE_11 12
-#define PLUS_ONE_12 13
-#define PLUS_ONE_13 14
-#define PLUS_ONE_14 15
-#define PLUS_ONE_15 16
-#define PLUS_ONE(x) PLUS_ONE_ ## x
-#define _CONCAT3(a,b,c) a ## b ## c
-#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
-
-#define STORE_UNALIGNED(x) \
-CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
- .if (x != 12); \
- lw T0, (x*4)(STATE); \
- .endif; \
- lwl T1, (x*4)+MSB ## (IN); \
- lwr T1, (x*4)+LSB ## (IN); \
- .if (x == 12); \
- addu X ## x, NONCE_0; \
- .else; \
- addu X ## x, T0; \
- .endif; \
- CPU_TO_LE32(X ## x); \
- xor X ## x, T1; \
- swl X ## x, (x*4)+MSB ## (OUT); \
- swr X ## x, (x*4)+LSB ## (OUT);
-
-#define STORE_ALIGNED(x) \
-CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
- .if (x != 12); \
- lw T0, (x*4)(STATE); \
- .endif; \
- lw T1, (x*4) ## (IN); \
- .if (x == 12); \
- addu X ## x, NONCE_0; \
- .else; \
- addu X ## x, T0; \
- .endif; \
- CPU_TO_LE32(X ## x); \
- xor X ## x, T1; \
- sw X ## x, (x*4) ## (OUT);
-
-/* Jump table macro.
- * Used for setup and handling the last bytes, which are not multiple of 4.
- * X15 is free to store Xn
- * Every jumptable entry must be equal in size.
- */
-#define JMPTBL_ALIGNED(x) \
-.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
- .set noreorder; \
- b .Lchacha20_mips_xor_aligned_ ## x ## _b; \
- .if (x == 12); \
- addu SAVED_X, X ## x, NONCE_0; \
- .else; \
- addu SAVED_X, X ## x, SAVED_CA; \
- .endif; \
- .set reorder
-
-#define JMPTBL_UNALIGNED(x) \
-.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
- .set noreorder; \
- b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
- .if (x == 12); \
- addu SAVED_X, X ## x, NONCE_0; \
- .else; \
- addu SAVED_X, X ## x, SAVED_CA; \
- .endif; \
- .set reorder
-
-#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
- addu X(A), X(K); \
- addu X(B), X(L); \
- addu X(C), X(M); \
- addu X(D), X(N); \
- xor X(V), X(A); \
- xor X(W), X(B); \
- xor X(Y), X(C); \
- xor X(Z), X(D); \
- rotl X(V), S; \
- rotl X(W), S; \
- rotl X(Y), S; \
- rotl X(Z), S;
-
-.text
-.set reorder
-.set noat
-.globl chacha20_mips
-.ent chacha20_mips
-chacha20_mips:
- .frame $sp, STACK_SIZE, $ra
-
- addiu $sp, -STACK_SIZE
-
- /* Return bytes = 0. */
- beqz BYTES, .Lchacha20_mips_end
-
- lw NONCE_0, 48(STATE)
-
- /* Save s0-s7 */
- sw $s0, 0($sp)
- sw $s1, 4($sp)
- sw $s2, 8($sp)
- sw $s3, 12($sp)
- sw $s4, 16($sp)
- sw $s5, 20($sp)
- sw $s6, 24($sp)
- sw $s7, 28($sp)
-
- /* Test IN or OUT is unaligned.
- * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
- */
- or IS_UNALIGNED, IN, OUT
- andi IS_UNALIGNED, 0x3
-
- /* Set number of rounds */
- li $at, 20
-
- b .Lchacha20_rounds_start
-
-.align 4
-.Loop_chacha20_rounds:
- addiu IN, CHACHA20_BLOCK_SIZE
- addiu OUT, CHACHA20_BLOCK_SIZE
- addiu NONCE_0, 1
-
-.Lchacha20_rounds_start:
- lw X0, 0(STATE)
- lw X1, 4(STATE)
- lw X2, 8(STATE)
- lw X3, 12(STATE)
-
- lw X4, 16(STATE)
- lw X5, 20(STATE)
- lw X6, 24(STATE)
- lw X7, 28(STATE)
- lw X8, 32(STATE)
- lw X9, 36(STATE)
- lw X10, 40(STATE)
- lw X11, 44(STATE)
-
- move X12, NONCE_0
- lw X13, 52(STATE)
- lw X14, 56(STATE)
- lw X15, 60(STATE)
-
-.Loop_chacha20_xor_rounds:
- addiu $at, -2
- AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
- AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
- AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
- AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
- AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
- AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
- AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
- AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
- bnez $at, .Loop_chacha20_xor_rounds
-
- addiu BYTES, -(CHACHA20_BLOCK_SIZE)
-
- /* Is data src/dst unaligned? Jump */
- bnez IS_UNALIGNED, .Loop_chacha20_unaligned
-
- /* Set number rounds here to fill delayslot. */
- li $at, 20
-
- /* BYTES < 0, it has no full block. */
- bltz BYTES, .Lchacha20_mips_no_full_block_aligned
-
- FOR_EACH_WORD_REV(STORE_ALIGNED)
-
- /* BYTES > 0? Loop again. */
- bgtz BYTES, .Loop_chacha20_rounds
-
- /* Place this here to fill delay slot */
- addiu NONCE_0, 1
-
- /* BYTES < 0? Handle last bytes */
- bltz BYTES, .Lchacha20_mips_xor_bytes
-
-.Lchacha20_mips_xor_done:
- /* Restore used registers */
- lw $s0, 0($sp)
- lw $s1, 4($sp)
- lw $s2, 8($sp)
- lw $s3, 12($sp)
- lw $s4, 16($sp)
- lw $s5, 20($sp)
- lw $s6, 24($sp)
- lw $s7, 28($sp)
-
- /* Write NONCE_0 back to right location in state */
- sw NONCE_0, 48(STATE)
-
-.Lchacha20_mips_end:
- addiu $sp, STACK_SIZE
- jr $ra
-
-.Lchacha20_mips_no_full_block_aligned:
- /* Restore the offset on BYTES */
- addiu BYTES, CHACHA20_BLOCK_SIZE
-
- /* Get number of full WORDS */
- andi $at, BYTES, MASK_U32
-
- /* Load upper half of jump table addr */
- lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
-
- /* Calculate lower half jump table offset */
- ins T0, $at, 1, 6
-
- /* Add offset to STATE */
- addu T1, STATE, $at
-
- /* Add lower half jump table addr */
- addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
-
- /* Read value from STATE */
- lw SAVED_CA, 0(T1)
-
- /* Store remaining bytecounter as negative value */
- subu BYTES, $at, BYTES
-
- jr T0
-
- /* Jump table */
- FOR_EACH_WORD(JMPTBL_ALIGNED)
-
-
-.Loop_chacha20_unaligned:
- /* Set number rounds here to fill delayslot. */
- li $at, 20
-
- /* BYTES > 0, it has no full block. */
- bltz BYTES, .Lchacha20_mips_no_full_block_unaligned
-
- FOR_EACH_WORD_REV(STORE_UNALIGNED)
-
- /* BYTES > 0? Loop again. */
- bgtz BYTES, .Loop_chacha20_rounds
-
- /* Write NONCE_0 back to right location in state */
- sw NONCE_0, 48(STATE)
-
- .set noreorder
- /* Fall through to byte handling */
- bgez BYTES, .Lchacha20_mips_xor_done
-.Lchacha20_mips_xor_unaligned_0_b:
-.Lchacha20_mips_xor_aligned_0_b:
- /* Place this here to fill delay slot */
- addiu NONCE_0, 1
- .set reorder
-
-.Lchacha20_mips_xor_bytes:
- addu IN, $at
- addu OUT, $at
- /* First byte */
- lbu T1, 0(IN)
- addiu $at, BYTES, 1
- CPU_TO_LE32(SAVED_X)
- ROTR(SAVED_X)
- xor T1, SAVED_X
- sb T1, 0(OUT)
- beqz $at, .Lchacha20_mips_xor_done
- /* Second byte */
- lbu T1, 1(IN)
- addiu $at, BYTES, 2
- ROTx SAVED_X, 8
- xor T1, SAVED_X
- sb T1, 1(OUT)
- beqz $at, .Lchacha20_mips_xor_done
- /* Third byte */
- lbu T1, 2(IN)
- ROTx SAVED_X, 8
- xor T1, SAVED_X
- sb T1, 2(OUT)
- b .Lchacha20_mips_xor_done
-
-.Lchacha20_mips_no_full_block_unaligned:
- /* Restore the offset on BYTES */
- addiu BYTES, CHACHA20_BLOCK_SIZE
-
- /* Get number of full WORDS */
- andi $at, BYTES, MASK_U32
-
- /* Load upper half of jump table addr */
- lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
-
- /* Calculate lower half jump table offset */
- ins T0, $at, 1, 6
-
- /* Add offset to STATE */
- addu T1, STATE, $at
-
- /* Add lower half jump table addr */
- addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
-
- /* Read value from STATE */
- lw SAVED_CA, 0(T1)
-
- /* Store remaining bytecounter as negative value */
- subu BYTES, $at, BYTES
-
- jr T0
-
- /* Jump table */
- FOR_EACH_WORD(JMPTBL_UNALIGNED)
-.end chacha20_mips
-.set at
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64-glue.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64-glue.c
deleted file mode 100644
index 1bccec70845c..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64-glue.c
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-#ifdef __linux__
-#include <asm/fpu/api.h>
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-#include <asm/intel-family.h>
-#else
-#include <sys/simd-x86_64.h>
-#endif
-
-asmlinkage void hchacha20_ssse3(u32 *derived_key, const u8 *nonce,
- const u8 *key);
-asmlinkage void chacha20_ssse3(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-asmlinkage void chacha20_avx2(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-asmlinkage void chacha20_avx512(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-asmlinkage void chacha20_avx512vl(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-
-static bool chacha20_use_ssse3 __ro_after_init;
-static bool chacha20_use_avx2 __ro_after_init;
-static bool chacha20_use_avx512 __ro_after_init;
-static bool chacha20_use_avx512vl __ro_after_init;
-static bool *const chacha20_nobs[] __initconst = {
- &chacha20_use_ssse3, &chacha20_use_avx2, &chacha20_use_avx512,
- &chacha20_use_avx512vl };
-
-static void __init chacha20_fpu_init(void)
-{
-#ifdef __linux__
- chacha20_use_ssse3 = boot_cpu_has(X86_FEATURE_SSSE3);
- chacha20_use_avx2 =
- boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
-#ifndef COMPAT_CANNOT_USE_AVX512
- chacha20_use_avx512 =
- boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX512F) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
- XFEATURE_MASK_AVX512, NULL) &&
- /* Skylake downclocks unacceptably much when using zmm. */
- boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
- chacha20_use_avx512vl =
- boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX512F) &&
- boot_cpu_has(X86_FEATURE_AVX512VL) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
- XFEATURE_MASK_AVX512, NULL);
-#endif
-#else
- chacha20_use_ssse3 = !!(cpu_feature2 & CPUID2_SSSE3);
- chacha20_use_avx2 = !!(cpu_feature2 & CPUID2_AVX) &&
- !!(cpu_stdext_feature & CPUID_STDEXT_AVX2) &&
- __ymm_enabled();
- chacha20_use_avx512 = chacha20_use_avx2 &&
- !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) &&
- __zmm_enabled();
- chacha20_use_avx512vl = chacha20_use_avx512 &&
- !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) &&
- !!(cpu_stdext_feature & CPUID_STDEXT_AVX512VL);
-#endif
- if (bootverbose)
- printf("ssse3: %d avx2: %d avx512: %d avx512vl: %d\n",
- chacha20_use_ssse3,
- chacha20_use_avx2,
- chacha20_use_avx512,
- chacha20_use_avx512vl);
-}
-
-static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst,
- const u8 *src, size_t len,
- simd_context_t *simd_context)
-{
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE < CHACHA20_BLOCK_SIZE ||
- PAGE_SIZE % CHACHA20_BLOCK_SIZE);
-
- if (!chacha20_use_ssse3) {
- return false;
- }
- if (len <= CHACHA20_BLOCK_SIZE) {
- return false;
- }
- if (!simd_use(simd_context)) {
- return false;
- }
- for (;;) {
- const size_t bytes = min_t(size_t, len, PAGE_SIZE);
-
- if (chacha20_use_avx512 &&
- len >= CHACHA20_BLOCK_SIZE * 8)
- chacha20_avx512(dst, src, bytes, ctx->key, ctx->counter);
- else if (chacha20_use_avx512vl &&
- len >= CHACHA20_BLOCK_SIZE * 4)
- chacha20_avx512vl(dst, src, bytes, ctx->key, ctx->counter);
- else if (chacha20_use_avx2 &&
- len >= CHACHA20_BLOCK_SIZE * 4)
- chacha20_avx2(dst, src, bytes, ctx->key, ctx->counter);
- else
- chacha20_ssse3(dst, src, bytes, ctx->key, ctx->counter);
- ctx->counter[0] += (bytes + 63) / 64;
- len -= bytes;
- if (!len)
- break;
- dst += bytes;
- src += bytes;
- simd_relax(simd_context);
- }
-
- return true;
-}
-
-static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE],
- simd_context_t *simd_context)
-{
- if (IS_ENABLED(CONFIG_AS_SSSE3) && chacha20_use_ssse3 &&
- simd_use(simd_context)) {
- hchacha20_ssse3(derived_key, nonce, key);
- return true;
- }
- return false;
-}
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64.pl b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64.pl
deleted file mode 100755
index 29906a66b8b7..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64.pl
+++ /dev/null
@@ -1,4106 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
-# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
-# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# November 2014
-#
-# ChaCha20 for x86_64.
-#
-# December 2016
-#
-# Add AVX512F code path.
-#
-# December 2017
-#
-# Add AVX512VL code path.
-#
-# Performance in cycles per byte out of large buffer.
-#
-# IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v)
-#
-# P4 9.48/+99% - -
-# Core2 7.83/+55% 7.90/5.76 4.35
-# Westmere 7.19/+50% 5.60/4.50 3.00
-# Sandy Bridge 8.31/+42% 5.45/4.00 2.72
-# Ivy Bridge 6.71/+46% 5.40/? 2.41
-# Haswell 5.92/+43% 5.20/3.45 2.42 1.23
-# Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)]
-# Silvermont 12.0/+33% 7.75/6.90 7.03(iii)
-# Knights L 11.7/- ? 9.60(iii) 0.80
-# Goldmont 10.6/+17% 5.10/3.52 3.28
-# Sledgehammer 7.28/+52% - -
-# Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv)
-# Ryzen 5.96/+50% 5.19/3.00 2.40 2.09
-# VIA Nano 10.5/+46% 6.72/6.88 6.05
-#
-# (i) compared to older gcc 3.x one can observe >2x improvement on
-# most platforms;
-# (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used
-# by chacha20_poly1305_tls_cipher, results are EVP-free;
-# (iii) this is not optimal result for Atom because of MSROM
-# limitations, SSE2 can do better, but gain is considered too
-# low to justify the [maintenance] effort;
-# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20
-# and 4.85 for 128-byte inputs;
-# (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
-# (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
-# cpb in single thread, the corresponding capability is suppressed;
-
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-$kernel=0; $kernel=1 if (!$flavour && !$output);
-
-if (!$kernel) {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
-
- open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
- *STDOUT=*OUT;
-
- if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
- =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
- }
-
- if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
- $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
- $avx += 1 if ($1==2.11 && $2>=8);
- }
-
- if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
- `ml64 2>&1` =~ /Version ([0-9]+)\./) {
- $avx = ($1>=10) + ($1>=11);
- }
-
- if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
- $avx = ($2>=3.0) + ($2>3.0);
- }
-} else {
- $avx = 4; # The kernel uses ifdefs for this.
-}
-
-# input parameter block
-($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
-
-$code.=<<___ if $kernel;
-#include <linux/linkage.h>
-___
-
-sub declare_variable() {
- my ($name, $size, $type, $payload) = @_;
- if($kernel) {
- $code.=".section .rodata.cst$size.L$name, \"aM\", \@progbits, $size\n";
- $code.=".align $size\n";
- $code.=".L$name:\n";
- $code.=".$type $payload\n";
- } else {
- $code.=".L$name:\n";
- $code.=".$type $payload\n";
- }
-}
-
-sub declare_function() {
- my ($name, $align, $nargs) = @_;
- if($kernel) {
- $code .= ".align $align\n";
- $code .= "SYM_FUNC_START($name)\n";
- $code .= ".L$name:\n";
- } else {
- $code .= ".globl $name\n";
- $code .= ".type $name,\@function,$nargs\n";
- $code .= ".align $align\n";
- $code .= "$name:\n";
- }
-}
-
-sub end_function() {
- my ($name) = @_;
- if($kernel) {
- $code .= "SYM_FUNC_END($name)\n";
- } else {
- $code .= ".size $name,.-$name\n";
- }
-}
-
-if(!$kernel) {
- $code .= ".text\n";
-}
-&declare_variable('zero', 16, 'long', '0,0,0,0');
-&declare_variable('one', 16, 'long', '1,0,0,0');
-&declare_variable('inc', 16, 'long', '0,1,2,3');
-&declare_variable('four', 16, 'long', '4,4,4,4');
-&declare_variable('incy', 32, 'long', '0,2,4,6,1,3,5,7');
-&declare_variable('eight', 32, 'long', '8,8,8,8,8,8,8,8');
-&declare_variable('rot16', 16, 'byte', '0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd');
-&declare_variable('rot24', 16, 'byte', '0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe');
-&declare_variable('twoy', 32, 'long', '2,0,0,0, 2,0,0,0');
-&declare_variable('zeroz', 64, 'long', '0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0');
-&declare_variable('fourz', 64, 'long', '4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0');
-&declare_variable('incz', 64, 'long', '0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15');
-&declare_variable('sixteen', 64, 'long', '16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16');
-&declare_variable('sigma', 16, 'ascii', '"expand 32-byte k"');
-
-$code.=<<___ if !$kernel;
-.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-___
-$code.=".text\n";
-
-sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
- my $arg = pop;
- $arg = "\$$arg" if ($arg*1 eq $arg);
- $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
-}
-
-@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
- "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
-@t=("%esi","%edi");
-
-sub ROUND { # critical path is 24 cycles per round
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my ($xc,$xc_)=map("\"$_\"",@t);
-my @x=map("\"$_\"",@x);
-
- # Consider order in which variables are addressed by their
- # index:
- #
- # a b c d
- #
- # 0 4 8 12 < even round
- # 1 5 9 13
- # 2 6 10 14
- # 3 7 11 15
- # 0 5 10 15 < odd round
- # 1 6 11 12
- # 2 7 8 13
- # 3 4 9 14
- #
- # 'a', 'b' and 'd's are permanently allocated in registers,
- # @x[0..7,12..15], while 'c's are maintained in memory. If
- # you observe 'c' column, you'll notice that pair of 'c's is
- # invariant between rounds. This means that we have to reload
- # them once per round, in the middle. This is why you'll see
- # bunch of 'c' stores and loads in the middle, but none in
- # the beginning or end.
-
- # Normally instructions would be interleaved to favour in-order
- # execution. Generally out-of-order cores manage it gracefully,
- # but not this time for some reason. As in-order execution
- # cores are dying breed, old Atom is the only one around,
- # instructions are left uninterleaved. Besides, Atom is better
- # off executing 1xSSSE3 code anyway...
-
- (
- "&add (@x[$a0],@x[$b0])", # Q1
- "&xor (@x[$d0],@x[$a0])",
- "&rol (@x[$d0],16)",
- "&add (@x[$a1],@x[$b1])", # Q2
- "&xor (@x[$d1],@x[$a1])",
- "&rol (@x[$d1],16)",
-
- "&add ($xc,@x[$d0])",
- "&xor (@x[$b0],$xc)",
- "&rol (@x[$b0],12)",
- "&add ($xc_,@x[$d1])",
- "&xor (@x[$b1],$xc_)",
- "&rol (@x[$b1],12)",
-
- "&add (@x[$a0],@x[$b0])",
- "&xor (@x[$d0],@x[$a0])",
- "&rol (@x[$d0],8)",
- "&add (@x[$a1],@x[$b1])",
- "&xor (@x[$d1],@x[$a1])",
- "&rol (@x[$d1],8)",
-
- "&add ($xc,@x[$d0])",
- "&xor (@x[$b0],$xc)",
- "&rol (@x[$b0],7)",
- "&add ($xc_,@x[$d1])",
- "&xor (@x[$b1],$xc_)",
- "&rol (@x[$b1],7)",
-
- "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
- "&mov (\"4*$c1(%rsp)\",$xc_)",
- "&mov ($xc,\"4*$c2(%rsp)\")",
- "&mov ($xc_,\"4*$c3(%rsp)\")",
-
- "&add (@x[$a2],@x[$b2])", # Q3
- "&xor (@x[$d2],@x[$a2])",
- "&rol (@x[$d2],16)",
- "&add (@x[$a3],@x[$b3])", # Q4
- "&xor (@x[$d3],@x[$a3])",
- "&rol (@x[$d3],16)",
-
- "&add ($xc,@x[$d2])",
- "&xor (@x[$b2],$xc)",
- "&rol (@x[$b2],12)",
- "&add ($xc_,@x[$d3])",
- "&xor (@x[$b3],$xc_)",
- "&rol (@x[$b3],12)",
-
- "&add (@x[$a2],@x[$b2])",
- "&xor (@x[$d2],@x[$a2])",
- "&rol (@x[$d2],8)",
- "&add (@x[$a3],@x[$b3])",
- "&xor (@x[$d3],@x[$a3])",
- "&rol (@x[$d3],8)",
-
- "&add ($xc,@x[$d2])",
- "&xor (@x[$b2],$xc)",
- "&rol (@x[$b2],7)",
- "&add ($xc_,@x[$d3])",
- "&xor (@x[$b3],$xc_)",
- "&rol (@x[$b3],7)"
- );
-}
-
-########################################################################
-# Generic code path that handles all lengths on pre-SSSE3 processors.
-if(!$kernel) {
-&declare_function("chacha20_ctr32", 64, 5);
-$code.=<<___;
-.cfi_startproc
- cmp \$0,$len
- je .Lno_data
- mov OPENSSL_ia32cap_P+4(%rip),%r9
-___
-$code.=<<___ if ($avx>2);
- bt \$48,%r9 # check for AVX512F
- jc .Lchacha20_avx512
- test %r9,%r9 # check for AVX512VL
- js .Lchacha20_avx512vl
-___
-$code.=<<___;
- test \$`1<<(41-32)`,%r9d
- jnz .Lchacha20_ssse3
-___
-$code.=<<___;
- push %rbx
-.cfi_push %rbx
- push %rbp
-.cfi_push %rbp
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
- sub \$64+24,%rsp
-.cfi_adjust_cfa_offset 64+24
-.Lctr32_body:
-
- #movdqa .Lsigma(%rip),%xmm0
- movdqu ($key),%xmm1
- movdqu 16($key),%xmm2
- movdqu ($counter),%xmm3
- movdqa .Lone(%rip),%xmm4
-
- #movdqa %xmm0,4*0(%rsp) # key[0]
- movdqa %xmm1,4*4(%rsp) # key[1]
- movdqa %xmm2,4*8(%rsp) # key[2]
- movdqa %xmm3,4*12(%rsp) # key[3]
- mov $len,%rbp # reassign $len
- jmp .Loop_outer
-
-.align 32
-.Loop_outer:
- mov \$0x61707865,@x[0] # 'expa'
- mov \$0x3320646e,@x[1] # 'nd 3'
- mov \$0x79622d32,@x[2] # '2-by'
- mov \$0x6b206574,@x[3] # 'te k'
- mov 4*4(%rsp),@x[4]
- mov 4*5(%rsp),@x[5]
- mov 4*6(%rsp),@x[6]
- mov 4*7(%rsp),@x[7]
- movd %xmm3,@x[12]
- mov 4*13(%rsp),@x[13]
- mov 4*14(%rsp),@x[14]
- mov 4*15(%rsp),@x[15]
-
- mov %rbp,64+0(%rsp) # save len
- mov \$10,%ebp
- mov $inp,64+8(%rsp) # save inp
- movq %xmm2,%rsi # "@x[8]"
- mov $out,64+16(%rsp) # save out
- mov %rsi,%rdi
- shr \$32,%rdi # "@x[9]"
- jmp .Loop
-
-.align 32
-.Loop:
-___
- foreach (&ROUND (0, 4, 8,12)) { eval; }
- foreach (&ROUND (0, 5,10,15)) { eval; }
- &dec ("%ebp");
- &jnz (".Loop");
-
-$code.=<<___;
- mov @t[1],4*9(%rsp) # modulo-scheduled
- mov @t[0],4*8(%rsp)
- mov 64(%rsp),%rbp # load len
- movdqa %xmm2,%xmm1
- mov 64+8(%rsp),$inp # load inp
- paddd %xmm4,%xmm3 # increment counter
- mov 64+16(%rsp),$out # load out
-
- add \$0x61707865,@x[0] # 'expa'
- add \$0x3320646e,@x[1] # 'nd 3'
- add \$0x79622d32,@x[2] # '2-by'
- add \$0x6b206574,@x[3] # 'te k'
- add 4*4(%rsp),@x[4]
- add 4*5(%rsp),@x[5]
- add 4*6(%rsp),@x[6]
- add 4*7(%rsp),@x[7]
- add 4*12(%rsp),@x[12]
- add 4*13(%rsp),@x[13]
- add 4*14(%rsp),@x[14]
- add 4*15(%rsp),@x[15]
- paddd 4*8(%rsp),%xmm1
-
- cmp \$64,%rbp
- jb .Ltail
-
- xor 4*0($inp),@x[0] # xor with input
- xor 4*1($inp),@x[1]
- xor 4*2($inp),@x[2]
- xor 4*3($inp),@x[3]
- xor 4*4($inp),@x[4]
- xor 4*5($inp),@x[5]
- xor 4*6($inp),@x[6]
- xor 4*7($inp),@x[7]
- movdqu 4*8($inp),%xmm0
- xor 4*12($inp),@x[12]
- xor 4*13($inp),@x[13]
- xor 4*14($inp),@x[14]
- xor 4*15($inp),@x[15]
- lea 4*16($inp),$inp # inp+=64
- pxor %xmm1,%xmm0
-
- movdqa %xmm2,4*8(%rsp)
- movd %xmm3,4*12(%rsp)
-
- mov @x[0],4*0($out) # write output
- mov @x[1],4*1($out)
- mov @x[2],4*2($out)
- mov @x[3],4*3($out)
- mov @x[4],4*4($out)
- mov @x[5],4*5($out)
- mov @x[6],4*6($out)
- mov @x[7],4*7($out)
- movdqu %xmm0,4*8($out)
- mov @x[12],4*12($out)
- mov @x[13],4*13($out)
- mov @x[14],4*14($out)
- mov @x[15],4*15($out)
- lea 4*16($out),$out # out+=64
-
- sub \$64,%rbp
- jnz .Loop_outer
-
- jmp .Ldone
-
-.align 16
-.Ltail:
- mov @x[0],4*0(%rsp)
- mov @x[1],4*1(%rsp)
- xor %rbx,%rbx
- mov @x[2],4*2(%rsp)
- mov @x[3],4*3(%rsp)
- mov @x[4],4*4(%rsp)
- mov @x[5],4*5(%rsp)
- mov @x[6],4*6(%rsp)
- mov @x[7],4*7(%rsp)
- movdqa %xmm1,4*8(%rsp)
- mov @x[12],4*12(%rsp)
- mov @x[13],4*13(%rsp)
- mov @x[14],4*14(%rsp)
- mov @x[15],4*15(%rsp)
-
-.Loop_tail:
- movzb ($inp,%rbx),%eax
- movzb (%rsp,%rbx),%edx
- lea 1(%rbx),%rbx
- xor %edx,%eax
- mov %al,-1($out,%rbx)
- dec %rbp
- jnz .Loop_tail
-
-.Ldone:
- add \$64+24,%rsp
-.cfi_adjust_cfa_offset -64-24
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbp
-.cfi_restore %rbp
- pop %rbx
-.cfi_restore %rbx
-.Lno_data:
- ret
-.cfi_endproc
-___
-&end_function("chacha20_ctr32");
-}
-
-########################################################################
-# SSSE3 code path that handles shorter lengths
-{
-my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
-
-sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
- &paddd ($a,$b);
- &pxor ($d,$a);
- &pshufb ($d,$rot16);
-
- &paddd ($c,$d);
- &pxor ($b,$c);
- &movdqa ($t,$b);
- &psrld ($b,20);
- &pslld ($t,12);
- &por ($b,$t);
-
- &paddd ($a,$b);
- &pxor ($d,$a);
- &pshufb ($d,$rot24);
-
- &paddd ($c,$d);
- &pxor ($b,$c);
- &movdqa ($t,$b);
- &psrld ($b,25);
- &pslld ($t,7);
- &por ($b,$t);
-}
-
-my $xframe = $win64 ? 32+8 : 8;
-
-if($kernel) {
- $code .= "#ifdef CONFIG_AS_SSSE3\n";
-}
-
-if($kernel) {
-&declare_function("hchacha20_ssse3", 32, 5);
-$code.=<<___;
- movdqa .Lsigma(%rip),$a
- movdqu ($len),$b
- movdqu 16($len),$c
- movdqu ($inp),$d
- # This code is only used when targeting kernel.
- # If targeting win64, xmm{6,7} preserving needs to be added.
- movdqa .Lrot16(%rip),$rot16
- movdqa .Lrot24(%rip),$rot24
- mov \$10,$counter # reuse $counter
- jmp 1f
-.align 32
-1:
-___
- &SSSE3ROUND();
- &pshufd ($a,$a,0b10010011);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b00111001);
- &nop ();
-
- &SSSE3ROUND();
- &pshufd ($a,$a,0b00111001);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b10010011);
-
- &dec ($counter);
- &jnz ("1b");
-
-$code.=<<___;
- movdqu $a, ($out)
- movdqu $d, 16($out)
- ret
-___
-&end_function("hchacha20_ssse3");
-}
-
-&declare_function("chacha20_ssse3", 32, 5);
-$code.=<<___;
-.cfi_startproc
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
-___
-$code.=<<___ if ($avx && !$kernel);
- test \$`1<<(43-32)`,%r10d
- jnz .Lchacha20_4xop # XOP is fastest even if we use 1/4
-___
-$code.=<<___;
- cmp \$128,$len # we might throw away some data,
- je .Lchacha20_128
- ja .Lchacha20_4x # but overall it won't be slower
-
-.Ldo_ssse3_after_all:
- sub \$64+$xframe,%rsp
- and \$-16,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0x30(%r10)
- movaps %xmm7,-0x20(%r10)
-.Lssse3_body:
-___
-$code.=<<___;
- movdqa .Lsigma(%rip),$a
- movdqu ($key),$b
- movdqu 16($key),$c
- movdqu ($counter),$d
- movdqa .Lrot16(%rip),$rot16
- movdqa .Lrot24(%rip),$rot24
-
- movdqa $a,0x00(%rsp)
- movdqa $b,0x10(%rsp)
- movdqa $c,0x20(%rsp)
- movdqa $d,0x30(%rsp)
- mov \$10,$counter # reuse $counter
- jmp .Loop_ssse3
-
-.align 32
-.Loop_outer_ssse3:
- movdqa .Lone(%rip),$d
- movdqa 0x00(%rsp),$a
- movdqa 0x10(%rsp),$b
- movdqa 0x20(%rsp),$c
- paddd 0x30(%rsp),$d
- mov \$10,$counter
- movdqa $d,0x30(%rsp)
- jmp .Loop_ssse3
-
-.align 32
-.Loop_ssse3:
-___
- &SSSE3ROUND();
- &pshufd ($a,$a,0b10010011);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b00111001);
- &nop ();
-
- &SSSE3ROUND();
- &pshufd ($a,$a,0b00111001);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b10010011);
-
- &dec ($counter);
- &jnz (".Loop_ssse3");
-
-$code.=<<___;
- paddd 0x00(%rsp),$a
- paddd 0x10(%rsp),$b
- paddd 0x20(%rsp),$c
- paddd 0x30(%rsp),$d
-
- cmp \$64,$len
- jb .Ltail_ssse3
-
- movdqu 0x00($inp),$t
- movdqu 0x10($inp),$t1
- pxor $t,$a # xor with input
- movdqu 0x20($inp),$t
- pxor $t1,$b
- movdqu 0x30($inp),$t1
- lea 0x40($inp),$inp # inp+=64
- pxor $t,$c
- pxor $t1,$d
-
- movdqu $a,0x00($out) # write output
- movdqu $b,0x10($out)
- movdqu $c,0x20($out)
- movdqu $d,0x30($out)
- lea 0x40($out),$out # out+=64
-
- sub \$64,$len
- jnz .Loop_outer_ssse3
-
- jmp .Ldone_ssse3
-
-.align 16
-.Ltail_ssse3:
- movdqa $a,0x00(%rsp)
- movdqa $b,0x10(%rsp)
- movdqa $c,0x20(%rsp)
- movdqa $d,0x30(%rsp)
- xor $counter,$counter
-
-.Loop_tail_ssse3:
- movzb ($inp,$counter),%eax
- movzb (%rsp,$counter),%ecx
- lea 1($counter),$counter
- xor %ecx,%eax
- mov %al,-1($out,$counter)
- dec $len
- jnz .Loop_tail_ssse3
-
-.Ldone_ssse3:
-___
-$code.=<<___ if ($win64);
- movaps -0x30(%r10),%xmm6
- movaps -0x20(%r10),%xmm7
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.Lssse3_epilogue:
- ret
-.cfi_endproc
-___
-}
-&end_function("chacha20_ssse3");
-
-########################################################################
-# SSSE3 code path that handles 128-byte inputs
-{
-my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
-my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));
-
-sub SSSE3ROUND_2x {
- &paddd ($a,$b);
- &pxor ($d,$a);
- &paddd ($a1,$b1);
- &pxor ($d1,$a1);
- &pshufb ($d,$rot16);
- &pshufb($d1,$rot16);
-
- &paddd ($c,$d);
- &paddd ($c1,$d1);
- &pxor ($b,$c);
- &pxor ($b1,$c1);
- &movdqa ($t,$b);
- &psrld ($b,20);
- &movdqa($t1,$b1);
- &pslld ($t,12);
- &psrld ($b1,20);
- &por ($b,$t);
- &pslld ($t1,12);
- &por ($b1,$t1);
-
- &paddd ($a,$b);
- &pxor ($d,$a);
- &paddd ($a1,$b1);
- &pxor ($d1,$a1);
- &pshufb ($d,$rot24);
- &pshufb($d1,$rot24);
-
- &paddd ($c,$d);
- &paddd ($c1,$d1);
- &pxor ($b,$c);
- &pxor ($b1,$c1);
- &movdqa ($t,$b);
- &psrld ($b,25);
- &movdqa($t1,$b1);
- &pslld ($t,7);
- &psrld ($b1,25);
- &por ($b,$t);
- &pslld ($t1,7);
- &por ($b1,$t1);
-}
-
-my $xframe = $win64 ? 0x68 : 8;
-
-$code.=<<___;
-.type chacha20_128,\@function,5
-.align 32
-chacha20_128:
-.cfi_startproc
-.Lchacha20_128:
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
- sub \$64+$xframe,%rsp
- and \$-16,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0x70(%r10)
- movaps %xmm7,-0x60(%r10)
- movaps %xmm8,-0x50(%r10)
- movaps %xmm9,-0x40(%r10)
- movaps %xmm10,-0x30(%r10)
- movaps %xmm11,-0x20(%r10)
-.L128_body:
-___
-$code.=<<___;
- movdqa .Lsigma(%rip),$a
- movdqu ($key),$b
- movdqu 16($key),$c
- movdqu ($counter),$d
- movdqa .Lone(%rip),$d1
- movdqa .Lrot16(%rip),$rot16
- movdqa .Lrot24(%rip),$rot24
-
- movdqa $a,$a1
- movdqa $a,0x00(%rsp)
- movdqa $b,$b1
- movdqa $b,0x10(%rsp)
- movdqa $c,$c1
- movdqa $c,0x20(%rsp)
- paddd $d,$d1
- movdqa $d,0x30(%rsp)
- mov \$10,$counter # reuse $counter
- jmp .Loop_128
-
-.align 32
-.Loop_128:
-___
- &SSSE3ROUND_2x();
- &pshufd ($a,$a,0b10010011);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b00111001);
- &pshufd ($a1,$a1,0b10010011);
- &pshufd ($d1,$d1,0b01001110);
- &pshufd ($c1,$c1,0b00111001);
-
- &SSSE3ROUND_2x();
- &pshufd ($a,$a,0b00111001);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b10010011);
- &pshufd ($a1,$a1,0b00111001);
- &pshufd ($d1,$d1,0b01001110);
- &pshufd ($c1,$c1,0b10010011);
-
- &dec ($counter);
- &jnz (".Loop_128");
-
-$code.=<<___;
- paddd 0x00(%rsp),$a
- paddd 0x10(%rsp),$b
- paddd 0x20(%rsp),$c
- paddd 0x30(%rsp),$d
- paddd .Lone(%rip),$d1
- paddd 0x00(%rsp),$a1
- paddd 0x10(%rsp),$b1
- paddd 0x20(%rsp),$c1
- paddd 0x30(%rsp),$d1
-
- movdqu 0x00($inp),$t
- movdqu 0x10($inp),$t1
- pxor $t,$a # xor with input
- movdqu 0x20($inp),$t
- pxor $t1,$b
- movdqu 0x30($inp),$t1
- pxor $t,$c
- movdqu 0x40($inp),$t
- pxor $t1,$d
- movdqu 0x50($inp),$t1
- pxor $t,$a1
- movdqu 0x60($inp),$t
- pxor $t1,$b1
- movdqu 0x70($inp),$t1
- pxor $t,$c1
- pxor $t1,$d1
-
- movdqu $a,0x00($out) # write output
- movdqu $b,0x10($out)
- movdqu $c,0x20($out)
- movdqu $d,0x30($out)
- movdqu $a1,0x40($out)
- movdqu $b1,0x50($out)
- movdqu $c1,0x60($out)
- movdqu $d1,0x70($out)
-___
-$code.=<<___ if ($win64);
- movaps -0x70(%r10),%xmm6
- movaps -0x60(%r10),%xmm7
- movaps -0x50(%r10),%xmm8
- movaps -0x40(%r10),%xmm9
- movaps -0x30(%r10),%xmm10
- movaps -0x20(%r10),%xmm11
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L128_epilogue:
- ret
-.cfi_endproc
-.size chacha20_128,.-chacha20_128
-___
-}
-
-########################################################################
-# SSSE3 code path that handles longer messages.
-{
-# assign variables to favor Atom front-end
-my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
- $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
-my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
-
-sub SSSE3_lane_ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
-my @x=map("\"$_\"",@xx);
-
- # Consider order in which variables are addressed by their
- # index:
- #
- # a b c d
- #
- # 0 4 8 12 < even round
- # 1 5 9 13
- # 2 6 10 14
- # 3 7 11 15
- # 0 5 10 15 < odd round
- # 1 6 11 12
- # 2 7 8 13
- # 3 4 9 14
- #
- # 'a', 'b' and 'd's are permanently allocated in registers,
- # @x[0..7,12..15], while 'c's are maintained in memory. If
- # you observe 'c' column, you'll notice that pair of 'c's is
- # invariant between rounds. This means that we have to reload
- # them once per round, in the middle. This is why you'll see
- # bunch of 'c' stores and loads in the middle, but none in
- # the beginning or end.
-
- (
- "&paddd (@x[$a0],@x[$b0])", # Q1
- "&paddd (@x[$a1],@x[$b1])", # Q2
- "&pxor (@x[$d0],@x[$a0])",
- "&pxor (@x[$d1],@x[$a1])",
- "&pshufb (@x[$d0],$t1)",
- "&pshufb (@x[$d1],$t1)",
-
- "&paddd ($xc,@x[$d0])",
- "&paddd ($xc_,@x[$d1])",
- "&pxor (@x[$b0],$xc)",
- "&pxor (@x[$b1],$xc_)",
- "&movdqa ($t0,@x[$b0])",
- "&pslld (@x[$b0],12)",
- "&psrld ($t0,20)",
- "&movdqa ($t1,@x[$b1])",
- "&pslld (@x[$b1],12)",
- "&por (@x[$b0],$t0)",
- "&psrld ($t1,20)",
- "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
- "&por (@x[$b1],$t1)",
-
- "&paddd (@x[$a0],@x[$b0])",
- "&paddd (@x[$a1],@x[$b1])",
- "&pxor (@x[$d0],@x[$a0])",
- "&pxor (@x[$d1],@x[$a1])",
- "&pshufb (@x[$d0],$t0)",
- "&pshufb (@x[$d1],$t0)",
-
- "&paddd ($xc,@x[$d0])",
- "&paddd ($xc_,@x[$d1])",
- "&pxor (@x[$b0],$xc)",
- "&pxor (@x[$b1],$xc_)",
- "&movdqa ($t1,@x[$b0])",
- "&pslld (@x[$b0],7)",
- "&psrld ($t1,25)",
- "&movdqa ($t0,@x[$b1])",
- "&pslld (@x[$b1],7)",
- "&por (@x[$b0],$t1)",
- "&psrld ($t0,25)",
- "&movdqa ($t1,'(%r9)')", # .Lrot16(%rip)
- "&por (@x[$b1],$t0)",
-
- "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
- "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
- "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
- "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
-
- "&paddd (@x[$a2],@x[$b2])", # Q3
- "&paddd (@x[$a3],@x[$b3])", # Q4
- "&pxor (@x[$d2],@x[$a2])",
- "&pxor (@x[$d3],@x[$a3])",
- "&pshufb (@x[$d2],$t1)",
- "&pshufb (@x[$d3],$t1)",
-
- "&paddd ($xc,@x[$d2])",
- "&paddd ($xc_,@x[$d3])",
- "&pxor (@x[$b2],$xc)",
- "&pxor (@x[$b3],$xc_)",
- "&movdqa ($t0,@x[$b2])",
- "&pslld (@x[$b2],12)",
- "&psrld ($t0,20)",
- "&movdqa ($t1,@x[$b3])",
- "&pslld (@x[$b3],12)",
- "&por (@x[$b2],$t0)",
- "&psrld ($t1,20)",
- "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
- "&por (@x[$b3],$t1)",
-
- "&paddd (@x[$a2],@x[$b2])",
- "&paddd (@x[$a3],@x[$b3])",
- "&pxor (@x[$d2],@x[$a2])",
- "&pxor (@x[$d3],@x[$a3])",
- "&pshufb (@x[$d2],$t0)",
- "&pshufb (@x[$d3],$t0)",
-
- "&paddd ($xc,@x[$d2])",
- "&paddd ($xc_,@x[$d3])",
- "&pxor (@x[$b2],$xc)",
- "&pxor (@x[$b3],$xc_)",
- "&movdqa ($t1,@x[$b2])",
- "&pslld (@x[$b2],7)",
- "&psrld ($t1,25)",
- "&movdqa ($t0,@x[$b3])",
- "&pslld (@x[$b3],7)",
- "&por (@x[$b2],$t1)",
- "&psrld ($t0,25)",
- "&movdqa ($t1,'(%r9)')", # .Lrot16(%rip)
- "&por (@x[$b3],$t0)"
- );
-}
-
-my $xframe = $win64 ? 0xa8 : 8;
-
-$code.=<<___;
-.type chacha20_4x,\@function,5
-.align 32
-chacha20_4x:
-.cfi_startproc
-.Lchacha20_4x:
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
-___
-$code.=<<___ if (!$kernel);
- mov %r9,%r11
-___
-$code.=<<___ if ($avx>1 && !$kernel);
- shr \$32,%r9 # OPENSSL_ia32cap_P+8
- test \$`1<<5`,%r9 # test AVX2
- jnz .Lchacha20_8x
-___
-$code.=<<___;
- cmp \$192,$len
- ja .Lproceed4x
-___
-$code.=<<___ if (!$kernel);
- and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
- cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
- je .Ldo_ssse3_after_all # to detect Atom
-___
-$code.=<<___;
-.Lproceed4x:
- sub \$0x140+$xframe,%rsp
- and \$-16,%rsp
-___
- ################ stack layout
- # +0x00 SIMD equivalent of @x[8-12]
- # ...
- # +0x40 constant copy of key[0-2] smashed by lanes
- # ...
- # +0x100 SIMD counters (with nonce smashed by lanes)
- # ...
- # +0x140
-$code.=<<___ if ($win64);
- movaps %xmm6,-0xb0(%r10)
- movaps %xmm7,-0xa0(%r10)
- movaps %xmm8,-0x90(%r10)
- movaps %xmm9,-0x80(%r10)
- movaps %xmm10,-0x70(%r10)
- movaps %xmm11,-0x60(%r10)
- movaps %xmm12,-0x50(%r10)
- movaps %xmm13,-0x40(%r10)
- movaps %xmm14,-0x30(%r10)
- movaps %xmm15,-0x20(%r10)
-.L4x_body:
-___
-$code.=<<___;
- movdqa .Lsigma(%rip),$xa3 # key[0]
- movdqu ($key),$xb3 # key[1]
- movdqu 16($key),$xt3 # key[2]
- movdqu ($counter),$xd3 # key[3]
- lea 0x100(%rsp),%rcx # size optimization
- lea .Lrot16(%rip),%r9
- lea .Lrot24(%rip),%r11
-
- pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
- pshufd \$0x55,$xa3,$xa1
- movdqa $xa0,0x40(%rsp) # ... and offload
- pshufd \$0xaa,$xa3,$xa2
- movdqa $xa1,0x50(%rsp)
- pshufd \$0xff,$xa3,$xa3
- movdqa $xa2,0x60(%rsp)
- movdqa $xa3,0x70(%rsp)
-
- pshufd \$0x00,$xb3,$xb0
- pshufd \$0x55,$xb3,$xb1
- movdqa $xb0,0x80-0x100(%rcx)
- pshufd \$0xaa,$xb3,$xb2
- movdqa $xb1,0x90-0x100(%rcx)
- pshufd \$0xff,$xb3,$xb3
- movdqa $xb2,0xa0-0x100(%rcx)
- movdqa $xb3,0xb0-0x100(%rcx)
-
- pshufd \$0x00,$xt3,$xt0 # "$xc0"
- pshufd \$0x55,$xt3,$xt1 # "$xc1"
- movdqa $xt0,0xc0-0x100(%rcx)
- pshufd \$0xaa,$xt3,$xt2 # "$xc2"
- movdqa $xt1,0xd0-0x100(%rcx)
- pshufd \$0xff,$xt3,$xt3 # "$xc3"
- movdqa $xt2,0xe0-0x100(%rcx)
- movdqa $xt3,0xf0-0x100(%rcx)
-
- pshufd \$0x00,$xd3,$xd0
- pshufd \$0x55,$xd3,$xd1
- paddd .Linc(%rip),$xd0 # don't save counters yet
- pshufd \$0xaa,$xd3,$xd2
- movdqa $xd1,0x110-0x100(%rcx)
- pshufd \$0xff,$xd3,$xd3
- movdqa $xd2,0x120-0x100(%rcx)
- movdqa $xd3,0x130-0x100(%rcx)
-
- jmp .Loop_enter4x
-
-.align 32
-.Loop_outer4x:
- movdqa 0x40(%rsp),$xa0 # re-load smashed key
- movdqa 0x50(%rsp),$xa1
- movdqa 0x60(%rsp),$xa2
- movdqa 0x70(%rsp),$xa3
- movdqa 0x80-0x100(%rcx),$xb0
- movdqa 0x90-0x100(%rcx),$xb1
- movdqa 0xa0-0x100(%rcx),$xb2
- movdqa 0xb0-0x100(%rcx),$xb3
- movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
- movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
- movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
- movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
- movdqa 0x100-0x100(%rcx),$xd0
- movdqa 0x110-0x100(%rcx),$xd1
- movdqa 0x120-0x100(%rcx),$xd2
- movdqa 0x130-0x100(%rcx),$xd3
- paddd .Lfour(%rip),$xd0 # next SIMD counters
-
-.Loop_enter4x:
- movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
- movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
- movdqa (%r9),$xt3 # .Lrot16(%rip)
- mov \$10,%eax
- movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
- jmp .Loop4x
-
-.align 32
-.Loop4x:
-___
- foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
- foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- dec %eax
- jnz .Loop4x
-
- paddd 0x40(%rsp),$xa0 # accumulate key material
- paddd 0x50(%rsp),$xa1
- paddd 0x60(%rsp),$xa2
- paddd 0x70(%rsp),$xa3
-
- movdqa $xa0,$xt2 # "de-interlace" data
- punpckldq $xa1,$xa0
- movdqa $xa2,$xt3
- punpckldq $xa3,$xa2
- punpckhdq $xa1,$xt2
- punpckhdq $xa3,$xt3
- movdqa $xa0,$xa1
- punpcklqdq $xa2,$xa0 # "a0"
- movdqa $xt2,$xa3
- punpcklqdq $xt3,$xt2 # "a2"
- punpckhqdq $xa2,$xa1 # "a1"
- punpckhqdq $xt3,$xa3 # "a3"
-___
- ($xa2,$xt2)=($xt2,$xa2);
-$code.=<<___;
- paddd 0x80-0x100(%rcx),$xb0
- paddd 0x90-0x100(%rcx),$xb1
- paddd 0xa0-0x100(%rcx),$xb2
- paddd 0xb0-0x100(%rcx),$xb3
-
- movdqa $xa0,0x00(%rsp) # offload $xaN
- movdqa $xa1,0x10(%rsp)
- movdqa 0x20(%rsp),$xa0 # "xc2"
- movdqa 0x30(%rsp),$xa1 # "xc3"
-
- movdqa $xb0,$xt2
- punpckldq $xb1,$xb0
- movdqa $xb2,$xt3
- punpckldq $xb3,$xb2
- punpckhdq $xb1,$xt2
- punpckhdq $xb3,$xt3
- movdqa $xb0,$xb1
- punpcklqdq $xb2,$xb0 # "b0"
- movdqa $xt2,$xb3
- punpcklqdq $xt3,$xt2 # "b2"
- punpckhqdq $xb2,$xb1 # "b1"
- punpckhqdq $xt3,$xb3 # "b3"
-___
- ($xb2,$xt2)=($xt2,$xb2);
- my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
-$code.=<<___;
- paddd 0xc0-0x100(%rcx),$xc0
- paddd 0xd0-0x100(%rcx),$xc1
- paddd 0xe0-0x100(%rcx),$xc2
- paddd 0xf0-0x100(%rcx),$xc3
-
- movdqa $xa2,0x20(%rsp) # keep offloading $xaN
- movdqa $xa3,0x30(%rsp)
-
- movdqa $xc0,$xt2
- punpckldq $xc1,$xc0
- movdqa $xc2,$xt3
- punpckldq $xc3,$xc2
- punpckhdq $xc1,$xt2
- punpckhdq $xc3,$xt3
- movdqa $xc0,$xc1
- punpcklqdq $xc2,$xc0 # "c0"
- movdqa $xt2,$xc3
- punpcklqdq $xt3,$xt2 # "c2"
- punpckhqdq $xc2,$xc1 # "c1"
- punpckhqdq $xt3,$xc3 # "c3"
-___
- ($xc2,$xt2)=($xt2,$xc2);
- ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
-$code.=<<___;
- paddd 0x100-0x100(%rcx),$xd0
- paddd 0x110-0x100(%rcx),$xd1
- paddd 0x120-0x100(%rcx),$xd2
- paddd 0x130-0x100(%rcx),$xd3
-
- movdqa $xd0,$xt2
- punpckldq $xd1,$xd0
- movdqa $xd2,$xt3
- punpckldq $xd3,$xd2
- punpckhdq $xd1,$xt2
- punpckhdq $xd3,$xt3
- movdqa $xd0,$xd1
- punpcklqdq $xd2,$xd0 # "d0"
- movdqa $xt2,$xd3
- punpcklqdq $xt3,$xt2 # "d2"
- punpckhqdq $xd2,$xd1 # "d1"
- punpckhqdq $xt3,$xd3 # "d3"
-___
- ($xd2,$xt2)=($xt2,$xd2);
-$code.=<<___;
- cmp \$64*4,$len
- jb .Ltail4x
-
- movdqu 0x00($inp),$xt0 # xor with input
- movdqu 0x10($inp),$xt1
- movdqu 0x20($inp),$xt2
- movdqu 0x30($inp),$xt3
- pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
- pxor $xb0,$xt1
- pxor $xc0,$xt2
- pxor $xd0,$xt3
-
- movdqu $xt0,0x00($out)
- movdqu 0x40($inp),$xt0
- movdqu $xt1,0x10($out)
- movdqu 0x50($inp),$xt1
- movdqu $xt2,0x20($out)
- movdqu 0x60($inp),$xt2
- movdqu $xt3,0x30($out)
- movdqu 0x70($inp),$xt3
- lea 0x80($inp),$inp # size optimization
- pxor 0x10(%rsp),$xt0
- pxor $xb1,$xt1
- pxor $xc1,$xt2
- pxor $xd1,$xt3
-
- movdqu $xt0,0x40($out)
- movdqu 0x00($inp),$xt0
- movdqu $xt1,0x50($out)
- movdqu 0x10($inp),$xt1
- movdqu $xt2,0x60($out)
- movdqu 0x20($inp),$xt2
- movdqu $xt3,0x70($out)
- lea 0x80($out),$out # size optimization
- movdqu 0x30($inp),$xt3
- pxor 0x20(%rsp),$xt0
- pxor $xb2,$xt1
- pxor $xc2,$xt2
- pxor $xd2,$xt3
-
- movdqu $xt0,0x00($out)
- movdqu 0x40($inp),$xt0
- movdqu $xt1,0x10($out)
- movdqu 0x50($inp),$xt1
- movdqu $xt2,0x20($out)
- movdqu 0x60($inp),$xt2
- movdqu $xt3,0x30($out)
- movdqu 0x70($inp),$xt3
- lea 0x80($inp),$inp # inp+=64*4
- pxor 0x30(%rsp),$xt0
- pxor $xb3,$xt1
- pxor $xc3,$xt2
- pxor $xd3,$xt3
- movdqu $xt0,0x40($out)
- movdqu $xt1,0x50($out)
- movdqu $xt2,0x60($out)
- movdqu $xt3,0x70($out)
- lea 0x80($out),$out # out+=64*4
-
- sub \$64*4,$len
- jnz .Loop_outer4x
-
- jmp .Ldone4x
-
-.Ltail4x:
- cmp \$192,$len
- jae .L192_or_more4x
- cmp \$128,$len
- jae .L128_or_more4x
- cmp \$64,$len
- jae .L64_or_more4x
-
- #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
- xor %r9,%r9
- #movdqa $xt0,0x00(%rsp)
- movdqa $xb0,0x10(%rsp)
- movdqa $xc0,0x20(%rsp)
- movdqa $xd0,0x30(%rsp)
- jmp .Loop_tail4x
-
-.align 32
-.L64_or_more4x:
- movdqu 0x00($inp),$xt0 # xor with input
- movdqu 0x10($inp),$xt1
- movdqu 0x20($inp),$xt2
- movdqu 0x30($inp),$xt3
- pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
- pxor $xb0,$xt1
- pxor $xc0,$xt2
- pxor $xd0,$xt3
- movdqu $xt0,0x00($out)
- movdqu $xt1,0x10($out)
- movdqu $xt2,0x20($out)
- movdqu $xt3,0x30($out)
- je .Ldone4x
-
- movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
- lea 0x40($inp),$inp # inp+=64*1
- xor %r9,%r9
- movdqa $xt0,0x00(%rsp)
- movdqa $xb1,0x10(%rsp)
- lea 0x40($out),$out # out+=64*1
- movdqa $xc1,0x20(%rsp)
- sub \$64,$len # len-=64*1
- movdqa $xd1,0x30(%rsp)
- jmp .Loop_tail4x
-
-.align 32
-.L128_or_more4x:
- movdqu 0x00($inp),$xt0 # xor with input
- movdqu 0x10($inp),$xt1
- movdqu 0x20($inp),$xt2
- movdqu 0x30($inp),$xt3
- pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
- pxor $xb0,$xt1
- pxor $xc0,$xt2
- pxor $xd0,$xt3
-
- movdqu $xt0,0x00($out)
- movdqu 0x40($inp),$xt0
- movdqu $xt1,0x10($out)
- movdqu 0x50($inp),$xt1
- movdqu $xt2,0x20($out)
- movdqu 0x60($inp),$xt2
- movdqu $xt3,0x30($out)
- movdqu 0x70($inp),$xt3
- pxor 0x10(%rsp),$xt0
- pxor $xb1,$xt1
- pxor $xc1,$xt2
- pxor $xd1,$xt3
- movdqu $xt0,0x40($out)
- movdqu $xt1,0x50($out)
- movdqu $xt2,0x60($out)
- movdqu $xt3,0x70($out)
- je .Ldone4x
-
- movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
- lea 0x80($inp),$inp # inp+=64*2
- xor %r9,%r9
- movdqa $xt0,0x00(%rsp)
- movdqa $xb2,0x10(%rsp)
- lea 0x80($out),$out # out+=64*2
- movdqa $xc2,0x20(%rsp)
- sub \$128,$len # len-=64*2
- movdqa $xd2,0x30(%rsp)
- jmp .Loop_tail4x
-
-.align 32
-.L192_or_more4x:
- movdqu 0x00($inp),$xt0 # xor with input
- movdqu 0x10($inp),$xt1
- movdqu 0x20($inp),$xt2
- movdqu 0x30($inp),$xt3
- pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
- pxor $xb0,$xt1
- pxor $xc0,$xt2
- pxor $xd0,$xt3
-
- movdqu $xt0,0x00($out)
- movdqu 0x40($inp),$xt0
- movdqu $xt1,0x10($out)
- movdqu 0x50($inp),$xt1
- movdqu $xt2,0x20($out)
- movdqu 0x60($inp),$xt2
- movdqu $xt3,0x30($out)
- movdqu 0x70($inp),$xt3
- lea 0x80($inp),$inp # size optimization
- pxor 0x10(%rsp),$xt0
- pxor $xb1,$xt1
- pxor $xc1,$xt2
- pxor $xd1,$xt3
-
- movdqu $xt0,0x40($out)
- movdqu 0x00($inp),$xt0
- movdqu $xt1,0x50($out)
- movdqu 0x10($inp),$xt1
- movdqu $xt2,0x60($out)
- movdqu 0x20($inp),$xt2
- movdqu $xt3,0x70($out)
- lea 0x80($out),$out # size optimization
- movdqu 0x30($inp),$xt3
- pxor 0x20(%rsp),$xt0
- pxor $xb2,$xt1
- pxor $xc2,$xt2
- pxor $xd2,$xt3
- movdqu $xt0,0x00($out)
- movdqu $xt1,0x10($out)
- movdqu $xt2,0x20($out)
- movdqu $xt3,0x30($out)
- je .Ldone4x
-
- movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
- lea 0x40($inp),$inp # inp+=64*3
- xor %r9,%r9
- movdqa $xt0,0x00(%rsp)
- movdqa $xb3,0x10(%rsp)
- lea 0x40($out),$out # out+=64*3
- movdqa $xc3,0x20(%rsp)
- sub \$192,$len # len-=64*3
- movdqa $xd3,0x30(%rsp)
-
-.Loop_tail4x:
- movzb ($inp,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1($out,%r9)
- dec $len
- jnz .Loop_tail4x
-
-.Ldone4x:
-___
-$code.=<<___ if ($win64);
- movaps -0xb0(%r10),%xmm6
- movaps -0xa0(%r10),%xmm7
- movaps -0x90(%r10),%xmm8
- movaps -0x80(%r10),%xmm9
- movaps -0x70(%r10),%xmm10
- movaps -0x60(%r10),%xmm11
- movaps -0x50(%r10),%xmm12
- movaps -0x40(%r10),%xmm13
- movaps -0x30(%r10),%xmm14
- movaps -0x20(%r10),%xmm15
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L4x_epilogue:
- ret
-.cfi_endproc
-.size chacha20_4x,.-chacha20_4x
-___
-}
-if($kernel) {
- $code .= "#endif\n";
-}
-
-########################################################################
-# XOP code path that handles all lengths.
-if ($avx && !$kernel) {
-# There is some "anomaly" observed depending on instructions' size or
-# alignment. If you look closely at below code you'll notice that
-# sometimes argument order varies. The order affects instruction
-# encoding by making it larger, and such fiddling gives 5% performance
-# improvement. This is on FX-4100...
-
-my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
- $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
-my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
-
-sub XOP_lane_ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my @x=map("\"$_\"",@xx);
-
- (
- "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
- "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
- "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
- "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
- "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
- "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
- "&vprotd (@x[$d0],@x[$d0],16)",
- "&vprotd (@x[$d1],@x[$d1],16)",
- "&vprotd (@x[$d2],@x[$d2],16)",
- "&vprotd (@x[$d3],@x[$d3],16)",
-
- "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
- "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
- "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
- "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
- "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
- "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
- "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
- "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
- "&vprotd (@x[$b0],@x[$b0],12)",
- "&vprotd (@x[$b1],@x[$b1],12)",
- "&vprotd (@x[$b2],@x[$b2],12)",
- "&vprotd (@x[$b3],@x[$b3],12)",
-
- "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
- "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
- "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
- "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
- "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
- "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
- "&vprotd (@x[$d0],@x[$d0],8)",
- "&vprotd (@x[$d1],@x[$d1],8)",
- "&vprotd (@x[$d2],@x[$d2],8)",
- "&vprotd (@x[$d3],@x[$d3],8)",
-
- "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
- "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
- "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
- "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
- "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
- "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
- "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
- "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
- "&vprotd (@x[$b0],@x[$b0],7)",
- "&vprotd (@x[$b1],@x[$b1],7)",
- "&vprotd (@x[$b2],@x[$b2],7)",
- "&vprotd (@x[$b3],@x[$b3],7)"
- );
-}
-
-my $xframe = $win64 ? 0xa8 : 8;
-
-&declare_function("chacha20_xop", 32, 5);
-$code.=<<___;
-.cfi_startproc
-.Lchacha20_4xop:
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
- sub \$0x140+$xframe,%rsp
- and \$-16,%rsp
-___
- ################ stack layout
- # +0x00 SIMD equivalent of @x[8-12]
- # ...
- # +0x40 constant copy of key[0-2] smashed by lanes
- # ...
- # +0x100 SIMD counters (with nonce smashed by lanes)
- # ...
- # +0x140
-$code.=<<___ if ($win64);
- movaps %xmm6,-0xb0(%r10)
- movaps %xmm7,-0xa0(%r10)
- movaps %xmm8,-0x90(%r10)
- movaps %xmm9,-0x80(%r10)
- movaps %xmm10,-0x70(%r10)
- movaps %xmm11,-0x60(%r10)
- movaps %xmm12,-0x50(%r10)
- movaps %xmm13,-0x40(%r10)
- movaps %xmm14,-0x30(%r10)
- movaps %xmm15,-0x20(%r10)
-.L4xop_body:
-___
-$code.=<<___;
- vzeroupper
-
- vmovdqa .Lsigma(%rip),$xa3 # key[0]
- vmovdqu ($key),$xb3 # key[1]
- vmovdqu 16($key),$xt3 # key[2]
- vmovdqu ($counter),$xd3 # key[3]
- lea 0x100(%rsp),%rcx # size optimization
-
- vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
- vpshufd \$0x55,$xa3,$xa1
- vmovdqa $xa0,0x40(%rsp) # ... and offload
- vpshufd \$0xaa,$xa3,$xa2
- vmovdqa $xa1,0x50(%rsp)
- vpshufd \$0xff,$xa3,$xa3
- vmovdqa $xa2,0x60(%rsp)
- vmovdqa $xa3,0x70(%rsp)
-
- vpshufd \$0x00,$xb3,$xb0
- vpshufd \$0x55,$xb3,$xb1
- vmovdqa $xb0,0x80-0x100(%rcx)
- vpshufd \$0xaa,$xb3,$xb2
- vmovdqa $xb1,0x90-0x100(%rcx)
- vpshufd \$0xff,$xb3,$xb3
- vmovdqa $xb2,0xa0-0x100(%rcx)
- vmovdqa $xb3,0xb0-0x100(%rcx)
-
- vpshufd \$0x00,$xt3,$xt0 # "$xc0"
- vpshufd \$0x55,$xt3,$xt1 # "$xc1"
- vmovdqa $xt0,0xc0-0x100(%rcx)
- vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
- vmovdqa $xt1,0xd0-0x100(%rcx)
- vpshufd \$0xff,$xt3,$xt3 # "$xc3"
- vmovdqa $xt2,0xe0-0x100(%rcx)
- vmovdqa $xt3,0xf0-0x100(%rcx)
-
- vpshufd \$0x00,$xd3,$xd0
- vpshufd \$0x55,$xd3,$xd1
- vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
- vpshufd \$0xaa,$xd3,$xd2
- vmovdqa $xd1,0x110-0x100(%rcx)
- vpshufd \$0xff,$xd3,$xd3
- vmovdqa $xd2,0x120-0x100(%rcx)
- vmovdqa $xd3,0x130-0x100(%rcx)
-
- jmp .Loop_enter4xop
-
-.align 32
-.Loop_outer4xop:
- vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
- vmovdqa 0x50(%rsp),$xa1
- vmovdqa 0x60(%rsp),$xa2
- vmovdqa 0x70(%rsp),$xa3
- vmovdqa 0x80-0x100(%rcx),$xb0
- vmovdqa 0x90-0x100(%rcx),$xb1
- vmovdqa 0xa0-0x100(%rcx),$xb2
- vmovdqa 0xb0-0x100(%rcx),$xb3
- vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
- vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
- vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
- vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
- vmovdqa 0x100-0x100(%rcx),$xd0
- vmovdqa 0x110-0x100(%rcx),$xd1
- vmovdqa 0x120-0x100(%rcx),$xd2
- vmovdqa 0x130-0x100(%rcx),$xd3
- vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
-
-.Loop_enter4xop:
- mov \$10,%eax
- vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
- jmp .Loop4xop
-
-.align 32
-.Loop4xop:
-___
- foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
- foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- dec %eax
- jnz .Loop4xop
-
- vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
- vpaddd 0x50(%rsp),$xa1,$xa1
- vpaddd 0x60(%rsp),$xa2,$xa2
- vpaddd 0x70(%rsp),$xa3,$xa3
-
- vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
- vmovdqa $xt3,0x30(%rsp)
-
- vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
- vpunpckldq $xa3,$xa2,$xt3
- vpunpckhdq $xa1,$xa0,$xa0
- vpunpckhdq $xa3,$xa2,$xa2
- vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
- vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
- vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
-___
- ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
-$code.=<<___;
- vpaddd 0x80-0x100(%rcx),$xb0,$xb0
- vpaddd 0x90-0x100(%rcx),$xb1,$xb1
- vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
- vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
-
- vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
- vmovdqa $xa1,0x10(%rsp)
- vmovdqa 0x20(%rsp),$xa0 # "xc2"
- vmovdqa 0x30(%rsp),$xa1 # "xc3"
-
- vpunpckldq $xb1,$xb0,$xt2
- vpunpckldq $xb3,$xb2,$xt3
- vpunpckhdq $xb1,$xb0,$xb0
- vpunpckhdq $xb3,$xb2,$xb2
- vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
- vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
- vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
-___
- ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
- my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
-$code.=<<___;
- vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
- vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
- vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
- vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
-
- vpunpckldq $xc1,$xc0,$xt2
- vpunpckldq $xc3,$xc2,$xt3
- vpunpckhdq $xc1,$xc0,$xc0
- vpunpckhdq $xc3,$xc2,$xc2
- vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
- vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
- vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
-___
- ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
-$code.=<<___;
- vpaddd 0x100-0x100(%rcx),$xd0,$xd0
- vpaddd 0x110-0x100(%rcx),$xd1,$xd1
- vpaddd 0x120-0x100(%rcx),$xd2,$xd2
- vpaddd 0x130-0x100(%rcx),$xd3,$xd3
-
- vpunpckldq $xd1,$xd0,$xt2
- vpunpckldq $xd3,$xd2,$xt3
- vpunpckhdq $xd1,$xd0,$xd0
- vpunpckhdq $xd3,$xd2,$xd2
- vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
- vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
- vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
-___
- ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
- ($xa0,$xa1)=($xt2,$xt3);
-$code.=<<___;
- vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
- vmovdqa 0x10(%rsp),$xa1
-
- cmp \$64*4,$len
- jb .Ltail4xop
-
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x10($inp),$xb0,$xb0
- vpxor 0x20($inp),$xc0,$xc0
- vpxor 0x30($inp),$xd0,$xd0
- vpxor 0x40($inp),$xa1,$xa1
- vpxor 0x50($inp),$xb1,$xb1
- vpxor 0x60($inp),$xc1,$xc1
- vpxor 0x70($inp),$xd1,$xd1
- lea 0x80($inp),$inp # size optimization
- vpxor 0x00($inp),$xa2,$xa2
- vpxor 0x10($inp),$xb2,$xb2
- vpxor 0x20($inp),$xc2,$xc2
- vpxor 0x30($inp),$xd2,$xd2
- vpxor 0x40($inp),$xa3,$xa3
- vpxor 0x50($inp),$xb3,$xb3
- vpxor 0x60($inp),$xc3,$xc3
- vpxor 0x70($inp),$xd3,$xd3
- lea 0x80($inp),$inp # inp+=64*4
-
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x10($out)
- vmovdqu $xc0,0x20($out)
- vmovdqu $xd0,0x30($out)
- vmovdqu $xa1,0x40($out)
- vmovdqu $xb1,0x50($out)
- vmovdqu $xc1,0x60($out)
- vmovdqu $xd1,0x70($out)
- lea 0x80($out),$out # size optimization
- vmovdqu $xa2,0x00($out)
- vmovdqu $xb2,0x10($out)
- vmovdqu $xc2,0x20($out)
- vmovdqu $xd2,0x30($out)
- vmovdqu $xa3,0x40($out)
- vmovdqu $xb3,0x50($out)
- vmovdqu $xc3,0x60($out)
- vmovdqu $xd3,0x70($out)
- lea 0x80($out),$out # out+=64*4
-
- sub \$64*4,$len
- jnz .Loop_outer4xop
-
- jmp .Ldone4xop
-
-.align 32
-.Ltail4xop:
- cmp \$192,$len
- jae .L192_or_more4xop
- cmp \$128,$len
- jae .L128_or_more4xop
- cmp \$64,$len
- jae .L64_or_more4xop
-
- xor %r9,%r9
- vmovdqa $xa0,0x00(%rsp)
- vmovdqa $xb0,0x10(%rsp)
- vmovdqa $xc0,0x20(%rsp)
- vmovdqa $xd0,0x30(%rsp)
- jmp .Loop_tail4xop
-
-.align 32
-.L64_or_more4xop:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x10($inp),$xb0,$xb0
- vpxor 0x20($inp),$xc0,$xc0
- vpxor 0x30($inp),$xd0,$xd0
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x10($out)
- vmovdqu $xc0,0x20($out)
- vmovdqu $xd0,0x30($out)
- je .Ldone4xop
-
- lea 0x40($inp),$inp # inp+=64*1
- vmovdqa $xa1,0x00(%rsp)
- xor %r9,%r9
- vmovdqa $xb1,0x10(%rsp)
- lea 0x40($out),$out # out+=64*1
- vmovdqa $xc1,0x20(%rsp)
- sub \$64,$len # len-=64*1
- vmovdqa $xd1,0x30(%rsp)
- jmp .Loop_tail4xop
-
-.align 32
-.L128_or_more4xop:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x10($inp),$xb0,$xb0
- vpxor 0x20($inp),$xc0,$xc0
- vpxor 0x30($inp),$xd0,$xd0
- vpxor 0x40($inp),$xa1,$xa1
- vpxor 0x50($inp),$xb1,$xb1
- vpxor 0x60($inp),$xc1,$xc1
- vpxor 0x70($inp),$xd1,$xd1
-
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x10($out)
- vmovdqu $xc0,0x20($out)
- vmovdqu $xd0,0x30($out)
- vmovdqu $xa1,0x40($out)
- vmovdqu $xb1,0x50($out)
- vmovdqu $xc1,0x60($out)
- vmovdqu $xd1,0x70($out)
- je .Ldone4xop
-
- lea 0x80($inp),$inp # inp+=64*2
- vmovdqa $xa2,0x00(%rsp)
- xor %r9,%r9
- vmovdqa $xb2,0x10(%rsp)
- lea 0x80($out),$out # out+=64*2
- vmovdqa $xc2,0x20(%rsp)
- sub \$128,$len # len-=64*2
- vmovdqa $xd2,0x30(%rsp)
- jmp .Loop_tail4xop
-
-.align 32
-.L192_or_more4xop:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x10($inp),$xb0,$xb0
- vpxor 0x20($inp),$xc0,$xc0
- vpxor 0x30($inp),$xd0,$xd0
- vpxor 0x40($inp),$xa1,$xa1
- vpxor 0x50($inp),$xb1,$xb1
- vpxor 0x60($inp),$xc1,$xc1
- vpxor 0x70($inp),$xd1,$xd1
- lea 0x80($inp),$inp # size optimization
- vpxor 0x00($inp),$xa2,$xa2
- vpxor 0x10($inp),$xb2,$xb2
- vpxor 0x20($inp),$xc2,$xc2
- vpxor 0x30($inp),$xd2,$xd2
-
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x10($out)
- vmovdqu $xc0,0x20($out)
- vmovdqu $xd0,0x30($out)
- vmovdqu $xa1,0x40($out)
- vmovdqu $xb1,0x50($out)
- vmovdqu $xc1,0x60($out)
- vmovdqu $xd1,0x70($out)
- lea 0x80($out),$out # size optimization
- vmovdqu $xa2,0x00($out)
- vmovdqu $xb2,0x10($out)
- vmovdqu $xc2,0x20($out)
- vmovdqu $xd2,0x30($out)
- je .Ldone4xop
-
- lea 0x40($inp),$inp # inp+=64*3
- vmovdqa $xa3,0x00(%rsp)
- xor %r9,%r9
- vmovdqa $xb3,0x10(%rsp)
- lea 0x40($out),$out # out+=64*3
- vmovdqa $xc3,0x20(%rsp)
- sub \$192,$len # len-=64*3
- vmovdqa $xd3,0x30(%rsp)
-
-.Loop_tail4xop:
- movzb ($inp,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1($out,%r9)
- dec $len
- jnz .Loop_tail4xop
-
-.Ldone4xop:
- vzeroupper
-___
-$code.=<<___ if ($win64);
- movaps -0xb0(%r10),%xmm6
- movaps -0xa0(%r10),%xmm7
- movaps -0x90(%r10),%xmm8
- movaps -0x80(%r10),%xmm9
- movaps -0x70(%r10),%xmm10
- movaps -0x60(%r10),%xmm11
- movaps -0x50(%r10),%xmm12
- movaps -0x40(%r10),%xmm13
- movaps -0x30(%r10),%xmm14
- movaps -0x20(%r10),%xmm15
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L4xop_epilogue:
- ret
-.cfi_endproc
-___
-&end_function("chacha20_xop");
-}
-
-########################################################################
-# AVX2 code path
-if ($avx>1) {
-
-if($kernel) {
- $code .= "#ifdef CONFIG_AS_AVX2\n";
-}
-
-my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
- $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
-my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
-
-sub AVX2_lane_ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
-my @x=map("\"$_\"",@xx);
-
- # Consider order in which variables are addressed by their
- # index:
- #
- # a b c d
- #
- # 0 4 8 12 < even round
- # 1 5 9 13
- # 2 6 10 14
- # 3 7 11 15
- # 0 5 10 15 < odd round
- # 1 6 11 12
- # 2 7 8 13
- # 3 4 9 14
- #
- # 'a', 'b' and 'd's are permanently allocated in registers,
- # @x[0..7,12..15], while 'c's are maintained in memory. If
- # you observe 'c' column, you'll notice that pair of 'c's is
- # invariant between rounds. This means that we have to reload
- # them once per round, in the middle. This is why you'll see
- # bunch of 'c' stores and loads in the middle, but none in
- # the beginning or end.
-
- (
- "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
- "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
- "&vpshufb (@x[$d0],@x[$d0],$t1)",
- "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
- "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
- "&vpshufb (@x[$d1],@x[$d1],$t1)",
-
- "&vpaddd ($xc,$xc,@x[$d0])",
- "&vpxor (@x[$b0],$xc,@x[$b0])",
- "&vpslld ($t0,@x[$b0],12)",
- "&vpsrld (@x[$b0],@x[$b0],20)",
- "&vpor (@x[$b0],$t0,@x[$b0])",
- "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
- "&vpaddd ($xc_,$xc_,@x[$d1])",
- "&vpxor (@x[$b1],$xc_,@x[$b1])",
- "&vpslld ($t1,@x[$b1],12)",
- "&vpsrld (@x[$b1],@x[$b1],20)",
- "&vpor (@x[$b1],$t1,@x[$b1])",
-
- "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
- "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
- "&vpshufb (@x[$d0],@x[$d0],$t0)",
- "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
- "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
- "&vpshufb (@x[$d1],@x[$d1],$t0)",
-
- "&vpaddd ($xc,$xc,@x[$d0])",
- "&vpxor (@x[$b0],$xc,@x[$b0])",
- "&vpslld ($t1,@x[$b0],7)",
- "&vpsrld (@x[$b0],@x[$b0],25)",
- "&vpor (@x[$b0],$t1,@x[$b0])",
- "&vbroadcasti128($t1,'(%r9)')", # .Lrot16(%rip)
- "&vpaddd ($xc_,$xc_,@x[$d1])",
- "&vpxor (@x[$b1],$xc_,@x[$b1])",
- "&vpslld ($t0,@x[$b1],7)",
- "&vpsrld (@x[$b1],@x[$b1],25)",
- "&vpor (@x[$b1],$t0,@x[$b1])",
-
- "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
- "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
- "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
- "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
-
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
- "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
- "&vpshufb (@x[$d2],@x[$d2],$t1)",
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
- "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
- "&vpshufb (@x[$d3],@x[$d3],$t1)",
-
- "&vpaddd ($xc,$xc,@x[$d2])",
- "&vpxor (@x[$b2],$xc,@x[$b2])",
- "&vpslld ($t0,@x[$b2],12)",
- "&vpsrld (@x[$b2],@x[$b2],20)",
- "&vpor (@x[$b2],$t0,@x[$b2])",
- "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
- "&vpaddd ($xc_,$xc_,@x[$d3])",
- "&vpxor (@x[$b3],$xc_,@x[$b3])",
- "&vpslld ($t1,@x[$b3],12)",
- "&vpsrld (@x[$b3],@x[$b3],20)",
- "&vpor (@x[$b3],$t1,@x[$b3])",
-
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
- "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
- "&vpshufb (@x[$d2],@x[$d2],$t0)",
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
- "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
- "&vpshufb (@x[$d3],@x[$d3],$t0)",
-
- "&vpaddd ($xc,$xc,@x[$d2])",
- "&vpxor (@x[$b2],$xc,@x[$b2])",
- "&vpslld ($t1,@x[$b2],7)",
- "&vpsrld (@x[$b2],@x[$b2],25)",
- "&vpor (@x[$b2],$t1,@x[$b2])",
- "&vbroadcasti128($t1,'(%r9)')", # .Lrot16(%rip)
- "&vpaddd ($xc_,$xc_,@x[$d3])",
- "&vpxor (@x[$b3],$xc_,@x[$b3])",
- "&vpslld ($t0,@x[$b3],7)",
- "&vpsrld (@x[$b3],@x[$b3],25)",
- "&vpor (@x[$b3],$t0,@x[$b3])"
- );
-}
-
-my $xframe = $win64 ? 0xa8 : 8;
-
-&declare_function("chacha20_avx2", 32, 5);
-$code.=<<___;
-.cfi_startproc
-.Lchacha20_8x:
- lea 8(%rsp),%r10 # frame register
-.cfi_def_cfa_register %r10
- sub \$0x280+$xframe,%rsp
- and \$-32,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0xb0(%r10)
- movaps %xmm7,-0xa0(%r10)
- movaps %xmm8,-0x90(%r10)
- movaps %xmm9,-0x80(%r10)
- movaps %xmm10,-0x70(%r10)
- movaps %xmm11,-0x60(%r10)
- movaps %xmm12,-0x50(%r10)
- movaps %xmm13,-0x40(%r10)
- movaps %xmm14,-0x30(%r10)
- movaps %xmm15,-0x20(%r10)
-.L8x_body:
-___
-$code.=<<___;
- vzeroupper
-
- ################ stack layout
- # +0x00 SIMD equivalent of @x[8-12]
- # ...
- # +0x80 constant copy of key[0-2] smashed by lanes
- # ...
- # +0x200 SIMD counters (with nonce smashed by lanes)
- # ...
- # +0x280
-
- vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
- vbroadcasti128 ($key),$xb3 # key[1]
- vbroadcasti128 16($key),$xt3 # key[2]
- vbroadcasti128 ($counter),$xd3 # key[3]
- lea 0x100(%rsp),%rcx # size optimization
- lea 0x200(%rsp),%rax # size optimization
- lea .Lrot16(%rip),%r9
- lea .Lrot24(%rip),%r11
-
- vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
- vpshufd \$0x55,$xa3,$xa1
- vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
- vpshufd \$0xaa,$xa3,$xa2
- vmovdqa $xa1,0xa0-0x100(%rcx)
- vpshufd \$0xff,$xa3,$xa3
- vmovdqa $xa2,0xc0-0x100(%rcx)
- vmovdqa $xa3,0xe0-0x100(%rcx)
-
- vpshufd \$0x00,$xb3,$xb0
- vpshufd \$0x55,$xb3,$xb1
- vmovdqa $xb0,0x100-0x100(%rcx)
- vpshufd \$0xaa,$xb3,$xb2
- vmovdqa $xb1,0x120-0x100(%rcx)
- vpshufd \$0xff,$xb3,$xb3
- vmovdqa $xb2,0x140-0x100(%rcx)
- vmovdqa $xb3,0x160-0x100(%rcx)
-
- vpshufd \$0x00,$xt3,$xt0 # "xc0"
- vpshufd \$0x55,$xt3,$xt1 # "xc1"
- vmovdqa $xt0,0x180-0x200(%rax)
- vpshufd \$0xaa,$xt3,$xt2 # "xc2"
- vmovdqa $xt1,0x1a0-0x200(%rax)
- vpshufd \$0xff,$xt3,$xt3 # "xc3"
- vmovdqa $xt2,0x1c0-0x200(%rax)
- vmovdqa $xt3,0x1e0-0x200(%rax)
-
- vpshufd \$0x00,$xd3,$xd0
- vpshufd \$0x55,$xd3,$xd1
- vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
- vpshufd \$0xaa,$xd3,$xd2
- vmovdqa $xd1,0x220-0x200(%rax)
- vpshufd \$0xff,$xd3,$xd3
- vmovdqa $xd2,0x240-0x200(%rax)
- vmovdqa $xd3,0x260-0x200(%rax)
-
- jmp .Loop_enter8x
-
-.align 32
-.Loop_outer8x:
- vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
- vmovdqa 0xa0-0x100(%rcx),$xa1
- vmovdqa 0xc0-0x100(%rcx),$xa2
- vmovdqa 0xe0-0x100(%rcx),$xa3
- vmovdqa 0x100-0x100(%rcx),$xb0
- vmovdqa 0x120-0x100(%rcx),$xb1
- vmovdqa 0x140-0x100(%rcx),$xb2
- vmovdqa 0x160-0x100(%rcx),$xb3
- vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
- vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
- vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
- vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
- vmovdqa 0x200-0x200(%rax),$xd0
- vmovdqa 0x220-0x200(%rax),$xd1
- vmovdqa 0x240-0x200(%rax),$xd2
- vmovdqa 0x260-0x200(%rax),$xd3
- vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
-
-.Loop_enter8x:
- vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
- vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
- vbroadcasti128 (%r9),$xt3
- vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
- mov \$10,%eax
- jmp .Loop8x
-
-.align 32
-.Loop8x:
-___
- foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
- foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- dec %eax
- jnz .Loop8x
-
- lea 0x200(%rsp),%rax # size optimization
- vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
- vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
- vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
- vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
-
- vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
- vpunpckldq $xa3,$xa2,$xt3
- vpunpckhdq $xa1,$xa0,$xa0
- vpunpckhdq $xa3,$xa2,$xa2
- vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
- vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
- vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
-___
- ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
-$code.=<<___;
- vpaddd 0x100-0x100(%rcx),$xb0,$xb0
- vpaddd 0x120-0x100(%rcx),$xb1,$xb1
- vpaddd 0x140-0x100(%rcx),$xb2,$xb2
- vpaddd 0x160-0x100(%rcx),$xb3,$xb3
-
- vpunpckldq $xb1,$xb0,$xt2
- vpunpckldq $xb3,$xb2,$xt3
- vpunpckhdq $xb1,$xb0,$xb0
- vpunpckhdq $xb3,$xb2,$xb2
- vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
- vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
- vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
-___
- ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
-$code.=<<___;
- vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
- vperm2i128 \$0x31,$xb0,$xa0,$xb0
- vperm2i128 \$0x20,$xb1,$xa1,$xa0
- vperm2i128 \$0x31,$xb1,$xa1,$xb1
- vperm2i128 \$0x20,$xb2,$xa2,$xa1
- vperm2i128 \$0x31,$xb2,$xa2,$xb2
- vperm2i128 \$0x20,$xb3,$xa3,$xa2
- vperm2i128 \$0x31,$xb3,$xa3,$xb3
-___
- ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
- my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
-$code.=<<___;
- vmovdqa $xa0,0x00(%rsp) # offload $xaN
- vmovdqa $xa1,0x20(%rsp)
- vmovdqa 0x40(%rsp),$xc2 # $xa0
- vmovdqa 0x60(%rsp),$xc3 # $xa1
-
- vpaddd 0x180-0x200(%rax),$xc0,$xc0
- vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
- vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
- vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
-
- vpunpckldq $xc1,$xc0,$xt2
- vpunpckldq $xc3,$xc2,$xt3
- vpunpckhdq $xc1,$xc0,$xc0
- vpunpckhdq $xc3,$xc2,$xc2
- vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
- vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
- vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
-___
- ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
-$code.=<<___;
- vpaddd 0x200-0x200(%rax),$xd0,$xd0
- vpaddd 0x220-0x200(%rax),$xd1,$xd1
- vpaddd 0x240-0x200(%rax),$xd2,$xd2
- vpaddd 0x260-0x200(%rax),$xd3,$xd3
-
- vpunpckldq $xd1,$xd0,$xt2
- vpunpckldq $xd3,$xd2,$xt3
- vpunpckhdq $xd1,$xd0,$xd0
- vpunpckhdq $xd3,$xd2,$xd2
- vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
- vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
- vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
-___
- ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
-$code.=<<___;
- vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
- vperm2i128 \$0x31,$xd0,$xc0,$xd0
- vperm2i128 \$0x20,$xd1,$xc1,$xc0
- vperm2i128 \$0x31,$xd1,$xc1,$xd1
- vperm2i128 \$0x20,$xd2,$xc2,$xc1
- vperm2i128 \$0x31,$xd2,$xc2,$xd2
- vperm2i128 \$0x20,$xd3,$xc3,$xc2
- vperm2i128 \$0x31,$xd3,$xc3,$xd3
-___
- ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
- ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
- ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
- ($xa0,$xa1)=($xt2,$xt3);
-$code.=<<___;
- vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
- vmovdqa 0x20(%rsp),$xa1
-
- cmp \$64*8,$len
- jb .Ltail8x
-
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- lea 0x80($inp),$inp # size optimization
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- lea 0x80($out),$out # size optimization
-
- vpxor 0x00($inp),$xa1,$xa1
- vpxor 0x20($inp),$xb1,$xb1
- vpxor 0x40($inp),$xc1,$xc1
- vpxor 0x60($inp),$xd1,$xd1
- lea 0x80($inp),$inp # size optimization
- vmovdqu $xa1,0x00($out)
- vmovdqu $xb1,0x20($out)
- vmovdqu $xc1,0x40($out)
- vmovdqu $xd1,0x60($out)
- lea 0x80($out),$out # size optimization
-
- vpxor 0x00($inp),$xa2,$xa2
- vpxor 0x20($inp),$xb2,$xb2
- vpxor 0x40($inp),$xc2,$xc2
- vpxor 0x60($inp),$xd2,$xd2
- lea 0x80($inp),$inp # size optimization
- vmovdqu $xa2,0x00($out)
- vmovdqu $xb2,0x20($out)
- vmovdqu $xc2,0x40($out)
- vmovdqu $xd2,0x60($out)
- lea 0x80($out),$out # size optimization
-
- vpxor 0x00($inp),$xa3,$xa3
- vpxor 0x20($inp),$xb3,$xb3
- vpxor 0x40($inp),$xc3,$xc3
- vpxor 0x60($inp),$xd3,$xd3
- lea 0x80($inp),$inp # size optimization
- vmovdqu $xa3,0x00($out)
- vmovdqu $xb3,0x20($out)
- vmovdqu $xc3,0x40($out)
- vmovdqu $xd3,0x60($out)
- lea 0x80($out),$out # size optimization
-
- sub \$64*8,$len
- jnz .Loop_outer8x
-
- jmp .Ldone8x
-
-.Ltail8x:
- cmp \$448,$len
- jae .L448_or_more8x
- cmp \$384,$len
- jae .L384_or_more8x
- cmp \$320,$len
- jae .L320_or_more8x
- cmp \$256,$len
- jae .L256_or_more8x
- cmp \$192,$len
- jae .L192_or_more8x
- cmp \$128,$len
- jae .L128_or_more8x
- cmp \$64,$len
- jae .L64_or_more8x
-
- xor %r9,%r9
- vmovdqa $xa0,0x00(%rsp)
- vmovdqa $xb0,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L64_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- je .Ldone8x
-
- lea 0x40($inp),$inp # inp+=64*1
- xor %r9,%r9
- vmovdqa $xc0,0x00(%rsp)
- lea 0x40($out),$out # out+=64*1
- sub \$64,$len # len-=64*1
- vmovdqa $xd0,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L128_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- je .Ldone8x
-
- lea 0x80($inp),$inp # inp+=64*2
- xor %r9,%r9
- vmovdqa $xa1,0x00(%rsp)
- lea 0x80($out),$out # out+=64*2
- sub \$128,$len # len-=64*2
- vmovdqa $xb1,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L192_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vpxor 0x80($inp),$xa1,$xa1
- vpxor 0xa0($inp),$xb1,$xb1
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- vmovdqu $xa1,0x80($out)
- vmovdqu $xb1,0xa0($out)
- je .Ldone8x
-
- lea 0xc0($inp),$inp # inp+=64*3
- xor %r9,%r9
- vmovdqa $xc1,0x00(%rsp)
- lea 0xc0($out),$out # out+=64*3
- sub \$192,$len # len-=64*3
- vmovdqa $xd1,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L256_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vpxor 0x80($inp),$xa1,$xa1
- vpxor 0xa0($inp),$xb1,$xb1
- vpxor 0xc0($inp),$xc1,$xc1
- vpxor 0xe0($inp),$xd1,$xd1
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- vmovdqu $xa1,0x80($out)
- vmovdqu $xb1,0xa0($out)
- vmovdqu $xc1,0xc0($out)
- vmovdqu $xd1,0xe0($out)
- je .Ldone8x
-
- lea 0x100($inp),$inp # inp+=64*4
- xor %r9,%r9
- vmovdqa $xa2,0x00(%rsp)
- lea 0x100($out),$out # out+=64*4
- sub \$256,$len # len-=64*4
- vmovdqa $xb2,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L320_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vpxor 0x80($inp),$xa1,$xa1
- vpxor 0xa0($inp),$xb1,$xb1
- vpxor 0xc0($inp),$xc1,$xc1
- vpxor 0xe0($inp),$xd1,$xd1
- vpxor 0x100($inp),$xa2,$xa2
- vpxor 0x120($inp),$xb2,$xb2
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- vmovdqu $xa1,0x80($out)
- vmovdqu $xb1,0xa0($out)
- vmovdqu $xc1,0xc0($out)
- vmovdqu $xd1,0xe0($out)
- vmovdqu $xa2,0x100($out)
- vmovdqu $xb2,0x120($out)
- je .Ldone8x
-
- lea 0x140($inp),$inp # inp+=64*5
- xor %r9,%r9
- vmovdqa $xc2,0x00(%rsp)
- lea 0x140($out),$out # out+=64*5
- sub \$320,$len # len-=64*5
- vmovdqa $xd2,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L384_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vpxor 0x80($inp),$xa1,$xa1
- vpxor 0xa0($inp),$xb1,$xb1
- vpxor 0xc0($inp),$xc1,$xc1
- vpxor 0xe0($inp),$xd1,$xd1
- vpxor 0x100($inp),$xa2,$xa2
- vpxor 0x120($inp),$xb2,$xb2
- vpxor 0x140($inp),$xc2,$xc2
- vpxor 0x160($inp),$xd2,$xd2
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- vmovdqu $xa1,0x80($out)
- vmovdqu $xb1,0xa0($out)
- vmovdqu $xc1,0xc0($out)
- vmovdqu $xd1,0xe0($out)
- vmovdqu $xa2,0x100($out)
- vmovdqu $xb2,0x120($out)
- vmovdqu $xc2,0x140($out)
- vmovdqu $xd2,0x160($out)
- je .Ldone8x
-
- lea 0x180($inp),$inp # inp+=64*6
- xor %r9,%r9
- vmovdqa $xa3,0x00(%rsp)
- lea 0x180($out),$out # out+=64*6
- sub \$384,$len # len-=64*6
- vmovdqa $xb3,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L448_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vpxor 0x80($inp),$xa1,$xa1
- vpxor 0xa0($inp),$xb1,$xb1
- vpxor 0xc0($inp),$xc1,$xc1
- vpxor 0xe0($inp),$xd1,$xd1
- vpxor 0x100($inp),$xa2,$xa2
- vpxor 0x120($inp),$xb2,$xb2
- vpxor 0x140($inp),$xc2,$xc2
- vpxor 0x160($inp),$xd2,$xd2
- vpxor 0x180($inp),$xa3,$xa3
- vpxor 0x1a0($inp),$xb3,$xb3
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- vmovdqu $xa1,0x80($out)
- vmovdqu $xb1,0xa0($out)
- vmovdqu $xc1,0xc0($out)
- vmovdqu $xd1,0xe0($out)
- vmovdqu $xa2,0x100($out)
- vmovdqu $xb2,0x120($out)
- vmovdqu $xc2,0x140($out)
- vmovdqu $xd2,0x160($out)
- vmovdqu $xa3,0x180($out)
- vmovdqu $xb3,0x1a0($out)
- je .Ldone8x
-
- lea 0x1c0($inp),$inp # inp+=64*7
- xor %r9,%r9
- vmovdqa $xc3,0x00(%rsp)
- lea 0x1c0($out),$out # out+=64*7
- sub \$448,$len # len-=64*7
- vmovdqa $xd3,0x20(%rsp)
-
-.Loop_tail8x:
- movzb ($inp,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1($out,%r9)
- dec $len
- jnz .Loop_tail8x
-
-.Ldone8x:
- vzeroall
-___
-$code.=<<___ if ($win64);
- movaps -0xb0(%r10),%xmm6
- movaps -0xa0(%r10),%xmm7
- movaps -0x90(%r10),%xmm8
- movaps -0x80(%r10),%xmm9
- movaps -0x70(%r10),%xmm10
- movaps -0x60(%r10),%xmm11
- movaps -0x50(%r10),%xmm12
- movaps -0x40(%r10),%xmm13
- movaps -0x30(%r10),%xmm14
- movaps -0x20(%r10),%xmm15
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L8x_epilogue:
- ret
-.cfi_endproc
-___
-&end_function("chacha20_avx2");
-if($kernel) {
- $code .= "#endif\n";
-}
-}
-
-########################################################################
-# AVX512 code paths
-if ($avx>2) {
-# This one handles shorter inputs...
-if($kernel) {
- $code .= "#ifdef CONFIG_AS_AVX512\n";
-}
-
-my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
-my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
-
-sub vpxord() # size optimization
-{ my $opcode = "vpxor"; # adhere to vpxor when possible
-
- foreach (@_) {
- if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) {
- $opcode = "vpxord";
- last;
- }
- }
-
- $code .= "\t$opcode\t".join(',',reverse @_)."\n";
-}
-
-sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
- &vpaddd ($a,$a,$b);
- &vpxord ($d,$d,$a);
- &vprold ($d,$d,16);
-
- &vpaddd ($c,$c,$d);
- &vpxord ($b,$b,$c);
- &vprold ($b,$b,12);
-
- &vpaddd ($a,$a,$b);
- &vpxord ($d,$d,$a);
- &vprold ($d,$d,8);
-
- &vpaddd ($c,$c,$d);
- &vpxord ($b,$b,$c);
- &vprold ($b,$b,7);
-}
-
-my $xframe = $win64 ? 32+8 : 8;
-
-&declare_function("chacha20_avx512", 32, 5);
-$code.=<<___;
-.cfi_startproc
-.Lchacha20_avx512:
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
- cmp \$512,$len
- ja .Lchacha20_16x
-
- sub \$64+$xframe,%rsp
- and \$-64,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0x30(%r10)
- movaps %xmm7,-0x20(%r10)
-.Lavx512_body:
-___
-$code.=<<___;
- vbroadcasti32x4 .Lsigma(%rip),$a
- vbroadcasti32x4 ($key),$b
- vbroadcasti32x4 16($key),$c
- vbroadcasti32x4 ($counter),$d
-
- vmovdqa32 $a,$a_
- vmovdqa32 $b,$b_
- vmovdqa32 $c,$c_
- vpaddd .Lzeroz(%rip),$d,$d
- vmovdqa32 .Lfourz(%rip),$fourz
- mov \$10,$counter # reuse $counter
- vmovdqa32 $d,$d_
- jmp .Loop_avx512
-
-.align 16
-.Loop_outer_avx512:
- vmovdqa32 $a_,$a
- vmovdqa32 $b_,$b
- vmovdqa32 $c_,$c
- vpaddd $fourz,$d_,$d
- mov \$10,$counter
- vmovdqa32 $d,$d_
- jmp .Loop_avx512
-
-.align 32
-.Loop_avx512:
-___
- &AVX512ROUND();
- &vpshufd ($c,$c,0b01001110);
- &vpshufd ($b,$b,0b00111001);
- &vpshufd ($d,$d,0b10010011);
-
- &AVX512ROUND();
- &vpshufd ($c,$c,0b01001110);
- &vpshufd ($b,$b,0b10010011);
- &vpshufd ($d,$d,0b00111001);
-
- &dec ($counter);
- &jnz (".Loop_avx512");
-
-$code.=<<___;
- vpaddd $a_,$a,$a
- vpaddd $b_,$b,$b
- vpaddd $c_,$c,$c
- vpaddd $d_,$d,$d
-
- sub \$64,$len
- jb .Ltail64_avx512
-
- vpxor 0x00($inp),%x#$a,$t0 # xor with input
- vpxor 0x10($inp),%x#$b,$t1
- vpxor 0x20($inp),%x#$c,$t2
- vpxor 0x30($inp),%x#$d,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- jz .Ldone_avx512
-
- vextracti32x4 \$1,$a,$t0
- vextracti32x4 \$1,$b,$t1
- vextracti32x4 \$1,$c,$t2
- vextracti32x4 \$1,$d,$t3
-
- sub \$64,$len
- jb .Ltail_avx512
-
- vpxor 0x00($inp),$t0,$t0 # xor with input
- vpxor 0x10($inp),$t1,$t1
- vpxor 0x20($inp),$t2,$t2
- vpxor 0x30($inp),$t3,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- jz .Ldone_avx512
-
- vextracti32x4 \$2,$a,$t0
- vextracti32x4 \$2,$b,$t1
- vextracti32x4 \$2,$c,$t2
- vextracti32x4 \$2,$d,$t3
-
- sub \$64,$len
- jb .Ltail_avx512
-
- vpxor 0x00($inp),$t0,$t0 # xor with input
- vpxor 0x10($inp),$t1,$t1
- vpxor 0x20($inp),$t2,$t2
- vpxor 0x30($inp),$t3,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- jz .Ldone_avx512
-
- vextracti32x4 \$3,$a,$t0
- vextracti32x4 \$3,$b,$t1
- vextracti32x4 \$3,$c,$t2
- vextracti32x4 \$3,$d,$t3
-
- sub \$64,$len
- jb .Ltail_avx512
-
- vpxor 0x00($inp),$t0,$t0 # xor with input
- vpxor 0x10($inp),$t1,$t1
- vpxor 0x20($inp),$t2,$t2
- vpxor 0x30($inp),$t3,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- jnz .Loop_outer_avx512
-
- jmp .Ldone_avx512
-
-.align 16
-.Ltail64_avx512:
- vmovdqa %x#$a,0x00(%rsp)
- vmovdqa %x#$b,0x10(%rsp)
- vmovdqa %x#$c,0x20(%rsp)
- vmovdqa %x#$d,0x30(%rsp)
- add \$64,$len
- jmp .Loop_tail_avx512
-
-.align 16
-.Ltail_avx512:
- vmovdqa $t0,0x00(%rsp)
- vmovdqa $t1,0x10(%rsp)
- vmovdqa $t2,0x20(%rsp)
- vmovdqa $t3,0x30(%rsp)
- add \$64,$len
-
-.Loop_tail_avx512:
- movzb ($inp,$counter),%eax
- movzb (%rsp,$counter),%ecx
- lea 1($counter),$counter
- xor %ecx,%eax
- mov %al,-1($out,$counter)
- dec $len
- jnz .Loop_tail_avx512
-
- vmovdqu32 $a_,0x00(%rsp)
-
-.Ldone_avx512:
- vzeroall
-___
-$code.=<<___ if ($win64);
- movaps -0x30(%r10),%xmm6
- movaps -0x20(%r10),%xmm7
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.Lavx512_epilogue:
- ret
-.cfi_endproc
-___
-&end_function("chacha20_avx512");
-
-map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz);
-
-&declare_function("chacha20_avx512vl", 32, 5);
-$code.=<<___;
-.cfi_startproc
-.Lchacha20_avx512vl:
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
- cmp \$128,$len
- ja .Lchacha20_8xvl
-
- sub \$64+$xframe,%rsp
- and \$-32,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0x30(%r10)
- movaps %xmm7,-0x20(%r10)
-.Lavx512vl_body:
-___
-$code.=<<___;
- vbroadcasti128 .Lsigma(%rip),$a
- vbroadcasti128 ($key),$b
- vbroadcasti128 16($key),$c
- vbroadcasti128 ($counter),$d
-
- vmovdqa32 $a,$a_
- vmovdqa32 $b,$b_
- vmovdqa32 $c,$c_
- vpaddd .Lzeroz(%rip),$d,$d
- vmovdqa32 .Ltwoy(%rip),$fourz
- mov \$10,$counter # reuse $counter
- vmovdqa32 $d,$d_
- jmp .Loop_avx512vl
-
-.align 16
-.Loop_outer_avx512vl:
- vmovdqa32 $c_,$c
- vpaddd $fourz,$d_,$d
- mov \$10,$counter
- vmovdqa32 $d,$d_
- jmp .Loop_avx512vl
-
-.align 32
-.Loop_avx512vl:
-___
- &AVX512ROUND();
- &vpshufd ($c,$c,0b01001110);
- &vpshufd ($b,$b,0b00111001);
- &vpshufd ($d,$d,0b10010011);
-
- &AVX512ROUND();
- &vpshufd ($c,$c,0b01001110);
- &vpshufd ($b,$b,0b10010011);
- &vpshufd ($d,$d,0b00111001);
-
- &dec ($counter);
- &jnz (".Loop_avx512vl");
-
-$code.=<<___;
- vpaddd $a_,$a,$a
- vpaddd $b_,$b,$b
- vpaddd $c_,$c,$c
- vpaddd $d_,$d,$d
-
- sub \$64,$len
- jb .Ltail64_avx512vl
-
- vpxor 0x00($inp),%x#$a,$t0 # xor with input
- vpxor 0x10($inp),%x#$b,$t1
- vpxor 0x20($inp),%x#$c,$t2
- vpxor 0x30($inp),%x#$d,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- jz .Ldone_avx512vl
-
- vextracti128 \$1,$a,$t0
- vextracti128 \$1,$b,$t1
- vextracti128 \$1,$c,$t2
- vextracti128 \$1,$d,$t3
-
- sub \$64,$len
- jb .Ltail_avx512vl
-
- vpxor 0x00($inp),$t0,$t0 # xor with input
- vpxor 0x10($inp),$t1,$t1
- vpxor 0x20($inp),$t2,$t2
- vpxor 0x30($inp),$t3,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- vmovdqa32 $a_,$a
- vmovdqa32 $b_,$b
- jnz .Loop_outer_avx512vl
-
- jmp .Ldone_avx512vl
-
-.align 16
-.Ltail64_avx512vl:
- vmovdqa %x#$a,0x00(%rsp)
- vmovdqa %x#$b,0x10(%rsp)
- vmovdqa %x#$c,0x20(%rsp)
- vmovdqa %x#$d,0x30(%rsp)
- add \$64,$len
- jmp .Loop_tail_avx512vl
-
-.align 16
-.Ltail_avx512vl:
- vmovdqa $t0,0x00(%rsp)
- vmovdqa $t1,0x10(%rsp)
- vmovdqa $t2,0x20(%rsp)
- vmovdqa $t3,0x30(%rsp)
- add \$64,$len
-
-.Loop_tail_avx512vl:
- movzb ($inp,$counter),%eax
- movzb (%rsp,$counter),%ecx
- lea 1($counter),$counter
- xor %ecx,%eax
- mov %al,-1($out,$counter)
- dec $len
- jnz .Loop_tail_avx512vl
-
- vmovdqu32 $a_,0x00(%rsp)
- vmovdqu32 $a_,0x20(%rsp)
-
-.Ldone_avx512vl:
- vzeroall
-___
-$code.=<<___ if ($win64);
- movaps -0x30(%r10),%xmm6
- movaps -0x20(%r10),%xmm7
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.Lavx512vl_epilogue:
- ret
-.cfi_endproc
-___
-&end_function("chacha20_avx512vl");
-
-# This one handles longer inputs...
-
-my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
-my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
-my @key=map("%zmm$_",(16..31));
-my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
-
-sub AVX512_lane_ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my @x=map("\"$_\"",@xx);
-
- (
- "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
- "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
- "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
- "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
- "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
- "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
- "&vprold (@x[$d0],@x[$d0],16)",
- "&vprold (@x[$d1],@x[$d1],16)",
- "&vprold (@x[$d2],@x[$d2],16)",
- "&vprold (@x[$d3],@x[$d3],16)",
-
- "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
- "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
- "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
- "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
- "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
- "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
- "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
- "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
- "&vprold (@x[$b0],@x[$b0],12)",
- "&vprold (@x[$b1],@x[$b1],12)",
- "&vprold (@x[$b2],@x[$b2],12)",
- "&vprold (@x[$b3],@x[$b3],12)",
-
- "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
- "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
- "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
- "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
- "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
- "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
- "&vprold (@x[$d0],@x[$d0],8)",
- "&vprold (@x[$d1],@x[$d1],8)",
- "&vprold (@x[$d2],@x[$d2],8)",
- "&vprold (@x[$d3],@x[$d3],8)",
-
- "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
- "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
- "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
- "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
- "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
- "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
- "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
- "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
- "&vprold (@x[$b0],@x[$b0],7)",
- "&vprold (@x[$b1],@x[$b1],7)",
- "&vprold (@x[$b2],@x[$b2],7)",
- "&vprold (@x[$b3],@x[$b3],7)"
- );
-}
-
-my $xframe = $win64 ? 0xa8 : 8;
-
-$code.=<<___;
-.type chacha20_16x,\@function,5
-.align 32
-chacha20_16x:
-.cfi_startproc
-.Lchacha20_16x:
- lea 8(%rsp),%r10 # frame register
-.cfi_def_cfa_register %r10
- sub \$64+$xframe,%rsp
- and \$-64,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0xb0(%r10)
- movaps %xmm7,-0xa0(%r10)
- movaps %xmm8,-0x90(%r10)
- movaps %xmm9,-0x80(%r10)
- movaps %xmm10,-0x70(%r10)
- movaps %xmm11,-0x60(%r10)
- movaps %xmm12,-0x50(%r10)
- movaps %xmm13,-0x40(%r10)
- movaps %xmm14,-0x30(%r10)
- movaps %xmm15,-0x20(%r10)
-.L16x_body:
-___
-$code.=<<___;
- vzeroupper
-
- lea .Lsigma(%rip),%r9
- vbroadcasti32x4 (%r9),$xa3 # key[0]
- vbroadcasti32x4 ($key),$xb3 # key[1]
- vbroadcasti32x4 16($key),$xc3 # key[2]
- vbroadcasti32x4 ($counter),$xd3 # key[3]
-
- vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
- vpshufd \$0x55,$xa3,$xa1
- vpshufd \$0xaa,$xa3,$xa2
- vpshufd \$0xff,$xa3,$xa3
- vmovdqa64 $xa0,@key[0]
- vmovdqa64 $xa1,@key[1]
- vmovdqa64 $xa2,@key[2]
- vmovdqa64 $xa3,@key[3]
-
- vpshufd \$0x00,$xb3,$xb0
- vpshufd \$0x55,$xb3,$xb1
- vpshufd \$0xaa,$xb3,$xb2
- vpshufd \$0xff,$xb3,$xb3
- vmovdqa64 $xb0,@key[4]
- vmovdqa64 $xb1,@key[5]
- vmovdqa64 $xb2,@key[6]
- vmovdqa64 $xb3,@key[7]
-
- vpshufd \$0x00,$xc3,$xc0
- vpshufd \$0x55,$xc3,$xc1
- vpshufd \$0xaa,$xc3,$xc2
- vpshufd \$0xff,$xc3,$xc3
- vmovdqa64 $xc0,@key[8]
- vmovdqa64 $xc1,@key[9]
- vmovdqa64 $xc2,@key[10]
- vmovdqa64 $xc3,@key[11]
-
- vpshufd \$0x00,$xd3,$xd0
- vpshufd \$0x55,$xd3,$xd1
- vpshufd \$0xaa,$xd3,$xd2
- vpshufd \$0xff,$xd3,$xd3
- vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
- vmovdqa64 $xd0,@key[12]
- vmovdqa64 $xd1,@key[13]
- vmovdqa64 $xd2,@key[14]
- vmovdqa64 $xd3,@key[15]
-
- mov \$10,%eax
- jmp .Loop16x
-
-.align 32
-.Loop_outer16x:
- vpbroadcastd 0(%r9),$xa0 # reload key
- vpbroadcastd 4(%r9),$xa1
- vpbroadcastd 8(%r9),$xa2
- vpbroadcastd 12(%r9),$xa3
- vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
- vmovdqa64 @key[4],$xb0
- vmovdqa64 @key[5],$xb1
- vmovdqa64 @key[6],$xb2
- vmovdqa64 @key[7],$xb3
- vmovdqa64 @key[8],$xc0
- vmovdqa64 @key[9],$xc1
- vmovdqa64 @key[10],$xc2
- vmovdqa64 @key[11],$xc3
- vmovdqa64 @key[12],$xd0
- vmovdqa64 @key[13],$xd1
- vmovdqa64 @key[14],$xd2
- vmovdqa64 @key[15],$xd3
-
- vmovdqa64 $xa0,@key[0]
- vmovdqa64 $xa1,@key[1]
- vmovdqa64 $xa2,@key[2]
- vmovdqa64 $xa3,@key[3]
-
- mov \$10,%eax
- jmp .Loop16x
-
-.align 32
-.Loop16x:
-___
- foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
- foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- dec %eax
- jnz .Loop16x
-
- vpaddd @key[0],$xa0,$xa0 # accumulate key
- vpaddd @key[1],$xa1,$xa1
- vpaddd @key[2],$xa2,$xa2
- vpaddd @key[3],$xa3,$xa3
-
- vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
- vpunpckldq $xa3,$xa2,$xt3
- vpunpckhdq $xa1,$xa0,$xa0
- vpunpckhdq $xa3,$xa2,$xa2
- vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
- vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
- vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
-___
- ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
-$code.=<<___;
- vpaddd @key[4],$xb0,$xb0
- vpaddd @key[5],$xb1,$xb1
- vpaddd @key[6],$xb2,$xb2
- vpaddd @key[7],$xb3,$xb3
-
- vpunpckldq $xb1,$xb0,$xt2
- vpunpckldq $xb3,$xb2,$xt3
- vpunpckhdq $xb1,$xb0,$xb0
- vpunpckhdq $xb3,$xb2,$xb2
- vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
- vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
- vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
-___
- ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
-$code.=<<___;
- vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
- vshufi32x4 \$0xee,$xb0,$xa0,$xb0
- vshufi32x4 \$0x44,$xb1,$xa1,$xa0
- vshufi32x4 \$0xee,$xb1,$xa1,$xb1
- vshufi32x4 \$0x44,$xb2,$xa2,$xa1
- vshufi32x4 \$0xee,$xb2,$xa2,$xb2
- vshufi32x4 \$0x44,$xb3,$xa3,$xa2
- vshufi32x4 \$0xee,$xb3,$xa3,$xb3
-___
- ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
-$code.=<<___;
- vpaddd @key[8],$xc0,$xc0
- vpaddd @key[9],$xc1,$xc1
- vpaddd @key[10],$xc2,$xc2
- vpaddd @key[11],$xc3,$xc3
-
- vpunpckldq $xc1,$xc0,$xt2
- vpunpckldq $xc3,$xc2,$xt3
- vpunpckhdq $xc1,$xc0,$xc0
- vpunpckhdq $xc3,$xc2,$xc2
- vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
- vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
- vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
-___
- ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
-$code.=<<___;
- vpaddd @key[12],$xd0,$xd0
- vpaddd @key[13],$xd1,$xd1
- vpaddd @key[14],$xd2,$xd2
- vpaddd @key[15],$xd3,$xd3
-
- vpunpckldq $xd1,$xd0,$xt2
- vpunpckldq $xd3,$xd2,$xt3
- vpunpckhdq $xd1,$xd0,$xd0
- vpunpckhdq $xd3,$xd2,$xd2
- vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
- vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
- vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
-___
- ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
-$code.=<<___;
- vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
- vshufi32x4 \$0xee,$xd0,$xc0,$xd0
- vshufi32x4 \$0x44,$xd1,$xc1,$xc0
- vshufi32x4 \$0xee,$xd1,$xc1,$xd1
- vshufi32x4 \$0x44,$xd2,$xc2,$xc1
- vshufi32x4 \$0xee,$xd2,$xc2,$xd2
- vshufi32x4 \$0x44,$xd3,$xc3,$xc2
- vshufi32x4 \$0xee,$xd3,$xc3,$xd3
-___
- ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
-$code.=<<___;
- vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
- vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
- vshufi32x4 \$0x88,$xd0,$xb0,$xc0
- vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
- vshufi32x4 \$0x88,$xc1,$xa1,$xt1
- vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
- vshufi32x4 \$0x88,$xd1,$xb1,$xc1
- vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
- vshufi32x4 \$0x88,$xc2,$xa2,$xt2
- vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
- vshufi32x4 \$0x88,$xd2,$xb2,$xc2
- vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
- vshufi32x4 \$0x88,$xc3,$xa3,$xt3
- vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
- vshufi32x4 \$0x88,$xd3,$xb3,$xc3
- vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
-___
- ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
- ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
-
- ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
- $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
- ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
-$code.=<<___;
- cmp \$64*16,$len
- jb .Ltail16x
-
- vpxord 0x00($inp),$xa0,$xa0 # xor with input
- vpxord 0x40($inp),$xb0,$xb0
- vpxord 0x80($inp),$xc0,$xc0
- vpxord 0xc0($inp),$xd0,$xd0
- vmovdqu32 $xa0,0x00($out)
- vmovdqu32 $xb0,0x40($out)
- vmovdqu32 $xc0,0x80($out)
- vmovdqu32 $xd0,0xc0($out)
-
- vpxord 0x100($inp),$xa1,$xa1
- vpxord 0x140($inp),$xb1,$xb1
- vpxord 0x180($inp),$xc1,$xc1
- vpxord 0x1c0($inp),$xd1,$xd1
- vmovdqu32 $xa1,0x100($out)
- vmovdqu32 $xb1,0x140($out)
- vmovdqu32 $xc1,0x180($out)
- vmovdqu32 $xd1,0x1c0($out)
-
- vpxord 0x200($inp),$xa2,$xa2
- vpxord 0x240($inp),$xb2,$xb2
- vpxord 0x280($inp),$xc2,$xc2
- vpxord 0x2c0($inp),$xd2,$xd2
- vmovdqu32 $xa2,0x200($out)
- vmovdqu32 $xb2,0x240($out)
- vmovdqu32 $xc2,0x280($out)
- vmovdqu32 $xd2,0x2c0($out)
-
- vpxord 0x300($inp),$xa3,$xa3
- vpxord 0x340($inp),$xb3,$xb3
- vpxord 0x380($inp),$xc3,$xc3
- vpxord 0x3c0($inp),$xd3,$xd3
- lea 0x400($inp),$inp
- vmovdqu32 $xa3,0x300($out)
- vmovdqu32 $xb3,0x340($out)
- vmovdqu32 $xc3,0x380($out)
- vmovdqu32 $xd3,0x3c0($out)
- lea 0x400($out),$out
-
- sub \$64*16,$len
- jnz .Loop_outer16x
-
- jmp .Ldone16x
-
-.align 32
-.Ltail16x:
- xor %r9,%r9
- sub $inp,$out
- cmp \$64*1,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xa0,$xa0 # xor with input
- vmovdqu32 $xa0,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xb0,$xa0
- lea 64($inp),$inp
-
- cmp \$64*2,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xb0,$xb0
- vmovdqu32 $xb0,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xc0,$xa0
- lea 64($inp),$inp
-
- cmp \$64*3,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xc0,$xc0
- vmovdqu32 $xc0,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xd0,$xa0
- lea 64($inp),$inp
-
- cmp \$64*4,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xd0,$xd0
- vmovdqu32 $xd0,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xa1,$xa0
- lea 64($inp),$inp
-
- cmp \$64*5,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xa1,$xa1
- vmovdqu32 $xa1,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xb1,$xa0
- lea 64($inp),$inp
-
- cmp \$64*6,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xb1,$xb1
- vmovdqu32 $xb1,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xc1,$xa0
- lea 64($inp),$inp
-
- cmp \$64*7,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xc1,$xc1
- vmovdqu32 $xc1,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xd1,$xa0
- lea 64($inp),$inp
-
- cmp \$64*8,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xd1,$xd1
- vmovdqu32 $xd1,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xa2,$xa0
- lea 64($inp),$inp
-
- cmp \$64*9,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xa2,$xa2
- vmovdqu32 $xa2,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xb2,$xa0
- lea 64($inp),$inp
-
- cmp \$64*10,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xb2,$xb2
- vmovdqu32 $xb2,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xc2,$xa0
- lea 64($inp),$inp
-
- cmp \$64*11,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xc2,$xc2
- vmovdqu32 $xc2,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xd2,$xa0
- lea 64($inp),$inp
-
- cmp \$64*12,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xd2,$xd2
- vmovdqu32 $xd2,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xa3,$xa0
- lea 64($inp),$inp
-
- cmp \$64*13,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xa3,$xa3
- vmovdqu32 $xa3,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xb3,$xa0
- lea 64($inp),$inp
-
- cmp \$64*14,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xb3,$xb3
- vmovdqu32 $xb3,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xc3,$xa0
- lea 64($inp),$inp
-
- cmp \$64*15,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xc3,$xc3
- vmovdqu32 $xc3,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xd3,$xa0
- lea 64($inp),$inp
-
-.Less_than_64_16x:
- vmovdqa32 $xa0,0x00(%rsp)
- lea ($out,$inp),$out
- and \$63,$len
-
-.Loop_tail16x:
- movzb ($inp,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1($out,%r9)
- dec $len
- jnz .Loop_tail16x
-
- vpxord $xa0,$xa0,$xa0
- vmovdqa32 $xa0,0(%rsp)
-
-.Ldone16x:
- vzeroall
-___
-$code.=<<___ if ($win64);
- movaps -0xb0(%r10),%xmm6
- movaps -0xa0(%r10),%xmm7
- movaps -0x90(%r10),%xmm8
- movaps -0x80(%r10),%xmm9
- movaps -0x70(%r10),%xmm10
- movaps -0x60(%r10),%xmm11
- movaps -0x50(%r10),%xmm12
- movaps -0x40(%r10),%xmm13
- movaps -0x30(%r10),%xmm14
- movaps -0x20(%r10),%xmm15
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L16x_epilogue:
- ret
-.cfi_endproc
-.size chacha20_16x,.-chacha20_16x
-___
-
-# switch to %ymm domain
-($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15));
-@xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
-@key=map("%ymm$_",(16..31));
-($xt0,$xt1,$xt2,$xt3)=@key[0..3];
-
-$code.=<<___;
-.type chacha20_8xvl,\@function,5
-.align 32
-chacha20_8xvl:
-.cfi_startproc
-.Lchacha20_8xvl:
- lea 8(%rsp),%r10 # frame register
-.cfi_def_cfa_register %r10
- sub \$64+$xframe,%rsp
- and \$-64,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0xb0(%r10)
- movaps %xmm7,-0xa0(%r10)
- movaps %xmm8,-0x90(%r10)
- movaps %xmm9,-0x80(%r10)
- movaps %xmm10,-0x70(%r10)
- movaps %xmm11,-0x60(%r10)
- movaps %xmm12,-0x50(%r10)
- movaps %xmm13,-0x40(%r10)
- movaps %xmm14,-0x30(%r10)
- movaps %xmm15,-0x20(%r10)
-.L8xvl_body:
-___
-$code.=<<___;
- vzeroupper
-
- lea .Lsigma(%rip),%r9
- vbroadcasti128 (%r9),$xa3 # key[0]
- vbroadcasti128 ($key),$xb3 # key[1]
- vbroadcasti128 16($key),$xc3 # key[2]
- vbroadcasti128 ($counter),$xd3 # key[3]
-
- vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
- vpshufd \$0x55,$xa3,$xa1
- vpshufd \$0xaa,$xa3,$xa2
- vpshufd \$0xff,$xa3,$xa3
- vmovdqa64 $xa0,@key[0]
- vmovdqa64 $xa1,@key[1]
- vmovdqa64 $xa2,@key[2]
- vmovdqa64 $xa3,@key[3]
-
- vpshufd \$0x00,$xb3,$xb0
- vpshufd \$0x55,$xb3,$xb1
- vpshufd \$0xaa,$xb3,$xb2
- vpshufd \$0xff,$xb3,$xb3
- vmovdqa64 $xb0,@key[4]
- vmovdqa64 $xb1,@key[5]
- vmovdqa64 $xb2,@key[6]
- vmovdqa64 $xb3,@key[7]
-
- vpshufd \$0x00,$xc3,$xc0
- vpshufd \$0x55,$xc3,$xc1
- vpshufd \$0xaa,$xc3,$xc2
- vpshufd \$0xff,$xc3,$xc3
- vmovdqa64 $xc0,@key[8]
- vmovdqa64 $xc1,@key[9]
- vmovdqa64 $xc2,@key[10]
- vmovdqa64 $xc3,@key[11]
-
- vpshufd \$0x00,$xd3,$xd0
- vpshufd \$0x55,$xd3,$xd1
- vpshufd \$0xaa,$xd3,$xd2
- vpshufd \$0xff,$xd3,$xd3
- vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
- vmovdqa64 $xd0,@key[12]
- vmovdqa64 $xd1,@key[13]
- vmovdqa64 $xd2,@key[14]
- vmovdqa64 $xd3,@key[15]
-
- mov \$10,%eax
- jmp .Loop8xvl
-
-.align 32
-.Loop_outer8xvl:
- #vpbroadcastd 0(%r9),$xa0 # reload key
- #vpbroadcastd 4(%r9),$xa1
- vpbroadcastd 8(%r9),$xa2
- vpbroadcastd 12(%r9),$xa3
- vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters
- vmovdqa64 @key[4],$xb0
- vmovdqa64 @key[5],$xb1
- vmovdqa64 @key[6],$xb2
- vmovdqa64 @key[7],$xb3
- vmovdqa64 @key[8],$xc0
- vmovdqa64 @key[9],$xc1
- vmovdqa64 @key[10],$xc2
- vmovdqa64 @key[11],$xc3
- vmovdqa64 @key[12],$xd0
- vmovdqa64 @key[13],$xd1
- vmovdqa64 @key[14],$xd2
- vmovdqa64 @key[15],$xd3
-
- vmovdqa64 $xa0,@key[0]
- vmovdqa64 $xa1,@key[1]
- vmovdqa64 $xa2,@key[2]
- vmovdqa64 $xa3,@key[3]
-
- mov \$10,%eax
- jmp .Loop8xvl
-
-.align 32
-.Loop8xvl:
-___
- foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
- foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- dec %eax
- jnz .Loop8xvl
-
- vpaddd @key[0],$xa0,$xa0 # accumulate key
- vpaddd @key[1],$xa1,$xa1
- vpaddd @key[2],$xa2,$xa2
- vpaddd @key[3],$xa3,$xa3
-
- vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
- vpunpckldq $xa3,$xa2,$xt3
- vpunpckhdq $xa1,$xa0,$xa0
- vpunpckhdq $xa3,$xa2,$xa2
- vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
- vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
- vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
-___
- ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
-$code.=<<___;
- vpaddd @key[4],$xb0,$xb0
- vpaddd @key[5],$xb1,$xb1
- vpaddd @key[6],$xb2,$xb2
- vpaddd @key[7],$xb3,$xb3
-
- vpunpckldq $xb1,$xb0,$xt2
- vpunpckldq $xb3,$xb2,$xt3
- vpunpckhdq $xb1,$xb0,$xb0
- vpunpckhdq $xb3,$xb2,$xb2
- vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
- vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
- vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
-___
- ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
-$code.=<<___;
- vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further
- vshufi32x4 \$3,$xb0,$xa0,$xb0
- vshufi32x4 \$0,$xb1,$xa1,$xa0
- vshufi32x4 \$3,$xb1,$xa1,$xb1
- vshufi32x4 \$0,$xb2,$xa2,$xa1
- vshufi32x4 \$3,$xb2,$xa2,$xb2
- vshufi32x4 \$0,$xb3,$xa3,$xa2
- vshufi32x4 \$3,$xb3,$xa3,$xb3
-___
- ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
-$code.=<<___;
- vpaddd @key[8],$xc0,$xc0
- vpaddd @key[9],$xc1,$xc1
- vpaddd @key[10],$xc2,$xc2
- vpaddd @key[11],$xc3,$xc3
-
- vpunpckldq $xc1,$xc0,$xt2
- vpunpckldq $xc3,$xc2,$xt3
- vpunpckhdq $xc1,$xc0,$xc0
- vpunpckhdq $xc3,$xc2,$xc2
- vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
- vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
- vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
-___
- ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
-$code.=<<___;
- vpaddd @key[12],$xd0,$xd0
- vpaddd @key[13],$xd1,$xd1
- vpaddd @key[14],$xd2,$xd2
- vpaddd @key[15],$xd3,$xd3
-
- vpunpckldq $xd1,$xd0,$xt2
- vpunpckldq $xd3,$xd2,$xt3
- vpunpckhdq $xd1,$xd0,$xd0
- vpunpckhdq $xd3,$xd2,$xd2
- vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
- vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
- vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
-___
- ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
-$code.=<<___;
- vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
- vperm2i128 \$0x31,$xd0,$xc0,$xd0
- vperm2i128 \$0x20,$xd1,$xc1,$xc0
- vperm2i128 \$0x31,$xd1,$xc1,$xd1
- vperm2i128 \$0x20,$xd2,$xc2,$xc1
- vperm2i128 \$0x31,$xd2,$xc2,$xd2
- vperm2i128 \$0x20,$xd3,$xc3,$xc2
- vperm2i128 \$0x31,$xd3,$xc3,$xd3
-___
- ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
- ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
- ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
-$code.=<<___;
- cmp \$64*8,$len
- jb .Ltail8xvl
-
- mov \$0x80,%eax # size optimization
- vpxord 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- lea ($inp,%rax),$inp # size optimization
- vmovdqu32 $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- lea ($out,%rax),$out # size optimization
-
- vpxor 0x00($inp),$xa1,$xa1
- vpxor 0x20($inp),$xb1,$xb1
- vpxor 0x40($inp),$xc1,$xc1
- vpxor 0x60($inp),$xd1,$xd1
- lea ($inp,%rax),$inp # size optimization
- vmovdqu $xa1,0x00($out)
- vmovdqu $xb1,0x20($out)
- vmovdqu $xc1,0x40($out)
- vmovdqu $xd1,0x60($out)
- lea ($out,%rax),$out # size optimization
-
- vpxord 0x00($inp),$xa2,$xa2
- vpxor 0x20($inp),$xb2,$xb2
- vpxor 0x40($inp),$xc2,$xc2
- vpxor 0x60($inp),$xd2,$xd2
- lea ($inp,%rax),$inp # size optimization
- vmovdqu32 $xa2,0x00($out)
- vmovdqu $xb2,0x20($out)
- vmovdqu $xc2,0x40($out)
- vmovdqu $xd2,0x60($out)
- lea ($out,%rax),$out # size optimization
-
- vpxor 0x00($inp),$xa3,$xa3
- vpxor 0x20($inp),$xb3,$xb3
- vpxor 0x40($inp),$xc3,$xc3
- vpxor 0x60($inp),$xd3,$xd3
- lea ($inp,%rax),$inp # size optimization
- vmovdqu $xa3,0x00($out)
- vmovdqu $xb3,0x20($out)
- vmovdqu $xc3,0x40($out)
- vmovdqu $xd3,0x60($out)
- lea ($out,%rax),$out # size optimization
-
- vpbroadcastd 0(%r9),%ymm0 # reload key
- vpbroadcastd 4(%r9),%ymm1
-
- sub \$64*8,$len
- jnz .Loop_outer8xvl
-
- jmp .Ldone8xvl
-
-.align 32
-.Ltail8xvl:
- vmovdqa64 $xa0,%ymm8 # size optimization
-___
-$xa0 = "%ymm8";
-$code.=<<___;
- xor %r9,%r9
- sub $inp,$out
- cmp \$64*1,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vmovdqu $xa0,0x00($out,$inp)
- vmovdqu $xb0,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xc0,$xa0
- vmovdqa $xd0,$xb0
- lea 64($inp),$inp
-
- cmp \$64*2,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xc0,$xc0
- vpxor 0x20($inp),$xd0,$xd0
- vmovdqu $xc0,0x00($out,$inp)
- vmovdqu $xd0,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xa1,$xa0
- vmovdqa $xb1,$xb0
- lea 64($inp),$inp
-
- cmp \$64*3,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xa1,$xa1
- vpxor 0x20($inp),$xb1,$xb1
- vmovdqu $xa1,0x00($out,$inp)
- vmovdqu $xb1,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xc1,$xa0
- vmovdqa $xd1,$xb0
- lea 64($inp),$inp
-
- cmp \$64*4,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xc1,$xc1
- vpxor 0x20($inp),$xd1,$xd1
- vmovdqu $xc1,0x00($out,$inp)
- vmovdqu $xd1,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa32 $xa2,$xa0
- vmovdqa $xb2,$xb0
- lea 64($inp),$inp
-
- cmp \$64*5,$len
- jb .Less_than_64_8xvl
- vpxord 0x00($inp),$xa2,$xa2
- vpxor 0x20($inp),$xb2,$xb2
- vmovdqu32 $xa2,0x00($out,$inp)
- vmovdqu $xb2,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xc2,$xa0
- vmovdqa $xd2,$xb0
- lea 64($inp),$inp
-
- cmp \$64*6,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xc2,$xc2
- vpxor 0x20($inp),$xd2,$xd2
- vmovdqu $xc2,0x00($out,$inp)
- vmovdqu $xd2,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xa3,$xa0
- vmovdqa $xb3,$xb0
- lea 64($inp),$inp
-
- cmp \$64*7,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xa3,$xa3
- vpxor 0x20($inp),$xb3,$xb3
- vmovdqu $xa3,0x00($out,$inp)
- vmovdqu $xb3,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xc3,$xa0
- vmovdqa $xd3,$xb0
- lea 64($inp),$inp
-
-.Less_than_64_8xvl:
- vmovdqa $xa0,0x00(%rsp)
- vmovdqa $xb0,0x20(%rsp)
- lea ($out,$inp),$out
- and \$63,$len
-
-.Loop_tail8xvl:
- movzb ($inp,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1($out,%r9)
- dec $len
- jnz .Loop_tail8xvl
-
- vpxor $xa0,$xa0,$xa0
- vmovdqa $xa0,0x00(%rsp)
- vmovdqa $xa0,0x20(%rsp)
-
-.Ldone8xvl:
- vzeroall
-___
-$code.=<<___ if ($win64);
- movaps -0xb0(%r10),%xmm6
- movaps -0xa0(%r10),%xmm7
- movaps -0x90(%r10),%xmm8
- movaps -0x80(%r10),%xmm9
- movaps -0x70(%r10),%xmm10
- movaps -0x60(%r10),%xmm11
- movaps -0x50(%r10),%xmm12
- movaps -0x40(%r10),%xmm13
- movaps -0x30(%r10),%xmm14
- movaps -0x20(%r10),%xmm15
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L8xvl_epilogue:
- ret
-.cfi_endproc
-.size chacha20_8xvl,.-chacha20_8xvl
-___
-if($kernel) {
- $code .= "#endif\n";
-}
-}
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-# CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern __imp_RtlVirtualUnwind
-.type se_handler,\@abi-omnipotent
-.align 16
-se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- lea .Lctr32_body(%rip),%r10
- cmp %r10,%rbx # context->Rip<.Lprologue
- jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- lea .Lno_data(%rip),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=.Lepilogue
- jae .Lcommon_seh_tail
-
- lea 64+24+48(%rax),%rax
-
- mov -8(%rax),%rbx
- mov -16(%rax),%rbp
- mov -24(%rax),%r12
- mov -32(%rax),%r13
- mov -40(%rax),%r14
- mov -48(%rax),%r15
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
- mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
- mov %r15,240($context) # restore context->R14
-
-.Lcommon_seh_tail:
- mov 8(%rax),%rdi
- mov 16(%rax),%rsi
- mov %rax,152($context) # restore context->Rsp
- mov %rsi,168($context) # restore context->Rsi
- mov %rdi,176($context) # restore context->Rdi
-
- mov 40($disp),%rdi # disp->ContextRecord
- mov $context,%rsi # context
- mov \$154,%ecx # sizeof(CONTEXT)
- .long 0xa548f3fc # cld; rep movsq
-
- mov $disp,%rsi
- xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
- mov 40(%rsi),%r10 # disp->ContextRecord
- lea 56(%rsi),%r11 # &disp->HandlerData
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
- mov %r10,32(%rsp) # arg5
- mov %r11,40(%rsp) # arg6
- mov %r12,48(%rsp) # arg7
- mov %rcx,56(%rsp) # arg8, (NULL)
- call *__imp_RtlVirtualUnwind(%rip)
-
- mov \$1,%eax # ExceptionContinueSearch
- add \$64,%rsp
- popfq
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- pop %rdi
- pop %rsi
- ret
-.size se_handler,.-se_handler
-
-.type simd_handler,\@abi-omnipotent
-.align 16
-simd_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<prologue label
- jb .Lcommon_seh_tail
-
- mov 192($context),%rax # pull context->R9
-
- mov 4(%r11),%r10d # HandlerData[1]
- mov 8(%r11),%ecx # HandlerData[2]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=epilogue label
- jae .Lcommon_seh_tail
-
- neg %rcx
- lea -8(%rax,%rcx),%rsi
- lea 512($context),%rdi # &context.Xmm6
- neg %ecx
- shr \$3,%ecx
- .long 0xa548f3fc # cld; rep movsq
-
- jmp .Lcommon_seh_tail
-.size simd_handler,.-simd_handler
-
-.section .pdata
-.align 4
- .rva .LSEH_begin_chacha20_ctr32
- .rva .LSEH_end_chacha20_ctr32
- .rva .LSEH_info_chacha20_ctr32
-
- .rva .LSEH_begin_chacha20_ssse3
- .rva .LSEH_end_chacha20_ssse3
- .rva .LSEH_info_chacha20_ssse3
-
- .rva .LSEH_begin_chacha20_128
- .rva .LSEH_end_chacha20_128
- .rva .LSEH_info_chacha20_128
-
- .rva .LSEH_begin_chacha20_4x
- .rva .LSEH_end_chacha20_4x
- .rva .LSEH_info_chacha20_4x
-___
-$code.=<<___ if ($avx);
- .rva .LSEH_begin_chacha20_xop
- .rva .LSEH_end_chacha20_xop
- .rva .LSEH_info_chacha20_xop
-___
-$code.=<<___ if ($avx>1);
- .rva .LSEH_begin_chacha20_avx2
- .rva .LSEH_end_chacha20_avx2
- .rva .LSEH_info_chacha20_avx2
-___
-$code.=<<___ if ($avx>2);
- .rva .LSEH_begin_chacha20_avx512
- .rva .LSEH_end_chacha20_avx512
- .rva .LSEH_info_chacha20_avx512
-
- .rva .LSEH_begin_chacha20_avx512vl
- .rva .LSEH_end_chacha20_avx512vl
- .rva .LSEH_info_chacha20_avx512vl
-
- .rva .LSEH_begin_chacha20_16x
- .rva .LSEH_end_chacha20_16x
- .rva .LSEH_info_chacha20_16x
-
- .rva .LSEH_begin_chacha20_8xvl
- .rva .LSEH_end_chacha20_8xvl
- .rva .LSEH_info_chacha20_8xvl
-___
-$code.=<<___;
-.section .xdata
-.align 8
-.LSEH_info_chacha20_ctr32:
- .byte 9,0,0,0
- .rva se_handler
-
-.LSEH_info_chacha20_ssse3:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .Lssse3_body,.Lssse3_epilogue
- .long 0x20,0
-
-.LSEH_info_chacha20_128:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L128_body,.L128_epilogue
- .long 0x60,0
-
-.LSEH_info_chacha20_4x:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L4x_body,.L4x_epilogue
- .long 0xa0,0
-___
-$code.=<<___ if ($avx);
-.LSEH_info_chacha20_xop:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L4xop_body,.L4xop_epilogue # HandlerData[]
- .long 0xa0,0
-___
-$code.=<<___ if ($avx>1);
-.LSEH_info_chacha20_avx2:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L8x_body,.L8x_epilogue # HandlerData[]
- .long 0xa0,0
-___
-$code.=<<___ if ($avx>2);
-.LSEH_info_chacha20_avx512:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[]
- .long 0x20,0
-
-.LSEH_info_chacha20_avx512vl:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[]
- .long 0x20,0
-
-.LSEH_info_chacha20_16x:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L16x_body,.L16x_epilogue # HandlerData[]
- .long 0xa0,0
-
-.LSEH_info_chacha20_8xvl:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[]
- .long 0xa0,0
-___
-}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/\/\// and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/ge;
-
- s/%x#%[yz]/%x/g; # "down-shift"
-
- if ($kernel) {
- s/(^\.type.*),[0-9]+$/\1/;
- next if /^\.cfi.*/;
- }
-
- print $_,"\n";
-}
-
-close STDOUT;
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20.c
deleted file mode 100644
index b78f19975b1d..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20.c
+++ /dev/null
@@ -1,238 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * Implementation of the ChaCha20 stream cipher.
- *
- * Information: https://cr.yp.to/chacha.html
- */
-
-#include <zinc/chacha20.h>
-#include "../selftest/run.h"
-#define IS_ENABLED_CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 1
-
-#define IS_ENABLED_CONFIG_64BIT (sizeof(void*) == 8)
-
-void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int len)
-{
- int relalign = 0;
-
- if (!IS_ENABLED_CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) {
- int size = sizeof(unsigned long);
- int d = (((unsigned long)dst ^ (unsigned long)src1) |
- ((unsigned long)dst ^ (unsigned long)src2)) &
- (size - 1);
-
- relalign = d ? 1 << ffs(d) : size;
-
- /*
- * If we care about alignment, process as many bytes as
- * needed to advance dst and src to values whose alignments
- * equal their relative alignment. This will allow us to
- * process the remainder of the input using optimal strides.
- */
- while (((unsigned long)dst & (relalign - 1)) && len > 0) {
- *dst++ = *src1++ ^ *src2++;
- len--;
- }
- }
-
- while (IS_ENABLED(CONFIG_64BIT) && len >= 8 && !(relalign & 7)) {
- *(u64 *)dst = *(const u64 *)src1 ^ *(const u64 *)src2;
- dst += 8;
- src1 += 8;
- src2 += 8;
- len -= 8;
- }
-
- while (len >= 4 && !(relalign & 3)) {
- *(u32 *)dst = *(const u32 *)src1 ^ *(const u32 *)src2;
- dst += 4;
- src1 += 4;
- src2 += 4;
- len -= 4;
- }
-
- while (len >= 2 && !(relalign & 1)) {
- *(u16 *)dst = *(const u16 *)src1 ^ *(const u16 *)src2;
- dst += 2;
- src1 += 2;
- src2 += 2;
- len -= 2;
- }
-
- while (len--)
- *dst++ = *src1++ ^ *src2++;
-}
-
-#if defined(CONFIG_ZINC_ARCH_X86_64)
-#include "chacha20-x86_64-glue.c"
-#elif defined(CONFIG_ZINC_ARCH_ARM) || defined(CONFIG_ZINC_ARCH_ARM64)
-#include "chacha20-arm-glue.c"
-#elif defined(CONFIG_ZINC_ARCH_MIPS)
-#include "chacha20-mips-glue.c"
-#else
-static bool *const chacha20_nobs[] __initconst = { };
-static void __init chacha20_fpu_init(void)
-{
-}
-static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst,
- const u8 *src, size_t len,
- simd_context_t *simd_context)
-{
- return false;
-}
-static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE],
- simd_context_t *simd_context)
-{
- return false;
-}
-#endif
-
-#define QUARTER_ROUND(x, a, b, c, d) ( \
- x[a] += x[b], \
- x[d] = rol32((x[d] ^ x[a]), 16), \
- x[c] += x[d], \
- x[b] = rol32((x[b] ^ x[c]), 12), \
- x[a] += x[b], \
- x[d] = rol32((x[d] ^ x[a]), 8), \
- x[c] += x[d], \
- x[b] = rol32((x[b] ^ x[c]), 7) \
-)
-
-#define C(i, j) (i * 4 + j)
-
-#define DOUBLE_ROUND(x) ( \
- /* Column Round */ \
- QUARTER_ROUND(x, C(0, 0), C(1, 0), C(2, 0), C(3, 0)), \
- QUARTER_ROUND(x, C(0, 1), C(1, 1), C(2, 1), C(3, 1)), \
- QUARTER_ROUND(x, C(0, 2), C(1, 2), C(2, 2), C(3, 2)), \
- QUARTER_ROUND(x, C(0, 3), C(1, 3), C(2, 3), C(3, 3)), \
- /* Diagonal Round */ \
- QUARTER_ROUND(x, C(0, 0), C(1, 1), C(2, 2), C(3, 3)), \
- QUARTER_ROUND(x, C(0, 1), C(1, 2), C(2, 3), C(3, 0)), \
- QUARTER_ROUND(x, C(0, 2), C(1, 3), C(2, 0), C(3, 1)), \
- QUARTER_ROUND(x, C(0, 3), C(1, 0), C(2, 1), C(3, 2)) \
-)
-
-#define TWENTY_ROUNDS(x) ( \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x) \
-)
-
-static void chacha20_block_generic(struct chacha20_ctx *ctx, __le32 *stream)
-{
- u32 x[CHACHA20_BLOCK_WORDS];
- int i;
-
- for (i = 0; i < ARRAY_SIZE(x); ++i)
- x[i] = ctx->state[i];
-
- TWENTY_ROUNDS(x);
-
- for (i = 0; i < ARRAY_SIZE(x); ++i)
- stream[i] = cpu_to_le32(x[i] + ctx->state[i]);
-
- ctx->counter[0] += 1;
-}
-
-static void chacha20_generic(struct chacha20_ctx *ctx, u8 *out, const u8 *in,
- u32 len)
-{
- __le32 buf[CHACHA20_BLOCK_WORDS];
-
- while (len >= CHACHA20_BLOCK_SIZE) {
- chacha20_block_generic(ctx, buf);
- crypto_xor_cpy(out, in, (u8 *)buf, CHACHA20_BLOCK_SIZE);
- len -= CHACHA20_BLOCK_SIZE;
- out += CHACHA20_BLOCK_SIZE;
- in += CHACHA20_BLOCK_SIZE;
- }
- if (len) {
- chacha20_block_generic(ctx, buf);
- crypto_xor_cpy(out, in, (u8 *)buf, len);
- }
-}
-
-void chacha20(struct chacha20_ctx *ctx, u8 *dst, const u8 *src, u32 len,
- simd_context_t *simd_context)
-{
- if (!chacha20_arch(ctx, dst, src, len, simd_context))
- chacha20_generic(ctx, dst, src, len);
-}
-EXPORT_SYMBOL(chacha20);
-
-static void hchacha20_generic(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE])
-{
- u32 x[] = { CHACHA20_CONSTANT_EXPA,
- CHACHA20_CONSTANT_ND_3,
- CHACHA20_CONSTANT_2_BY,
- CHACHA20_CONSTANT_TE_K,
- get_unaligned_le32(key + 0),
- get_unaligned_le32(key + 4),
- get_unaligned_le32(key + 8),
- get_unaligned_le32(key + 12),
- get_unaligned_le32(key + 16),
- get_unaligned_le32(key + 20),
- get_unaligned_le32(key + 24),
- get_unaligned_le32(key + 28),
- get_unaligned_le32(nonce + 0),
- get_unaligned_le32(nonce + 4),
- get_unaligned_le32(nonce + 8),
- get_unaligned_le32(nonce + 12)
- };
-
- TWENTY_ROUNDS(x);
-
- memcpy(derived_key + 0, x + 0, sizeof(u32) * 4);
- memcpy(derived_key + 4, x + 12, sizeof(u32) * 4);
-}
-
-/* Derived key should be 32-bit aligned */
-void hchacha20(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE], simd_context_t *simd_context)
-{
- if (!hchacha20_arch(derived_key, nonce, key, simd_context))
- hchacha20_generic(derived_key, nonce, key);
-}
-EXPORT_SYMBOL(hchacha20);
-
-#include "../selftest/chacha20.c"
-
-static bool nosimd __initdata = false;
-
-#ifndef COMPAT_ZINC_IS_A_MODULE
-int __init chacha20_mod_init(void)
-#else
-static int __init mod_init(void)
-#endif
-{
- if (!nosimd)
- chacha20_fpu_init();
- if (!selftest_run("chacha20", chacha20_selftest, chacha20_nobs,
- ARRAY_SIZE(chacha20_nobs)))
- return -ENOTRECOVERABLE;
- return 0;
-}
-
-#ifdef COMPAT_ZINC_IS_A_MODULE
-static void __exit mod_exit(void)
-{
-}
-
-module_init(mod_init);
-module_exit(mod_exit);
-#endif
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20poly1305.c b/sys/dev/if_wg/module/crypto/zinc/chacha20poly1305.c
deleted file mode 100644
index 701666c78eb8..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20poly1305.c
+++ /dev/null
@@ -1,196 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * This is an implementation of the ChaCha20Poly1305 AEAD construction.
- *
- * Information: https://tools.ietf.org/html/rfc8439
- */
-
-#include <sys/support.h>
-#include <zinc/chacha20poly1305.h>
-#include <zinc/chacha20.h>
-#include <zinc/poly1305.h>
-#include "selftest/run.h"
-
-static const u8 pad0[CHACHA20_BLOCK_SIZE] = { 0 };
-
-static inline void
-__chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len, const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE],
- simd_context_t *simd_context)
-{
- struct poly1305_ctx poly1305_state;
- struct chacha20_ctx chacha20_state;
- union {
- u8 block0[POLY1305_KEY_SIZE];
- __le64 lens[2];
- } b = { { 0 } };
-
- chacha20_init(&chacha20_state, key, nonce);
- chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0),
- simd_context);
- poly1305_init(&poly1305_state, b.block0);
-
- poly1305_update(&poly1305_state, ad, ad_len, simd_context);
- poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf,
- simd_context);
-
- chacha20(&chacha20_state, dst, src, src_len, simd_context);
-
- poly1305_update(&poly1305_state, dst, src_len, simd_context);
- poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf,
- simd_context);
-
- b.lens[0] = cpu_to_le64(ad_len);
- b.lens[1] = cpu_to_le64(src_len);
- poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens),
- simd_context);
-
- poly1305_final(&poly1305_state, dst + src_len, simd_context);
-
- memzero_explicit(&chacha20_state, sizeof(chacha20_state));
- memzero_explicit(&b, sizeof(b));
-}
-
-void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len,
- const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
-{
- simd_context_t simd_context;
-
- simd_get(&simd_context);
- __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, nonce, key,
- &simd_context);
- simd_put(&simd_context);
-}
-EXPORT_SYMBOL(chacha20poly1305_encrypt);
-static inline bool
-__chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len, const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE],
- simd_context_t *simd_context)
-{
- struct poly1305_ctx poly1305_state;
- struct chacha20_ctx chacha20_state;
- int ret;
- size_t dst_len;
- union {
- u8 block0[POLY1305_KEY_SIZE];
- u8 mac[POLY1305_MAC_SIZE];
- __le64 lens[2];
- } b = { { 0 } };
-
- if (unlikely(src_len < POLY1305_MAC_SIZE)) {
- printf("src_len too short\n");
- return false;
- }
-
- chacha20_init(&chacha20_state, key, nonce);
- chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0),
- simd_context);
- poly1305_init(&poly1305_state, b.block0);
-
- poly1305_update(&poly1305_state, ad, ad_len, simd_context);
- poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf,
- simd_context);
-
- dst_len = src_len - POLY1305_MAC_SIZE;
- poly1305_update(&poly1305_state, src, dst_len, simd_context);
- poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf,
- simd_context);
-
- b.lens[0] = cpu_to_le64(ad_len);
- b.lens[1] = cpu_to_le64(dst_len);
- poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens),
- simd_context);
-
- poly1305_final(&poly1305_state, b.mac, simd_context);
-
- ret = crypto_memneq(b.mac, src + dst_len, POLY1305_MAC_SIZE);
- if (likely(!ret))
- chacha20(&chacha20_state, dst, src, dst_len, simd_context);
- else {
- printf("calculated: %16D\n", b.mac, "");
- printf("sent : %16D\n", src + dst_len, "");
- }
- memzero_explicit(&chacha20_state, sizeof(chacha20_state));
- memzero_explicit(&b, sizeof(b));
-
- return !ret;
-}
-
-bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len,
- const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
-{
- simd_context_t simd_context;
- bool ret;
-
- simd_get(&simd_context);
- ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, nonce,
- key, &simd_context);
- simd_put(&simd_context);
- return ret;
-}
-EXPORT_SYMBOL(chacha20poly1305_decrypt);
-
-void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len,
- const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
-{
- simd_context_t simd_context;
- u32 derived_key[CHACHA20_KEY_WORDS] __aligned(16);
-
- simd_get(&simd_context);
- hchacha20(derived_key, nonce, key, &simd_context);
- cpu_to_le32_array(derived_key, ARRAY_SIZE(derived_key));
- __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len,
- get_unaligned_le64(nonce + 16),
- (u8 *)derived_key, &simd_context);
- memzero_explicit(derived_key, CHACHA20POLY1305_KEY_SIZE);
- simd_put(&simd_context);
-}
-EXPORT_SYMBOL(xchacha20poly1305_encrypt);
-
-bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len,
- const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
-{
- bool ret;
- simd_context_t simd_context;
- u32 derived_key[CHACHA20_KEY_WORDS] __aligned(16);
-
- simd_get(&simd_context);
- hchacha20(derived_key, nonce, key, &simd_context);
- cpu_to_le32_array(derived_key, ARRAY_SIZE(derived_key));
- ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len,
- get_unaligned_le64(nonce + 16),
- (u8 *)derived_key, &simd_context);
- memzero_explicit(derived_key, CHACHA20POLY1305_KEY_SIZE);
- simd_put(&simd_context);
- return ret;
-}
-EXPORT_SYMBOL(xchacha20poly1305_decrypt);
-
-#include "selftest/chacha20poly1305.c"
-
-static int __init mod_init(void)
-{
- if (!selftest_run("chacha20poly1305", chacha20poly1305_selftest,
- NULL, 0))
- return -ENOTRECOVERABLE;
- return 0;
-}
-
-static void __exit mod_exit(void)
-{
-}
-
-module_init(mod_init);
-module_exit(mod_exit);
diff --git a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm-glue.c b/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm-glue.c
deleted file mode 100644
index 291fe4ba98b0..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm-glue.c
+++ /dev/null
@@ -1,140 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-
-asmlinkage void poly1305_init_arm(void *ctx, const u8 key[16]);
-asmlinkage void poly1305_blocks_arm(void *ctx, const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_emit_arm(void *ctx, u8 mac[16], const u32 nonce[4]);
-asmlinkage void poly1305_blocks_neon(void *ctx, const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_emit_neon(void *ctx, u8 mac[16], const u32 nonce[4]);
-
-static bool poly1305_use_neon __ro_after_init;
-static bool *const poly1305_nobs[] __initconst = { &poly1305_use_neon };
-
-static void __init poly1305_fpu_init(void)
-{
-#if defined(CONFIG_ZINC_ARCH_ARM64)
- poly1305_use_neon = cpu_have_named_feature(ASIMD);
-#elif defined(CONFIG_ZINC_ARCH_ARM)
- poly1305_use_neon = elf_hwcap & HWCAP_NEON;
-#endif
-}
-
-#if defined(CONFIG_ZINC_ARCH_ARM64)
-struct poly1305_arch_internal {
- union {
- u32 h[5];
- struct {
- u64 h0, h1, h2;
- };
- };
- u64 is_base2_26;
- u64 r[2];
-};
-#elif defined(CONFIG_ZINC_ARCH_ARM)
-struct poly1305_arch_internal {
- union {
- u32 h[5];
- struct {
- u64 h0, h1;
- u32 h2;
- } __packed;
- };
- u32 r[4];
- u32 is_base2_26;
-};
-#endif
-
-/* The NEON code uses base 2^26, while the scalar code uses base 2^64 on 64-bit
- * and base 2^32 on 32-bit. If we hit the unfortunate situation of using NEON
- * and then having to go back to scalar -- because the user is silly and has
- * called the update function from two separate contexts -- then we need to
- * convert back to the original base before proceeding. The below function is
- * written for 64-bit integers, and so we have to swap words at the end on
- * big-endian 32-bit. It is possible to reason that the initial reduction below
- * is sufficient given the implementation invariants. However, for an avoidance
- * of doubt and because this is not performance critical, we do the full
- * reduction anyway.
- */
-static void convert_to_base2_64(void *ctx)
-{
- struct poly1305_arch_internal *state = ctx;
- u32 cy;
-
- if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !state->is_base2_26)
- return;
-
- cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
- cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
- cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
- cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
- state->h0 = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
- state->h1 = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
- state->h2 = state->h[4] >> 24;
- if (IS_ENABLED(CONFIG_ZINC_ARCH_ARM) && IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) {
- state->h0 = rol64(state->h0, 32);
- state->h1 = rol64(state->h1, 32);
- }
-#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
- cy = (state->h2 >> 2) + (state->h2 & ~3ULL);
- state->h2 &= 3;
- state->h0 += cy;
- state->h1 += (cy = ULT(state->h0, cy));
- state->h2 += ULT(state->h1, cy);
-#undef ULT
- state->is_base2_26 = 0;
-}
-
-static inline bool poly1305_init_arch(void *ctx,
- const u8 key[POLY1305_KEY_SIZE])
-{
- poly1305_init_arm(ctx, key);
- return true;
-}
-
-static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
- size_t len, const u32 padbit,
- simd_context_t *simd_context)
-{
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
- PAGE_SIZE % POLY1305_BLOCK_SIZE);
-
- if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !poly1305_use_neon ||
- !simd_use(simd_context)) {
- convert_to_base2_64(ctx);
- poly1305_blocks_arm(ctx, inp, len, padbit);
- return true;
- }
-
- for (;;) {
- const size_t bytes = min_t(size_t, len, PAGE_SIZE);
-
- poly1305_blocks_neon(ctx, inp, bytes, padbit);
- len -= bytes;
- if (!len)
- break;
- inp += bytes;
- simd_relax(simd_context);
- }
- return true;
-}
-
-static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
- const u32 nonce[4],
- simd_context_t *simd_context)
-{
- if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !poly1305_use_neon ||
- !simd_use(simd_context)) {
- convert_to_base2_64(ctx);
- poly1305_emit_arm(ctx, mac, nonce);
- } else
- poly1305_emit_neon(ctx, mac, nonce);
- return true;
-}
diff --git a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm.pl b/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm.pl
deleted file mode 100755
index 468f41b76fbd..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm.pl
+++ /dev/null
@@ -1,1276 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# IALU(*)/gcc-4.4 NEON
-#
-# ARM11xx(ARMv6) 7.78/+100% -
-# Cortex-A5 6.35/+130% 3.00
-# Cortex-A8 6.25/+115% 2.36
-# Cortex-A9 5.10/+95% 2.55
-# Cortex-A15 3.85/+85% 1.25(**)
-# Snapdragon S4 5.70/+100% 1.48(**)
-#
-# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
-# (**) these are trade-off results, they can be improved by ~8% but at
-# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
-# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
-
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
- open STDOUT,">$output";
-}
-
-($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
-# define poly1305_init poly1305_init_arm
-# define poly1305_blocks poly1305_blocks_arm
-# define poly1305_emit poly1305_emit_arm
-#endif
-
-.text
-#if defined(__thumb2__)
-.syntax unified
-.thumb
-#else
-.code 32
-#endif
-
-.globl poly1305_emit
-.globl poly1305_blocks
-.globl poly1305_init
-.type poly1305_init,%function
-.align 5
-poly1305_init:
-.Lpoly1305_init:
- stmdb sp!,{r4-r11}
-
- eor r3,r3,r3
- cmp $inp,#0
- str r3,[$ctx,#0] @ zero hash value
- str r3,[$ctx,#4]
- str r3,[$ctx,#8]
- str r3,[$ctx,#12]
- str r3,[$ctx,#16]
- str r3,[$ctx,#36] @ is_base2_26
- add $ctx,$ctx,#20
-
-#ifdef __thumb2__
- it eq
-#endif
- moveq r0,#0
- beq .Lno_key
-
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- adr r11,.Lpoly1305_init
- ldr r12,.LOPENSSL_armcap
-#endif
- ldrb r4,[$inp,#0]
- mov r10,#0x0fffffff
- ldrb r5,[$inp,#1]
- and r3,r10,#-4 @ 0x0ffffffc
- ldrb r6,[$inp,#2]
- ldrb r7,[$inp,#3]
- orr r4,r4,r5,lsl#8
- ldrb r5,[$inp,#4]
- orr r4,r4,r6,lsl#16
- ldrb r6,[$inp,#5]
- orr r4,r4,r7,lsl#24
- ldrb r7,[$inp,#6]
- and r4,r4,r10
-
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- ldr r12,[r11,r12] @ OPENSSL_armcap_P
-# ifdef __APPLE__
- ldr r12,[r12]
-# endif
-#endif
- ldrb r8,[$inp,#7]
- orr r5,r5,r6,lsl#8
- ldrb r6,[$inp,#8]
- orr r5,r5,r7,lsl#16
- ldrb r7,[$inp,#9]
- orr r5,r5,r8,lsl#24
- ldrb r8,[$inp,#10]
- and r5,r5,r3
-
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- tst r12,#ARMV7_NEON @ check for NEON
-# ifdef __APPLE__
- adr r9,poly1305_blocks_neon
- adr r11,poly1305_blocks
-# ifdef __thumb2__
- it ne
-# endif
- movne r11,r9
- adr r12,poly1305_emit
- adr r10,poly1305_emit_neon
-# ifdef __thumb2__
- it ne
-# endif
- movne r12,r10
-# else
-# ifdef __thumb2__
- itete eq
-# endif
- addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
- addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
- addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
- addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
-# endif
-# ifdef __thumb2__
- orr r12,r12,#1 @ thumb-ify address
- orr r11,r11,#1
-# endif
-#endif
- ldrb r9,[$inp,#11]
- orr r6,r6,r7,lsl#8
- ldrb r7,[$inp,#12]
- orr r6,r6,r8,lsl#16
- ldrb r8,[$inp,#13]
- orr r6,r6,r9,lsl#24
- ldrb r9,[$inp,#14]
- and r6,r6,r3
-
- ldrb r10,[$inp,#15]
- orr r7,r7,r8,lsl#8
- str r4,[$ctx,#0]
- orr r7,r7,r9,lsl#16
- str r5,[$ctx,#4]
- orr r7,r7,r10,lsl#24
- str r6,[$ctx,#8]
- and r7,r7,r3
- str r7,[$ctx,#12]
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- stmia r2,{r11,r12} @ fill functions table
- mov r0,#1
-#else
- mov r0,#0
-#endif
-.Lno_key:
- ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
- ret @ bx lr
-#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_init,.-poly1305_init
-___
-{
-my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
-my ($s1,$s2,$s3)=($r1,$r2,$r3);
-
-$code.=<<___;
-.type poly1305_blocks,%function
-.align 5
-poly1305_blocks:
-.Lpoly1305_blocks:
- stmdb sp!,{r3-r11,lr}
-
- ands $len,$len,#-16
- beq .Lno_data
-
- cmp $padbit,#0
- add $len,$len,$inp @ end pointer
- sub sp,sp,#32
-
- ldmia $ctx,{$h0-$r3} @ load context
-
- str $ctx,[sp,#12] @ offload stuff
- mov lr,$inp
- str $len,[sp,#16]
- str $r1,[sp,#20]
- str $r2,[sp,#24]
- str $r3,[sp,#28]
- b .Loop
-
-.Loop:
-#if __ARM_ARCH__<7
- ldrb r0,[lr],#16 @ load input
-# ifdef __thumb2__
- it hi
-# endif
- addhi $h4,$h4,#1 @ 1<<128
- ldrb r1,[lr,#-15]
- ldrb r2,[lr,#-14]
- ldrb r3,[lr,#-13]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-12]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-11]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-10]
- adds $h0,$h0,r3 @ accumulate input
-
- ldrb r3,[lr,#-9]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-8]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-7]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-6]
- adcs $h1,$h1,r3
-
- ldrb r3,[lr,#-5]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-4]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-3]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-2]
- adcs $h2,$h2,r3
-
- ldrb r3,[lr,#-1]
- orr r1,r0,r1,lsl#8
- str lr,[sp,#8] @ offload input pointer
- orr r2,r1,r2,lsl#16
- add $s1,$r1,$r1,lsr#2
- orr r3,r2,r3,lsl#24
-#else
- ldr r0,[lr],#16 @ load input
-# ifdef __thumb2__
- it hi
-# endif
- addhi $h4,$h4,#1 @ padbit
- ldr r1,[lr,#-12]
- ldr r2,[lr,#-8]
- ldr r3,[lr,#-4]
-# ifdef __ARMEB__
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
-# endif
- adds $h0,$h0,r0 @ accumulate input
- str lr,[sp,#8] @ offload input pointer
- adcs $h1,$h1,r1
- add $s1,$r1,$r1,lsr#2
- adcs $h2,$h2,r2
-#endif
- add $s2,$r2,$r2,lsr#2
- adcs $h3,$h3,r3
- add $s3,$r3,$r3,lsr#2
-
- umull r2,r3,$h1,$r0
- adc $h4,$h4,#0
- umull r0,r1,$h0,$r0
- umlal r2,r3,$h4,$s1
- umlal r0,r1,$h3,$s1
- ldr $r1,[sp,#20] @ reload $r1
- umlal r2,r3,$h2,$s3
- umlal r0,r1,$h1,$s3
- umlal r2,r3,$h3,$s2
- umlal r0,r1,$h2,$s2
- umlal r2,r3,$h0,$r1
- str r0,[sp,#0] @ future $h0
- mul r0,$s2,$h4
- ldr $r2,[sp,#24] @ reload $r2
- adds r2,r2,r1 @ d1+=d0>>32
- eor r1,r1,r1
- adc lr,r3,#0 @ future $h2
- str r2,[sp,#4] @ future $h1
-
- mul r2,$s3,$h4
- eor r3,r3,r3
- umlal r0,r1,$h3,$s3
- ldr $r3,[sp,#28] @ reload $r3
- umlal r2,r3,$h3,$r0
- umlal r0,r1,$h2,$r0
- umlal r2,r3,$h2,$r1
- umlal r0,r1,$h1,$r1
- umlal r2,r3,$h1,$r2
- umlal r0,r1,$h0,$r2
- umlal r2,r3,$h0,$r3
- ldr $h0,[sp,#0]
- mul $h4,$r0,$h4
- ldr $h1,[sp,#4]
-
- adds $h2,lr,r0 @ d2+=d1>>32
- ldr lr,[sp,#8] @ reload input pointer
- adc r1,r1,#0
- adds $h3,r2,r1 @ d3+=d2>>32
- ldr r0,[sp,#16] @ reload end pointer
- adc r3,r3,#0
- add $h4,$h4,r3 @ h4+=d3>>32
-
- and r1,$h4,#-4
- and $h4,$h4,#3
- add r1,r1,r1,lsr#2 @ *=5
- adds $h0,$h0,r1
- adcs $h1,$h1,#0
- adcs $h2,$h2,#0
- adcs $h3,$h3,#0
- adc $h4,$h4,#0
-
- cmp r0,lr @ done yet?
- bhi .Loop
-
- ldr $ctx,[sp,#12]
- add sp,sp,#32
- stmia $ctx,{$h0-$h4} @ store the result
-
-.Lno_data:
-#if __ARM_ARCH__>=5
- ldmia sp!,{r3-r11,pc}
-#else
- ldmia sp!,{r3-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_blocks,.-poly1305_blocks
-___
-}
-{
-my ($ctx,$mac,$nonce)=map("r$_",(0..2));
-my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
-my $g4=$h4;
-
-$code.=<<___;
-.type poly1305_emit,%function
-.align 5
-poly1305_emit:
- stmdb sp!,{r4-r11}
-.Lpoly1305_emit_enter:
-
- ldmia $ctx,{$h0-$h4}
- adds $g0,$h0,#5 @ compare to modulus
- adcs $g1,$h1,#0
- adcs $g2,$h2,#0
- adcs $g3,$h3,#0
- adc $g4,$h4,#0
- tst $g4,#4 @ did it carry/borrow?
-
-#ifdef __thumb2__
- it ne
-#endif
- movne $h0,$g0
- ldr $g0,[$nonce,#0]
-#ifdef __thumb2__
- it ne
-#endif
- movne $h1,$g1
- ldr $g1,[$nonce,#4]
-#ifdef __thumb2__
- it ne
-#endif
- movne $h2,$g2
- ldr $g2,[$nonce,#8]
-#ifdef __thumb2__
- it ne
-#endif
- movne $h3,$g3
- ldr $g3,[$nonce,#12]
-
- adds $h0,$h0,$g0
- adcs $h1,$h1,$g1
- adcs $h2,$h2,$g2
- adc $h3,$h3,$g3
-
-#if __ARM_ARCH__>=7
-# ifdef __ARMEB__
- rev $h0,$h0
- rev $h1,$h1
- rev $h2,$h2
- rev $h3,$h3
-# endif
- str $h0,[$mac,#0]
- str $h1,[$mac,#4]
- str $h2,[$mac,#8]
- str $h3,[$mac,#12]
-#else
- strb $h0,[$mac,#0]
- mov $h0,$h0,lsr#8
- strb $h1,[$mac,#4]
- mov $h1,$h1,lsr#8
- strb $h2,[$mac,#8]
- mov $h2,$h2,lsr#8
- strb $h3,[$mac,#12]
- mov $h3,$h3,lsr#8
-
- strb $h0,[$mac,#1]
- mov $h0,$h0,lsr#8
- strb $h1,[$mac,#5]
- mov $h1,$h1,lsr#8
- strb $h2,[$mac,#9]
- mov $h2,$h2,lsr#8
- strb $h3,[$mac,#13]
- mov $h3,$h3,lsr#8
-
- strb $h0,[$mac,#2]
- mov $h0,$h0,lsr#8
- strb $h1,[$mac,#6]
- mov $h1,$h1,lsr#8
- strb $h2,[$mac,#10]
- mov $h2,$h2,lsr#8
- strb $h3,[$mac,#14]
- mov $h3,$h3,lsr#8
-
- strb $h0,[$mac,#3]
- strb $h1,[$mac,#7]
- strb $h2,[$mac,#11]
- strb $h3,[$mac,#15]
-#endif
- ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
- ret @ bx lr
-#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_emit,.-poly1305_emit
-___
-{
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
-my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
-my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
-
-my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
-
-$code.=<<___;
-#if (defined(__KERNEL__) && defined(CONFIG_KERNEL_MODE_NEON)) || (!defined(__KERNEL__) && __ARM_MAX_ARCH__>=7)
-.fpu neon
-
-.type poly1305_init_neon,%function
-.align 5
-poly1305_init_neon:
-.Lpoly1305_init_neon:
- ldr r4,[$ctx,#20] @ load key base 2^32
- ldr r5,[$ctx,#24]
- ldr r6,[$ctx,#28]
- ldr r7,[$ctx,#32]
-
- and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
- mov r3,r4,lsr#26
- mov r4,r5,lsr#20
- orr r3,r3,r5,lsl#6
- mov r5,r6,lsr#14
- orr r4,r4,r6,lsl#12
- mov r6,r7,lsr#8
- orr r5,r5,r7,lsl#18
- and r3,r3,#0x03ffffff
- and r4,r4,#0x03ffffff
- and r5,r5,#0x03ffffff
-
- vdup.32 $R0,r2 @ r^1 in both lanes
- add r2,r3,r3,lsl#2 @ *5
- vdup.32 $R1,r3
- add r3,r4,r4,lsl#2
- vdup.32 $S1,r2
- vdup.32 $R2,r4
- add r4,r5,r5,lsl#2
- vdup.32 $S2,r3
- vdup.32 $R3,r5
- add r5,r6,r6,lsl#2
- vdup.32 $S3,r4
- vdup.32 $R4,r6
- vdup.32 $S4,r5
-
- mov $zeros,#2 @ counter
-
-.Lsquare_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
-
- vmull.u32 $D0,$R0,${R0}[1]
- vmull.u32 $D1,$R1,${R0}[1]
- vmull.u32 $D2,$R2,${R0}[1]
- vmull.u32 $D3,$R3,${R0}[1]
- vmull.u32 $D4,$R4,${R0}[1]
-
- vmlal.u32 $D0,$R4,${S1}[1]
- vmlal.u32 $D1,$R0,${R1}[1]
- vmlal.u32 $D2,$R1,${R1}[1]
- vmlal.u32 $D3,$R2,${R1}[1]
- vmlal.u32 $D4,$R3,${R1}[1]
-
- vmlal.u32 $D0,$R3,${S2}[1]
- vmlal.u32 $D1,$R4,${S2}[1]
- vmlal.u32 $D3,$R1,${R2}[1]
- vmlal.u32 $D2,$R0,${R2}[1]
- vmlal.u32 $D4,$R2,${R2}[1]
-
- vmlal.u32 $D0,$R2,${S3}[1]
- vmlal.u32 $D3,$R0,${R3}[1]
- vmlal.u32 $D1,$R3,${S3}[1]
- vmlal.u32 $D2,$R4,${S3}[1]
- vmlal.u32 $D4,$R1,${R3}[1]
-
- vmlal.u32 $D3,$R4,${S4}[1]
- vmlal.u32 $D0,$R1,${S4}[1]
- vmlal.u32 $D1,$R2,${S4}[1]
- vmlal.u32 $D2,$R3,${S4}[1]
- vmlal.u32 $D4,$R0,${R4}[1]
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- @ and P. Schwabe
- @
- @ H0>>+H1>>+H2>>+H3>>+H4
- @ H3>>+H4>>*5+H0>>+H1
- @
- @ Trivia.
- @
- @ Result of multiplication of n-bit number by m-bit number is
- @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
- @ m-bit number multiplied by 2^n is still n+m bits wide.
- @
- @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
- @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
- @ one is n+1 bits wide.
- @
- @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
- @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
- @ can be 27. However! In cases when their width exceeds 26 bits
- @ they are limited by 2^26+2^6. This in turn means that *sum*
- @ of the products with these values can still be viewed as sum
- @ of 52-bit numbers as long as the amount of addends is not a
- @ power of 2. For example,
- @
- @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
- @
- @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
- @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
- @ 8 * (2^52) or 2^55. However, the value is then multiplied by
- @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
- @ which is less than 32 * (2^52) or 2^57. And when processing
- @ data we are looking at triple as many addends...
- @
- @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
- @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
- @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
- @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
- @ instruction accepts 2x32-bit input and writes 2x64-bit result.
- @ This means that result of reduction have to be compressed upon
- @ loop wrap-around. This can be done in the process of reduction
- @ to minimize amount of instructions [as well as amount of
- @ 128-bit instructions, which benefits low-end processors], but
- @ one has to watch for H2 (which is narrower than H0) and 5*H4
- @ not being wider than 58 bits, so that result of right shift
- @ by 26 bits fits in 32 bits. This is also useful on x86,
- @ because it allows to use paddd in place for paddq, which
- @ benefits Atom, where paddq is ridiculously slow.
-
- vshr.u64 $T0,$D3,#26
- vmovn.i64 $D3#lo,$D3
- vshr.u64 $T1,$D0,#26
- vmovn.i64 $D0#lo,$D0
- vadd.i64 $D4,$D4,$T0 @ h3 -> h4
- vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
- vadd.i64 $D1,$D1,$T1 @ h0 -> h1
- vbic.i32 $D0#lo,#0xfc000000
-
- vshrn.u64 $T0#lo,$D4,#26
- vmovn.i64 $D4#lo,$D4
- vshr.u64 $T1,$D1,#26
- vmovn.i64 $D1#lo,$D1
- vadd.i64 $D2,$D2,$T1 @ h1 -> h2
- vbic.i32 $D4#lo,#0xfc000000
- vbic.i32 $D1#lo,#0xfc000000
-
- vadd.i32 $D0#lo,$D0#lo,$T0#lo
- vshl.u32 $T0#lo,$T0#lo,#2
- vshrn.u64 $T1#lo,$D2,#26
- vmovn.i64 $D2#lo,$D2
- vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
- vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
- vbic.i32 $D2#lo,#0xfc000000
-
- vshr.u32 $T0#lo,$D0#lo,#26
- vbic.i32 $D0#lo,#0xfc000000
- vshr.u32 $T1#lo,$D3#lo,#26
- vbic.i32 $D3#lo,#0xfc000000
- vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
- vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
-
- subs $zeros,$zeros,#1
- beq .Lsquare_break_neon
-
- add $tbl0,$ctx,#(48+0*9*4)
- add $tbl1,$ctx,#(48+1*9*4)
-
- vtrn.32 $R0,$D0#lo @ r^2:r^1
- vtrn.32 $R2,$D2#lo
- vtrn.32 $R3,$D3#lo
- vtrn.32 $R1,$D1#lo
- vtrn.32 $R4,$D4#lo
-
- vshl.u32 $S2,$R2,#2 @ *5
- vshl.u32 $S3,$R3,#2
- vshl.u32 $S1,$R1,#2
- vshl.u32 $S4,$R4,#2
- vadd.i32 $S2,$S2,$R2
- vadd.i32 $S1,$S1,$R1
- vadd.i32 $S3,$S3,$R3
- vadd.i32 $S4,$S4,$R4
-
- vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
- vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
- vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vst1.32 {${S4}[0]},[$tbl0,:32]
- vst1.32 {${S4}[1]},[$tbl1,:32]
-
- b .Lsquare_neon
-
-.align 4
-.Lsquare_break_neon:
- add $tbl0,$ctx,#(48+2*4*9)
- add $tbl1,$ctx,#(48+3*4*9)
-
- vmov $R0,$D0#lo @ r^4:r^3
- vshl.u32 $S1,$D1#lo,#2 @ *5
- vmov $R1,$D1#lo
- vshl.u32 $S2,$D2#lo,#2
- vmov $R2,$D2#lo
- vshl.u32 $S3,$D3#lo,#2
- vmov $R3,$D3#lo
- vshl.u32 $S4,$D4#lo,#2
- vmov $R4,$D4#lo
- vadd.i32 $S1,$S1,$D1#lo
- vadd.i32 $S2,$S2,$D2#lo
- vadd.i32 $S3,$S3,$D3#lo
- vadd.i32 $S4,$S4,$D4#lo
-
- vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
- vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
- vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vst1.32 {${S4}[0]},[$tbl0]
- vst1.32 {${S4}[1]},[$tbl1]
-
- ret @ bx lr
-.size poly1305_init_neon,.-poly1305_init_neon
-
-#ifdef __KERNEL__
-.globl poly1305_blocks_neon
-#endif
-.type poly1305_blocks_neon,%function
-.align 5
-poly1305_blocks_neon:
- ldr ip,[$ctx,#36] @ is_base2_26
- ands $len,$len,#-16
- beq .Lno_data_neon
-
- cmp $len,#64
- bhs .Lenter_neon
- tst ip,ip @ is_base2_26?
- beq .Lpoly1305_blocks
-
-.Lenter_neon:
- stmdb sp!,{r4-r7}
- vstmdb sp!,{d8-d15} @ ABI specification says so
-
- tst ip,ip @ is_base2_26?
- bne .Lbase2_26_neon
-
- stmdb sp!,{r1-r3,lr}
- bl .Lpoly1305_init_neon
-
- ldr r4,[$ctx,#0] @ load hash value base 2^32
- ldr r5,[$ctx,#4]
- ldr r6,[$ctx,#8]
- ldr r7,[$ctx,#12]
- ldr ip,[$ctx,#16]
-
- and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
- mov r3,r4,lsr#26
- veor $D0#lo,$D0#lo,$D0#lo
- mov r4,r5,lsr#20
- orr r3,r3,r5,lsl#6
- veor $D1#lo,$D1#lo,$D1#lo
- mov r5,r6,lsr#14
- orr r4,r4,r6,lsl#12
- veor $D2#lo,$D2#lo,$D2#lo
- mov r6,r7,lsr#8
- orr r5,r5,r7,lsl#18
- veor $D3#lo,$D3#lo,$D3#lo
- and r3,r3,#0x03ffffff
- orr r6,r6,ip,lsl#24
- veor $D4#lo,$D4#lo,$D4#lo
- and r4,r4,#0x03ffffff
- mov r1,#1
- and r5,r5,#0x03ffffff
- str r1,[$ctx,#36] @ is_base2_26
-
- vmov.32 $D0#lo[0],r2
- vmov.32 $D1#lo[0],r3
- vmov.32 $D2#lo[0],r4
- vmov.32 $D3#lo[0],r5
- vmov.32 $D4#lo[0],r6
- adr $zeros,.Lzeros
-
- ldmia sp!,{r1-r3,lr}
- b .Lbase2_32_neon
-
-.align 4
-.Lbase2_26_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ load hash value
-
- veor $D0#lo,$D0#lo,$D0#lo
- veor $D1#lo,$D1#lo,$D1#lo
- veor $D2#lo,$D2#lo,$D2#lo
- veor $D3#lo,$D3#lo,$D3#lo
- veor $D4#lo,$D4#lo,$D4#lo
- vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
- adr $zeros,.Lzeros
- vld1.32 {$D4#lo[0]},[$ctx]
- sub $ctx,$ctx,#16 @ rewind
-
-.Lbase2_32_neon:
- add $in2,$inp,#32
- mov $padbit,$padbit,lsl#24
- tst $len,#31
- beq .Leven
-
- vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
- vmov.32 $H4#lo[0],$padbit
- sub $len,$len,#16
- add $in2,$inp,#32
-
-# ifdef __ARMEB__
- vrev32.8 $H0,$H0
- vrev32.8 $H3,$H3
- vrev32.8 $H1,$H1
- vrev32.8 $H2,$H2
-# endif
- vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
- vshl.u32 $H3#lo,$H3#lo,#18
-
- vsri.u32 $H3#lo,$H2#lo,#14
- vshl.u32 $H2#lo,$H2#lo,#12
- vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
-
- vbic.i32 $H3#lo,#0xfc000000
- vsri.u32 $H2#lo,$H1#lo,#20
- vshl.u32 $H1#lo,$H1#lo,#6
-
- vbic.i32 $H2#lo,#0xfc000000
- vsri.u32 $H1#lo,$H0#lo,#26
- vadd.i32 $H3#hi,$H3#lo,$D3#lo
-
- vbic.i32 $H0#lo,#0xfc000000
- vbic.i32 $H1#lo,#0xfc000000
- vadd.i32 $H2#hi,$H2#lo,$D2#lo
-
- vadd.i32 $H0#hi,$H0#lo,$D0#lo
- vadd.i32 $H1#hi,$H1#lo,$D1#lo
-
- mov $tbl1,$zeros
- add $tbl0,$ctx,#48
-
- cmp $len,$len
- b .Long_tail
-
-.align 4
-.Leven:
- subs $len,$len,#64
- it lo
- movlo $in2,$zeros
-
- vmov.i32 $H4,#1<<24 @ padbit, yes, always
- vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
- add $inp,$inp,#64
- vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
- add $in2,$in2,#64
- itt hi
- addhi $tbl1,$ctx,#(48+1*9*4)
- addhi $tbl0,$ctx,#(48+3*9*4)
-
-# ifdef __ARMEB__
- vrev32.8 $H0,$H0
- vrev32.8 $H3,$H3
- vrev32.8 $H1,$H1
- vrev32.8 $H2,$H2
-# endif
- vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
- vshl.u32 $H3,$H3,#18
-
- vsri.u32 $H3,$H2,#14
- vshl.u32 $H2,$H2,#12
-
- vbic.i32 $H3,#0xfc000000
- vsri.u32 $H2,$H1,#20
- vshl.u32 $H1,$H1,#6
-
- vbic.i32 $H2,#0xfc000000
- vsri.u32 $H1,$H0,#26
-
- vbic.i32 $H0,#0xfc000000
- vbic.i32 $H1,#0xfc000000
-
- bls .Lskip_loop
-
- vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
- vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
- vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- b .Loop_neon
-
-.align 5
-.Loop_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- @ \___________________/
- @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- @ \___________________/ \____________________/
- @
- @ Note that we start with inp[2:3]*r^2. This is because it
- @ doesn't depend on reduction in previous iteration.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ inp[2:3]*r^2
-
- vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
- vmull.u32 $D2,$H2#hi,${R0}[1]
- vadd.i32 $H0#lo,$H0#lo,$D0#lo
- vmull.u32 $D0,$H0#hi,${R0}[1]
- vadd.i32 $H3#lo,$H3#lo,$D3#lo
- vmull.u32 $D3,$H3#hi,${R0}[1]
- vmlal.u32 $D2,$H1#hi,${R1}[1]
- vadd.i32 $H1#lo,$H1#lo,$D1#lo
- vmull.u32 $D1,$H1#hi,${R0}[1]
-
- vadd.i32 $H4#lo,$H4#lo,$D4#lo
- vmull.u32 $D4,$H4#hi,${R0}[1]
- subs $len,$len,#64
- vmlal.u32 $D0,$H4#hi,${S1}[1]
- it lo
- movlo $in2,$zeros
- vmlal.u32 $D3,$H2#hi,${R1}[1]
- vld1.32 ${S4}[1],[$tbl1,:32]
- vmlal.u32 $D1,$H0#hi,${R1}[1]
- vmlal.u32 $D4,$H3#hi,${R1}[1]
-
- vmlal.u32 $D0,$H3#hi,${S2}[1]
- vmlal.u32 $D3,$H1#hi,${R2}[1]
- vmlal.u32 $D4,$H2#hi,${R2}[1]
- vmlal.u32 $D1,$H4#hi,${S2}[1]
- vmlal.u32 $D2,$H0#hi,${R2}[1]
-
- vmlal.u32 $D3,$H0#hi,${R3}[1]
- vmlal.u32 $D0,$H2#hi,${S3}[1]
- vmlal.u32 $D4,$H1#hi,${R3}[1]
- vmlal.u32 $D1,$H3#hi,${S3}[1]
- vmlal.u32 $D2,$H4#hi,${S3}[1]
-
- vmlal.u32 $D3,$H4#hi,${S4}[1]
- vmlal.u32 $D0,$H1#hi,${S4}[1]
- vmlal.u32 $D4,$H0#hi,${R4}[1]
- vmlal.u32 $D1,$H2#hi,${S4}[1]
- vmlal.u32 $D2,$H3#hi,${S4}[1]
-
- vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
- add $in2,$in2,#64
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ (hash+inp[0:1])*r^4 and accumulate
-
- vmlal.u32 $D3,$H3#lo,${R0}[0]
- vmlal.u32 $D0,$H0#lo,${R0}[0]
- vmlal.u32 $D4,$H4#lo,${R0}[0]
- vmlal.u32 $D1,$H1#lo,${R0}[0]
- vmlal.u32 $D2,$H2#lo,${R0}[0]
- vld1.32 ${S4}[0],[$tbl0,:32]
-
- vmlal.u32 $D3,$H2#lo,${R1}[0]
- vmlal.u32 $D0,$H4#lo,${S1}[0]
- vmlal.u32 $D4,$H3#lo,${R1}[0]
- vmlal.u32 $D1,$H0#lo,${R1}[0]
- vmlal.u32 $D2,$H1#lo,${R1}[0]
-
- vmlal.u32 $D3,$H1#lo,${R2}[0]
- vmlal.u32 $D0,$H3#lo,${S2}[0]
- vmlal.u32 $D4,$H2#lo,${R2}[0]
- vmlal.u32 $D1,$H4#lo,${S2}[0]
- vmlal.u32 $D2,$H0#lo,${R2}[0]
-
- vmlal.u32 $D3,$H0#lo,${R3}[0]
- vmlal.u32 $D0,$H2#lo,${S3}[0]
- vmlal.u32 $D4,$H1#lo,${R3}[0]
- vmlal.u32 $D1,$H3#lo,${S3}[0]
- vmlal.u32 $D3,$H4#lo,${S4}[0]
-
- vmlal.u32 $D2,$H4#lo,${S3}[0]
- vmlal.u32 $D0,$H1#lo,${S4}[0]
- vmlal.u32 $D4,$H0#lo,${R4}[0]
- vmov.i32 $H4,#1<<24 @ padbit, yes, always
- vmlal.u32 $D1,$H2#lo,${S4}[0]
- vmlal.u32 $D2,$H3#lo,${S4}[0]
-
- vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
- add $inp,$inp,#64
-# ifdef __ARMEB__
- vrev32.8 $H0,$H0
- vrev32.8 $H1,$H1
- vrev32.8 $H2,$H2
- vrev32.8 $H3,$H3
-# endif
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction interleaved with base 2^32 -> base 2^26 of
- @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
-
- vshr.u64 $T0,$D3,#26
- vmovn.i64 $D3#lo,$D3
- vshr.u64 $T1,$D0,#26
- vmovn.i64 $D0#lo,$D0
- vadd.i64 $D4,$D4,$T0 @ h3 -> h4
- vbic.i32 $D3#lo,#0xfc000000
- vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
- vadd.i64 $D1,$D1,$T1 @ h0 -> h1
- vshl.u32 $H3,$H3,#18
- vbic.i32 $D0#lo,#0xfc000000
-
- vshrn.u64 $T0#lo,$D4,#26
- vmovn.i64 $D4#lo,$D4
- vshr.u64 $T1,$D1,#26
- vmovn.i64 $D1#lo,$D1
- vadd.i64 $D2,$D2,$T1 @ h1 -> h2
- vsri.u32 $H3,$H2,#14
- vbic.i32 $D4#lo,#0xfc000000
- vshl.u32 $H2,$H2,#12
- vbic.i32 $D1#lo,#0xfc000000
-
- vadd.i32 $D0#lo,$D0#lo,$T0#lo
- vshl.u32 $T0#lo,$T0#lo,#2
- vbic.i32 $H3,#0xfc000000
- vshrn.u64 $T1#lo,$D2,#26
- vmovn.i64 $D2#lo,$D2
- vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
- vsri.u32 $H2,$H1,#20
- vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
- vshl.u32 $H1,$H1,#6
- vbic.i32 $D2#lo,#0xfc000000
- vbic.i32 $H2,#0xfc000000
-
- vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
- vmovn.i64 $D0#lo,$D0
- vsri.u32 $H1,$H0,#26
- vbic.i32 $H0,#0xfc000000
- vshr.u32 $T1#lo,$D3#lo,#26
- vbic.i32 $D3#lo,#0xfc000000
- vbic.i32 $D0#lo,#0xfc000000
- vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
- vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
- vbic.i32 $H1,#0xfc000000
-
- bhi .Loop_neon
-
-.Lskip_loop:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- add $tbl1,$ctx,#(48+0*9*4)
- add $tbl0,$ctx,#(48+1*9*4)
- adds $len,$len,#32
- it ne
- movne $len,#0
- bne .Long_tail
-
- vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
- vadd.i32 $H0#hi,$H0#lo,$D0#lo
- vadd.i32 $H3#hi,$H3#lo,$D3#lo
- vadd.i32 $H1#hi,$H1#lo,$D1#lo
- vadd.i32 $H4#hi,$H4#lo,$D4#lo
-
-.Long_tail:
- vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
- vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
-
- vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
- vmull.u32 $D2,$H2#hi,$R0
- vadd.i32 $H0#lo,$H0#lo,$D0#lo
- vmull.u32 $D0,$H0#hi,$R0
- vadd.i32 $H3#lo,$H3#lo,$D3#lo
- vmull.u32 $D3,$H3#hi,$R0
- vadd.i32 $H1#lo,$H1#lo,$D1#lo
- vmull.u32 $D1,$H1#hi,$R0
- vadd.i32 $H4#lo,$H4#lo,$D4#lo
- vmull.u32 $D4,$H4#hi,$R0
-
- vmlal.u32 $D0,$H4#hi,$S1
- vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vmlal.u32 $D3,$H2#hi,$R1
- vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vmlal.u32 $D1,$H0#hi,$R1
- vmlal.u32 $D4,$H3#hi,$R1
- vmlal.u32 $D2,$H1#hi,$R1
-
- vmlal.u32 $D3,$H1#hi,$R2
- vld1.32 ${S4}[1],[$tbl1,:32]
- vmlal.u32 $D0,$H3#hi,$S2
- vld1.32 ${S4}[0],[$tbl0,:32]
- vmlal.u32 $D4,$H2#hi,$R2
- vmlal.u32 $D1,$H4#hi,$S2
- vmlal.u32 $D2,$H0#hi,$R2
-
- vmlal.u32 $D3,$H0#hi,$R3
- it ne
- addne $tbl1,$ctx,#(48+2*9*4)
- vmlal.u32 $D0,$H2#hi,$S3
- it ne
- addne $tbl0,$ctx,#(48+3*9*4)
- vmlal.u32 $D4,$H1#hi,$R3
- vmlal.u32 $D1,$H3#hi,$S3
- vmlal.u32 $D2,$H4#hi,$S3
-
- vmlal.u32 $D3,$H4#hi,$S4
- vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
- vmlal.u32 $D0,$H1#hi,$S4
- vshr.u64 $MASK,$MASK,#38
- vmlal.u32 $D4,$H0#hi,$R4
- vmlal.u32 $D1,$H2#hi,$S4
- vmlal.u32 $D2,$H3#hi,$S4
-
- beq .Lshort_tail
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ (hash+inp[0:1])*r^4:r^3 and accumulate
-
- vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
- vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
-
- vmlal.u32 $D2,$H2#lo,$R0
- vmlal.u32 $D0,$H0#lo,$R0
- vmlal.u32 $D3,$H3#lo,$R0
- vmlal.u32 $D1,$H1#lo,$R0
- vmlal.u32 $D4,$H4#lo,$R0
-
- vmlal.u32 $D0,$H4#lo,$S1
- vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vmlal.u32 $D3,$H2#lo,$R1
- vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vmlal.u32 $D1,$H0#lo,$R1
- vmlal.u32 $D4,$H3#lo,$R1
- vmlal.u32 $D2,$H1#lo,$R1
-
- vmlal.u32 $D3,$H1#lo,$R2
- vld1.32 ${S4}[1],[$tbl1,:32]
- vmlal.u32 $D0,$H3#lo,$S2
- vld1.32 ${S4}[0],[$tbl0,:32]
- vmlal.u32 $D4,$H2#lo,$R2
- vmlal.u32 $D1,$H4#lo,$S2
- vmlal.u32 $D2,$H0#lo,$R2
-
- vmlal.u32 $D3,$H0#lo,$R3
- vmlal.u32 $D0,$H2#lo,$S3
- vmlal.u32 $D4,$H1#lo,$R3
- vmlal.u32 $D1,$H3#lo,$S3
- vmlal.u32 $D2,$H4#lo,$S3
-
- vmlal.u32 $D3,$H4#lo,$S4
- vorn $MASK,$MASK,$MASK @ all-ones
- vmlal.u32 $D0,$H1#lo,$S4
- vshr.u64 $MASK,$MASK,#38
- vmlal.u32 $D4,$H0#lo,$R4
- vmlal.u32 $D1,$H2#lo,$S4
- vmlal.u32 $D2,$H3#lo,$S4
-
-.Lshort_tail:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ horizontal addition
-
- vadd.i64 $D3#lo,$D3#lo,$D3#hi
- vadd.i64 $D0#lo,$D0#lo,$D0#hi
- vadd.i64 $D4#lo,$D4#lo,$D4#hi
- vadd.i64 $D1#lo,$D1#lo,$D1#hi
- vadd.i64 $D2#lo,$D2#lo,$D2#hi
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction, but without narrowing
-
- vshr.u64 $T0,$D3,#26
- vand.i64 $D3,$D3,$MASK
- vshr.u64 $T1,$D0,#26
- vand.i64 $D0,$D0,$MASK
- vadd.i64 $D4,$D4,$T0 @ h3 -> h4
- vadd.i64 $D1,$D1,$T1 @ h0 -> h1
-
- vshr.u64 $T0,$D4,#26
- vand.i64 $D4,$D4,$MASK
- vshr.u64 $T1,$D1,#26
- vand.i64 $D1,$D1,$MASK
- vadd.i64 $D2,$D2,$T1 @ h1 -> h2
-
- vadd.i64 $D0,$D0,$T0
- vshl.u64 $T0,$T0,#2
- vshr.u64 $T1,$D2,#26
- vand.i64 $D2,$D2,$MASK
- vadd.i64 $D0,$D0,$T0 @ h4 -> h0
- vadd.i64 $D3,$D3,$T1 @ h2 -> h3
-
- vshr.u64 $T0,$D0,#26
- vand.i64 $D0,$D0,$MASK
- vshr.u64 $T1,$D3,#26
- vand.i64 $D3,$D3,$MASK
- vadd.i64 $D1,$D1,$T0 @ h0 -> h1
- vadd.i64 $D4,$D4,$T1 @ h3 -> h4
-
- cmp $len,#0
- bne .Leven
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ store hash value
-
- vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
- vst1.32 {$D4#lo[0]},[$ctx]
-
- vldmia sp!,{d8-d15} @ epilogue
- ldmia sp!,{r4-r7}
-.Lno_data_neon:
- ret @ bx lr
-.size poly1305_blocks_neon,.-poly1305_blocks_neon
-
-#ifdef __KERNEL__
-.globl poly1305_emit_neon
-#endif
-.type poly1305_emit_neon,%function
-.align 5
-poly1305_emit_neon:
- ldr ip,[$ctx,#36] @ is_base2_26
-
- stmdb sp!,{r4-r11}
-
- tst ip,ip
- beq .Lpoly1305_emit_enter
-
- ldmia $ctx,{$h0-$h4}
- eor $g0,$g0,$g0
-
- adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
- mov $h1,$h1,lsr#6
- adcs $h1,$h1,$h2,lsl#20
- mov $h2,$h2,lsr#12
- adcs $h2,$h2,$h3,lsl#14
- mov $h3,$h3,lsr#18
- adcs $h3,$h3,$h4,lsl#8
- adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ...
-
- and $g0,$h4,#-4 @ ... so reduce
- and $h4,$h3,#3
- add $g0,$g0,$g0,lsr#2 @ *= 5
- adds $h0,$h0,$g0
- adcs $h1,$h1,#0
- adcs $h2,$h2,#0
- adcs $h3,$h3,#0
- adc $h4,$h4,#0
-
- adds $g0,$h0,#5 @ compare to modulus
- adcs $g1,$h1,#0
- adcs $g2,$h2,#0
- adcs $g3,$h3,#0
- adc $g4,$h4,#0
- tst $g4,#4 @ did it carry/borrow?
-
- it ne
- movne $h0,$g0
- ldr $g0,[$nonce,#0]
- it ne
- movne $h1,$g1
- ldr $g1,[$nonce,#4]
- it ne
- movne $h2,$g2
- ldr $g2,[$nonce,#8]
- it ne
- movne $h3,$g3
- ldr $g3,[$nonce,#12]
-
- adds $h0,$h0,$g0 @ accumulate nonce
- adcs $h1,$h1,$g1
- adcs $h2,$h2,$g2
- adc $h3,$h3,$g3
-
-# ifdef __ARMEB__
- rev $h0,$h0
- rev $h1,$h1
- rev $h2,$h2
- rev $h3,$h3
-# endif
- str $h0,[$mac,#0] @ store the result
- str $h1,[$mac,#4]
- str $h2,[$mac,#8]
- str $h3,[$mac,#12]
-
- ldmia sp!,{r4-r11}
- ret @ bx lr
-.size poly1305_emit_neon,.-poly1305_emit_neon
-
-.align 5
-.Lzeros:
-.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-# ifndef __KERNEL__
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.Lpoly1305_init
-# endif
-#endif
-___
-} }
-$code.=<<___;
-.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm OPENSSL_armcap_P,4,4
-#endif
-___
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/@/ and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/geo;
-
- s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
- s/\bret\b/bx lr/go or
- s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
-
- print $_,"\n";
-}
-close STDOUT; # enforce flush
diff --git a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm64.pl b/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm64.pl
deleted file mode 100755
index d513b45a149b..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm64.pl
+++ /dev/null
@@ -1,974 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for ARMv8.
-#
-# June 2015
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone.
-#
-# IALU/gcc-4.9 NEON
-#
-# Apple A7 1.86/+5% 0.72
-# Cortex-A53 2.69/+58% 1.47
-# Cortex-A57 2.70/+7% 1.14
-# Denver 1.64/+50% 1.18(*)
-# X-Gene 2.13/+68% 2.27
-# Mongoose 1.77/+75% 1.12
-# Kryo 2.70/+55% 1.13
-#
-# (*) estimate based on resources availability is less than 1.0,
-# i.e. measured result is worse than expected, presumably binary
-# translator is not almighty;
-
-$flavour=shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
- open STDOUT,">$output";
-}
-
-my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
-my ($mac,$nonce)=($inp,$len);
-
-my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-.extern OPENSSL_armcap_P
-#else
-# define poly1305_init poly1305_init_arm
-# define poly1305_blocks poly1305_blocks_arm
-# define poly1305_emit poly1305_emit_arm
-#endif
-
-.text
-
-// forward "declarations" are required for Apple
-.globl poly1305_blocks
-.globl poly1305_emit
-.globl poly1305_init
-.type poly1305_init,%function
-.align 5
-poly1305_init:
- cmp $inp,xzr
- stp xzr,xzr,[$ctx] // zero hash value
- stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
-
- csel x0,xzr,x0,eq
- b.eq .Lno_key
-
-#ifndef __KERNEL__
-# ifdef __ILP32__
- ldrsw $t1,.LOPENSSL_armcap_P
-# else
- ldr $t1,.LOPENSSL_armcap_P
-# endif
- adr $t0,.LOPENSSL_armcap_P
- ldr w17,[$t0,$t1]
-#endif
-
- ldp $r0,$r1,[$inp] // load key
- mov $s1,#0xfffffffc0fffffff
- movk $s1,#0x0fff,lsl#48
-#ifdef __AARCH64EB__
- rev $r0,$r0 // flip bytes
- rev $r1,$r1
-#endif
- and $r0,$r0,$s1 // &=0ffffffc0fffffff
- and $s1,$s1,#-4
- and $r1,$r1,$s1 // &=0ffffffc0ffffffc
- stp $r0,$r1,[$ctx,#32] // save key value
-
-#ifndef __KERNEL__
- tst w17,#ARMV7_NEON
-
- adr $d0,poly1305_blocks
- adr $r0,poly1305_blocks_neon
- adr $d1,poly1305_emit
- adr $r1,poly1305_emit_neon
-
- csel $d0,$d0,$r0,eq
- csel $d1,$d1,$r1,eq
-
-# ifdef __ILP32__
- stp w12,w13,[$len]
-# else
- stp $d0,$d1,[$len]
-# endif
-
- mov x0,#1
-#else
- mov x0,#0
-#endif
-.Lno_key:
- ret
-.size poly1305_init,.-poly1305_init
-
-.type poly1305_blocks,%function
-.align 5
-poly1305_blocks:
- ands $len,$len,#-16
- b.eq .Lno_data
-
- ldp $h0,$h1,[$ctx] // load hash value
- ldp $r0,$r1,[$ctx,#32] // load key value
- ldr $h2,[$ctx,#16]
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
- b .Loop
-
-.align 5
-.Loop:
- ldp $t0,$t1,[$inp],#16 // load input
- sub $len,$len,#16
-#ifdef __AARCH64EB__
- rev $t0,$t0
- rev $t1,$t1
-#endif
- adds $h0,$h0,$t0 // accumulate input
- adcs $h1,$h1,$t1
-
- mul $d0,$h0,$r0 // h0*r0
- adc $h2,$h2,$padbit
- umulh $d1,$h0,$r0
-
- mul $t0,$h1,$s1 // h1*5*r1
- umulh $t1,$h1,$s1
-
- adds $d0,$d0,$t0
- mul $t0,$h0,$r1 // h0*r1
- adc $d1,$d1,$t1
- umulh $d2,$h0,$r1
-
- adds $d1,$d1,$t0
- mul $t0,$h1,$r0 // h1*r0
- adc $d2,$d2,xzr
- umulh $t1,$h1,$r0
-
- adds $d1,$d1,$t0
- mul $t0,$h2,$s1 // h2*5*r1
- adc $d2,$d2,$t1
- mul $t1,$h2,$r0 // h2*r0
-
- adds $d1,$d1,$t0
- adc $d2,$d2,$t1
-
- and $t0,$d2,#-4 // final reduction
- and $h2,$d2,#3
- add $t0,$t0,$d2,lsr#2
- adds $h0,$d0,$t0
- adcs $h1,$d1,xzr
- adc $h2,$h2,xzr
-
- cbnz $len,.Loop
-
- stp $h0,$h1,[$ctx] // store hash value
- str $h2,[$ctx,#16]
-
-.Lno_data:
- ret
-.size poly1305_blocks,.-poly1305_blocks
-
-.type poly1305_emit,%function
-.align 5
-poly1305_emit:
- ldp $h0,$h1,[$ctx] // load hash base 2^64
- ldr $h2,[$ctx,#16]
- ldp $t0,$t1,[$nonce] // load nonce
-
- adds $d0,$h0,#5 // compare to modulus
- adcs $d1,$h1,xzr
- adc $d2,$h2,xzr
-
- tst $d2,#-4 // see if it's carried/borrowed
-
- csel $h0,$h0,$d0,eq
- csel $h1,$h1,$d1,eq
-
-#ifdef __AARCH64EB__
- ror $t0,$t0,#32 // flip nonce words
- ror $t1,$t1,#32
-#endif
- adds $h0,$h0,$t0 // accumulate nonce
- adc $h1,$h1,$t1
-#ifdef __AARCH64EB__
- rev $h0,$h0 // flip output bytes
- rev $h1,$h1
-#endif
- stp $h0,$h1,[$mac] // write result
-
- ret
-.size poly1305_emit,.-poly1305_emit
-___
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
-my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
-my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
-my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
-my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
-my ($T0,$T1,$MASK) = map("v$_",(29..31));
-
-my ($in2,$zeros)=("x16","x17");
-my $is_base2_26 = $zeros; # borrow
-
-$code.=<<___;
-.type __poly1305_mult,%function
-.align 5
-__poly1305_mult:
- mul $d0,$h0,$r0 // h0*r0
- umulh $d1,$h0,$r0
-
- mul $t0,$h1,$s1 // h1*5*r1
- umulh $t1,$h1,$s1
-
- adds $d0,$d0,$t0
- mul $t0,$h0,$r1 // h0*r1
- adc $d1,$d1,$t1
- umulh $d2,$h0,$r1
-
- adds $d1,$d1,$t0
- mul $t0,$h1,$r0 // h1*r0
- adc $d2,$d2,xzr
- umulh $t1,$h1,$r0
-
- adds $d1,$d1,$t0
- mul $t0,$h2,$s1 // h2*5*r1
- adc $d2,$d2,$t1
- mul $t1,$h2,$r0 // h2*r0
-
- adds $d1,$d1,$t0
- adc $d2,$d2,$t1
-
- and $t0,$d2,#-4 // final reduction
- and $h2,$d2,#3
- add $t0,$t0,$d2,lsr#2
- adds $h0,$d0,$t0
- adcs $h1,$d1,xzr
- adc $h2,$h2,xzr
-
- ret
-.size __poly1305_mult,.-__poly1305_mult
-
-.type __poly1305_splat,%function
-.align 5
-__poly1305_splat:
- and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x13,$h0,#26,#26
- extr x14,$h1,$h0,#52
- and x14,x14,#0x03ffffff
- ubfx x15,$h1,#14,#26
- extr x16,$h2,$h1,#40
-
- str w12,[$ctx,#16*0] // r0
- add w12,w13,w13,lsl#2 // r1*5
- str w13,[$ctx,#16*1] // r1
- add w13,w14,w14,lsl#2 // r2*5
- str w12,[$ctx,#16*2] // s1
- str w14,[$ctx,#16*3] // r2
- add w14,w15,w15,lsl#2 // r3*5
- str w13,[$ctx,#16*4] // s2
- str w15,[$ctx,#16*5] // r3
- add w15,w16,w16,lsl#2 // r4*5
- str w14,[$ctx,#16*6] // s3
- str w16,[$ctx,#16*7] // r4
- str w15,[$ctx,#16*8] // s4
-
- ret
-.size __poly1305_splat,.-__poly1305_splat
-
-#if !defined(__KERNEL__) || defined(CONFIG_KERNEL_MODE_NEON)
-#ifdef __KERNEL__
-.globl poly1305_blocks_neon
-.globl poly1305_emit_neon
-#endif
-
-.type poly1305_blocks_neon,%function
-.align 5
-poly1305_blocks_neon:
- ldr $is_base2_26,[$ctx,#24]
- cmp $len,#128
- b.hs .Lblocks_neon
- cbz $is_base2_26,poly1305_blocks
-
-.Lblocks_neon:
- stp x29,x30,[sp,#-80]!
- add x29,sp,#0
-
- ands $len,$len,#-16
- b.eq .Lno_data_neon
-
- cbz $is_base2_26,.Lbase2_64_neon
-
- ldp w10,w11,[$ctx] // load hash value base 2^26
- ldp w12,w13,[$ctx,#8]
- ldr w14,[$ctx,#16]
-
- tst $len,#31
- b.eq .Leven_neon
-
- ldp $r0,$r1,[$ctx,#32] // load key value
-
- add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
- lsr $h1,x12,#12
- adds $h0,$h0,x12,lsl#52
- add $h1,$h1,x13,lsl#14
- adc $h1,$h1,xzr
- lsr $h2,x14,#24
- adds $h1,$h1,x14,lsl#40
- adc $d2,$h2,xzr // can be partially reduced...
-
- ldp $d0,$d1,[$inp],#16 // load input
- sub $len,$len,#16
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
-
- and $t0,$d2,#-4 // ... so reduce
- and $h2,$d2,#3
- add $t0,$t0,$d2,lsr#2
- adds $h0,$h0,$t0
- adcs $h1,$h1,xzr
- adc $h2,$h2,xzr
-
-#ifdef __AARCH64EB__
- rev $d0,$d0
- rev $d1,$d1
-#endif
- adds $h0,$h0,$d0 // accumulate input
- adcs $h1,$h1,$d1
- adc $h2,$h2,$padbit
-
- bl __poly1305_mult
- ldr x30,[sp,#8]
-
- cbz $padbit,.Lstore_base2_64_neon
-
- and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x11,$h0,#26,#26
- extr x12,$h1,$h0,#52
- and x12,x12,#0x03ffffff
- ubfx x13,$h1,#14,#26
- extr x14,$h2,$h1,#40
-
- cbnz $len,.Leven_neon
-
- stp w10,w11,[$ctx] // store hash value base 2^26
- stp w12,w13,[$ctx,#8]
- str w14,[$ctx,#16]
- b .Lno_data_neon
-
-.align 4
-.Lstore_base2_64_neon:
- stp $h0,$h1,[$ctx] // store hash value base 2^64
- stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed
- b .Lno_data_neon
-
-.align 4
-.Lbase2_64_neon:
- ldp $r0,$r1,[$ctx,#32] // load key value
-
- ldp $h0,$h1,[$ctx] // load hash value base 2^64
- ldr $h2,[$ctx,#16]
-
- tst $len,#31
- b.eq .Linit_neon
-
- ldp $d0,$d1,[$inp],#16 // load input
- sub $len,$len,#16
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
-#ifdef __AARCH64EB__
- rev $d0,$d0
- rev $d1,$d1
-#endif
- adds $h0,$h0,$d0 // accumulate input
- adcs $h1,$h1,$d1
- adc $h2,$h2,$padbit
-
- bl __poly1305_mult
-
-.Linit_neon:
- and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x11,$h0,#26,#26
- extr x12,$h1,$h0,#52
- and x12,x12,#0x03ffffff
- ubfx x13,$h1,#14,#26
- extr x14,$h2,$h1,#40
-
- stp d8,d9,[sp,#16] // meet ABI requirements
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
-
- fmov ${H0},x10
- fmov ${H1},x11
- fmov ${H2},x12
- fmov ${H3},x13
- fmov ${H4},x14
-
- ////////////////////////////////// initialize r^n table
- mov $h0,$r0 // r^1
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
- mov $h1,$r1
- mov $h2,xzr
- add $ctx,$ctx,#48+12
- bl __poly1305_splat
-
- bl __poly1305_mult // r^2
- sub $ctx,$ctx,#4
- bl __poly1305_splat
-
- bl __poly1305_mult // r^3
- sub $ctx,$ctx,#4
- bl __poly1305_splat
-
- bl __poly1305_mult // r^4
- sub $ctx,$ctx,#4
- bl __poly1305_splat
- ldr x30,[sp,#8]
-
- add $in2,$inp,#32
- adr $zeros,.Lzeros
- subs $len,$len,#64
- csel $in2,$zeros,$in2,lo
-
- mov x4,#1
- str x4,[$ctx,#-24] // set is_base2_26
- sub $ctx,$ctx,#48 // restore original $ctx
- b .Ldo_neon
-
-.align 4
-.Leven_neon:
- add $in2,$inp,#32
- adr $zeros,.Lzeros
- subs $len,$len,#64
- csel $in2,$zeros,$in2,lo
-
- stp d8,d9,[sp,#16] // meet ABI requirements
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
-
- fmov ${H0},x10
- fmov ${H1},x11
- fmov ${H2},x12
- fmov ${H3},x13
- fmov ${H4},x14
-
-.Ldo_neon:
- ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
- ldp x9,x13,[$in2],#48
-
- lsl $padbit,$padbit,#24
- add x15,$ctx,#48
-
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- and x5,x9,#0x03ffffff
- ubfx x6,x8,#26,#26
- ubfx x7,x9,#26,#26
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- extr x8,x12,x8,#52
- extr x9,x13,x9,#52
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- fmov $IN23_0,x4
- and x8,x8,#0x03ffffff
- and x9,x9,#0x03ffffff
- ubfx x10,x12,#14,#26
- ubfx x11,x13,#14,#26
- add x12,$padbit,x12,lsr#40
- add x13,$padbit,x13,lsr#40
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- fmov $IN23_1,x6
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- fmov $IN23_2,x8
- fmov $IN23_3,x10
- fmov $IN23_4,x12
-
- ldp x8,x12,[$inp],#16 // inp[0:1]
- ldp x9,x13,[$inp],#48
-
- ld1 {$R0,$R1,$S1,$R2},[x15],#64
- ld1 {$S2,$R3,$S3,$R4},[x15],#64
- ld1 {$S4},[x15]
-
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- and x5,x9,#0x03ffffff
- ubfx x6,x8,#26,#26
- ubfx x7,x9,#26,#26
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- extr x8,x12,x8,#52
- extr x9,x13,x9,#52
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- fmov $IN01_0,x4
- and x8,x8,#0x03ffffff
- and x9,x9,#0x03ffffff
- ubfx x10,x12,#14,#26
- ubfx x11,x13,#14,#26
- add x12,$padbit,x12,lsr#40
- add x13,$padbit,x13,lsr#40
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- fmov $IN01_1,x6
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- movi $MASK.2d,#-1
- fmov $IN01_2,x8
- fmov $IN01_3,x10
- fmov $IN01_4,x12
- ushr $MASK.2d,$MASK.2d,#38
-
- b.ls .Lskip_loop
-
-.align 4
-.Loop_neon:
- ////////////////////////////////////////////////////////////////
- // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- // \___________________/
- // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- // \___________________/ \____________________/
- //
- // Note that we start with inp[2:3]*r^2. This is because it
- // doesn't depend on reduction in previous iteration.
- ////////////////////////////////////////////////////////////////
- // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
- // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
- // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
- // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
- // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
- subs $len,$len,#64
- umull $ACC4,$IN23_0,${R4}[2]
- csel $in2,$zeros,$in2,lo
- umull $ACC3,$IN23_0,${R3}[2]
- umull $ACC2,$IN23_0,${R2}[2]
- ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
- umull $ACC1,$IN23_0,${R1}[2]
- ldp x9,x13,[$in2],#48
- umull $ACC0,$IN23_0,${R0}[2]
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
-
- umlal $ACC4,$IN23_1,${R3}[2]
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- umlal $ACC3,$IN23_1,${R2}[2]
- and x5,x9,#0x03ffffff
- umlal $ACC2,$IN23_1,${R1}[2]
- ubfx x6,x8,#26,#26
- umlal $ACC1,$IN23_1,${R0}[2]
- ubfx x7,x9,#26,#26
- umlal $ACC0,$IN23_1,${S4}[2]
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
-
- umlal $ACC4,$IN23_2,${R2}[2]
- extr x8,x12,x8,#52
- umlal $ACC3,$IN23_2,${R1}[2]
- extr x9,x13,x9,#52
- umlal $ACC2,$IN23_2,${R0}[2]
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- umlal $ACC1,$IN23_2,${S4}[2]
- fmov $IN23_0,x4
- umlal $ACC0,$IN23_2,${S3}[2]
- and x8,x8,#0x03ffffff
-
- umlal $ACC4,$IN23_3,${R1}[2]
- and x9,x9,#0x03ffffff
- umlal $ACC3,$IN23_3,${R0}[2]
- ubfx x10,x12,#14,#26
- umlal $ACC2,$IN23_3,${S4}[2]
- ubfx x11,x13,#14,#26
- umlal $ACC1,$IN23_3,${S3}[2]
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- umlal $ACC0,$IN23_3,${S2}[2]
- fmov $IN23_1,x6
-
- add $IN01_2,$IN01_2,$H2
- add x12,$padbit,x12,lsr#40
- umlal $ACC4,$IN23_4,${R0}[2]
- add x13,$padbit,x13,lsr#40
- umlal $ACC3,$IN23_4,${S4}[2]
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- umlal $ACC2,$IN23_4,${S3}[2]
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- umlal $ACC1,$IN23_4,${S2}[2]
- fmov $IN23_2,x8
- umlal $ACC0,$IN23_4,${S1}[2]
- fmov $IN23_3,x10
-
- ////////////////////////////////////////////////////////////////
- // (hash+inp[0:1])*r^4 and accumulate
-
- add $IN01_0,$IN01_0,$H0
- fmov $IN23_4,x12
- umlal $ACC3,$IN01_2,${R1}[0]
- ldp x8,x12,[$inp],#16 // inp[0:1]
- umlal $ACC0,$IN01_2,${S3}[0]
- ldp x9,x13,[$inp],#48
- umlal $ACC4,$IN01_2,${R2}[0]
- umlal $ACC1,$IN01_2,${S4}[0]
- umlal $ACC2,$IN01_2,${R0}[0]
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
-
- add $IN01_1,$IN01_1,$H1
- umlal $ACC3,$IN01_0,${R3}[0]
- umlal $ACC4,$IN01_0,${R4}[0]
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- umlal $ACC2,$IN01_0,${R2}[0]
- and x5,x9,#0x03ffffff
- umlal $ACC0,$IN01_0,${R0}[0]
- ubfx x6,x8,#26,#26
- umlal $ACC1,$IN01_0,${R1}[0]
- ubfx x7,x9,#26,#26
-
- add $IN01_3,$IN01_3,$H3
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- umlal $ACC3,$IN01_1,${R2}[0]
- extr x8,x12,x8,#52
- umlal $ACC4,$IN01_1,${R3}[0]
- extr x9,x13,x9,#52
- umlal $ACC0,$IN01_1,${S4}[0]
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- umlal $ACC2,$IN01_1,${R1}[0]
- fmov $IN01_0,x4
- umlal $ACC1,$IN01_1,${R0}[0]
- and x8,x8,#0x03ffffff
-
- add $IN01_4,$IN01_4,$H4
- and x9,x9,#0x03ffffff
- umlal $ACC3,$IN01_3,${R0}[0]
- ubfx x10,x12,#14,#26
- umlal $ACC0,$IN01_3,${S2}[0]
- ubfx x11,x13,#14,#26
- umlal $ACC4,$IN01_3,${R1}[0]
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- umlal $ACC1,$IN01_3,${S3}[0]
- fmov $IN01_1,x6
- umlal $ACC2,$IN01_3,${S4}[0]
- add x12,$padbit,x12,lsr#40
-
- umlal $ACC3,$IN01_4,${S4}[0]
- add x13,$padbit,x13,lsr#40
- umlal $ACC0,$IN01_4,${S1}[0]
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- umlal $ACC4,$IN01_4,${R0}[0]
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- umlal $ACC1,$IN01_4,${S2}[0]
- fmov $IN01_2,x8
- umlal $ACC2,$IN01_4,${S3}[0]
- fmov $IN01_3,x10
- fmov $IN01_4,x12
-
- /////////////////////////////////////////////////////////////////
- // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- // and P. Schwabe
- //
- // [see discussion in poly1305-armv4 module]
-
- ushr $T0.2d,$ACC3,#26
- xtn $H3,$ACC3
- ushr $T1.2d,$ACC0,#26
- and $ACC0,$ACC0,$MASK.2d
- add $ACC4,$ACC4,$T0.2d // h3 -> h4
- bic $H3,#0xfc,lsl#24 // &=0x03ffffff
- add $ACC1,$ACC1,$T1.2d // h0 -> h1
-
- ushr $T0.2d,$ACC4,#26
- xtn $H4,$ACC4
- ushr $T1.2d,$ACC1,#26
- xtn $H1,$ACC1
- bic $H4,#0xfc,lsl#24
- add $ACC2,$ACC2,$T1.2d // h1 -> h2
-
- add $ACC0,$ACC0,$T0.2d
- shl $T0.2d,$T0.2d,#2
- shrn $T1.2s,$ACC2,#26
- xtn $H2,$ACC2
- add $ACC0,$ACC0,$T0.2d // h4 -> h0
- bic $H1,#0xfc,lsl#24
- add $H3,$H3,$T1.2s // h2 -> h3
- bic $H2,#0xfc,lsl#24
-
- shrn $T0.2s,$ACC0,#26
- xtn $H0,$ACC0
- ushr $T1.2s,$H3,#26
- bic $H3,#0xfc,lsl#24
- bic $H0,#0xfc,lsl#24
- add $H1,$H1,$T0.2s // h0 -> h1
- add $H4,$H4,$T1.2s // h3 -> h4
-
- b.hi .Loop_neon
-
-.Lskip_loop:
- dup $IN23_2,${IN23_2}[0]
- add $IN01_2,$IN01_2,$H2
-
- ////////////////////////////////////////////////////////////////
- // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- adds $len,$len,#32
- b.ne .Long_tail
-
- dup $IN23_2,${IN01_2}[0]
- add $IN23_0,$IN01_0,$H0
- add $IN23_3,$IN01_3,$H3
- add $IN23_1,$IN01_1,$H1
- add $IN23_4,$IN01_4,$H4
-
-.Long_tail:
- dup $IN23_0,${IN23_0}[0]
- umull2 $ACC0,$IN23_2,${S3}
- umull2 $ACC3,$IN23_2,${R1}
- umull2 $ACC4,$IN23_2,${R2}
- umull2 $ACC2,$IN23_2,${R0}
- umull2 $ACC1,$IN23_2,${S4}
-
- dup $IN23_1,${IN23_1}[0]
- umlal2 $ACC0,$IN23_0,${R0}
- umlal2 $ACC2,$IN23_0,${R2}
- umlal2 $ACC3,$IN23_0,${R3}
- umlal2 $ACC4,$IN23_0,${R4}
- umlal2 $ACC1,$IN23_0,${R1}
-
- dup $IN23_3,${IN23_3}[0]
- umlal2 $ACC0,$IN23_1,${S4}
- umlal2 $ACC3,$IN23_1,${R2}
- umlal2 $ACC2,$IN23_1,${R1}
- umlal2 $ACC4,$IN23_1,${R3}
- umlal2 $ACC1,$IN23_1,${R0}
-
- dup $IN23_4,${IN23_4}[0]
- umlal2 $ACC3,$IN23_3,${R0}
- umlal2 $ACC4,$IN23_3,${R1}
- umlal2 $ACC0,$IN23_3,${S2}
- umlal2 $ACC1,$IN23_3,${S3}
- umlal2 $ACC2,$IN23_3,${S4}
-
- umlal2 $ACC3,$IN23_4,${S4}
- umlal2 $ACC0,$IN23_4,${S1}
- umlal2 $ACC4,$IN23_4,${R0}
- umlal2 $ACC1,$IN23_4,${S2}
- umlal2 $ACC2,$IN23_4,${S3}
-
- b.eq .Lshort_tail
-
- ////////////////////////////////////////////////////////////////
- // (hash+inp[0:1])*r^4:r^3 and accumulate
-
- add $IN01_0,$IN01_0,$H0
- umlal $ACC3,$IN01_2,${R1}
- umlal $ACC0,$IN01_2,${S3}
- umlal $ACC4,$IN01_2,${R2}
- umlal $ACC1,$IN01_2,${S4}
- umlal $ACC2,$IN01_2,${R0}
-
- add $IN01_1,$IN01_1,$H1
- umlal $ACC3,$IN01_0,${R3}
- umlal $ACC0,$IN01_0,${R0}
- umlal $ACC4,$IN01_0,${R4}
- umlal $ACC1,$IN01_0,${R1}
- umlal $ACC2,$IN01_0,${R2}
-
- add $IN01_3,$IN01_3,$H3
- umlal $ACC3,$IN01_1,${R2}
- umlal $ACC0,$IN01_1,${S4}
- umlal $ACC4,$IN01_1,${R3}
- umlal $ACC1,$IN01_1,${R0}
- umlal $ACC2,$IN01_1,${R1}
-
- add $IN01_4,$IN01_4,$H4
- umlal $ACC3,$IN01_3,${R0}
- umlal $ACC0,$IN01_3,${S2}
- umlal $ACC4,$IN01_3,${R1}
- umlal $ACC1,$IN01_3,${S3}
- umlal $ACC2,$IN01_3,${S4}
-
- umlal $ACC3,$IN01_4,${S4}
- umlal $ACC0,$IN01_4,${S1}
- umlal $ACC4,$IN01_4,${R0}
- umlal $ACC1,$IN01_4,${S2}
- umlal $ACC2,$IN01_4,${S3}
-
-.Lshort_tail:
- ////////////////////////////////////////////////////////////////
- // horizontal add
-
- addp $ACC3,$ACC3,$ACC3
- ldp d8,d9,[sp,#16] // meet ABI requirements
- addp $ACC0,$ACC0,$ACC0
- ldp d10,d11,[sp,#32]
- addp $ACC4,$ACC4,$ACC4
- ldp d12,d13,[sp,#48]
- addp $ACC1,$ACC1,$ACC1
- ldp d14,d15,[sp,#64]
- addp $ACC2,$ACC2,$ACC2
-
- ////////////////////////////////////////////////////////////////
- // lazy reduction, but without narrowing
-
- ushr $T0.2d,$ACC3,#26
- and $ACC3,$ACC3,$MASK.2d
- ushr $T1.2d,$ACC0,#26
- and $ACC0,$ACC0,$MASK.2d
-
- add $ACC4,$ACC4,$T0.2d // h3 -> h4
- add $ACC1,$ACC1,$T1.2d // h0 -> h1
-
- ushr $T0.2d,$ACC4,#26
- and $ACC4,$ACC4,$MASK.2d
- ushr $T1.2d,$ACC1,#26
- and $ACC1,$ACC1,$MASK.2d
- add $ACC2,$ACC2,$T1.2d // h1 -> h2
-
- add $ACC0,$ACC0,$T0.2d
- shl $T0.2d,$T0.2d,#2
- ushr $T1.2d,$ACC2,#26
- and $ACC2,$ACC2,$MASK.2d
- add $ACC0,$ACC0,$T0.2d // h4 -> h0
- add $ACC3,$ACC3,$T1.2d // h2 -> h3
-
- ushr $T0.2d,$ACC0,#26
- and $ACC0,$ACC0,$MASK.2d
- ushr $T1.2d,$ACC3,#26
- and $ACC3,$ACC3,$MASK.2d
- add $ACC1,$ACC1,$T0.2d // h0 -> h1
- add $ACC4,$ACC4,$T1.2d // h3 -> h4
-
- ////////////////////////////////////////////////////////////////
- // write the result, can be partially reduced
-
- st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
- st1 {$ACC4}[0],[$ctx]
-
-.Lno_data_neon:
- ldr x29,[sp],#80
- ret
-.size poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.type poly1305_emit_neon,%function
-.align 5
-poly1305_emit_neon:
- ldr $is_base2_26,[$ctx,#24]
- cbz $is_base2_26,poly1305_emit
-
- ldp w10,w11,[$ctx] // load hash value base 2^26
- ldp w12,w13,[$ctx,#8]
- ldr w14,[$ctx,#16]
-
- add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
- lsr $h1,x12,#12
- adds $h0,$h0,x12,lsl#52
- add $h1,$h1,x13,lsl#14
- adc $h1,$h1,xzr
- lsr $h2,x14,#24
- adds $h1,$h1,x14,lsl#40
- adc $h2,$h2,xzr // can be partially reduced...
-
- ldp $t0,$t1,[$nonce] // load nonce
-
- and $d0,$h2,#-4 // ... so reduce
- add $d0,$d0,$h2,lsr#2
- and $h2,$h2,#3
- adds $h0,$h0,$d0
- adcs $h1,$h1,xzr
- adc $h2,$h2,xzr
-
- adds $d0,$h0,#5 // compare to modulus
- adcs $d1,$h1,xzr
- adc $d2,$h2,xzr
-
- tst $d2,#-4 // see if it's carried/borrowed
-
- csel $h0,$h0,$d0,eq
- csel $h1,$h1,$d1,eq
-
-#ifdef __AARCH64EB__
- ror $t0,$t0,#32 // flip nonce words
- ror $t1,$t1,#32
-#endif
- adds $h0,$h0,$t0 // accumulate nonce
- adc $h1,$h1,$t1
-#ifdef __AARCH64EB__
- rev $h0,$h0 // flip output bytes
- rev $h1,$h1
-#endif
- stp $h0,$h1,[$mac] // write result
-
- ret
-.size poly1305_emit_neon,.-poly1305_emit_neon
-#endif
-
-.align 5
-.Lzeros:
-.long 0,0,0,0,0,0,0,0
-#ifndef __KERNEL__
-.LOPENSSL_armcap_P:
-#ifdef __ILP32__
-.long OPENSSL_armcap_P-.
-#else
-.quad OPENSSL_armcap_P-.
-#endif
-#endif
-.align 2
-___
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/\/\// and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split("\n",$code)) {
- s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
- s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
- (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
- (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
- (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
- (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
- (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
-
- s/\.[124]([sd])\[/.$1\[/;
-
- print $_,"\n";
-}
-close STDOUT;
diff --git a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-donna32.c b/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-donna32.c
deleted file mode 100644
index 527ccc3b59cc..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-donna32.c
+++ /dev/null
@@ -1,205 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * This is based in part on Andrew Moon's poly1305-donna, which is in the
- * public domain.
- */
-
-struct poly1305_internal {
- u32 h[5];
- u32 r[5];
- u32 s[4];
-};
-
-static void poly1305_init_generic(void *ctx, const u8 key[16])
-{
- struct poly1305_internal *st = (struct poly1305_internal *)ctx;
-
- /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
- st->r[0] = (get_unaligned_le32(&key[0])) & 0x3ffffff;
- st->r[1] = (get_unaligned_le32(&key[3]) >> 2) & 0x3ffff03;
- st->r[2] = (get_unaligned_le32(&key[6]) >> 4) & 0x3ffc0ff;
- st->r[3] = (get_unaligned_le32(&key[9]) >> 6) & 0x3f03fff;
- st->r[4] = (get_unaligned_le32(&key[12]) >> 8) & 0x00fffff;
-
- /* s = 5*r */
- st->s[0] = st->r[1] * 5;
- st->s[1] = st->r[2] * 5;
- st->s[2] = st->r[3] * 5;
- st->s[3] = st->r[4] * 5;
-
- /* h = 0 */
- st->h[0] = 0;
- st->h[1] = 0;
- st->h[2] = 0;
- st->h[3] = 0;
- st->h[4] = 0;
-}
-
-static void poly1305_blocks_generic(void *ctx, const u8 *input, size_t len,
- const u32 padbit)
-{
- struct poly1305_internal *st = (struct poly1305_internal *)ctx;
- const u32 hibit = padbit << 24;
- u32 r0, r1, r2, r3, r4;
- u32 s1, s2, s3, s4;
- u32 h0, h1, h2, h3, h4;
- u64 d0, d1, d2, d3, d4;
- u32 c;
-
- r0 = st->r[0];
- r1 = st->r[1];
- r2 = st->r[2];
- r3 = st->r[3];
- r4 = st->r[4];
-
- s1 = st->s[0];
- s2 = st->s[1];
- s3 = st->s[2];
- s4 = st->s[3];
-
- h0 = st->h[0];
- h1 = st->h[1];
- h2 = st->h[2];
- h3 = st->h[3];
- h4 = st->h[4];
-
- while (len >= POLY1305_BLOCK_SIZE) {
- /* h += m[i] */
- h0 += (get_unaligned_le32(&input[0])) & 0x3ffffff;
- h1 += (get_unaligned_le32(&input[3]) >> 2) & 0x3ffffff;
- h2 += (get_unaligned_le32(&input[6]) >> 4) & 0x3ffffff;
- h3 += (get_unaligned_le32(&input[9]) >> 6) & 0x3ffffff;
- h4 += (get_unaligned_le32(&input[12]) >> 8) | hibit;
-
- /* h *= r */
- d0 = ((u64)h0 * r0) + ((u64)h1 * s4) +
- ((u64)h2 * s3) + ((u64)h3 * s2) +
- ((u64)h4 * s1);
- d1 = ((u64)h0 * r1) + ((u64)h1 * r0) +
- ((u64)h2 * s4) + ((u64)h3 * s3) +
- ((u64)h4 * s2);
- d2 = ((u64)h0 * r2) + ((u64)h1 * r1) +
- ((u64)h2 * r0) + ((u64)h3 * s4) +
- ((u64)h4 * s3);
- d3 = ((u64)h0 * r3) + ((u64)h1 * r2) +
- ((u64)h2 * r1) + ((u64)h3 * r0) +
- ((u64)h4 * s4);
- d4 = ((u64)h0 * r4) + ((u64)h1 * r3) +
- ((u64)h2 * r2) + ((u64)h3 * r1) +
- ((u64)h4 * r0);
-
- /* (partial) h %= p */
- c = (u32)(d0 >> 26);
- h0 = (u32)d0 & 0x3ffffff;
- d1 += c;
- c = (u32)(d1 >> 26);
- h1 = (u32)d1 & 0x3ffffff;
- d2 += c;
- c = (u32)(d2 >> 26);
- h2 = (u32)d2 & 0x3ffffff;
- d3 += c;
- c = (u32)(d3 >> 26);
- h3 = (u32)d3 & 0x3ffffff;
- d4 += c;
- c = (u32)(d4 >> 26);
- h4 = (u32)d4 & 0x3ffffff;
- h0 += c * 5;
- c = (h0 >> 26);
- h0 = h0 & 0x3ffffff;
- h1 += c;
-
- input += POLY1305_BLOCK_SIZE;
- len -= POLY1305_BLOCK_SIZE;
- }
-
- st->h[0] = h0;
- st->h[1] = h1;
- st->h[2] = h2;
- st->h[3] = h3;
- st->h[4] = h4;
-}
-
-static void poly1305_emit_generic(void *ctx, u8 mac[16], const u32 nonce[4])
-{
- struct poly1305_internal *st = (struct poly1305_internal *)ctx;
- u32 h0, h1, h2, h3, h4, c;
- u32 g0, g1, g2, g3, g4;
- u64 f;
- u32 mask;
-
- /* fully carry h */
- h0 = st->h[0];
- h1 = st->h[1];
- h2 = st->h[2];
- h3 = st->h[3];
- h4 = st->h[4];
-
- c = h1 >> 26;
- h1 = h1 & 0x3ffffff;
- h2 += c;
- c = h2 >> 26;
- h2 = h2 & 0x3ffffff;
- h3 += c;
- c = h3 >> 26;
- h3 = h3 & 0x3ffffff;
- h4 += c;
- c = h4 >> 26;
- h4 = h4 & 0x3ffffff;
- h0 += c * 5;
- c = h0 >> 26;
- h0 = h0 & 0x3ffffff;
- h1 += c;
-
- /* compute h + -p */
- g0 = h0 + 5;
- c = g0 >> 26;
- g0 &= 0x3ffffff;
- g1 = h1 + c;
- c = g1 >> 26;
- g1 &= 0x3ffffff;
- g2 = h2 + c;
- c = g2 >> 26;
- g2 &= 0x3ffffff;
- g3 = h3 + c;
- c = g3 >> 26;
- g3 &= 0x3ffffff;
- g4 = h4 + c - (1UL << 26);
-
- /* select h if h < p, or h + -p if h >= p */
- mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
- g0 &= mask;
- g1 &= mask;
- g2 &= mask;
- g3 &= mask;
- g4 &= mask;
- mask = ~mask;
-
- h0 = (h0 & mask) | g0;
- h1 = (h1 & mask) | g1;
- h2 = (h2 & mask) | g2;
- h3 = (h3 & mask) | g3;
- h4 = (h4 & mask) | g4;
-
- /* h = h % (2^128) */
- h0 = ((h0) | (h1 << 26)) & 0xffffffff;
- h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
- h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
- h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
-
- /* mac = (h + nonce) % (2^128) */
- f = (u64)h0 + nonce[0];
- h0 = (u32)f;
- f = (u64)h1 + nonce[1] + (f >> 32);
- h1 = (u32)f;
- f = (u64)h2 + nonce[2] + (f >> 32);
- h2 = (u32)f;
- f = (u64)h3 + nonce[3] + (f >> 32);
- h3 = (u32)f;
-
- put_unaligned_le32(h0, &mac[0]);
- put_unaligned_le32(h1, &mac[4]);
- put_unaligned_le32(h2, &mac[8]);
- put_unaligned_le32(h3, &mac[12]);
-}
diff --git a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-donna64.c b/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-donna64.c
deleted file mode 100644
index 131f1dda1b1d..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-donna64.c
+++ /dev/null
@@ -1,182 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * This is based in part on Andrew Moon's poly1305-donna, which is in the
- * public domain.
- */
-
-typedef __uint128_t u128;
-
-struct poly1305_internal {
- u64 r[3];
- u64 h[3];
- u64 s[2];
-};
-
-static void poly1305_init_generic(void *ctx, const u8 key[16])
-{
- struct poly1305_internal *st = (struct poly1305_internal *)ctx;
- u64 t0, t1;
-
- /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
- t0 = get_unaligned_le64(&key[0]);
- t1 = get_unaligned_le64(&key[8]);
-
- st->r[0] = t0 & 0xffc0fffffffULL;
- st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffffULL;
- st->r[2] = ((t1 >> 24)) & 0x00ffffffc0fULL;
-
- /* s = 20*r */
- st->s[0] = st->r[1] * 20;
- st->s[1] = st->r[2] * 20;
-
- /* h = 0 */
- st->h[0] = 0;
- st->h[1] = 0;
- st->h[2] = 0;
-}
-
-static void poly1305_blocks_generic(void *ctx, const u8 *input, size_t len,
- const u32 padbit)
-{
- struct poly1305_internal *st = (struct poly1305_internal *)ctx;
- const u64 hibit = ((u64)padbit) << 40;
- u64 r0, r1, r2;
- u64 s1, s2;
- u64 h0, h1, h2;
- u64 c;
- u128 d0, d1, d2, d;
-
- r0 = st->r[0];
- r1 = st->r[1];
- r2 = st->r[2];
-
- h0 = st->h[0];
- h1 = st->h[1];
- h2 = st->h[2];
-
- s1 = st->s[0];
- s2 = st->s[1];
-
- while (len >= POLY1305_BLOCK_SIZE) {
- u64 t0, t1;
-
- /* h += m[i] */
- t0 = get_unaligned_le64(&input[0]);
- t1 = get_unaligned_le64(&input[8]);
-
- h0 += t0 & 0xfffffffffffULL;
- h1 += ((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL;
- h2 += (((t1 >> 24)) & 0x3ffffffffffULL) | hibit;
-
- /* h *= r */
- d0 = (u128)h0 * r0;
- d = (u128)h1 * s2;
- d0 += d;
- d = (u128)h2 * s1;
- d0 += d;
- d1 = (u128)h0 * r1;
- d = (u128)h1 * r0;
- d1 += d;
- d = (u128)h2 * s2;
- d1 += d;
- d2 = (u128)h0 * r2;
- d = (u128)h1 * r1;
- d2 += d;
- d = (u128)h2 * r0;
- d2 += d;
-
- /* (partial) h %= p */
- c = (u64)(d0 >> 44);
- h0 = (u64)d0 & 0xfffffffffffULL;
- d1 += c;
- c = (u64)(d1 >> 44);
- h1 = (u64)d1 & 0xfffffffffffULL;
- d2 += c;
- c = (u64)(d2 >> 42);
- h2 = (u64)d2 & 0x3ffffffffffULL;
- h0 += c * 5;
- c = h0 >> 44;
- h0 = h0 & 0xfffffffffffULL;
- h1 += c;
-
- input += POLY1305_BLOCK_SIZE;
- len -= POLY1305_BLOCK_SIZE;
- }
-
- st->h[0] = h0;
- st->h[1] = h1;
- st->h[2] = h2;
-}
-
-static void poly1305_emit_generic(void *ctx, u8 mac[16], const u32 nonce[4])
-{
- struct poly1305_internal *st = (struct poly1305_internal *)ctx;
- u64 h0, h1, h2, c;
- u64 g0, g1, g2;
- u64 t0, t1;
-
- /* fully carry h */
- h0 = st->h[0];
- h1 = st->h[1];
- h2 = st->h[2];
-
- c = h1 >> 44;
- h1 &= 0xfffffffffffULL;
- h2 += c;
- c = h2 >> 42;
- h2 &= 0x3ffffffffffULL;
- h0 += c * 5;
- c = h0 >> 44;
- h0 &= 0xfffffffffffULL;
- h1 += c;
- c = h1 >> 44;
- h1 &= 0xfffffffffffULL;
- h2 += c;
- c = h2 >> 42;
- h2 &= 0x3ffffffffffULL;
- h0 += c * 5;
- c = h0 >> 44;
- h0 &= 0xfffffffffffULL;
- h1 += c;
-
- /* compute h + -p */
- g0 = h0 + 5;
- c = g0 >> 44;
- g0 &= 0xfffffffffffULL;
- g1 = h1 + c;
- c = g1 >> 44;
- g1 &= 0xfffffffffffULL;
- g2 = h2 + c - (1ULL << 42);
-
- /* select h if h < p, or h + -p if h >= p */
- c = (g2 >> ((sizeof(u64) * 8) - 1)) - 1;
- g0 &= c;
- g1 &= c;
- g2 &= c;
- c = ~c;
- h0 = (h0 & c) | g0;
- h1 = (h1 & c) | g1;
- h2 = (h2 & c) | g2;
-
- /* h = (h + nonce) */
- t0 = ((u64)nonce[1] << 32) | nonce[0];
- t1 = ((u64)nonce[3] << 32) | nonce[2];
-
- h0 += t0 & 0xfffffffffffULL;
- c = h0 >> 44;
- h0 &= 0xfffffffffffULL;
- h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL) + c;
- c = h1 >> 44;
- h1 &= 0xfffffffffffULL;
- h2 += (((t1 >> 24)) & 0x3ffffffffffULL) + c;
- h2 &= 0x3ffffffffffULL;
-
- /* mac = h % (2^128) */
- h0 = h0 | (h1 << 44);
- h1 = (h1 >> 20) | (h2 << 24);
-
- put_unaligned_le64(h0, &mac[0]);
- put_unaligned_le64(h1, &mac[8]);
-}
diff --git a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-mips-glue.c b/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-mips-glue.c
deleted file mode 100644
index a540e9c4eee8..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-mips-glue.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-asmlinkage void poly1305_init_mips(void *ctx, const u8 key[16]);
-asmlinkage void poly1305_blocks_mips(void *ctx, const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_emit_mips(void *ctx, u8 mac[16], const u32 nonce[4]);
-
-static bool *const poly1305_nobs[] __initconst = { };
-static void __init poly1305_fpu_init(void)
-{
-}
-
-static inline bool poly1305_init_arch(void *ctx,
- const u8 key[POLY1305_KEY_SIZE])
-{
- poly1305_init_mips(ctx, key);
- return true;
-}
-
-static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
- size_t len, const u32 padbit,
- simd_context_t *simd_context)
-{
- poly1305_blocks_mips(ctx, inp, len, padbit);
- return true;
-}
-
-static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
- const u32 nonce[4],
- simd_context_t *simd_context)
-{
- poly1305_emit_mips(ctx, mac, nonce);
- return true;
-}
diff --git a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-mips.S b/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-mips.S
deleted file mode 100644
index 4291c156815b..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-mips.S
+++ /dev/null
@@ -1,407 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com> All Rights Reserved.
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#define MSB 0
-#define LSB 3
-#else
-#define MSB 3
-#define LSB 0
-#endif
-
-#define POLY1305_BLOCK_SIZE 16
-.text
-#define H0 $t0
-#define H1 $t1
-#define H2 $t2
-#define H3 $t3
-#define H4 $t4
-
-#define R0 $t5
-#define R1 $t6
-#define R2 $t7
-#define R3 $t8
-
-#define O0 $s0
-#define O1 $s4
-#define O2 $v1
-#define O3 $t9
-#define O4 $s5
-
-#define S1 $s1
-#define S2 $s2
-#define S3 $s3
-
-#define SC $at
-#define CA $v0
-
-/* Input arguments */
-#define poly $a0
-#define src $a1
-#define srclen $a2
-#define hibit $a3
-
-/* Location in the opaque buffer
- * R[0..3], CA, H[0..4]
- */
-#define PTR_POLY1305_R(n) ( 0 + (n*4)) ## ($a0)
-#define PTR_POLY1305_CA (16 ) ## ($a0)
-#define PTR_POLY1305_H(n) (20 + (n*4)) ## ($a0)
-
-#define POLY1305_BLOCK_SIZE 16
-#define POLY1305_STACK_SIZE 32
-
-.set noat
-.align 4
-.globl poly1305_blocks_mips
-.ent poly1305_blocks_mips
-poly1305_blocks_mips:
- .frame $sp, POLY1305_STACK_SIZE, $ra
- /* srclen &= 0xFFFFFFF0 */
- ins srclen, $zero, 0, 4
-
- addiu $sp, -(POLY1305_STACK_SIZE)
-
- /* check srclen >= 16 bytes */
- beqz srclen, .Lpoly1305_blocks_mips_end
-
- /* Calculate last round based on src address pointer.
- * last round src ptr (srclen) = src + (srclen & 0xFFFFFFF0)
- */
- addu srclen, src
-
- lw R0, PTR_POLY1305_R(0)
- lw R1, PTR_POLY1305_R(1)
- lw R2, PTR_POLY1305_R(2)
- lw R3, PTR_POLY1305_R(3)
-
- /* store the used save registers. */
- sw $s0, 0($sp)
- sw $s1, 4($sp)
- sw $s2, 8($sp)
- sw $s3, 12($sp)
- sw $s4, 16($sp)
- sw $s5, 20($sp)
-
- /* load Hx and Carry */
- lw CA, PTR_POLY1305_CA
- lw H0, PTR_POLY1305_H(0)
- lw H1, PTR_POLY1305_H(1)
- lw H2, PTR_POLY1305_H(2)
- lw H3, PTR_POLY1305_H(3)
- lw H4, PTR_POLY1305_H(4)
-
- /* Sx = Rx + (Rx >> 2) */
- srl S1, R1, 2
- srl S2, R2, 2
- srl S3, R3, 2
- addu S1, R1
- addu S2, R2
- addu S3, R3
-
- addiu SC, $zero, 1
-
-.Lpoly1305_loop:
- lwl O0, 0+MSB(src)
- lwl O1, 4+MSB(src)
- lwl O2, 8+MSB(src)
- lwl O3,12+MSB(src)
- lwr O0, 0+LSB(src)
- lwr O1, 4+LSB(src)
- lwr O2, 8+LSB(src)
- lwr O3,12+LSB(src)
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- wsbh O0
- wsbh O1
- wsbh O2
- wsbh O3
- rotr O0, 16
- rotr O1, 16
- rotr O2, 16
- rotr O3, 16
-#endif
-
- /* h0 = (u32)(d0 = (u64)h0 + inp[0] + c 'Carry_previous cycle'); */
- addu H0, CA
- sltu CA, H0, CA
- addu O0, H0
- sltu H0, O0, H0
- addu CA, H0
-
- /* h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + inp[4]); */
- addu H1, CA
- sltu CA, H1, CA
- addu O1, H1
- sltu H1, O1, H1
- addu CA, H1
-
- /* h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + inp[8]); */
- addu H2, CA
- sltu CA, H2, CA
- addu O2, H2
- sltu H2, O2, H2
- addu CA, H2
-
- /* h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + inp[12]); */
- addu H3, CA
- sltu CA, H3, CA
- addu O3, H3
- sltu H3, O3, H3
- addu CA, H3
-
- /* h4 += (u32)(d3 >> 32) + padbit; */
- addu H4, hibit
- addu O4, H4, CA
-
- /* D0 */
- multu O0, R0
- maddu O1, S3
- maddu O2, S2
- maddu O3, S1
- mfhi CA
- mflo H0
-
- /* D1 */
- multu O0, R1
- maddu O1, R0
- maddu O2, S3
- maddu O3, S2
- maddu O4, S1
- maddu CA, SC
- mfhi CA
- mflo H1
-
- /* D2 */
- multu O0, R2
- maddu O1, R1
- maddu O2, R0
- maddu O3, S3
- maddu O4, S2
- maddu CA, SC
- mfhi CA
- mflo H2
-
- /* D4 */
- mul H4, O4, R0
-
- /* D3 */
- multu O0, R3
- maddu O1, R2
- maddu O2, R1
- maddu O3, R0
- maddu O4, S3
- maddu CA, SC
- mfhi CA
- mflo H3
-
- addiu src, POLY1305_BLOCK_SIZE
-
- /* h4 += (u32)(d3 >> 32); */
- addu O4, H4, CA
- /* h4 &= 3 */
- andi H4, O4, 3
- /* c = (h4 >> 2) + (h4 & ~3U); */
- srl CA, O4, 2
- ins O4, $zero, 0, 2
-
- addu CA, O4
-
- /* able to do a 16 byte block. */
- bne src, srclen, .Lpoly1305_loop
-
- /* restore the used save registers. */
- lw $s0, 0($sp)
- lw $s1, 4($sp)
- lw $s2, 8($sp)
- lw $s3, 12($sp)
- lw $s4, 16($sp)
- lw $s5, 20($sp)
-
- /* store Hx and Carry */
- sw CA, PTR_POLY1305_CA
- sw H0, PTR_POLY1305_H(0)
- sw H1, PTR_POLY1305_H(1)
- sw H2, PTR_POLY1305_H(2)
- sw H3, PTR_POLY1305_H(3)
- sw H4, PTR_POLY1305_H(4)
-
-.Lpoly1305_blocks_mips_end:
- addiu $sp, POLY1305_STACK_SIZE
-
- /* Jump Back */
- jr $ra
-.end poly1305_blocks_mips
-.set at
-
-/* Input arguments CTX=$a0, MAC=$a1, NONCE=$a2 */
-#define MAC $a1
-#define NONCE $a2
-
-#define G0 $t5
-#define G1 $t6
-#define G2 $t7
-#define G3 $t8
-#define G4 $t9
-
-.set noat
-.align 4
-.globl poly1305_emit_mips
-.ent poly1305_emit_mips
-poly1305_emit_mips:
- /* load Hx and Carry */
- lw CA, PTR_POLY1305_CA
- lw H0, PTR_POLY1305_H(0)
- lw H1, PTR_POLY1305_H(1)
- lw H2, PTR_POLY1305_H(2)
- lw H3, PTR_POLY1305_H(3)
- lw H4, PTR_POLY1305_H(4)
-
- /* Add left over carry */
- addu H0, CA
- sltu CA, H0, CA
- addu H1, CA
- sltu CA, H1, CA
- addu H2, CA
- sltu CA, H2, CA
- addu H3, CA
- sltu CA, H3, CA
- addu H4, CA
-
- /* compare to modulus by computing h + -p */
- addiu G0, H0, 5
- sltu CA, G0, H0
- addu G1, H1, CA
- sltu CA, G1, H1
- addu G2, H2, CA
- sltu CA, G2, H2
- addu G3, H3, CA
- sltu CA, G3, H3
- addu G4, H4, CA
-
- srl SC, G4, 2
-
- /* if there was carry into 131st bit, h3:h0 = g3:g0 */
- movn H0, G0, SC
- movn H1, G1, SC
- movn H2, G2, SC
- movn H3, G3, SC
-
- lwl G0, 0+MSB(NONCE)
- lwl G1, 4+MSB(NONCE)
- lwl G2, 8+MSB(NONCE)
- lwl G3,12+MSB(NONCE)
- lwr G0, 0+LSB(NONCE)
- lwr G1, 4+LSB(NONCE)
- lwr G2, 8+LSB(NONCE)
- lwr G3,12+LSB(NONCE)
-
- /* mac = (h + nonce) % (2^128) */
- addu H0, G0
- sltu CA, H0, G0
-
- /* H1 */
- addu H1, CA
- sltu CA, H1, CA
- addu H1, G1
- sltu G1, H1, G1
- addu CA, G1
-
- /* H2 */
- addu H2, CA
- sltu CA, H2, CA
- addu H2, G2
- sltu G2, H2, G2
- addu CA, G2
-
- /* H3 */
- addu H3, CA
- addu H3, G3
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- wsbh H0
- wsbh H1
- wsbh H2
- wsbh H3
- rotr H0, 16
- rotr H1, 16
- rotr H2, 16
- rotr H3, 16
-#endif
-
- /* store MAC */
- swl H0, 0+MSB(MAC)
- swl H1, 4+MSB(MAC)
- swl H2, 8+MSB(MAC)
- swl H3,12+MSB(MAC)
- swr H0, 0+LSB(MAC)
- swr H1, 4+LSB(MAC)
- swr H2, 8+LSB(MAC)
- swr H3,12+LSB(MAC)
-
- jr $ra
-.end poly1305_emit_mips
-
-#define PR0 $t0
-#define PR1 $t1
-#define PR2 $t2
-#define PR3 $t3
-#define PT0 $t4
-
-/* Input arguments CTX=$a0, KEY=$a1 */
-
-.align 4
-.globl poly1305_init_mips
-.ent poly1305_init_mips
-poly1305_init_mips:
- lwl PR0, 0+MSB($a1)
- lwl PR1, 4+MSB($a1)
- lwl PR2, 8+MSB($a1)
- lwl PR3,12+MSB($a1)
- lwr PR0, 0+LSB($a1)
- lwr PR1, 4+LSB($a1)
- lwr PR2, 8+LSB($a1)
- lwr PR3,12+LSB($a1)
-
- /* store Hx and Carry */
- sw $zero, PTR_POLY1305_CA
- sw $zero, PTR_POLY1305_H(0)
- sw $zero, PTR_POLY1305_H(1)
- sw $zero, PTR_POLY1305_H(2)
- sw $zero, PTR_POLY1305_H(3)
- sw $zero, PTR_POLY1305_H(4)
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- wsbh PR0
- wsbh PR1
- wsbh PR2
- wsbh PR3
- rotr PR0, 16
- rotr PR1, 16
- rotr PR2, 16
- rotr PR3, 16
-#endif
-
- lui PT0, 0x0FFF
- ori PT0, 0xFFFC
-
- /* AND 0x0fffffff; */
- ext PR0, PR0, 0, (32-4)
-
- /* AND 0x0ffffffc; */
- and PR1, PT0
- and PR2, PT0
- and PR3, PT0
-
- /* store Rx */
- sw PR0, PTR_POLY1305_R(0)
- sw PR1, PTR_POLY1305_R(1)
- sw PR2, PTR_POLY1305_R(2)
- sw PR3, PTR_POLY1305_R(3)
-
- /* Jump Back */
- jr $ra
-.end poly1305_init_mips
diff --git a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-mips64.pl b/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-mips64.pl
deleted file mode 100755
index d30a03d79177..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-mips64.pl
+++ /dev/null
@@ -1,467 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# Poly1305 hash for MIPS64.
-#
-# May 2016
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone.
-#
-# IALU/gcc
-# R1x000 5.64/+120% (big-endian)
-# Octeon II 3.80/+280% (little-endian)
-
-######################################################################
-# There is a number of MIPS ABI in use, O32 and N32/64 are most
-# widely used. Then there is a new contender: NUBI. It appears that if
-# one picks the latter, it's possible to arrange code in ABI neutral
-# manner. Therefore let's stick to NUBI register layout:
-#
-($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
-($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
-($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
-#
-# The return value is placed in $a0. Following coding rules facilitate
-# interoperability:
-#
-# - never ever touch $tp, "thread pointer", former $gp [o32 can be
-# excluded from the rule, because it's specified volatile];
-# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
-# old code];
-# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
-#
-# For reference here is register layout for N32/64 MIPS ABIs:
-#
-# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
-# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
-# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
-# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
-#
-# <appro@openssl.org>
-#
-######################################################################
-
-$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
-
-die "MIPS64 only" unless ($flavour =~ /64|n32/i);
-
-$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
-$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
-
-($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
-($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
-
-$code.=<<___;
-#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
- defined(_MIPS_ARCH_MIPS64R6)) \\
- && !defined(_MIPS_ARCH_MIPS64R2)
-# define _MIPS_ARCH_MIPS64R2
-#endif
-
-#if defined(_MIPS_ARCH_MIPS64R6)
-# define dmultu(rs,rt)
-# define mflo(rd,rs,rt) dmulu rd,rs,rt
-# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
-#else
-# define dmultu(rs,rt) dmultu rs,rt
-# define mflo(rd,rs,rt) mflo rd
-# define mfhi(rd,rs,rt) mfhi rd
-#endif
-
-#ifdef __KERNEL__
-# define poly1305_init poly1305_init_mips
-# define poly1305_blocks poly1305_blocks_mips
-# define poly1305_emit poly1305_emit_mips
-#endif
-
-#if defined(__MIPSEB__) && !defined(MIPSEB)
-# define MIPSEB
-#endif
-
-#ifdef MIPSEB
-# define MSB 0
-# define LSB 7
-#else
-# define MSB 7
-# define LSB 0
-#endif
-
-.text
-.set noat
-.set noreorder
-
-.align 5
-.globl poly1305_init
-.ent poly1305_init
-poly1305_init:
- .frame $sp,0,$ra
- .set reorder
-
- sd $zero,0($ctx)
- sd $zero,8($ctx)
- sd $zero,16($ctx)
-
- beqz $inp,.Lno_key
-
-#if defined(_MIPS_ARCH_MIPS64R6)
- ld $in0,0($inp)
- ld $in1,8($inp)
-#else
- ldl $in0,0+MSB($inp)
- ldl $in1,8+MSB($inp)
- ldr $in0,0+LSB($inp)
- ldr $in1,8+LSB($inp)
-#endif
-#ifdef MIPSEB
-# if defined(_MIPS_ARCH_MIPS64R2)
- dsbh $in0,$in0 # byte swap
- dsbh $in1,$in1
- dshd $in0,$in0
- dshd $in1,$in1
-# else
- ori $tmp0,$zero,0xFF
- dsll $tmp2,$tmp0,32
- or $tmp0,$tmp2 # 0x000000FF000000FF
-
- and $tmp1,$in0,$tmp0 # byte swap
- and $tmp3,$in1,$tmp0
- dsrl $tmp2,$in0,24
- dsrl $tmp4,$in1,24
- dsll $tmp1,24
- dsll $tmp3,24
- and $tmp2,$tmp0
- and $tmp4,$tmp0
- dsll $tmp0,8 # 0x0000FF000000FF00
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- and $tmp2,$in0,$tmp0
- and $tmp4,$in1,$tmp0
- dsrl $in0,8
- dsrl $in1,8
- dsll $tmp2,8
- dsll $tmp4,8
- and $in0,$tmp0
- and $in1,$tmp0
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- or $in0,$tmp1
- or $in1,$tmp3
- dsrl $tmp1,$in0,32
- dsrl $tmp3,$in1,32
- dsll $in0,32
- dsll $in1,32
- or $in0,$tmp1
- or $in1,$tmp3
-# endif
-#endif
- li $tmp0,1
- dsll $tmp0,32
- daddiu $tmp0,-63
- dsll $tmp0,28
- daddiu $tmp0,-1 # 0ffffffc0fffffff
-
- and $in0,$tmp0
- daddiu $tmp0,-3 # 0ffffffc0ffffffc
- and $in1,$tmp0
-
- sd $in0,24($ctx)
- dsrl $tmp0,$in1,2
- sd $in1,32($ctx)
- daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
- sd $tmp0,40($ctx)
-
-.Lno_key:
- li $v0,0 # return 0
- jr $ra
-.end poly1305_init
-___
-{
-my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
- ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
-
-$code.=<<___;
-.align 5
-.globl poly1305_blocks
-.ent poly1305_blocks
-poly1305_blocks:
- .set noreorder
- dsrl $len,4 # number of complete blocks
- bnez $len,poly1305_blocks_internal
- nop
- jr $ra
- nop
-.end poly1305_blocks
-
-.align 5
-.ent poly1305_blocks_internal
-poly1305_blocks_internal:
- .frame $sp,6*8,$ra
- .mask $SAVED_REGS_MASK,-8
- .set noreorder
- dsubu $sp,6*8
- sd $s5,40($sp)
- sd $s4,32($sp)
-___
-$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
- sd $s3,24($sp)
- sd $s2,16($sp)
- sd $s1,8($sp)
- sd $s0,0($sp)
-___
-$code.=<<___;
- .set reorder
-
- ld $h0,0($ctx) # load hash value
- ld $h1,8($ctx)
- ld $h2,16($ctx)
-
- ld $r0,24($ctx) # load key
- ld $r1,32($ctx)
- ld $s1,40($ctx)
-
-.Loop:
-#if defined(_MIPS_ARCH_MIPS64R6)
- ld $in0,0($inp) # load input
- ld $in1,8($inp)
-#else
- ldl $in0,0+MSB($inp) # load input
- ldl $in1,8+MSB($inp)
- ldr $in0,0+LSB($inp)
- ldr $in1,8+LSB($inp)
-#endif
- daddiu $len,-1
- daddiu $inp,16
-#ifdef MIPSEB
-# if defined(_MIPS_ARCH_MIPS64R2)
- dsbh $in0,$in0 # byte swap
- dsbh $in1,$in1
- dshd $in0,$in0
- dshd $in1,$in1
-# else
- ori $tmp0,$zero,0xFF
- dsll $tmp2,$tmp0,32
- or $tmp0,$tmp2 # 0x000000FF000000FF
-
- and $tmp1,$in0,$tmp0 # byte swap
- and $tmp3,$in1,$tmp0
- dsrl $tmp2,$in0,24
- dsrl $tmp4,$in1,24
- dsll $tmp1,24
- dsll $tmp3,24
- and $tmp2,$tmp0
- and $tmp4,$tmp0
- dsll $tmp0,8 # 0x0000FF000000FF00
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- and $tmp2,$in0,$tmp0
- and $tmp4,$in1,$tmp0
- dsrl $in0,8
- dsrl $in1,8
- dsll $tmp2,8
- dsll $tmp4,8
- and $in0,$tmp0
- and $in1,$tmp0
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- or $in0,$tmp1
- or $in1,$tmp3
- dsrl $tmp1,$in0,32
- dsrl $tmp3,$in1,32
- dsll $in0,32
- dsll $in1,32
- or $in0,$tmp1
- or $in1,$tmp3
-# endif
-#endif
- daddu $h0,$in0 # accumulate input
- daddu $h1,$in1
- sltu $tmp0,$h0,$in0
- sltu $tmp1,$h1,$in1
- daddu $h1,$tmp0
-
- dmultu ($r0,$h0) # h0*r0
- daddu $h2,$padbit
- sltu $tmp0,$h1,$tmp0
- mflo ($d0,$r0,$h0)
- mfhi ($d1,$r0,$h0)
-
- dmultu ($s1,$h1) # h1*5*r1
- daddu $tmp0,$tmp1
- daddu $h2,$tmp0
- mflo ($tmp0,$s1,$h1)
- mfhi ($tmp1,$s1,$h1)
-
- dmultu ($r1,$h0) # h0*r1
- daddu $d0,$tmp0
- daddu $d1,$tmp1
- mflo ($tmp2,$r1,$h0)
- mfhi ($d2,$r1,$h0)
- sltu $tmp0,$d0,$tmp0
- daddu $d1,$tmp0
-
- dmultu ($r0,$h1) # h1*r0
- daddu $d1,$tmp2
- sltu $tmp2,$d1,$tmp2
- mflo ($tmp0,$r0,$h1)
- mfhi ($tmp1,$r0,$h1)
- daddu $d2,$tmp2
-
- dmultu ($s1,$h2) # h2*5*r1
- daddu $d1,$tmp0
- daddu $d2,$tmp1
- mflo ($tmp2,$s1,$h2)
-
- dmultu ($r0,$h2) # h2*r0
- sltu $tmp0,$d1,$tmp0
- daddu $d2,$tmp0
- mflo ($tmp3,$r0,$h2)
-
- daddu $d1,$tmp2
- daddu $d2,$tmp3
- sltu $tmp2,$d1,$tmp2
- daddu $d2,$tmp2
-
- li $tmp0,-4 # final reduction
- and $tmp0,$d2
- dsrl $tmp1,$d2,2
- andi $h2,$d2,3
- daddu $tmp0,$tmp1
- daddu $h0,$d0,$tmp0
- sltu $tmp0,$h0,$tmp0
- daddu $h1,$d1,$tmp0
- sltu $tmp0,$h1,$tmp0
- daddu $h2,$h2,$tmp0
-
- bnez $len,.Loop
-
- sd $h0,0($ctx) # store hash value
- sd $h1,8($ctx)
- sd $h2,16($ctx)
-
- .set noreorder
- ld $s5,40($sp) # epilogue
- ld $s4,32($sp)
-___
-$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
- ld $s3,24($sp)
- ld $s2,16($sp)
- ld $s1,8($sp)
- ld $s0,0($sp)
-___
-$code.=<<___;
- jr $ra
- daddu $sp,6*8
-.end poly1305_blocks_internal
-___
-}
-{
-my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
-
-$code.=<<___;
-.align 5
-.globl poly1305_emit
-.ent poly1305_emit
-poly1305_emit:
- .frame $sp,0,$ra
- .set reorder
-
- ld $tmp0,0($ctx)
- ld $tmp1,8($ctx)
- ld $tmp2,16($ctx)
-
- daddiu $in0,$tmp0,5 # compare to modulus
- sltiu $tmp3,$in0,5
- daddu $in1,$tmp1,$tmp3
- sltu $tmp3,$in1,$tmp3
- daddu $tmp2,$tmp2,$tmp3
-
- dsrl $tmp2,2 # see if it carried/borrowed
- dsubu $tmp2,$zero,$tmp2
- nor $tmp3,$zero,$tmp2
-
- and $in0,$tmp2
- and $tmp0,$tmp3
- and $in1,$tmp2
- and $tmp1,$tmp3
- or $in0,$tmp0
- or $in1,$tmp1
-
- lwu $tmp0,0($nonce) # load nonce
- lwu $tmp1,4($nonce)
- lwu $tmp2,8($nonce)
- lwu $tmp3,12($nonce)
- dsll $tmp1,32
- dsll $tmp3,32
- or $tmp0,$tmp1
- or $tmp2,$tmp3
-
- daddu $in0,$tmp0 # accumulate nonce
- daddu $in1,$tmp2
- sltu $tmp0,$in0,$tmp0
- daddu $in1,$tmp0
-
- dsrl $tmp0,$in0,8 # write mac value
- dsrl $tmp1,$in0,16
- dsrl $tmp2,$in0,24
- sb $in0,0($mac)
- dsrl $tmp3,$in0,32
- sb $tmp0,1($mac)
- dsrl $tmp0,$in0,40
- sb $tmp1,2($mac)
- dsrl $tmp1,$in0,48
- sb $tmp2,3($mac)
- dsrl $tmp2,$in0,56
- sb $tmp3,4($mac)
- dsrl $tmp3,$in1,8
- sb $tmp0,5($mac)
- dsrl $tmp0,$in1,16
- sb $tmp1,6($mac)
- dsrl $tmp1,$in1,24
- sb $tmp2,7($mac)
-
- sb $in1,8($mac)
- dsrl $tmp2,$in1,32
- sb $tmp3,9($mac)
- dsrl $tmp3,$in1,40
- sb $tmp0,10($mac)
- dsrl $tmp0,$in1,48
- sb $tmp1,11($mac)
- dsrl $tmp1,$in1,56
- sb $tmp2,12($mac)
- sb $tmp3,13($mac)
- sb $tmp0,14($mac)
- sb $tmp1,15($mac)
-
- jr $ra
-.end poly1305_emit
-.rdata
-.align 2
-___
-}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/\/\// and !/^$/);
- print;
-}
-close SELF;
-
-$output=pop and open STDOUT,">$output";
-print $code;
-close STDOUT;
-
diff --git a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-x86_64-glue.c b/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-x86_64-glue.c
deleted file mode 100644
index 874877e3fe3b..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-x86_64-glue.c
+++ /dev/null
@@ -1,171 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#ifdef __linux__
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-#include <asm/intel-family.h>
-#else
-#include <sys/simd-x86_64.h>
-#endif
-
-asmlinkage void poly1305_init_x86_64(void *ctx,
- const u8 key[POLY1305_KEY_SIZE]);
-asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
- const size_t len, const u32 padbit);
-asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_MAC_SIZE],
- const u32 nonce[4]);
-asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_MAC_SIZE],
- const u32 nonce[4]);
-asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
- const size_t len, const u32 padbit);
-
-static bool poly1305_use_avx __ro_after_init;
-static bool poly1305_use_avx2 __ro_after_init;
-static bool poly1305_use_avx512 __ro_after_init;
-static bool *const poly1305_nobs[] __initconst = {
- &poly1305_use_avx, &poly1305_use_avx2, &poly1305_use_avx512 };
-
-static void __init poly1305_fpu_init(void)
-{
-#ifdef __linux__
- poly1305_use_avx =
- boot_cpu_has(X86_FEATURE_AVX) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
- poly1305_use_avx2 =
- boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
-#ifndef COMPAT_CANNOT_USE_AVX512
- poly1305_use_avx512 =
- boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX512F) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
- XFEATURE_MASK_AVX512, NULL) &&
- /* Skylake downclocks unacceptably much when using zmm. */
- boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
-#endif
-#else
-
- poly1305_use_avx = !!(cpu_feature2 & CPUID2_AVX) &&
- __ymm_enabled();
- poly1305_use_avx2 = poly1305_use_avx &&
- !!(cpu_stdext_feature & CPUID_STDEXT_AVX2);
- poly1305_use_avx512 = poly1305_use_avx2 &&
- !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) &&
- __zmm_enabled();
-#endif
-}
-
-static inline bool poly1305_init_arch(void *ctx,
- const u8 key[POLY1305_KEY_SIZE])
-{
- poly1305_init_x86_64(ctx, key);
- return true;
-}
-
-struct poly1305_arch_internal {
- union {
- struct {
- u32 h[5];
- u32 is_base2_26;
- };
- u64 hs[3];
- };
- u64 r[2];
- u64 pad;
- struct { u32 r2, r1, r4, r3; } rn[9];
-};
-
-/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
- * the unfortunate situation of using AVX and then having to go back to scalar
- * -- because the user is silly and has called the update function from two
- * separate contexts -- then we need to convert back to the original base before
- * proceeding. It is possible to reason that the initial reduction below is
- * sufficient given the implementation invariants. However, for an avoidance of
- * doubt and because this is not performance critical, we do the full reduction
- * anyway.
- */
-static void convert_to_base2_64(void *ctx)
-{
- struct poly1305_arch_internal *state = ctx;
- u32 cy;
-
- if (!state->is_base2_26)
- return;
-
- cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
- cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
- cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
- cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
- state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
- state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
- state->hs[2] = state->h[4] >> 24;
-#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
- cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
- state->hs[2] &= 3;
- state->hs[0] += cy;
- state->hs[1] += (cy = ULT(state->hs[0], cy));
- state->hs[2] += ULT(state->hs[1], cy);
-#undef ULT
- state->is_base2_26 = 0;
-}
-
-static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
- size_t len, const u32 padbit,
- simd_context_t *simd_context)
-{
- struct poly1305_arch_internal *state = ctx;
-
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
- PAGE_SIZE % POLY1305_BLOCK_SIZE);
-
- if (!poly1305_use_avx ||
- (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
- !simd_use(simd_context)) {
- convert_to_base2_64(ctx);
- poly1305_blocks_x86_64(ctx, inp, len, padbit);
- return true;
- }
-
- for (;;) {
- const size_t bytes = min_t(size_t, len, PAGE_SIZE);
-
- if (poly1305_use_avx512)
- poly1305_blocks_avx512(ctx, inp, bytes, padbit);
- else if (poly1305_use_avx2)
- poly1305_blocks_avx2(ctx, inp, bytes, padbit);
- else
- poly1305_blocks_avx(ctx, inp, bytes, padbit);
- len -= bytes;
- if (!len)
- break;
- inp += bytes;
- simd_relax(simd_context);
- }
-
- return true;
-}
-
-static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
- const u32 nonce[4],
- simd_context_t *simd_context)
-{
- struct poly1305_arch_internal *state = ctx;
-
- if (!IS_ENABLED(CONFIG_AS_AVX) || !poly1305_use_avx ||
- !state->is_base2_26 || !simd_use(simd_context)) {
- convert_to_base2_64(ctx);
- poly1305_emit_x86_64(ctx, mac, nonce);
- } else
- poly1305_emit_avx(ctx, mac, nonce);
- return true;
-}
diff --git a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-x86_64.pl b/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-x86_64.pl
deleted file mode 100755
index 94c3c42f89f2..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-x86_64.pl
+++ /dev/null
@@ -1,4266 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
-# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
-# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for x86_64.
-#
-# March 2015
-#
-# Initial release.
-#
-# December 2016
-#
-# Add AVX512F+VL+BW code path.
-#
-# November 2017
-#
-# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
-# executed even on Knights Landing. Trigger for modification was
-# observation that AVX512 code paths can negatively affect overall
-# Skylake-X system performance. Since we are likely to suppress
-# AVX512F capability flag [at least on Skylake-X], conversion serves
-# as kind of "investment protection". Note that next *lake processor,
-# Cannolake, has AVX512IFMA code path to execute...
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone,
-# measured with rdtsc at fixed clock frequency.
-#
-# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
-# P4 4.46/+120% -
-# Core 2 2.41/+90% -
-# Westmere 1.88/+120% -
-# Sandy Bridge 1.39/+140% 1.10
-# Haswell 1.14/+175% 1.11 0.65
-# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
-# Silvermont 2.83/+95% -
-# Knights L 3.60/? 1.65 1.10 0.41(***)
-# Goldmont 1.70/+180% -
-# VIA Nano 1.82/+150% -
-# Sledgehammer 1.38/+160% -
-# Bulldozer 2.30/+130% 0.97
-# Ryzen 1.15/+200% 1.08 1.18
-#
-# (*) improvement coefficients relative to clang are more modest and
-# are ~50% on most processors, in both cases we are comparing to
-# __int128 code;
-# (**) SSE2 implementation was attempted, but among non-AVX processors
-# it was faster than integer-only code only on older Intel P4 and
-# Core processors, 50-30%, less newer processor is, but slower on
-# contemporary ones, for example almost 2x slower on Atom, and as
-# former are naturally disappearing, SSE2 is deemed unnecessary;
-# (***) strangely enough performance seems to vary from core to core,
-# listed result is best case;
-
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-$kernel=0; $kernel=1 if (!$flavour && !$output);
-
-if (!$kernel) {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
-
- open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
- *STDOUT=*OUT;
-
- if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
- =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
- }
-
- if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
- $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
- $avx += 1 if ($1==2.11 && $2>=8);
- }
-
- if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
- `ml64 2>&1` =~ /Version ([0-9]+)\./) {
- $avx = ($1>=10) + ($1>=11);
- }
-
- if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
- $avx = ($2>=3.0) + ($2>3.0);
- }
-} else {
- $avx = 4; # The kernel uses ifdefs for this.
-}
-
-sub declare_function() {
- my ($name, $align, $nargs) = @_;
- if($kernel) {
- $code .= ".align $align\n";
- $code .= "SYM_FUNC_START($name)\n";
- $code .= ".L$name:\n";
- } else {
- $code .= ".globl $name\n";
- $code .= ".type $name,\@function,$nargs\n";
- $code .= ".align $align\n";
- $code .= "$name:\n";
- }
-}
-
-sub end_function() {
- my ($name) = @_;
- if($kernel) {
- $code .= "SYM_FUNC_END($name)\n";
- } else {
- $code .= ".size $name,.-$name\n";
- }
-}
-
-$code.=<<___ if $kernel;
-#include <linux/linkage.h>
-___
-
-if ($avx) {
-$code.=<<___ if $kernel;
-.section .rodata
-___
-$code.=<<___;
-.align 64
-.Lconst:
-.Lmask24:
-.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
-.L129:
-.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
-.Lmask26:
-.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lpermd_avx2:
-.long 2,2,2,3,2,0,2,1
-.Lpermd_avx512:
-.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
-
-.L2_44_inp_permd:
-.long 0,1,1,2,2,3,7,7
-.L2_44_inp_shift:
-.quad 0,12,24,64
-.L2_44_mask:
-.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
-.L2_44_shift_rgt:
-.quad 44,44,42,64
-.L2_44_shift_lft:
-.quad 8,8,10,64
-
-.align 64
-.Lx_mask44:
-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.Lx_mask42:
-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-___
-}
-$code.=<<___ if (!$kernel);
-.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align 16
-___
-
-my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
-my ($mac,$nonce)=($inp,$len); # *_emit arguments
-my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
-my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
-
-sub poly1305_iteration {
-# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
-# output: $h0-$h2 *= $r0-$r1
-$code.=<<___;
- mulq $h0 # h0*r1
- mov %rax,$d2
- mov $r0,%rax
- mov %rdx,$d3
-
- mulq $h0 # h0*r0
- mov %rax,$h0 # future $h0
- mov $r0,%rax
- mov %rdx,$d1
-
- mulq $h1 # h1*r0
- add %rax,$d2
- mov $s1,%rax
- adc %rdx,$d3
-
- mulq $h1 # h1*s1
- mov $h2,$h1 # borrow $h1
- add %rax,$h0
- adc %rdx,$d1
-
- imulq $s1,$h1 # h2*s1
- add $h1,$d2
- mov $d1,$h1
- adc \$0,$d3
-
- imulq $r0,$h2 # h2*r0
- add $d2,$h1
- mov \$-4,%rax # mask value
- adc $h2,$d3
-
- and $d3,%rax # last reduction step
- mov $d3,$h2
- shr \$2,$d3
- and \$3,$h2
- add $d3,%rax
- add %rax,$h0
- adc \$0,$h1
- adc \$0,$h2
-___
-}
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int64 h[3]; # current hash value base 2^64
-# unsigned __int64 r[2]; # key value base 2^64
-
-$code.=<<___;
-.text
-___
-$code.=<<___ if (!$kernel);
-.extern OPENSSL_ia32cap_P
-
-.globl poly1305_init_x86_64
-.hidden poly1305_init_x86_64
-.globl poly1305_blocks_x86_64
-.hidden poly1305_blocks_x86_64
-.globl poly1305_emit_x86_64
-.hidden poly1305_emit_x86_64
-___
-&declare_function("poly1305_init_x86_64", 32, 3);
-$code.=<<___;
- xor %rax,%rax
- mov %rax,0($ctx) # initialize hash value
- mov %rax,8($ctx)
- mov %rax,16($ctx)
-
- cmp \$0,$inp
- je .Lno_key
-___
-$code.=<<___ if (!$kernel);
- lea poly1305_blocks_x86_64(%rip),%r10
- lea poly1305_emit_x86_64(%rip),%r11
-___
-$code.=<<___ if (!$kernel && $avx);
- mov OPENSSL_ia32cap_P+4(%rip),%r9
- lea poly1305_blocks_avx(%rip),%rax
- lea poly1305_emit_avx(%rip),%rcx
- bt \$`60-32`,%r9 # AVX?
- cmovc %rax,%r10
- cmovc %rcx,%r11
-___
-$code.=<<___ if (!$kernel && $avx>1);
- lea poly1305_blocks_avx2(%rip),%rax
- bt \$`5+32`,%r9 # AVX2?
- cmovc %rax,%r10
-___
-$code.=<<___ if (!$kernel && $avx>3);
- mov \$`(1<<31|1<<21|1<<16)`,%rax
- shr \$32,%r9
- and %rax,%r9
- cmp %rax,%r9
- je .Linit_base2_44
-___
-$code.=<<___;
- mov \$0x0ffffffc0fffffff,%rax
- mov \$0x0ffffffc0ffffffc,%rcx
- and 0($inp),%rax
- and 8($inp),%rcx
- mov %rax,24($ctx)
- mov %rcx,32($ctx)
-___
-$code.=<<___ if (!$kernel && $flavour !~ /elf32/);
- mov %r10,0(%rdx)
- mov %r11,8(%rdx)
-___
-$code.=<<___ if (!$kernel && $flavour =~ /elf32/);
- mov %r10d,0(%rdx)
- mov %r11d,4(%rdx)
-___
-$code.=<<___;
- mov \$1,%eax
-.Lno_key:
- ret
-___
-&end_function("poly1305_init_x86_64");
-
-&declare_function("poly1305_blocks_x86_64", 32, 4);
-$code.=<<___;
-.cfi_startproc
-.Lblocks:
- shr \$4,$len
- jz .Lno_data # too short
-
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
- push $ctx
-.cfi_push $ctx
-.Lblocks_body:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2
-
- mov $s1,$r1
- shr \$2,$s1
- mov $r1,%rax
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
- jmp .Loop
-
-.align 32
-.Loop:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
-___
-
- &poly1305_iteration();
-
-$code.=<<___;
- mov $r1,%rax
- dec %r15 # len-=16
- jnz .Loop
-
- mov 0(%rsp),$ctx
-.cfi_restore $ctx
-
- mov $h0,0($ctx) # store hash value
- mov $h1,8($ctx)
- mov $h2,16($ctx)
-
- mov 8(%rsp),%r15
-.cfi_restore %r15
- mov 16(%rsp),%r14
-.cfi_restore %r14
- mov 24(%rsp),%r13
-.cfi_restore %r13
- mov 32(%rsp),%r12
-.cfi_restore %r12
- mov 40(%rsp),%rbx
-.cfi_restore %rbx
- lea 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
-.Lno_data:
-.Lblocks_epilogue:
- ret
-.cfi_endproc
-___
-&end_function("poly1305_blocks_x86_64");
-
-&declare_function("poly1305_emit_x86_64", 32, 3);
-$code.=<<___;
-.Lemit:
- mov 0($ctx),%r8 # load hash value
- mov 8($ctx),%r9
- mov 16($ctx),%r10
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overflow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- ret
-___
-&end_function("poly1305_emit_x86_64");
-if ($avx) {
-
-if($kernel) {
- $code .= "#ifdef CONFIG_AS_AVX\n";
-}
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int32 h[5]; # current hash value base 2^26
-# unsigned __int32 is_base2_26;
-# unsigned __int64 r[2]; # key value base 2^64
-# unsigned __int64 pad;
-# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
-#
-# where r^n are base 2^26 digits of degrees of multiplier key. There are
-# 5 digits, but last four are interleaved with multiples of 5, totalling
-# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
-
-my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
- map("%xmm$_",(0..15));
-
-$code.=<<___;
-.type __poly1305_block,\@abi-omnipotent
-.align 32
-__poly1305_block:
- push $ctx
-___
- &poly1305_iteration();
-$code.=<<___;
- pop $ctx
- ret
-.size __poly1305_block,.-__poly1305_block
-
-.type __poly1305_init_avx,\@abi-omnipotent
-.align 32
-__poly1305_init_avx:
- push %rbp
- mov %rsp,%rbp
- mov $r0,$h0
- mov $r1,$h1
- xor $h2,$h2
-
- lea 48+64($ctx),$ctx # size optimization
-
- mov $r1,%rax
- call __poly1305_block # r^2
-
- mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
- mov \$0x3ffffff,%edx
- mov $h0,$d1
- and $h0#d,%eax
- mov $r0,$d2
- and $r0#d,%edx
- mov %eax,`16*0+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*0+4-64`($ctx)
- shr \$26,$d2
-
- mov \$0x3ffffff,%eax
- mov \$0x3ffffff,%edx
- and $d1#d,%eax
- and $d2#d,%edx
- mov %eax,`16*1+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*1+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*2+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*2+4-64`($ctx)
- shr \$26,$d2
-
- mov $h1,%rax
- mov $r1,%rdx
- shl \$12,%rax
- shl \$12,%rdx
- or $d1,%rax
- or $d2,%rdx
- and \$0x3ffffff,%eax
- and \$0x3ffffff,%edx
- mov %eax,`16*3+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*3+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*4+0-64`($ctx)
- mov $h1,$d1
- mov %edx,`16*4+4-64`($ctx)
- mov $r1,$d2
-
- mov \$0x3ffffff,%eax
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- shr \$14,$d2
- and $d1#d,%eax
- and $d2#d,%edx
- mov %eax,`16*5+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*5+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*6+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*6+4-64`($ctx)
- shr \$26,$d2
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+0-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d2#d,`16*7+4-64`($ctx)
- lea ($d2,$d2,4),$d2 # *5
- mov $d1#d,`16*8+0-64`($ctx)
- mov $d2#d,`16*8+4-64`($ctx)
-
- mov $r1,%rax
- call __poly1305_block # r^3
-
- mov \$0x3ffffff,%eax # save r^3 base 2^26
- mov $h0,$d1
- and $h0#d,%eax
- shr \$26,$d1
- mov %eax,`16*0+12-64`($ctx)
-
- mov \$0x3ffffff,%edx
- and $d1#d,%edx
- mov %edx,`16*1+12-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*2+12-64`($ctx)
-
- mov $h1,%rax
- shl \$12,%rax
- or $d1,%rax
- and \$0x3ffffff,%eax
- mov %eax,`16*3+12-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov $h1,$d1
- mov %eax,`16*4+12-64`($ctx)
-
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- and $d1#d,%edx
- mov %edx,`16*5+12-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*6+12-64`($ctx)
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+12-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d1#d,`16*8+12-64`($ctx)
-
- mov $r1,%rax
- call __poly1305_block # r^4
-
- mov \$0x3ffffff,%eax # save r^4 base 2^26
- mov $h0,$d1
- and $h0#d,%eax
- shr \$26,$d1
- mov %eax,`16*0+8-64`($ctx)
-
- mov \$0x3ffffff,%edx
- and $d1#d,%edx
- mov %edx,`16*1+8-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*2+8-64`($ctx)
-
- mov $h1,%rax
- shl \$12,%rax
- or $d1,%rax
- and \$0x3ffffff,%eax
- mov %eax,`16*3+8-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov $h1,$d1
- mov %eax,`16*4+8-64`($ctx)
-
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- and $d1#d,%edx
- mov %edx,`16*5+8-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*6+8-64`($ctx)
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+8-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d1#d,`16*8+8-64`($ctx)
-
- lea -48-64($ctx),$ctx # size [de-]optimization
- pop %rbp
- ret
-.size __poly1305_init_avx,.-__poly1305_init_avx
-___
-
-&declare_function("poly1305_blocks_avx", 32, 4);
-$code.=<<___;
-.cfi_startproc
- mov 20($ctx),%r8d # is_base2_26
- cmp \$128,$len
- jae .Lblocks_avx
- test %r8d,%r8d
- jz .Lblocks
-
-.Lblocks_avx:
- and \$-16,$len
- jz .Lno_data_avx
-
- vzeroupper
-
- test %r8d,%r8d
- jz .Lbase2_64_avx
-
- test \$31,$len
- jz .Leven_avx
-
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lblocks_avx_body:
-
- mov $len,%r15 # reassign $len
-
- mov 0($ctx),$d1 # load hash value
- mov 8($ctx),$d2
- mov 16($ctx),$h2#d
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- ################################# base 2^26 -> base 2^64
- mov $d1#d,$h0#d
- and \$`-1*(1<<31)`,$d1
- mov $d2,$r1 # borrow $r1
- mov $d2#d,$h1#d
- and \$`-1*(1<<31)`,$d2
-
- shr \$6,$d1
- shl \$52,$r1
- add $d1,$h0
- shr \$12,$h1
- shr \$18,$d2
- add $r1,$h0
- adc $d2,$h1
-
- mov $h2,$d1
- shl \$40,$d1
- shr \$24,$h2
- add $d1,$h1
- adc \$0,$h2 # can be partially reduced...
-
- mov \$-4,$d2 # ... so reduce
- mov $h2,$d1
- and $h2,$d2
- shr \$2,$d1
- and \$3,$h2
- add $d2,$d1 # =*5
- add $d1,$h0
- adc \$0,$h1
- adc \$0,$h2
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
-
- call __poly1305_block
-
- test $padbit,$padbit # if $padbit is zero,
- jz .Lstore_base2_64_avx # store hash in base 2^64 format
-
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$r0
- mov $h1,$r1
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$r0
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $r0,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$r1
- and \$0x3ffffff,$h1 # h[3]
- or $r1,$h2 # h[4]
-
- sub \$16,%r15
- jz .Lstore_base2_26_avx
-
- vmovd %rax#d,$H0
- vmovd %rdx#d,$H1
- vmovd $h0#d,$H2
- vmovd $h1#d,$H3
- vmovd $h2#d,$H4
- jmp .Lproceed_avx
-
-.align 32
-.Lstore_base2_64_avx:
- mov $h0,0($ctx)
- mov $h1,8($ctx)
- mov $h2,16($ctx) # note that is_base2_26 is zeroed
- jmp .Ldone_avx
-
-.align 16
-.Lstore_base2_26_avx:
- mov %rax#d,0($ctx) # store hash value base 2^26
- mov %rdx#d,4($ctx)
- mov $h0#d,8($ctx)
- mov $h1#d,12($ctx)
- mov $h2#d,16($ctx)
-.align 16
-.Ldone_avx:
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lno_data_avx:
-.Lblocks_avx_epilogue:
- ret
-.cfi_endproc
-
-.align 32
-.Lbase2_64_avx:
-.cfi_startproc
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lbase2_64_avx_body:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2#d
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- test \$31,$len
- jz .Linit_avx
-
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
-
-.Linit_avx:
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$d1
- mov $h1,$d2
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$d1
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $d1,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$d2
- and \$0x3ffffff,$h1 # h[3]
- or $d2,$h2 # h[4]
-
- vmovd %rax#d,$H0
- vmovd %rdx#d,$H1
- vmovd $h0#d,$H2
- vmovd $h1#d,$H3
- vmovd $h2#d,$H4
- movl \$1,20($ctx) # set is_base2_26
-
- call __poly1305_init_avx
-
-.Lproceed_avx:
- mov %r15,$len
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lbase2_64_avx_epilogue:
- jmp .Ldo_avx
-.cfi_endproc
-
-.align 32
-.Leven_avx:
-.cfi_startproc
- vmovd 4*0($ctx),$H0 # load hash value
- vmovd 4*1($ctx),$H1
- vmovd 4*2($ctx),$H2
- vmovd 4*3($ctx),$H3
- vmovd 4*4($ctx),$H4
-
-.Ldo_avx:
-___
-$code.=<<___ if (!$win64);
- lea 8(%rsp),%r10
-.cfi_def_cfa_register %r10
- and \$-32,%rsp
- sub \$-8,%rsp
- lea -0x58(%rsp),%r11
- sub \$0x178,%rsp
-
-___
-$code.=<<___ if ($win64);
- lea -0xf8(%rsp),%r11
- sub \$0x218,%rsp
- vmovdqa %xmm6,0x50(%r11)
- vmovdqa %xmm7,0x60(%r11)
- vmovdqa %xmm8,0x70(%r11)
- vmovdqa %xmm9,0x80(%r11)
- vmovdqa %xmm10,0x90(%r11)
- vmovdqa %xmm11,0xa0(%r11)
- vmovdqa %xmm12,0xb0(%r11)
- vmovdqa %xmm13,0xc0(%r11)
- vmovdqa %xmm14,0xd0(%r11)
- vmovdqa %xmm15,0xe0(%r11)
-.Ldo_avx_body:
-___
-$code.=<<___;
- sub \$64,$len
- lea -32($inp),%rax
- cmovc %rax,$inp
-
- vmovdqu `16*3`($ctx),$D4 # preload r0^2
- lea `16*3+64`($ctx),$ctx # size optimization
- lea .Lconst(%rip),%rcx
-
- ################################################################
- # load input
- vmovdqu 16*2($inp),$T0
- vmovdqu 16*3($inp),$T1
- vmovdqa 64(%rcx),$MASK # .Lmask26
-
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpunpcklqdq $T3,$T2,$T3 # 2:3
-
- vpsrlq \$40,$T4,$T4 # 4
- vpsrlq \$26,$T0,$T1
- vpand $MASK,$T0,$T0 # 0
- vpsrlq \$4,$T3,$T2
- vpand $MASK,$T1,$T1 # 1
- vpsrlq \$30,$T3,$T3
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- jbe .Lskip_loop_avx
-
- # expand and copy pre-calculated table to stack
- vmovdqu `16*1-64`($ctx),$D1
- vmovdqu `16*2-64`($ctx),$D2
- vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
- vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
- vmovdqa $D3,-0x90(%r11)
- vmovdqa $D0,0x00(%rsp)
- vpshufd \$0xEE,$D1,$D4
- vmovdqu `16*3-64`($ctx),$D0
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D4,-0x80(%r11)
- vmovdqa $D1,0x10(%rsp)
- vpshufd \$0xEE,$D2,$D3
- vmovdqu `16*4-64`($ctx),$D1
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D3,-0x70(%r11)
- vmovdqa $D2,0x20(%rsp)
- vpshufd \$0xEE,$D0,$D4
- vmovdqu `16*5-64`($ctx),$D2
- vpshufd \$0x44,$D0,$D0
- vmovdqa $D4,-0x60(%r11)
- vmovdqa $D0,0x30(%rsp)
- vpshufd \$0xEE,$D1,$D3
- vmovdqu `16*6-64`($ctx),$D0
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D3,-0x50(%r11)
- vmovdqa $D1,0x40(%rsp)
- vpshufd \$0xEE,$D2,$D4
- vmovdqu `16*7-64`($ctx),$D1
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D4,-0x40(%r11)
- vmovdqa $D2,0x50(%rsp)
- vpshufd \$0xEE,$D0,$D3
- vmovdqu `16*8-64`($ctx),$D2
- vpshufd \$0x44,$D0,$D0
- vmovdqa $D3,-0x30(%r11)
- vmovdqa $D0,0x60(%rsp)
- vpshufd \$0xEE,$D1,$D4
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D4,-0x20(%r11)
- vmovdqa $D1,0x70(%rsp)
- vpshufd \$0xEE,$D2,$D3
- vmovdqa 0x00(%rsp),$D4 # preload r0^2
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D3,-0x10(%r11)
- vmovdqa $D2,0x80(%rsp)
-
- jmp .Loop_avx
-
-.align 32
-.Loop_avx:
- ################################################################
- # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- # \___________________/
- # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- # \___________________/ \____________________/
- #
- # Note that we start with inp[2:3]*r^2. This is because it
- # doesn't depend on reduction in previous iteration.
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # though note that $Tx and $Hx are "reversed" in this section,
- # and $D4 is preloaded with r0^2...
-
- vpmuludq $T0,$D4,$D0 # d0 = h0*r0
- vpmuludq $T1,$D4,$D1 # d1 = h1*r0
- vmovdqa $H2,0x20(%r11) # offload hash
- vpmuludq $T2,$D4,$D2 # d3 = h2*r0
- vmovdqa 0x10(%rsp),$H2 # r1^2
- vpmuludq $T3,$D4,$D3 # d3 = h3*r0
- vpmuludq $T4,$D4,$D4 # d4 = h4*r0
-
- vmovdqa $H0,0x00(%r11) #
- vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
- vmovdqa $H1,0x10(%r11) #
- vpmuludq $T3,$H2,$H1 # h3*r1
- vpaddq $H0,$D0,$D0 # d0 += h4*s1
- vpaddq $H1,$D4,$D4 # d4 += h3*r1
- vmovdqa $H3,0x30(%r11) #
- vpmuludq $T2,$H2,$H0 # h2*r1
- vpmuludq $T1,$H2,$H1 # h1*r1
- vpaddq $H0,$D3,$D3 # d3 += h2*r1
- vmovdqa 0x30(%rsp),$H3 # r2^2
- vpaddq $H1,$D2,$D2 # d2 += h1*r1
- vmovdqa $H4,0x40(%r11) #
- vpmuludq $T0,$H2,$H2 # h0*r1
- vpmuludq $T2,$H3,$H0 # h2*r2
- vpaddq $H2,$D1,$D1 # d1 += h0*r1
-
- vmovdqa 0x40(%rsp),$H4 # s2^2
- vpaddq $H0,$D4,$D4 # d4 += h2*r2
- vpmuludq $T1,$H3,$H1 # h1*r2
- vpmuludq $T0,$H3,$H3 # h0*r2
- vpaddq $H1,$D3,$D3 # d3 += h1*r2
- vmovdqa 0x50(%rsp),$H2 # r3^2
- vpaddq $H3,$D2,$D2 # d2 += h0*r2
- vpmuludq $T4,$H4,$H0 # h4*s2
- vpmuludq $T3,$H4,$H4 # h3*s2
- vpaddq $H0,$D1,$D1 # d1 += h4*s2
- vmovdqa 0x60(%rsp),$H3 # s3^2
- vpaddq $H4,$D0,$D0 # d0 += h3*s2
-
- vmovdqa 0x80(%rsp),$H4 # s4^2
- vpmuludq $T1,$H2,$H1 # h1*r3
- vpmuludq $T0,$H2,$H2 # h0*r3
- vpaddq $H1,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $T4,$H3,$H0 # h4*s3
- vpmuludq $T3,$H3,$H1 # h3*s3
- vpaddq $H0,$D2,$D2 # d2 += h4*s3
- vmovdqu 16*0($inp),$H0 # load input
- vpaddq $H1,$D1,$D1 # d1 += h3*s3
- vpmuludq $T2,$H3,$H3 # h2*s3
- vpmuludq $T2,$H4,$T2 # h2*s4
- vpaddq $H3,$D0,$D0 # d0 += h2*s3
-
- vmovdqu 16*1($inp),$H1 #
- vpaddq $T2,$D1,$D1 # d1 += h2*s4
- vpmuludq $T3,$H4,$T3 # h3*s4
- vpmuludq $T4,$H4,$T4 # h4*s4
- vpsrldq \$6,$H0,$H2 # splat input
- vpaddq $T3,$D2,$D2 # d2 += h3*s4
- vpaddq $T4,$D3,$D3 # d3 += h4*s4
- vpsrldq \$6,$H1,$H3 #
- vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
- vpmuludq $T1,$H4,$T0 # h1*s4
- vpunpckhqdq $H1,$H0,$H4 # 4
- vpaddq $T4,$D4,$D4 # d4 += h0*r4
- vmovdqa -0x90(%r11),$T4 # r0^4
- vpaddq $T0,$D0,$D0 # d0 += h1*s4
-
- vpunpcklqdq $H1,$H0,$H0 # 0:1
- vpunpcklqdq $H3,$H2,$H3 # 2:3
-
- #vpsrlq \$40,$H4,$H4 # 4
- vpsrldq \$`40/8`,$H4,$H4 # 4
- vpsrlq \$26,$H0,$H1
- vpand $MASK,$H0,$H0 # 0
- vpsrlq \$4,$H3,$H2
- vpand $MASK,$H1,$H1 # 1
- vpand 0(%rcx),$H4,$H4 # .Lmask24
- vpsrlq \$30,$H3,$H3
- vpand $MASK,$H2,$H2 # 2
- vpand $MASK,$H3,$H3 # 3
- vpor 32(%rcx),$H4,$H4 # padbit, yes, always
-
- vpaddq 0x00(%r11),$H0,$H0 # add hash value
- vpaddq 0x10(%r11),$H1,$H1
- vpaddq 0x20(%r11),$H2,$H2
- vpaddq 0x30(%r11),$H3,$H3
- vpaddq 0x40(%r11),$H4,$H4
-
- lea 16*2($inp),%rax
- lea 16*4($inp),$inp
- sub \$64,$len
- cmovc %rax,$inp
-
- ################################################################
- # Now we accumulate (inp[0:1]+hash)*r^4
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- vpmuludq $H0,$T4,$T0 # h0*r0
- vpmuludq $H1,$T4,$T1 # h1*r0
- vpaddq $T0,$D0,$D0
- vpaddq $T1,$D1,$D1
- vmovdqa -0x80(%r11),$T2 # r1^4
- vpmuludq $H2,$T4,$T0 # h2*r0
- vpmuludq $H3,$T4,$T1 # h3*r0
- vpaddq $T0,$D2,$D2
- vpaddq $T1,$D3,$D3
- vpmuludq $H4,$T4,$T4 # h4*r0
- vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
- vpaddq $T4,$D4,$D4
-
- vpaddq $T0,$D0,$D0 # d0 += h4*s1
- vpmuludq $H2,$T2,$T1 # h2*r1
- vpmuludq $H3,$T2,$T0 # h3*r1
- vpaddq $T1,$D3,$D3 # d3 += h2*r1
- vmovdqa -0x60(%r11),$T3 # r2^4
- vpaddq $T0,$D4,$D4 # d4 += h3*r1
- vpmuludq $H1,$T2,$T1 # h1*r1
- vpmuludq $H0,$T2,$T2 # h0*r1
- vpaddq $T1,$D2,$D2 # d2 += h1*r1
- vpaddq $T2,$D1,$D1 # d1 += h0*r1
-
- vmovdqa -0x50(%r11),$T4 # s2^4
- vpmuludq $H2,$T3,$T0 # h2*r2
- vpmuludq $H1,$T3,$T1 # h1*r2
- vpaddq $T0,$D4,$D4 # d4 += h2*r2
- vpaddq $T1,$D3,$D3 # d3 += h1*r2
- vmovdqa -0x40(%r11),$T2 # r3^4
- vpmuludq $H0,$T3,$T3 # h0*r2
- vpmuludq $H4,$T4,$T0 # h4*s2
- vpaddq $T3,$D2,$D2 # d2 += h0*r2
- vpaddq $T0,$D1,$D1 # d1 += h4*s2
- vmovdqa -0x30(%r11),$T3 # s3^4
- vpmuludq $H3,$T4,$T4 # h3*s2
- vpmuludq $H1,$T2,$T1 # h1*r3
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
-
- vmovdqa -0x10(%r11),$T4 # s4^4
- vpaddq $T1,$D4,$D4 # d4 += h1*r3
- vpmuludq $H0,$T2,$T2 # h0*r3
- vpmuludq $H4,$T3,$T0 # h4*s3
- vpaddq $T2,$D3,$D3 # d3 += h0*r3
- vpaddq $T0,$D2,$D2 # d2 += h4*s3
- vmovdqu 16*2($inp),$T0 # load input
- vpmuludq $H3,$T3,$T2 # h3*s3
- vpmuludq $H2,$T3,$T3 # h2*s3
- vpaddq $T2,$D1,$D1 # d1 += h3*s3
- vmovdqu 16*3($inp),$T1 #
- vpaddq $T3,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $H2,$T4,$H2 # h2*s4
- vpmuludq $H3,$T4,$H3 # h3*s4
- vpsrldq \$6,$T0,$T2 # splat input
- vpaddq $H2,$D1,$D1 # d1 += h2*s4
- vpmuludq $H4,$T4,$H4 # h4*s4
- vpsrldq \$6,$T1,$T3 #
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
- vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
- vpmuludq $H1,$T4,$H0
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpunpcklqdq $T3,$T2,$T3 # 2:3
-
- #vpsrlq \$40,$T4,$T4 # 4
- vpsrldq \$`40/8`,$T4,$T4 # 4
- vpsrlq \$26,$T0,$T1
- vmovdqa 0x00(%rsp),$D4 # preload r0^2
- vpand $MASK,$T0,$T0 # 0
- vpsrlq \$4,$T3,$T2
- vpand $MASK,$T1,$T1 # 1
- vpand 0(%rcx),$T4,$T4 # .Lmask24
- vpsrlq \$30,$T3,$T3
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- ################################################################
- # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- # and P. Schwabe
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D0
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D0,$H0,$H0
- vpsllq \$2,$D0,$D0
- vpaddq $D0,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- ja .Loop_avx
-
-.Lskip_loop_avx:
- ################################################################
- # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
- add \$32,$len
- jnz .Long_tail_avx
-
- vpaddq $H2,$T2,$T2
- vpaddq $H0,$T0,$T0
- vpaddq $H1,$T1,$T1
- vpaddq $H3,$T3,$T3
- vpaddq $H4,$T4,$T4
-
-.Long_tail_avx:
- vmovdqa $H2,0x20(%r11)
- vmovdqa $H0,0x00(%r11)
- vmovdqa $H1,0x10(%r11)
- vmovdqa $H3,0x30(%r11)
- vmovdqa $H4,0x40(%r11)
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- vpmuludq $T2,$D4,$D2 # d2 = h2*r0
- vpmuludq $T0,$D4,$D0 # d0 = h0*r0
- vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
- vpmuludq $T1,$D4,$D1 # d1 = h1*r0
- vpmuludq $T3,$D4,$D3 # d3 = h3*r0
- vpmuludq $T4,$D4,$D4 # d4 = h4*r0
-
- vpmuludq $T3,$H2,$H0 # h3*r1
- vpaddq $H0,$D4,$D4 # d4 += h3*r1
- vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
- vpmuludq $T2,$H2,$H1 # h2*r1
- vpaddq $H1,$D3,$D3 # d3 += h2*r1
- vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
- vpmuludq $T1,$H2,$H0 # h1*r1
- vpaddq $H0,$D2,$D2 # d2 += h1*r1
- vpmuludq $T0,$H2,$H2 # h0*r1
- vpaddq $H2,$D1,$D1 # d1 += h0*r1
- vpmuludq $T4,$H3,$H3 # h4*s1
- vpaddq $H3,$D0,$D0 # d0 += h4*s1
-
- vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
- vpmuludq $T2,$H4,$H1 # h2*r2
- vpaddq $H1,$D4,$D4 # d4 += h2*r2
- vpmuludq $T1,$H4,$H0 # h1*r2
- vpaddq $H0,$D3,$D3 # d3 += h1*r2
- vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
- vpmuludq $T0,$H4,$H4 # h0*r2
- vpaddq $H4,$D2,$D2 # d2 += h0*r2
- vpmuludq $T4,$H2,$H1 # h4*s2
- vpaddq $H1,$D1,$D1 # d1 += h4*s2
- vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
- vpmuludq $T3,$H2,$H2 # h3*s2
- vpaddq $H2,$D0,$D0 # d0 += h3*s2
-
- vpmuludq $T1,$H3,$H0 # h1*r3
- vpaddq $H0,$D4,$D4 # d4 += h1*r3
- vpmuludq $T0,$H3,$H3 # h0*r3
- vpaddq $H3,$D3,$D3 # d3 += h0*r3
- vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
- vpmuludq $T4,$H4,$H1 # h4*s3
- vpaddq $H1,$D2,$D2 # d2 += h4*s3
- vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
- vpmuludq $T3,$H4,$H0 # h3*s3
- vpaddq $H0,$D1,$D1 # d1 += h3*s3
- vpmuludq $T2,$H4,$H4 # h2*s3
- vpaddq $H4,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $T0,$H2,$H2 # h0*r4
- vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
- vpmuludq $T4,$H3,$H1 # h4*s4
- vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
- vpmuludq $T3,$H3,$H0 # h3*s4
- vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
- vpmuludq $T2,$H3,$H1 # h2*s4
- vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
- vpmuludq $T1,$H3,$H3 # h1*s4
- vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
-
- jz .Lshort_tail_avx
-
- vmovdqu 16*0($inp),$H0 # load input
- vmovdqu 16*1($inp),$H1
-
- vpsrldq \$6,$H0,$H2 # splat input
- vpsrldq \$6,$H1,$H3
- vpunpckhqdq $H1,$H0,$H4 # 4
- vpunpcklqdq $H1,$H0,$H0 # 0:1
- vpunpcklqdq $H3,$H2,$H3 # 2:3
-
- vpsrlq \$40,$H4,$H4 # 4
- vpsrlq \$26,$H0,$H1
- vpand $MASK,$H0,$H0 # 0
- vpsrlq \$4,$H3,$H2
- vpand $MASK,$H1,$H1 # 1
- vpsrlq \$30,$H3,$H3
- vpand $MASK,$H2,$H2 # 2
- vpand $MASK,$H3,$H3 # 3
- vpor 32(%rcx),$H4,$H4 # padbit, yes, always
-
- vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
- vpaddq 0x00(%r11),$H0,$H0
- vpaddq 0x10(%r11),$H1,$H1
- vpaddq 0x20(%r11),$H2,$H2
- vpaddq 0x30(%r11),$H3,$H3
- vpaddq 0x40(%r11),$H4,$H4
-
- ################################################################
- # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
-
- vpmuludq $H0,$T4,$T0 # h0*r0
- vpaddq $T0,$D0,$D0 # d0 += h0*r0
- vpmuludq $H1,$T4,$T1 # h1*r0
- vpaddq $T1,$D1,$D1 # d1 += h1*r0
- vpmuludq $H2,$T4,$T0 # h2*r0
- vpaddq $T0,$D2,$D2 # d2 += h2*r0
- vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
- vpmuludq $H3,$T4,$T1 # h3*r0
- vpaddq $T1,$D3,$D3 # d3 += h3*r0
- vpmuludq $H4,$T4,$T4 # h4*r0
- vpaddq $T4,$D4,$D4 # d4 += h4*r0
-
- vpmuludq $H3,$T2,$T0 # h3*r1
- vpaddq $T0,$D4,$D4 # d4 += h3*r1
- vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
- vpmuludq $H2,$T2,$T1 # h2*r1
- vpaddq $T1,$D3,$D3 # d3 += h2*r1
- vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
- vpmuludq $H1,$T2,$T0 # h1*r1
- vpaddq $T0,$D2,$D2 # d2 += h1*r1
- vpmuludq $H0,$T2,$T2 # h0*r1
- vpaddq $T2,$D1,$D1 # d1 += h0*r1
- vpmuludq $H4,$T3,$T3 # h4*s1
- vpaddq $T3,$D0,$D0 # d0 += h4*s1
-
- vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
- vpmuludq $H2,$T4,$T1 # h2*r2
- vpaddq $T1,$D4,$D4 # d4 += h2*r2
- vpmuludq $H1,$T4,$T0 # h1*r2
- vpaddq $T0,$D3,$D3 # d3 += h1*r2
- vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
- vpmuludq $H0,$T4,$T4 # h0*r2
- vpaddq $T4,$D2,$D2 # d2 += h0*r2
- vpmuludq $H4,$T2,$T1 # h4*s2
- vpaddq $T1,$D1,$D1 # d1 += h4*s2
- vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
- vpmuludq $H3,$T2,$T2 # h3*s2
- vpaddq $T2,$D0,$D0 # d0 += h3*s2
-
- vpmuludq $H1,$T3,$T0 # h1*r3
- vpaddq $T0,$D4,$D4 # d4 += h1*r3
- vpmuludq $H0,$T3,$T3 # h0*r3
- vpaddq $T3,$D3,$D3 # d3 += h0*r3
- vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
- vpmuludq $H4,$T4,$T1 # h4*s3
- vpaddq $T1,$D2,$D2 # d2 += h4*s3
- vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
- vpmuludq $H3,$T4,$T0 # h3*s3
- vpaddq $T0,$D1,$D1 # d1 += h3*s3
- vpmuludq $H2,$T4,$T4 # h2*s3
- vpaddq $T4,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $H0,$T2,$T2 # h0*r4
- vpaddq $T2,$D4,$D4 # d4 += h0*r4
- vpmuludq $H4,$T3,$T1 # h4*s4
- vpaddq $T1,$D3,$D3 # d3 += h4*s4
- vpmuludq $H3,$T3,$T0 # h3*s4
- vpaddq $T0,$D2,$D2 # d2 += h3*s4
- vpmuludq $H2,$T3,$T1 # h2*s4
- vpaddq $T1,$D1,$D1 # d1 += h2*s4
- vpmuludq $H1,$T3,$T3 # h1*s4
- vpaddq $T3,$D0,$D0 # d0 += h1*s4
-
-.Lshort_tail_avx:
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$D4,$T4
- vpsrldq \$8,$D3,$T3
- vpsrldq \$8,$D1,$T1
- vpsrldq \$8,$D0,$T0
- vpsrldq \$8,$D2,$T2
- vpaddq $T3,$D3,$D3
- vpaddq $T4,$D4,$D4
- vpaddq $T0,$D0,$D0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$D2,$D2
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$D3,$H3
- vpand $MASK,$D3,$D3
- vpaddq $H3,$D4,$D4 # h3 -> h4
-
- vpsrlq \$26,$D0,$H0
- vpand $MASK,$D0,$D0
- vpaddq $H0,$D1,$D1 # h0 -> h1
-
- vpsrlq \$26,$D4,$H4
- vpand $MASK,$D4,$D4
-
- vpsrlq \$26,$D1,$H1
- vpand $MASK,$D1,$D1
- vpaddq $H1,$D2,$D2 # h1 -> h2
-
- vpaddq $H4,$D0,$D0
- vpsllq \$2,$H4,$H4
- vpaddq $H4,$D0,$D0 # h4 -> h0
-
- vpsrlq \$26,$D2,$H2
- vpand $MASK,$D2,$D2
- vpaddq $H2,$D3,$D3 # h2 -> h3
-
- vpsrlq \$26,$D0,$H0
- vpand $MASK,$D0,$D0
- vpaddq $H0,$D1,$D1 # h0 -> h1
-
- vpsrlq \$26,$D3,$H3
- vpand $MASK,$D3,$D3
- vpaddq $H3,$D4,$D4 # h3 -> h4
-
- vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
- vmovd $D1,`4*1-48-64`($ctx)
- vmovd $D2,`4*2-48-64`($ctx)
- vmovd $D3,`4*3-48-64`($ctx)
- vmovd $D4,`4*4-48-64`($ctx)
-___
-$code.=<<___ if ($win64);
- vmovdqa 0x50(%r11),%xmm6
- vmovdqa 0x60(%r11),%xmm7
- vmovdqa 0x70(%r11),%xmm8
- vmovdqa 0x80(%r11),%xmm9
- vmovdqa 0x90(%r11),%xmm10
- vmovdqa 0xa0(%r11),%xmm11
- vmovdqa 0xb0(%r11),%xmm12
- vmovdqa 0xc0(%r11),%xmm13
- vmovdqa 0xd0(%r11),%xmm14
- vmovdqa 0xe0(%r11),%xmm15
- lea 0xf8(%r11),%rsp
-.Ldo_avx_epilogue:
-___
-$code.=<<___ if (!$win64);
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-___
-$code.=<<___;
- vzeroupper
- ret
-.cfi_endproc
-___
-&end_function("poly1305_blocks_avx");
-
-&declare_function("poly1305_emit_avx", 32, 3);
-$code.=<<___;
- cmpl \$0,20($ctx) # is_base2_26?
- je .Lemit
-
- mov 0($ctx),%eax # load hash value base 2^26
- mov 4($ctx),%ecx
- mov 8($ctx),%r8d
- mov 12($ctx),%r11d
- mov 16($ctx),%r10d
-
- shl \$26,%rcx # base 2^26 -> base 2^64
- mov %r8,%r9
- shl \$52,%r8
- add %rcx,%rax
- shr \$12,%r9
- add %rax,%r8 # h0
- adc \$0,%r9
-
- shl \$14,%r11
- mov %r10,%rax
- shr \$24,%r10
- add %r11,%r9
- shl \$40,%rax
- add %rax,%r9 # h1
- adc \$0,%r10 # h2
-
- mov %r10,%rax # could be partially reduced, so reduce
- mov %r10,%rcx
- and \$3,%r10
- shr \$2,%rax
- and \$-4,%rcx
- add %rcx,%rax
- add %rax,%r8
- adc \$0,%r9
- adc \$0,%r10
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overflow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- ret
-___
-&end_function("poly1305_emit_avx");
-
-if ($kernel) {
- $code .= "#endif\n";
-}
-
-if ($avx>1) {
-
-if ($kernel) {
- $code .= "#ifdef CONFIG_AS_AVX2\n";
-}
-
-my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
- map("%ymm$_",(0..15));
-my $S4=$MASK;
-
-sub poly1305_blocks_avxN {
- my ($avx512) = @_;
- my $suffix = $avx512 ? "_avx512" : "";
-$code.=<<___;
-.cfi_startproc
- mov 20($ctx),%r8d # is_base2_26
- cmp \$128,$len
- jae .Lblocks_avx2$suffix
- test %r8d,%r8d
- jz .Lblocks
-
-.Lblocks_avx2$suffix:
- and \$-16,$len
- jz .Lno_data_avx2$suffix
-
- vzeroupper
-
- test %r8d,%r8d
- jz .Lbase2_64_avx2$suffix
-
- test \$63,$len
- jz .Leven_avx2$suffix
-
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lblocks_avx2_body$suffix:
-
- mov $len,%r15 # reassign $len
-
- mov 0($ctx),$d1 # load hash value
- mov 8($ctx),$d2
- mov 16($ctx),$h2#d
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- ################################# base 2^26 -> base 2^64
- mov $d1#d,$h0#d
- and \$`-1*(1<<31)`,$d1
- mov $d2,$r1 # borrow $r1
- mov $d2#d,$h1#d
- and \$`-1*(1<<31)`,$d2
-
- shr \$6,$d1
- shl \$52,$r1
- add $d1,$h0
- shr \$12,$h1
- shr \$18,$d2
- add $r1,$h0
- adc $d2,$h1
-
- mov $h2,$d1
- shl \$40,$d1
- shr \$24,$h2
- add $d1,$h1
- adc \$0,$h2 # can be partially reduced...
-
- mov \$-4,$d2 # ... so reduce
- mov $h2,$d1
- and $h2,$d2
- shr \$2,$d1
- and \$3,$h2
- add $d2,$d1 # =*5
- add $d1,$h0
- adc \$0,$h1
- adc \$0,$h2
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
-.Lbase2_26_pre_avx2$suffix:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
- mov $r1,%rax
-
- test \$63,%r15
- jnz .Lbase2_26_pre_avx2$suffix
-
- test $padbit,$padbit # if $padbit is zero,
- jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
-
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$r0
- mov $h1,$r1
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$r0
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $r0,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$r1
- and \$0x3ffffff,$h1 # h[3]
- or $r1,$h2 # h[4]
-
- test %r15,%r15
- jz .Lstore_base2_26_avx2$suffix
-
- vmovd %rax#d,%x#$H0
- vmovd %rdx#d,%x#$H1
- vmovd $h0#d,%x#$H2
- vmovd $h1#d,%x#$H3
- vmovd $h2#d,%x#$H4
- jmp .Lproceed_avx2$suffix
-
-.align 32
-.Lstore_base2_64_avx2$suffix:
- mov $h0,0($ctx)
- mov $h1,8($ctx)
- mov $h2,16($ctx) # note that is_base2_26 is zeroed
- jmp .Ldone_avx2$suffix
-
-.align 16
-.Lstore_base2_26_avx2$suffix:
- mov %rax#d,0($ctx) # store hash value base 2^26
- mov %rdx#d,4($ctx)
- mov $h0#d,8($ctx)
- mov $h1#d,12($ctx)
- mov $h2#d,16($ctx)
-.align 16
-.Ldone_avx2$suffix:
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lno_data_avx2$suffix:
-.Lblocks_avx2_epilogue$suffix:
- ret
-.cfi_endproc
-
-.align 32
-.Lbase2_64_avx2$suffix:
-.cfi_startproc
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lbase2_64_avx2_body$suffix:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2#d
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- test \$63,$len
- jz .Linit_avx2$suffix
-
-.Lbase2_64_pre_avx2$suffix:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
- mov $r1,%rax
-
- test \$63,%r15
- jnz .Lbase2_64_pre_avx2$suffix
-
-.Linit_avx2$suffix:
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$d1
- mov $h1,$d2
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$d1
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $d1,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$d2
- and \$0x3ffffff,$h1 # h[3]
- or $d2,$h2 # h[4]
-
- vmovd %rax#d,%x#$H0
- vmovd %rdx#d,%x#$H1
- vmovd $h0#d,%x#$H2
- vmovd $h1#d,%x#$H3
- vmovd $h2#d,%x#$H4
- movl \$1,20($ctx) # set is_base2_26
-
- call __poly1305_init_avx
-
-.Lproceed_avx2$suffix:
- mov %r15,$len # restore $len
-___
-$code.=<<___ if (!$kernel);
- mov OPENSSL_ia32cap_P+8(%rip),%r9d
- mov \$`(1<<31|1<<30|1<<16)`,%r11d
-___
-$code.=<<___;
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lbase2_64_avx2_epilogue$suffix:
- jmp .Ldo_avx2$suffix
-.cfi_endproc
-
-.align 32
-.Leven_avx2$suffix:
-.cfi_startproc
-___
-$code.=<<___ if (!$kernel);
- mov OPENSSL_ia32cap_P+8(%rip),%r9d
-___
-$code.=<<___;
- vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
- vmovd 4*1($ctx),%x#$H1
- vmovd 4*2($ctx),%x#$H2
- vmovd 4*3($ctx),%x#$H3
- vmovd 4*4($ctx),%x#$H4
-
-.Ldo_avx2$suffix:
-___
-$code.=<<___ if (!$kernel && $avx>2);
- cmp \$512,$len
- jb .Lskip_avx512
- and %r11d,%r9d
- test \$`1<<16`,%r9d # check for AVX512F
- jnz .Lblocks_avx512
-.Lskip_avx512$suffix:
-___
-$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
- cmp \$512,$len
- jae .Lblocks_avx512
-___
-$code.=<<___ if (!$win64);
- lea 8(%rsp),%r10
-.cfi_def_cfa_register %r10
- sub \$0x128,%rsp
-___
-$code.=<<___ if ($win64);
- lea 8(%rsp),%r10
- sub \$0x1c8,%rsp
- vmovdqa %xmm6,-0xb0(%r10)
- vmovdqa %xmm7,-0xa0(%r10)
- vmovdqa %xmm8,-0x90(%r10)
- vmovdqa %xmm9,-0x80(%r10)
- vmovdqa %xmm10,-0x70(%r10)
- vmovdqa %xmm11,-0x60(%r10)
- vmovdqa %xmm12,-0x50(%r10)
- vmovdqa %xmm13,-0x40(%r10)
- vmovdqa %xmm14,-0x30(%r10)
- vmovdqa %xmm15,-0x20(%r10)
-.Ldo_avx2_body$suffix:
-___
-$code.=<<___;
- lea .Lconst(%rip),%rcx
- lea 48+64($ctx),$ctx # size optimization
- vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
-
- # expand and copy pre-calculated table to stack
- vmovdqu `16*0-64`($ctx),%x#$T2
- and \$-512,%rsp
- vmovdqu `16*1-64`($ctx),%x#$T3
- vmovdqu `16*2-64`($ctx),%x#$T4
- vmovdqu `16*3-64`($ctx),%x#$D0
- vmovdqu `16*4-64`($ctx),%x#$D1
- vmovdqu `16*5-64`($ctx),%x#$D2
- lea 0x90(%rsp),%rax # size optimization
- vmovdqu `16*6-64`($ctx),%x#$D3
- vpermd $T2,$T0,$T2 # 00003412 -> 14243444
- vmovdqu `16*7-64`($ctx),%x#$D4
- vpermd $T3,$T0,$T3
- vmovdqu `16*8-64`($ctx),%x#$MASK
- vpermd $T4,$T0,$T4
- vmovdqa $T2,0x00(%rsp)
- vpermd $D0,$T0,$D0
- vmovdqa $T3,0x20-0x90(%rax)
- vpermd $D1,$T0,$D1
- vmovdqa $T4,0x40-0x90(%rax)
- vpermd $D2,$T0,$D2
- vmovdqa $D0,0x60-0x90(%rax)
- vpermd $D3,$T0,$D3
- vmovdqa $D1,0x80-0x90(%rax)
- vpermd $D4,$T0,$D4
- vmovdqa $D2,0xa0-0x90(%rax)
- vpermd $MASK,$T0,$MASK
- vmovdqa $D3,0xc0-0x90(%rax)
- vmovdqa $D4,0xe0-0x90(%rax)
- vmovdqa $MASK,0x100-0x90(%rax)
- vmovdqa 64(%rcx),$MASK # .Lmask26
-
- ################################################################
- # load input
- vmovdqu 16*0($inp),%x#$T0
- vmovdqu 16*1($inp),%x#$T1
- vinserti128 \$1,16*2($inp),$T0,$T0
- vinserti128 \$1,16*3($inp),$T1,$T1
- lea 16*4($inp),$inp
-
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpunpcklqdq $T3,$T2,$T2 # 2:3
- vpunpcklqdq $T1,$T0,$T0 # 0:1
-
- vpsrlq \$30,$T2,$T3
- vpsrlq \$4,$T2,$T2
- vpsrlq \$26,$T0,$T1
- vpsrlq \$40,$T4,$T4 # 4
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T0,$T0 # 0
- vpand $MASK,$T1,$T1 # 1
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- vpaddq $H2,$T2,$H2 # accumulate input
- sub \$64,$len
- jz .Ltail_avx2$suffix
- jmp .Loop_avx2$suffix
-
-.align 32
-.Loop_avx2$suffix:
- ################################################################
- # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
- # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
- # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
- # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
- # \________/\__________/
- ################################################################
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
- vmovdqa `32*0`(%rsp),$T0 # r0^4
- vpaddq $H1,$T1,$H1
- vmovdqa `32*1`(%rsp),$T1 # r1^4
- vpaddq $H3,$T3,$H3
- vmovdqa `32*3`(%rsp),$T2 # r2^4
- vpaddq $H4,$T4,$H4
- vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
- vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # however, as h2 is "chronologically" first one available pull
- # corresponding operations up, so it's
- #
- # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
- # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
- # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
-
- vpmuludq $H2,$T0,$D2 # d2 = h2*r0
- vpmuludq $H2,$T1,$D3 # d3 = h2*r1
- vpmuludq $H2,$T2,$D4 # d4 = h2*r2
- vpmuludq $H2,$T3,$D0 # d0 = h2*s3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
-
- vpmuludq $H0,$T1,$T4 # h0*r1
- vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
- vpaddq $T4,$D1,$D1 # d1 += h0*r1
- vpaddq $H2,$D2,$D2 # d2 += h1*r1
- vpmuludq $H3,$T1,$T4 # h3*r1
- vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
- vpaddq $T4,$D4,$D4 # d4 += h3*r1
- vpaddq $H2,$D0,$D0 # d0 += h4*s1
- vmovdqa `32*4-0x90`(%rax),$T1 # s2
-
- vpmuludq $H0,$T0,$T4 # h0*r0
- vpmuludq $H1,$T0,$H2 # h1*r0
- vpaddq $T4,$D0,$D0 # d0 += h0*r0
- vpaddq $H2,$D1,$D1 # d1 += h1*r0
- vpmuludq $H3,$T0,$T4 # h3*r0
- vpmuludq $H4,$T0,$H2 # h4*r0
- vmovdqu 16*0($inp),%x#$T0 # load input
- vpaddq $T4,$D3,$D3 # d3 += h3*r0
- vpaddq $H2,$D4,$D4 # d4 += h4*r0
- vinserti128 \$1,16*2($inp),$T0,$T0
-
- vpmuludq $H3,$T1,$T4 # h3*s2
- vpmuludq $H4,$T1,$H2 # h4*s2
- vmovdqu 16*1($inp),%x#$T1
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
- vpaddq $H2,$D1,$D1 # d1 += h4*s2
- vmovdqa `32*5-0x90`(%rax),$H2 # r3
- vpmuludq $H1,$T2,$T4 # h1*r2
- vpmuludq $H0,$T2,$T2 # h0*r2
- vpaddq $T4,$D3,$D3 # d3 += h1*r2
- vpaddq $T2,$D2,$D2 # d2 += h0*r2
- vinserti128 \$1,16*3($inp),$T1,$T1
- lea 16*4($inp),$inp
-
- vpmuludq $H1,$H2,$T4 # h1*r3
- vpmuludq $H0,$H2,$H2 # h0*r3
- vpsrldq \$6,$T0,$T2 # splat input
- vpaddq $T4,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $H3,$T3,$T4 # h3*s3
- vpmuludq $H4,$T3,$H2 # h4*s3
- vpsrldq \$6,$T1,$T3
- vpaddq $T4,$D1,$D1 # d1 += h3*s3
- vpaddq $H2,$D2,$D2 # d2 += h4*s3
- vpunpckhqdq $T1,$T0,$T4 # 4
-
- vpmuludq $H3,$S4,$H3 # h3*s4
- vpmuludq $H4,$S4,$H4 # h4*s4
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
- vpunpcklqdq $T3,$T2,$T3 # 2:3
- vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
- vpmuludq $H1,$S4,$H0 # h1*s4
- vmovdqa 64(%rcx),$MASK # .Lmask26
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- ################################################################
- # lazy reduction (interleaved with tail of input splat)
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$4,$T3,$T2
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpand $MASK,$T2,$T2 # 2
- vpsrlq \$26,$T0,$T1
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpaddq $T2,$H2,$H2 # modulo-scheduled
- vpsrlq \$30,$T3,$T3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$40,$T4,$T4 # 4
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpand $MASK,$T0,$T0 # 0
- vpand $MASK,$T1,$T1 # 1
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- sub \$64,$len
- jnz .Loop_avx2$suffix
-
- .byte 0x66,0x90
-.Ltail_avx2$suffix:
- ################################################################
- # while above multiplications were by r^4 in all lanes, in last
- # iteration we multiply least significant lane by r^4 and most
- # significant one by r, so copy of above except that references
- # to the precomputed table are displaced by 4...
-
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
- vmovdqu `32*0+4`(%rsp),$T0 # r0^4
- vpaddq $H1,$T1,$H1
- vmovdqu `32*1+4`(%rsp),$T1 # r1^4
- vpaddq $H3,$T3,$H3
- vmovdqu `32*3+4`(%rsp),$T2 # r2^4
- vpaddq $H4,$T4,$H4
- vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
- vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
-
- vpmuludq $H2,$T0,$D2 # d2 = h2*r0
- vpmuludq $H2,$T1,$D3 # d3 = h2*r1
- vpmuludq $H2,$T2,$D4 # d4 = h2*r2
- vpmuludq $H2,$T3,$D0 # d0 = h2*s3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
-
- vpmuludq $H0,$T1,$T4 # h0*r1
- vpmuludq $H1,$T1,$H2 # h1*r1
- vpaddq $T4,$D1,$D1 # d1 += h0*r1
- vpaddq $H2,$D2,$D2 # d2 += h1*r1
- vpmuludq $H3,$T1,$T4 # h3*r1
- vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
- vpaddq $T4,$D4,$D4 # d4 += h3*r1
- vpaddq $H2,$D0,$D0 # d0 += h4*s1
-
- vpmuludq $H0,$T0,$T4 # h0*r0
- vpmuludq $H1,$T0,$H2 # h1*r0
- vpaddq $T4,$D0,$D0 # d0 += h0*r0
- vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
- vpaddq $H2,$D1,$D1 # d1 += h1*r0
- vpmuludq $H3,$T0,$T4 # h3*r0
- vpmuludq $H4,$T0,$H2 # h4*r0
- vpaddq $T4,$D3,$D3 # d3 += h3*r0
- vpaddq $H2,$D4,$D4 # d4 += h4*r0
-
- vpmuludq $H3,$T1,$T4 # h3*s2
- vpmuludq $H4,$T1,$H2 # h4*s2
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
- vpaddq $H2,$D1,$D1 # d1 += h4*s2
- vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
- vpmuludq $H1,$T2,$T4 # h1*r2
- vpmuludq $H0,$T2,$T2 # h0*r2
- vpaddq $T4,$D3,$D3 # d3 += h1*r2
- vpaddq $T2,$D2,$D2 # d2 += h0*r2
-
- vpmuludq $H1,$H2,$T4 # h1*r3
- vpmuludq $H0,$H2,$H2 # h0*r3
- vpaddq $T4,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $H3,$T3,$T4 # h3*s3
- vpmuludq $H4,$T3,$H2 # h4*s3
- vpaddq $T4,$D1,$D1 # d1 += h3*s3
- vpaddq $H2,$D2,$D2 # d2 += h4*s3
-
- vpmuludq $H3,$S4,$H3 # h3*s4
- vpmuludq $H4,$S4,$H4 # h4*s4
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
- vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
- vpmuludq $H1,$S4,$H0 # h1*s4
- vmovdqa 64(%rcx),$MASK # .Lmask26
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$D1,$T1
- vpsrldq \$8,$H2,$T2
- vpsrldq \$8,$H3,$T3
- vpsrldq \$8,$H4,$T4
- vpsrldq \$8,$H0,$T0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$H2,$H2
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
- vpaddq $T0,$H0,$H0
-
- vpermq \$0x2,$H3,$T3
- vpermq \$0x2,$H4,$T4
- vpermq \$0x2,$H0,$T0
- vpermq \$0x2,$D1,$T1
- vpermq \$0x2,$H2,$T2
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$H2,$H2
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
- vmovd %x#$H1,`4*1-48-64`($ctx)
- vmovd %x#$H2,`4*2-48-64`($ctx)
- vmovd %x#$H3,`4*3-48-64`($ctx)
- vmovd %x#$H4,`4*4-48-64`($ctx)
-___
-$code.=<<___ if ($win64);
- vmovdqa -0xb0(%r10),%xmm6
- vmovdqa -0xa0(%r10),%xmm7
- vmovdqa -0x90(%r10),%xmm8
- vmovdqa -0x80(%r10),%xmm9
- vmovdqa -0x70(%r10),%xmm10
- vmovdqa -0x60(%r10),%xmm11
- vmovdqa -0x50(%r10),%xmm12
- vmovdqa -0x40(%r10),%xmm13
- vmovdqa -0x30(%r10),%xmm14
- vmovdqa -0x20(%r10),%xmm15
- lea -8(%r10),%rsp
-.Ldo_avx2_epilogue$suffix:
-___
-$code.=<<___ if (!$win64);
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-___
-$code.=<<___;
- vzeroupper
- ret
-.cfi_endproc
-___
-if($avx > 2 && $avx512) {
-my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
-my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
-my $PADBIT="%zmm30";
-
-map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
-map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
-map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
-map(s/%y/%z/,($MASK));
-
-$code.=<<___;
-.cfi_startproc
-.Lblocks_avx512:
- mov \$15,%eax
- kmovw %eax,%k2
-___
-$code.=<<___ if (!$win64);
- lea 8(%rsp),%r10
-.cfi_def_cfa_register %r10
- sub \$0x128,%rsp
-___
-$code.=<<___ if ($win64);
- lea 8(%rsp),%r10
- sub \$0x1c8,%rsp
- vmovdqa %xmm6,-0xb0(%r10)
- vmovdqa %xmm7,-0xa0(%r10)
- vmovdqa %xmm8,-0x90(%r10)
- vmovdqa %xmm9,-0x80(%r10)
- vmovdqa %xmm10,-0x70(%r10)
- vmovdqa %xmm11,-0x60(%r10)
- vmovdqa %xmm12,-0x50(%r10)
- vmovdqa %xmm13,-0x40(%r10)
- vmovdqa %xmm14,-0x30(%r10)
- vmovdqa %xmm15,-0x20(%r10)
-.Ldo_avx512_body:
-___
-$code.=<<___;
- lea .Lconst(%rip),%rcx
- lea 48+64($ctx),$ctx # size optimization
- vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
-
- # expand pre-calculated table
- vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
- and \$-512,%rsp
- vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
- mov \$0x20,%rax
- vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
- vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
- vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
- vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
- vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
- vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
- vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
- vpermd $D0,$T2,$R0 # 00003412 -> 14243444
- vpbroadcastq 64(%rcx),$MASK # .Lmask26
- vpermd $D1,$T2,$R1
- vpermd $T0,$T2,$S1
- vpermd $D2,$T2,$R2
- vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
- vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
- vpermd $T1,$T2,$S2
- vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
- vpsrlq \$32,$R1,$T1
- vpermd $D3,$T2,$R3
- vmovdqa64 $S1,0x40(%rsp){%k2}
- vpermd $T3,$T2,$S3
- vpermd $D4,$T2,$R4
- vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
- vpermd $T4,$T2,$S4
- vmovdqa64 $S2,0x80(%rsp){%k2}
- vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
- vmovdqa64 $S3,0xc0(%rsp){%k2}
- vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
- vmovdqa64 $S4,0x100(%rsp){%k2}
-
- ################################################################
- # calculate 5th through 8th powers of the key
- #
- # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
- # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
- # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
- # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
- # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
-
- vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
- vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
- vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
- vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
- vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
- vpsrlq \$32,$R2,$T2
-
- vpmuludq $T1,$S4,$M0
- vpmuludq $T1,$R0,$M1
- vpmuludq $T1,$R1,$M2
- vpmuludq $T1,$R2,$M3
- vpmuludq $T1,$R3,$M4
- vpsrlq \$32,$R3,$T3
- vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
- vpaddq $M1,$D1,$D1 # d1 += r1'*r0
- vpaddq $M2,$D2,$D2 # d2 += r1'*r1
- vpaddq $M3,$D3,$D3 # d3 += r1'*r2
- vpaddq $M4,$D4,$D4 # d4 += r1'*r3
-
- vpmuludq $T2,$S3,$M0
- vpmuludq $T2,$S4,$M1
- vpmuludq $T2,$R1,$M3
- vpmuludq $T2,$R2,$M4
- vpmuludq $T2,$R0,$M2
- vpsrlq \$32,$R4,$T4
- vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
- vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
- vpaddq $M3,$D3,$D3 # d3 += r2'*r1
- vpaddq $M4,$D4,$D4 # d4 += r2'*r2
- vpaddq $M2,$D2,$D2 # d2 += r2'*r0
-
- vpmuludq $T3,$S2,$M0
- vpmuludq $T3,$R0,$M3
- vpmuludq $T3,$R1,$M4
- vpmuludq $T3,$S3,$M1
- vpmuludq $T3,$S4,$M2
- vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
- vpaddq $M3,$D3,$D3 # d3 += r3'*r0
- vpaddq $M4,$D4,$D4 # d4 += r3'*r1
- vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
- vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
-
- vpmuludq $T4,$S4,$M3
- vpmuludq $T4,$R0,$M4
- vpmuludq $T4,$S1,$M0
- vpmuludq $T4,$S2,$M1
- vpmuludq $T4,$S3,$M2
- vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
- vpaddq $M4,$D4,$D4 # d4 += r2'*r0
- vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
- vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
- vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
-
- ################################################################
- # load input
- vmovdqu64 16*0($inp),%z#$T3
- vmovdqu64 16*4($inp),%z#$T4
- lea 16*8($inp),$inp
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$D3,$M3
- vpandq $MASK,$D3,$D3
- vpaddq $M3,$D4,$D4 # d3 -> d4
-
- vpsrlq \$26,$D0,$M0
- vpandq $MASK,$D0,$D0
- vpaddq $M0,$D1,$D1 # d0 -> d1
-
- vpsrlq \$26,$D4,$M4
- vpandq $MASK,$D4,$D4
-
- vpsrlq \$26,$D1,$M1
- vpandq $MASK,$D1,$D1
- vpaddq $M1,$D2,$D2 # d1 -> d2
-
- vpaddq $M4,$D0,$D0
- vpsllq \$2,$M4,$M4
- vpaddq $M4,$D0,$D0 # d4 -> d0
-
- vpsrlq \$26,$D2,$M2
- vpandq $MASK,$D2,$D2
- vpaddq $M2,$D3,$D3 # d2 -> d3
-
- vpsrlq \$26,$D0,$M0
- vpandq $MASK,$D0,$D0
- vpaddq $M0,$D1,$D1 # d0 -> d1
-
- vpsrlq \$26,$D3,$M3
- vpandq $MASK,$D3,$D3
- vpaddq $M3,$D4,$D4 # d3 -> d4
-
- ################################################################
- # at this point we have 14243444 in $R0-$S4 and 05060708 in
- # $D0-$D4, ...
-
- vpunpcklqdq $T4,$T3,$T0 # transpose input
- vpunpckhqdq $T4,$T3,$T4
-
- # ... since input 64-bit lanes are ordered as 73625140, we could
- # "vperm" it to 76543210 (here and in each loop iteration), *or*
- # we could just flow along, hence the goal for $R0-$S4 is
- # 1858286838784888 ...
-
- vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
- mov \$0x7777,%eax
- kmovw %eax,%k1
-
- vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
- vpermd $R1,$M0,$R1
- vpermd $R2,$M0,$R2
- vpermd $R3,$M0,$R3
- vpermd $R4,$M0,$R4
-
- vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
- vpermd $D1,$M0,${R1}{%k1}
- vpermd $D2,$M0,${R2}{%k1}
- vpermd $D3,$M0,${R3}{%k1}
- vpermd $D4,$M0,${R4}{%k1}
-
- vpslld \$2,$R1,$S1 # *5
- vpslld \$2,$R2,$S2
- vpslld \$2,$R3,$S3
- vpslld \$2,$R4,$S4
- vpaddd $R1,$S1,$S1
- vpaddd $R2,$S2,$S2
- vpaddd $R3,$S3,$S3
- vpaddd $R4,$S4,$S4
-
- vpbroadcastq 32(%rcx),$PADBIT # .L129
-
- vpsrlq \$52,$T0,$T2 # splat input
- vpsllq \$12,$T4,$T3
- vporq $T3,$T2,$T2
- vpsrlq \$26,$T0,$T1
- vpsrlq \$14,$T4,$T3
- vpsrlq \$40,$T4,$T4 # 4
- vpandq $MASK,$T2,$T2 # 2
- vpandq $MASK,$T0,$T0 # 0
- #vpandq $MASK,$T1,$T1 # 1
- #vpandq $MASK,$T3,$T3 # 3
- #vporq $PADBIT,$T4,$T4 # padbit, yes, always
-
- vpaddq $H2,$T2,$H2 # accumulate input
- sub \$192,$len
- jbe .Ltail_avx512
- jmp .Loop_avx512
-
-.align 32
-.Loop_avx512:
- ################################################################
- # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
- # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
- # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
- # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
- # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
- # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
- # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
- # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
- # \________/\___________/
- ################################################################
- #vpaddq $H2,$T2,$H2 # accumulate input
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # however, as h2 is "chronologically" first one available pull
- # corresponding operations up, so it's
- #
- # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
- # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
- # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
- # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
- # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
-
- vpmuludq $H2,$R1,$D3 # d3 = h2*r1
- vpaddq $H0,$T0,$H0
- vpmuludq $H2,$R2,$D4 # d4 = h2*r2
- vpandq $MASK,$T1,$T1 # 1
- vpmuludq $H2,$S3,$D0 # d0 = h2*s3
- vpandq $MASK,$T3,$T3 # 3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
- vporq $PADBIT,$T4,$T4 # padbit, yes, always
- vpmuludq $H2,$R0,$D2 # d2 = h2*r0
- vpaddq $H1,$T1,$H1 # accumulate input
- vpaddq $H3,$T3,$H3
- vpaddq $H4,$T4,$H4
-
- vmovdqu64 16*0($inp),$T3 # load input
- vmovdqu64 16*4($inp),$T4
- lea 16*8($inp),$inp
- vpmuludq $H0,$R3,$M3
- vpmuludq $H0,$R4,$M4
- vpmuludq $H0,$R0,$M0
- vpmuludq $H0,$R1,$M1
- vpaddq $M3,$D3,$D3 # d3 += h0*r3
- vpaddq $M4,$D4,$D4 # d4 += h0*r4
- vpaddq $M0,$D0,$D0 # d0 += h0*r0
- vpaddq $M1,$D1,$D1 # d1 += h0*r1
-
- vpmuludq $H1,$R2,$M3
- vpmuludq $H1,$R3,$M4
- vpmuludq $H1,$S4,$M0
- vpmuludq $H0,$R2,$M2
- vpaddq $M3,$D3,$D3 # d3 += h1*r2
- vpaddq $M4,$D4,$D4 # d4 += h1*r3
- vpaddq $M0,$D0,$D0 # d0 += h1*s4
- vpaddq $M2,$D2,$D2 # d2 += h0*r2
-
- vpunpcklqdq $T4,$T3,$T0 # transpose input
- vpunpckhqdq $T4,$T3,$T4
-
- vpmuludq $H3,$R0,$M3
- vpmuludq $H3,$R1,$M4
- vpmuludq $H1,$R0,$M1
- vpmuludq $H1,$R1,$M2
- vpaddq $M3,$D3,$D3 # d3 += h3*r0
- vpaddq $M4,$D4,$D4 # d4 += h3*r1
- vpaddq $M1,$D1,$D1 # d1 += h1*r0
- vpaddq $M2,$D2,$D2 # d2 += h1*r1
-
- vpmuludq $H4,$S4,$M3
- vpmuludq $H4,$R0,$M4
- vpmuludq $H3,$S2,$M0
- vpmuludq $H3,$S3,$M1
- vpaddq $M3,$D3,$D3 # d3 += h4*s4
- vpmuludq $H3,$S4,$M2
- vpaddq $M4,$D4,$D4 # d4 += h4*r0
- vpaddq $M0,$D0,$D0 # d0 += h3*s2
- vpaddq $M1,$D1,$D1 # d1 += h3*s3
- vpaddq $M2,$D2,$D2 # d2 += h3*s4
-
- vpmuludq $H4,$S1,$M0
- vpmuludq $H4,$S2,$M1
- vpmuludq $H4,$S3,$M2
- vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
- vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
- vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
-
- ################################################################
- # lazy reduction (interleaved with input splat)
-
- vpsrlq \$52,$T0,$T2 # splat input
- vpsllq \$12,$T4,$T3
-
- vpsrlq \$26,$D3,$H3
- vpandq $MASK,$D3,$D3
- vpaddq $H3,$D4,$H4 # h3 -> h4
-
- vporq $T3,$T2,$T2
-
- vpsrlq \$26,$H0,$D0
- vpandq $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpandq $MASK,$T2,$T2 # 2
-
- vpsrlq \$26,$H4,$D4
- vpandq $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpandq $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpaddq $T2,$H2,$H2 # modulo-scheduled
- vpsrlq \$26,$T0,$T1
-
- vpsrlq \$26,$H2,$D2
- vpandq $MASK,$H2,$H2
- vpaddq $D2,$D3,$H3 # h2 -> h3
-
- vpsrlq \$14,$T4,$T3
-
- vpsrlq \$26,$H0,$D0
- vpandq $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$40,$T4,$T4 # 4
-
- vpsrlq \$26,$H3,$D3
- vpandq $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpandq $MASK,$T0,$T0 # 0
- #vpandq $MASK,$T1,$T1 # 1
- #vpandq $MASK,$T3,$T3 # 3
- #vporq $PADBIT,$T4,$T4 # padbit, yes, always
-
- sub \$128,$len
- ja .Loop_avx512
-
-.Ltail_avx512:
- ################################################################
- # while above multiplications were by r^8 in all lanes, in last
- # iteration we multiply least significant lane by r^8 and most
- # significant one by r, that's why table gets shifted...
-
- vpsrlq \$32,$R0,$R0 # 0105020603070408
- vpsrlq \$32,$R1,$R1
- vpsrlq \$32,$R2,$R2
- vpsrlq \$32,$S3,$S3
- vpsrlq \$32,$S4,$S4
- vpsrlq \$32,$R3,$R3
- vpsrlq \$32,$R4,$R4
- vpsrlq \$32,$S1,$S1
- vpsrlq \$32,$S2,$S2
-
- ################################################################
- # load either next or last 64 byte of input
- lea ($inp,$len),$inp
-
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
-
- vpmuludq $H2,$R1,$D3 # d3 = h2*r1
- vpmuludq $H2,$R2,$D4 # d4 = h2*r2
- vpmuludq $H2,$S3,$D0 # d0 = h2*s3
- vpandq $MASK,$T1,$T1 # 1
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
- vpandq $MASK,$T3,$T3 # 3
- vpmuludq $H2,$R0,$D2 # d2 = h2*r0
- vporq $PADBIT,$T4,$T4 # padbit, yes, always
- vpaddq $H1,$T1,$H1 # accumulate input
- vpaddq $H3,$T3,$H3
- vpaddq $H4,$T4,$H4
-
- vmovdqu 16*0($inp),%x#$T0
- vpmuludq $H0,$R3,$M3
- vpmuludq $H0,$R4,$M4
- vpmuludq $H0,$R0,$M0
- vpmuludq $H0,$R1,$M1
- vpaddq $M3,$D3,$D3 # d3 += h0*r3
- vpaddq $M4,$D4,$D4 # d4 += h0*r4
- vpaddq $M0,$D0,$D0 # d0 += h0*r0
- vpaddq $M1,$D1,$D1 # d1 += h0*r1
-
- vmovdqu 16*1($inp),%x#$T1
- vpmuludq $H1,$R2,$M3
- vpmuludq $H1,$R3,$M4
- vpmuludq $H1,$S4,$M0
- vpmuludq $H0,$R2,$M2
- vpaddq $M3,$D3,$D3 # d3 += h1*r2
- vpaddq $M4,$D4,$D4 # d4 += h1*r3
- vpaddq $M0,$D0,$D0 # d0 += h1*s4
- vpaddq $M2,$D2,$D2 # d2 += h0*r2
-
- vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
- vpmuludq $H3,$R0,$M3
- vpmuludq $H3,$R1,$M4
- vpmuludq $H1,$R0,$M1
- vpmuludq $H1,$R1,$M2
- vpaddq $M3,$D3,$D3 # d3 += h3*r0
- vpaddq $M4,$D4,$D4 # d4 += h3*r1
- vpaddq $M1,$D1,$D1 # d1 += h1*r0
- vpaddq $M2,$D2,$D2 # d2 += h1*r1
-
- vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
- vpmuludq $H4,$S4,$M3
- vpmuludq $H4,$R0,$M4
- vpmuludq $H3,$S2,$M0
- vpmuludq $H3,$S3,$M1
- vpmuludq $H3,$S4,$M2
- vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
- vpaddq $M4,$D4,$D4 # d4 += h4*r0
- vpaddq $M0,$D0,$D0 # d0 += h3*s2
- vpaddq $M1,$D1,$D1 # d1 += h3*s3
- vpaddq $M2,$D2,$D2 # d2 += h3*s4
-
- vpmuludq $H4,$S1,$M0
- vpmuludq $H4,$S2,$M1
- vpmuludq $H4,$S3,$M2
- vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
- vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
- vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
-
- ################################################################
- # horizontal addition
-
- mov \$1,%eax
- vpermq \$0xb1,$H3,$D3
- vpermq \$0xb1,$D4,$H4
- vpermq \$0xb1,$H0,$D0
- vpermq \$0xb1,$H1,$D1
- vpermq \$0xb1,$H2,$D2
- vpaddq $D3,$H3,$H3
- vpaddq $D4,$H4,$H4
- vpaddq $D0,$H0,$H0
- vpaddq $D1,$H1,$H1
- vpaddq $D2,$H2,$H2
-
- kmovw %eax,%k3
- vpermq \$0x2,$H3,$D3
- vpermq \$0x2,$H4,$D4
- vpermq \$0x2,$H0,$D0
- vpermq \$0x2,$H1,$D1
- vpermq \$0x2,$H2,$D2
- vpaddq $D3,$H3,$H3
- vpaddq $D4,$H4,$H4
- vpaddq $D0,$H0,$H0
- vpaddq $D1,$H1,$H1
- vpaddq $D2,$H2,$H2
-
- vextracti64x4 \$0x1,$H3,%y#$D3
- vextracti64x4 \$0x1,$H4,%y#$D4
- vextracti64x4 \$0x1,$H0,%y#$D0
- vextracti64x4 \$0x1,$H1,%y#$D1
- vextracti64x4 \$0x1,$H2,%y#$D2
- vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
- vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
- vpaddq $D0,$H0,${H0}{%k3}{z}
- vpaddq $D1,$H1,${H1}{%k3}{z}
- vpaddq $D2,$H2,${H2}{%k3}{z}
-___
-map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
-map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
-$code.=<<___;
- ################################################################
- # lazy reduction (interleaved with input splat)
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpunpcklqdq $T3,$T2,$T2 # 2:3
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpsrlq \$30,$T2,$T3
- vpsrlq \$4,$T2,$T2
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpsrlq \$26,$T0,$T1
- vpsrlq \$40,$T4,$T4 # 4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T0,$T0 # 0
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
- vpand $MASK,$T1,$T1 # 1
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
- add \$64,$len
- jnz .Ltail_avx2$suffix
-
- vpsubq $T2,$H2,$H2 # undo input accumulation
- vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
- vmovd %x#$H1,`4*1-48-64`($ctx)
- vmovd %x#$H2,`4*2-48-64`($ctx)
- vmovd %x#$H3,`4*3-48-64`($ctx)
- vmovd %x#$H4,`4*4-48-64`($ctx)
- vzeroall
-___
-$code.=<<___ if ($win64);
- movdqa -0xb0(%r10),%xmm6
- movdqa -0xa0(%r10),%xmm7
- movdqa -0x90(%r10),%xmm8
- movdqa -0x80(%r10),%xmm9
- movdqa -0x70(%r10),%xmm10
- movdqa -0x60(%r10),%xmm11
- movdqa -0x50(%r10),%xmm12
- movdqa -0x40(%r10),%xmm13
- movdqa -0x30(%r10),%xmm14
- movdqa -0x20(%r10),%xmm15
- lea -8(%r10),%rsp
-.Ldo_avx512_epilogue:
-___
-$code.=<<___ if (!$win64);
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-___
-$code.=<<___;
- ret
-.cfi_endproc
-___
-
-}
-
-}
-
-&declare_function("poly1305_blocks_avx2", 32, 4);
-poly1305_blocks_avxN(0);
-&end_function("poly1305_blocks_avx2");
-
-if($kernel) {
- $code .= "#endif\n";
-}
-
-#######################################################################
-if ($avx>2) {
-# On entry we have input length divisible by 64. But since inner loop
-# processes 128 bytes per iteration, cases when length is not divisible
-# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
-# reason stack layout is kept identical to poly1305_blocks_avx2. If not
-# for this tail, we wouldn't have to even allocate stack frame...
-
-if($kernel) {
- $code .= "#ifdef CONFIG_AS_AVX512\n";
-}
-
-&declare_function("poly1305_blocks_avx512", 32, 4);
-poly1305_blocks_avxN(1);
-&end_function("poly1305_blocks_avx512");
-
-if ($kernel) {
- $code .= "#endif\n";
-}
-
-if (!$kernel && $avx>3) {
-########################################################################
-# VPMADD52 version using 2^44 radix.
-#
-# One can argue that base 2^52 would be more natural. Well, even though
-# some operations would be more natural, one has to recognize couple of
-# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
-# at amount of multiply-n-accumulate operations. Secondly, it makes it
-# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
-# reference implementations], which means that more such operations
-# would have to be performed in inner loop, which in turn makes critical
-# path longer. In other words, even though base 2^44 reduction might
-# look less elegant, overall critical path is actually shorter...
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int64 h[3]; # current hash value base 2^44
-# unsigned __int64 s[2]; # key value*20 base 2^44
-# unsigned __int64 r[3]; # key value base 2^44
-# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
-# # r^n positions reflect
-# # placement in register, not
-# # memory, R[3] is R[1]*20
-
-$code.=<<___;
-.type poly1305_init_base2_44,\@function,3
-.align 32
-poly1305_init_base2_44:
- xor %rax,%rax
- mov %rax,0($ctx) # initialize hash value
- mov %rax,8($ctx)
- mov %rax,16($ctx)
-
-.Linit_base2_44:
- lea poly1305_blocks_vpmadd52(%rip),%r10
- lea poly1305_emit_base2_44(%rip),%r11
-
- mov \$0x0ffffffc0fffffff,%rax
- mov \$0x0ffffffc0ffffffc,%rcx
- and 0($inp),%rax
- mov \$0x00000fffffffffff,%r8
- and 8($inp),%rcx
- mov \$0x00000fffffffffff,%r9
- and %rax,%r8
- shrd \$44,%rcx,%rax
- mov %r8,40($ctx) # r0
- and %r9,%rax
- shr \$24,%rcx
- mov %rax,48($ctx) # r1
- lea (%rax,%rax,4),%rax # *5
- mov %rcx,56($ctx) # r2
- shl \$2,%rax # magic <<2
- lea (%rcx,%rcx,4),%rcx # *5
- shl \$2,%rcx # magic <<2
- mov %rax,24($ctx) # s1
- mov %rcx,32($ctx) # s2
- movq \$-1,64($ctx) # write impossible value
-___
-$code.=<<___ if ($flavour !~ /elf32/);
- mov %r10,0(%rdx)
- mov %r11,8(%rdx)
-___
-$code.=<<___ if ($flavour =~ /elf32/);
- mov %r10d,0(%rdx)
- mov %r11d,4(%rdx)
-___
-$code.=<<___;
- mov \$1,%eax
- ret
-.size poly1305_init_base2_44,.-poly1305_init_base2_44
-___
-{
-my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
-my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
-my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
-
-$code.=<<___;
-.type poly1305_blocks_vpmadd52,\@function,4
-.align 32
-poly1305_blocks_vpmadd52:
- shr \$4,$len
- jz .Lno_data_vpmadd52 # too short
-
- shl \$40,$padbit
- mov 64($ctx),%r8 # peek on power of the key
-
- # if powers of the key are not calculated yet, process up to 3
- # blocks with this single-block subroutine, otherwise ensure that
- # length is divisible by 2 blocks and pass the rest down to next
- # subroutine...
-
- mov \$3,%rax
- mov \$1,%r10
- cmp \$4,$len # is input long
- cmovae %r10,%rax
- test %r8,%r8 # is power value impossible?
- cmovns %r10,%rax
-
- and $len,%rax # is input of favourable length?
- jz .Lblocks_vpmadd52_4x
-
- sub %rax,$len
- mov \$7,%r10d
- mov \$1,%r11d
- kmovw %r10d,%k7
- lea .L2_44_inp_permd(%rip),%r10
- kmovw %r11d,%k1
-
- vmovq $padbit,%x#$PAD
- vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
- vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
- vpermq \$0xcf,$PAD,$PAD
- vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
-
- vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
- vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
- vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
- vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
-
- vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
- vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
-
- jmp .Loop_vpmadd52
-
-.align 32
-.Loop_vpmadd52:
- vmovdqu32 0($inp),%x#$T0 # load input as ----3210
- lea 16($inp),$inp
-
- vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
- vpsrlvq $inp_shift,$T0,$T0
- vpandq $reduc_mask,$T0,$T0
- vporq $PAD,$T0,$T0
-
- vpaddq $T0,$Dlo,$Dlo # accumulate input
-
- vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
- vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
- vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
-
- vpxord $Dlo,$Dlo,$Dlo
- vpxord $Dhi,$Dhi,$Dhi
-
- vpmadd52luq $r2r1r0,$H0,$Dlo
- vpmadd52huq $r2r1r0,$H0,$Dhi
-
- vpmadd52luq $r1r0s2,$H1,$Dlo
- vpmadd52huq $r1r0s2,$H1,$Dhi
-
- vpmadd52luq $r0s2s1,$H2,$Dlo
- vpmadd52huq $r0s2s1,$H2,$Dhi
-
- vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
- vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
- vpandq $reduc_mask,$Dlo,$Dlo
-
- vpaddq $T0,$Dhi,$Dhi
-
- vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
-
- vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
-
- vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
- vpandq $reduc_mask,$Dlo,$Dlo
-
- vpermq \$0b10010011,$T0,$T0
-
- vpaddq $T0,$Dlo,$Dlo
-
- vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
-
- vpaddq $T0,$Dlo,$Dlo
- vpsllq \$2,$T0,$T0
-
- vpaddq $T0,$Dlo,$Dlo
-
- dec %rax # len-=16
- jnz .Loop_vpmadd52
-
- vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
-
- test $len,$len
- jnz .Lblocks_vpmadd52_4x
-
-.Lno_data_vpmadd52:
- ret
-.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
-___
-}
-{
-########################################################################
-# As implied by its name 4x subroutine processes 4 blocks in parallel
-# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
-# and is handled in 256-bit %ymm registers.
-
-my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
-my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
-my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
-
-$code.=<<___;
-.type poly1305_blocks_vpmadd52_4x,\@function,4
-.align 32
-poly1305_blocks_vpmadd52_4x:
- shr \$4,$len
- jz .Lno_data_vpmadd52_4x # too short
-
- shl \$40,$padbit
- mov 64($ctx),%r8 # peek on power of the key
-
-.Lblocks_vpmadd52_4x:
- vpbroadcastq $padbit,$PAD
-
- vmovdqa64 .Lx_mask44(%rip),$mask44
- mov \$5,%eax
- vmovdqa64 .Lx_mask42(%rip),$mask42
- kmovw %eax,%k1 # used in 2x path
-
- test %r8,%r8 # is power value impossible?
- js .Linit_vpmadd52 # if it is, then init R[4]