diff options
author | Kyle Evans <kevans@FreeBSD.org> | 2021-03-15 02:25:40 +0000 |
---|---|---|
committer | Kyle Evans <kevans@FreeBSD.org> | 2021-03-15 04:52:04 +0000 |
commit | 74ae3f3e33b810248da19004c58b3581cd367843 (patch) | |
tree | b17ce98b77a3a1a86e8255dad7861d9c160222a9 | |
parent | 3e5e9939cda3b24df37c37da5f195415a894d9fd (diff) |
if_wg: import latest fixup work from the wireguard-freebsd project
This is the culmination of about a week of work from three developers to
fix a number of functional and security issues. This patch consists of
work done by the following folks:
- Jason A. Donenfeld <Jason@zx2c4.com>
- Matt Dunwoodie <ncon@noconroy.net>
- Kyle Evans <kevans@FreeBSD.org>
Notable changes include:
- Packets are now correctly staged for processing once the handshake has
completed, resulting in less packet loss in the interim.
- Various race conditions have been resolved, particularly w.r.t. socket
and packet lifetime (panics)
- Various tests have been added to assure correct functionality and
tooling conformance
- Many security issues have been addressed
- if_wg now maintains jail-friendly semantics: sockets are created in
the interface's home vnet so that it can act as the sole network
connection for a jail
- if_wg no longer fails to remove peer allowed-ips of 0.0.0.0/0
- if_wg now exports via ioctl a format that is future proof and
complete. It is additionally supported by the upstream
wireguard-tools (which we plan to merge in to base soon)
- if_wg now conforms to the WireGuard protocol and is more closely
aligned with security auditing guidelines
Note that the driver has been rebased away from using iflib. iflib
poses a number of challenges for a cloned device trying to operate in a
vnet that are non-trivial to solve and adds complexity to the
implementation for little gain.
The crypto implementation that was previously added to the tree was a
super complex integration of what previously appeared in an old out of
tree Linux module, which has been reduced to crypto.c containing simple
boring reference implementations. This is part of a near-to-mid term
goal to work with FreeBSD kernel crypto folks and take advantage of or
improve accelerated crypto already offered elsewhere.
There's additional test suite effort underway out-of-tree taking
advantage of the aforementioned jail-friendly semantics to test a number
of real-world topologies, based on netns.sh.
Also note that this is still a work in progress; work going further will
be much smaller in nature.
MFC after: 1 month (maybe)
70 files changed, 6333 insertions, 43677 deletions
diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist index e7784cbb0a47..0f85798815d5 100644 --- a/etc/mtree/BSD.include.dist +++ b/etc/mtree/BSD.include.dist @@ -64,6 +64,8 @@ .. iicbus .. + if_wg + .. io .. mfi diff --git a/include/Makefile b/include/Makefile index 3a34ddb8aa18..31e207f6b199 100644 --- a/include/Makefile +++ b/include/Makefile @@ -44,7 +44,7 @@ LDIRS= bsm cam geom net net80211 netgraph netinet netinet6 \ LSUBDIRS= cam/ata cam/mmc cam/nvme cam/scsi \ dev/acpica dev/agp dev/an dev/ciss dev/filemon dev/firewire \ dev/hwpmc dev/hyperv \ - dev/ic dev/iicbus dev/io dev/mfi dev/mmc dev/nvme \ + dev/ic dev/iicbus dev/if_wg dev/io dev/mfi dev/mmc dev/nvme \ dev/ofw dev/pbio dev/pci ${_dev_powermac_nvram} dev/ppbus dev/pwm \ dev/smbus dev/speaker dev/tcp_log dev/veriexec dev/vkbd \ fs/devfs fs/fdescfs fs/msdosfs fs/nfs fs/nullfs \ @@ -170,6 +170,10 @@ NVPAIRDIR= ${INCLUDEDIR}/sys MLX5= mlx5io.h MLX5DIR= ${INCLUDEDIR}/dev/mlx5 +.PATH: ${SRCTOP}/sys/dev/if_wg +WG= if_wg.h +WGDIR= ${INCLUDEDIR}/dev/if_wg + INCSGROUPS= INCS \ ACPICA \ AGP \ @@ -182,7 +186,8 @@ INCSGROUPS= INCS \ PCI \ RPC \ TEKEN \ - VERIEXEC + VERIEXEC \ + WG .if ${MK_IPFILTER} != "no" INCSGROUPS+= IPFILTER diff --git a/sbin/ifconfig/ifwg.c b/sbin/ifconfig/ifwg.c index 86bacc59f50d..a102f392cf80 100644 --- a/sbin/ifconfig/ifwg.c +++ b/sbin/ifconfig/ifwg.c @@ -46,6 +46,8 @@ __FBSDID("$FreeBSD$"); #include <netinet/in.h> #include <arpa/inet.h> +#include <dev/if_wg/if_wg.h> + #include <assert.h> #include <ctype.h> #include <err.h> @@ -65,40 +67,60 @@ __FBSDID("$FreeBSD$"); #include "ifconfig.h" -typedef enum { - WGC_GET = 0x5, - WGC_SET = 0x6, -} wg_cmd_t; +static void wgfinish(int s, void *arg); + +static bool wgfinish_registered; -static nvlist_t *nvl_params; -static bool do_peer; static int allowed_ips_count; static int allowed_ips_max; -struct allowedip { - struct sockaddr_storage a_addr; - struct sockaddr_storage a_mask; -}; -struct allowedip *allowed_ips; +static nvlist_t **allowed_ips, *nvl_peer; #define ALLOWEDIPS_START 16 -#define WG_KEY_LEN 32 -#define WG_KEY_LEN_BASE64 ((((WG_KEY_LEN) + 2) / 3) * 4 + 1) -#define WG_KEY_LEN_HEX (WG_KEY_LEN * 2 + 1) +#define WG_KEY_SIZE_BASE64 ((((WG_KEY_SIZE) + 2) / 3) * 4 + 1) +#define WG_KEY_SIZE_HEX (WG_KEY_SIZE * 2 + 1) #define WG_MAX_STRLEN 64 +struct allowedip { + union { + struct in_addr ip4; + struct in6_addr ip6; + }; +}; + +static void +register_wgfinish(void) +{ + + if (wgfinish_registered) + return; + callback_register(wgfinish, NULL); + wgfinish_registered = true; +} + +static nvlist_t * +nvl_device(void) +{ + static nvlist_t *_nvl_device; + + if (_nvl_device == NULL) + _nvl_device = nvlist_create(0); + register_wgfinish(); + return (_nvl_device); +} + static bool -key_from_base64(uint8_t key[static WG_KEY_LEN], const char *base64) +key_from_base64(uint8_t key[static WG_KEY_SIZE], const char *base64) { - if (strlen(base64) != WG_KEY_LEN_BASE64 - 1) { - warnx("bad key len - need %d got %zu\n", WG_KEY_LEN_BASE64 - 1, strlen(base64)); + if (strlen(base64) != WG_KEY_SIZE_BASE64 - 1) { + warnx("bad key len - need %d got %zu\n", WG_KEY_SIZE_BASE64 - 1, strlen(base64)); return false; } - if (base64[WG_KEY_LEN_BASE64 - 2] != '=') { - warnx("bad key terminator, expected '=' got '%c'", base64[WG_KEY_LEN_BASE64 - 2]); + if (base64[WG_KEY_SIZE_BASE64 - 2] != '=') { + warnx("bad key terminator, expected '=' got '%c'", base64[WG_KEY_SIZE_BASE64 - 2]); return false; } - return (b64_pton(base64, key, WG_KEY_LEN)); + return (b64_pton(base64, key, WG_KEY_SIZE)); } static void @@ -128,7 +150,7 @@ parse_endpoint(const char *endpoint_) err = getaddrinfo(endpoint, port, &hints, &res); if (err) errx(1, "%s", gai_strerror(err)); - nvlist_add_binary(nvl_params, "endpoint", res->ai_addr, res->ai_addrlen); + nvlist_add_binary(nvl_peer, "endpoint", res->ai_addr, res->ai_addrlen); freeaddrinfo(res); free(base); } @@ -227,12 +249,14 @@ in6_mask2len(struct in6_addr *mask, u_char *lim0) } static bool -parse_ip(struct allowedip *aip, const char *value) +parse_ip(struct allowedip *aip, uint16_t *family, const char *value) { struct addrinfo hints, *res; int err; + bool ret; - bzero(&aip->a_addr, sizeof(aip->a_addr)); + ret = true; + bzero(aip, sizeof(*aip)); bzero(&hints, sizeof(hints)); hints.ai_family = AF_UNSPEC; hints.ai_flags = AI_NUMERICHOST; @@ -240,10 +264,21 @@ parse_ip(struct allowedip *aip, const char *value) if (err) errx(1, "%s", gai_strerror(err)); - memcpy(&aip->a_addr, res->ai_addr, res->ai_addrlen); + *family = res->ai_family; + if (res->ai_family == AF_INET) { + struct sockaddr_in *sin = (struct sockaddr_in *)res->ai_addr; + + aip->ip4 = sin->sin_addr; + } else if (res->ai_family == AF_INET6) { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)res->ai_addr; + + aip->ip6 = sin6->sin6_addr; + } else { + ret = false; + } freeaddrinfo(res); - return (true); + return (ret); } static void @@ -271,61 +306,84 @@ sa_ntop(const struct sockaddr *sa, char *buf, int *port) } static void -dump_peer(const nvlist_t *nvl_peer) +dump_peer(const nvlist_t *nvl_peer_cfg) { const void *key; - const struct allowedip *aips; const struct sockaddr *endpoint; char outbuf[WG_MAX_STRLEN]; char addr_buf[INET6_ADDRSTRLEN]; - size_t size; - int count, port; + size_t aip_count, size; + int port; uint16_t persistent_keepalive; + const nvlist_t * const *nvl_aips; printf("[Peer]\n"); - if (nvlist_exists_binary(nvl_peer, "public-key")) { - key = nvlist_get_binary(nvl_peer, "public-key", &size); + if (nvlist_exists_binary(nvl_peer_cfg, "public-key")) { + key = nvlist_get_binary(nvl_peer_cfg, "public-key", &size); b64_ntop((const uint8_t *)key, size, outbuf, WG_MAX_STRLEN); printf("PublicKey = %s\n", outbuf); } - if (nvlist_exists_binary(nvl_peer, "endpoint")) { - endpoint = nvlist_get_binary(nvl_peer, "endpoint", &size); + if (nvlist_exists_binary(nvl_peer_cfg, "preshared-key")) { + key = nvlist_get_binary(nvl_peer_cfg, "preshared-key", &size); + b64_ntop((const uint8_t *)key, size, outbuf, WG_MAX_STRLEN); + printf("PresharedKey = %s\n", outbuf); + } + if (nvlist_exists_binary(nvl_peer_cfg, "endpoint")) { + endpoint = nvlist_get_binary(nvl_peer_cfg, "endpoint", &size); sa_ntop(endpoint, addr_buf, &port); printf("Endpoint = %s:%d\n", addr_buf, ntohs(port)); } - if (nvlist_exists_number(nvl_peer, "persistent-keepalive-interval")) { - persistent_keepalive = nvlist_get_number(nvl_peer, + if (nvlist_exists_number(nvl_peer_cfg, + "persistent-keepalive-interval")) { + persistent_keepalive = nvlist_get_number(nvl_peer_cfg, "persistent-keepalive-interval"); printf("PersistentKeepalive = %d\n", persistent_keepalive); } - if (!nvlist_exists_binary(nvl_peer, "allowed-ips")) + if (!nvlist_exists_nvlist_array(nvl_peer_cfg, "allowed-ips")) return; - aips = nvlist_get_binary(nvl_peer, "allowed-ips", &size); - if (size == 0 || size % sizeof(struct allowedip) != 0) { - errx(1, "size %zu not integer multiple of allowedip", size); - } + + nvl_aips = nvlist_get_nvlist_array(nvl_peer_cfg, "allowed-ips", &aip_count); + if (nvl_aips == NULL || aip_count == 0) + return; + printf("AllowedIPs = "); - count = size / sizeof(struct allowedip); - for (int i = 0; i < count; i++) { - int mask; + for (size_t i = 0; i < aip_count; i++) { + uint8_t cidr; + struct sockaddr_storage ss; sa_family_t family; - void *bitmask; - struct sockaddr *sa; - - sa = __DECONST(void *, &aips[i].a_addr); - bitmask = __DECONST(void *, - ((const struct sockaddr *)&(&aips[i])->a_mask)->sa_data); - family = aips[i].a_addr.ss_family; - getnameinfo(sa, sa->sa_len, addr_buf, INET6_ADDRSTRLEN, NULL, - 0, NI_NUMERICHOST); - if (family == AF_INET) - mask = in_mask2len(bitmask); - else if (family == AF_INET6) - mask = in6_mask2len(bitmask, NULL); - else - errx(1, "bad family in peer %d\n", family); - printf("%s/%d", addr_buf, mask); - if (i < count -1) + + if (!nvlist_exists_number(nvl_aips[i], "cidr")) + continue; + cidr = nvlist_get_number(nvl_aips[i], "cidr"); + if (nvlist_exists_binary(nvl_aips[i], "ipv4")) { + struct sockaddr_in *sin = (struct sockaddr_in *)&ss; + const struct in_addr *ip4; + + ip4 = nvlist_get_binary(nvl_aips[i], "ipv4", &size); + if (ip4 == NULL || cidr > 32) + continue; + sin->sin_len = sizeof(*sin); + sin->sin_family = AF_INET; + sin->sin_addr = *ip4; + } else if (nvlist_exists_binary(nvl_aips[i], "ipv6")) { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ss; + const struct in6_addr *ip6; + + ip6 = nvlist_get_binary(nvl_aips[i], "ipv6", &size); + if (ip6 == NULL || cidr > 128) + continue; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = *ip6; + } else { + continue; + } + + family = ss.ss_family; + getnameinfo((struct sockaddr *)&ss, ss.ss_len, addr_buf, + INET6_ADDRSTRLEN, NULL, 0, NI_NUMERICHOST); + printf("%s/%d", addr_buf, cidr); + if (i < aip_count - 1) printf(", "); } printf("\n"); @@ -334,36 +392,34 @@ dump_peer(const nvlist_t *nvl_peer) static int get_nvl_out_size(int sock, u_long op, size_t *size) { - struct ifdrv ifd; + struct wg_data_io wgd; int err; - memset(&ifd, 0, sizeof(ifd)); + memset(&wgd, 0, sizeof(wgd)); - strlcpy(ifd.ifd_name, name, sizeof(ifd.ifd_name)); - ifd.ifd_cmd = op; - ifd.ifd_len = 0; - ifd.ifd_data = NULL; + strlcpy(wgd.wgd_name, name, sizeof(wgd.wgd_name)); + wgd.wgd_size = 0; + wgd.wgd_data = NULL; - err = ioctl(sock, SIOCGDRVSPEC, &ifd); + err = ioctl(sock, op, &wgd); if (err) return (err); - *size = ifd.ifd_len; + *size = wgd.wgd_size; return (0); } static int do_cmd(int sock, u_long op, void *arg, size_t argsize, int set) { - struct ifdrv ifd; + struct wg_data_io wgd; - memset(&ifd, 0, sizeof(ifd)); + memset(&wgd, 0, sizeof(wgd)); - strlcpy(ifd.ifd_name, name, sizeof(ifd.ifd_name)); - ifd.ifd_cmd = op; - ifd.ifd_len = argsize; - ifd.ifd_data = arg; + strlcpy(wgd.wgd_name, name, sizeof(wgd.wgd_name)); + wgd.wgd_size = argsize; + wgd.wgd_data = arg; - return (ioctl(sock, set ? SIOCSDRVSPEC : SIOCGDRVSPEC, &ifd)); + return (ioctl(sock, op, &wgd)); } static @@ -371,63 +427,84 @@ DECL_CMD_FUNC(peerlist, val, d) { size_t size, peercount; void *packed; - const nvlist_t *nvl, *nvl_peer; + const nvlist_t *nvl; const nvlist_t *const *nvl_peerlist; - if (get_nvl_out_size(s, WGC_GET, &size)) + if (get_nvl_out_size(s, SIOCGWG, &size)) errx(1, "can't get peer list size"); if ((packed = malloc(size)) == NULL) errx(1, "malloc failed for peer list"); - if (do_cmd(s, WGC_GET, packed, size, 0)) + if (do_cmd(s, SIOCGWG, packed, size, 0)) errx(1, "failed to obtain peer list"); nvl = nvlist_unpack(packed, size, 0); - if (!nvlist_exists_nvlist_array(nvl, "peer-list")) + if (!nvlist_exists_nvlist_array(nvl, "peers")) return; - nvl_peerlist = nvlist_get_nvlist_array(nvl, "peer-list", &peercount); + nvl_peerlist = nvlist_get_nvlist_array(nvl, "peers", &peercount); for (int i = 0; i < peercount; i++, nvl_peerlist++) { - nvl_peer = *nvl_peerlist; - dump_peer(nvl_peer); + dump_peer(*nvl_peerlist); } } static void -peerfinish(int s, void *arg) +wgfinish(int s, void *arg) { - nvlist_t *nvl, **nvl_array; void *packed; size_t size; + static nvlist_t *nvl_dev; + + nvl_dev = nvl_device(); + if (nvl_peer != NULL) { + if (!nvlist_exists_binary(nvl_peer, "public-key")) + errx(1, "must specify a public-key for adding peer"); + if (allowed_ips_count != 0) { + nvlist_add_nvlist_array(nvl_peer, "allowed-ips", + (const nvlist_t * const *)allowed_ips, + allowed_ips_count); + for (size_t i = 0; i < allowed_ips_count; i++) { + nvlist_destroy(allowed_ips[i]); + } + + free(allowed_ips); + } + + nvlist_add_nvlist_array(nvl_dev, "peers", + (const nvlist_t * const *)&nvl_peer, 1); + } + + packed = nvlist_pack(nvl_dev, &size); - if ((nvl = nvlist_create(0)) == NULL) - errx(1, "failed to allocate nvlist"); - if ((nvl_array = calloc(sizeof(void *), 1)) == NULL) - errx(1, "failed to allocate nvl_array"); - if (!nvlist_exists_binary(nvl_params, "public-key")) - errx(1, "must specify a public-key for adding peer"); - if (allowed_ips_count == 0) - errx(1, "must specify at least one range of allowed-ips to add a peer"); - - nvl_array[0] = nvl_params; - nvlist_add_nvlist_array(nvl, "peer-list", (const nvlist_t * const *)nvl_array, 1); - packed = nvlist_pack(nvl, &size); - - if (do_cmd(s, WGC_SET, packed, size, true)) - errx(1, "failed to install peer"); + if (do_cmd(s, SIOCSWG, packed, size, true)) + errx(1, "failed to configure"); } static DECL_CMD_FUNC(peerstart, val, d) { - do_peer = true; - callback_register(peerfinish, NULL); - allowed_ips = malloc(ALLOWEDIPS_START * sizeof(struct allowedip)); + + if (nvl_peer != NULL) + errx(1, "cannot both add and remove a peer"); + register_wgfinish(); + nvl_peer = nvlist_create(0); + allowed_ips = calloc(ALLOWEDIPS_START, sizeof(*allowed_ips)); allowed_ips_max = ALLOWEDIPS_START; if (allowed_ips == NULL) errx(1, "failed to allocate array for allowedips"); } static +DECL_CMD_FUNC(peerdel, val, d) +{ + + if (nvl_peer != NULL) + errx(1, "cannot both add and remove a peer"); + register_wgfinish(); + nvl_peer = nvlist_create(0); + nvlist_add_bool(nvl_peer, "remove", true); +} + +static DECL_CMD_FUNC(setwglistenport, val, d) { struct addrinfo hints, *res; @@ -454,39 +531,53 @@ DECL_CMD_FUNC(setwglistenport, val, d) errx(1, "unknown family"); } ul = ntohs((u_short)ul); - nvlist_add_number(nvl_params, "listen-port", ul); + nvlist_add_number(nvl_device(), "listen-port", ul); } static DECL_CMD_FUNC(setwgprivkey, val, d) { - uint8_t key[WG_KEY_LEN]; + uint8_t key[WG_KEY_SIZE]; if (!key_from_base64(key, val)) errx(1, "invalid key %s", val); - nvlist_add_binary(nvl_params, "private-key", key, WG_KEY_LEN); + nvlist_add_binary(nvl_device(), "private-key", key, WG_KEY_SIZE); } static DECL_CMD_FUNC(setwgpubkey, val, d) { - uint8_t key[WG_KEY_LEN]; + uint8_t key[WG_KEY_SIZE]; - if (!do_peer) + if (nvl_peer == NULL) errx(1, "setting public key only valid when adding peer"); if (!key_from_base64(key, val)) errx(1, "invalid key %s", val); - nvlist_add_binary(nvl_params, "public-key", key, WG_KEY_LEN); + nvlist_add_binary(nvl_peer, "public-key", key, WG_KEY_SIZE); } static +DECL_CMD_FUNC(setwgpresharedkey, val, d) +{ + uint8_t key[WG_KEY_SIZE]; + + if (nvl_peer == NULL) + errx(1, "setting preshared-key only valid when adding peer"); + + if (!key_from_base64(key, val)) + errx(1, "invalid key %s", val); + nvlist_add_binary(nvl_peer, "preshared-key", key, WG_KEY_SIZE); +} + + +static DECL_CMD_FUNC(setwgpersistentkeepalive, val, d) { unsigned long persistent_keepalive; char *endp; - if (!do_peer) + if (nvl_peer == NULL) errx(1, "setting persistent keepalive only valid when adding peer"); errno = 0; @@ -496,7 +587,7 @@ DECL_CMD_FUNC(setwgpersistentkeepalive, val, d) if (persistent_keepalive > USHRT_MAX) errx(1, "persistent-keepalive '%lu' too large", persistent_keepalive); - nvlist_add_number(nvl_params, "persistent-keepalive-interval", + nvlist_add_number(nvl_peer, "persistent-keepalive-interval", persistent_keepalive); } @@ -506,45 +597,57 @@ DECL_CMD_FUNC(setallowedips, val, d) char *base, *allowedip, *mask; u_long ul; char *endp; - struct allowedip *aip; + struct allowedip aip; + nvlist_t *nvl_aip; + uint16_t family; - if (!do_peer) + if (nvl_peer == NULL) errx(1, "setting allowed ip only valid when adding peer"); if (allowed_ips_count == allowed_ips_max) { - /* XXX grow array */ + allowed_ips_max *= 2; + allowed_ips = reallocarray(allowed_ips, allowed_ips_max, + sizeof(*allowed_ips)); + if (allowed_ips == NULL) + errx(1, "failed to grow allowed ip array"); } - aip = &allowed_ips[allowed_ips_count]; + + allowed_ips[allowed_ips_count] = nvl_aip = nvlist_create(0); + if (nvl_aip == NULL) + errx(1, "failed to create new allowedip nvlist"); + base = allowedip = strdup(val); mask = index(allowedip, '/'); if (mask == NULL) errx(1, "mask separator not found in allowedip %s", val); *mask = '\0'; mask++; - parse_ip(aip, allowedip); + + parse_ip(&aip, &family, allowedip); ul = strtoul(mask, &endp, 0); if (*endp != '\0') errx(1, "invalid value for allowedip mask"); - bzero(&aip->a_mask, sizeof(aip->a_mask)); - if (aip->a_addr.ss_family == AF_INET) - in_len2mask((struct in_addr *)&((struct sockaddr *)&aip->a_mask)->sa_data, ul); - else if (aip->a_addr.ss_family == AF_INET6) - in6_prefixlen2mask((struct in6_addr *)&((struct sockaddr *)&aip->a_mask)->sa_data, ul); - else - errx(1, "invalid address family %d\n", aip->a_addr.ss_family); + + nvlist_add_number(nvl_aip, "cidr", ul); + if (family == AF_INET) { + nvlist_add_binary(nvl_aip, "ipv4", &aip.ip4, sizeof(aip.ip4)); + } else if (family == AF_INET6) { + nvlist_add_binary(nvl_aip, "ipv6", &aip.ip6, sizeof(aip.ip6)); + } else { + /* Shouldn't happen */ + nvlist_destroy(nvl_aip); + goto out; + } + allowed_ips_count++; - if (allowed_ips_count > 1) - nvlist_free_binary(nvl_params, "allowed-ips"); - nvlist_add_binary(nvl_params, "allowed-ips", allowed_ips, - allowed_ips_count*sizeof(*aip)); - dump_peer(nvl_params); +out: free(base); } static DECL_CMD_FUNC(setendpoint, val, d) { - if (!do_peer) + if (nvl_peer == NULL) errx(1, "setting endpoint only valid when adding peer"); parse_endpoint(val); } @@ -555,15 +658,15 @@ wireguard_status(int s) size_t size; void *packed; nvlist_t *nvl; - char buf[WG_KEY_LEN_BASE64]; + char buf[WG_KEY_SIZE_BASE64]; const void *key; uint16_t listen_port; - if (get_nvl_out_size(s, WGC_GET, &size)) + if (get_nvl_out_size(s, SIOCGWG, &size)) return; if ((packed = malloc(size)) == NULL) return; - if (do_cmd(s, WGC_GET, packed, size, 0)) + if (do_cmd(s, SIOCGWG, packed, size, 0)) return; nvl = nvlist_unpack(packed, size, 0); if (nvlist_exists_number(nvl, "listen-port")) { @@ -583,10 +686,14 @@ wireguard_status(int s) } static struct cmd wireguard_cmds[] = { - DEF_CLONE_CMD_ARG("listen-port", setwglistenport), - DEF_CLONE_CMD_ARG("private-key", setwgprivkey), + DEF_CMD_ARG("listen-port", setwglistenport), + DEF_CMD_ARG("private-key", setwgprivkey), + /* XXX peer-list is deprecated. */ DEF_CMD("peer-list", 0, peerlist), + DEF_CMD("peers", 0, peerlist), DEF_CMD("peer", 0, peerstart), + DEF_CMD("-peer", 0, peerdel), + DEF_CMD_ARG("preshared-key", setwgpresharedkey), DEF_CMD_ARG("public-key", setwgpubkey), DEF_CMD_ARG("persistent-keepalive", setwgpersistentkeepalive), DEF_CMD_ARG("allowed-ips", setallowedips), @@ -602,27 +709,10 @@ static struct afswtch af_wireguard = { static void wg_create(int s, struct ifreq *ifr) { - struct iovec iov; - void *packed; - size_t size; setproctitle("ifconfig %s create ...\n", name); - if (!nvlist_exists_number(nvl_params, "listen-port")) - goto legacy; - if (!nvlist_exists_binary(nvl_params, "private-key")) - goto legacy; - - packed = nvlist_pack(nvl_params, &size); - if (packed == NULL) - errx(1, "failed to setup create request"); - iov.iov_len = size; - iov.iov_base = packed; - ifr->ifr_data = (caddr_t)&iov; - if (ioctl(s, SIOCIFCREATE2, ifr) < 0) - err(1, "SIOCIFCREATE2"); - return; -legacy: - ifr->ifr_data == NULL; + + ifr->ifr_data = NULL; if (ioctl(s, SIOCIFCREATE, ifr) < 0) err(1, "SIOCIFCREATE"); } @@ -632,7 +722,6 @@ wireguard_ctor(void) { int i; - nvl_params = nvlist_create(0); for (i = 0; i < nitems(wireguard_cmds); i++) cmd_register(&wireguard_cmds[i]); af_register(&af_wireguard); diff --git a/share/man/man4/wg.4 b/share/man/man4/wg.4 index 335d3e70b64a..29215bd438ff 100644 --- a/share/man/man4/wg.4 +++ b/share/man/man4/wg.4 @@ -23,7 +23,7 @@ .\" .\" $FreeBSD$ .\" -.Dd March 9, 2021 +.Dd March 12, 2021 .Dt WG 4 .Os .Sh NAME @@ -68,7 +68,7 @@ interface. The private key of the .Nm interface. -.It Cm pre-shared-key +.It Cm preshared-key Defines a pre-shared key for the .Nm interface. @@ -76,9 +76,9 @@ interface. A list of allowed IP addresses. .It Cm endpoint The IP address of the WiredGuard to connect to. -.It Cm peer-list +.It Cm peers A list of peering IP addresses to connect to. -.It Cm persistent-keepalive +.It Cm persistent-keepalive-interval Interval, in seconds, at which to send persistent keepalive packets. .El .Pp @@ -188,6 +188,11 @@ Connect to a specific endpoint using its public-key and set the allowed IP addre .Bd -literal -offset indent # ifconfig wg0 peer public-key '7lWtsDdqaGB3EY9WNxRN3hVaHMtu1zXw71+bOjNOVUw=' endpoint 10.0.1.100:54321 allowed-ips 192.168.2.100/32 .Ed +.Pp +Remove a peer +.Bd -literal -offset indent +# ifconfig wg0 -peer public-key '7lWtsDdqaGB3EY9WNxRN3hVaHMtu1zXw71+bOjNOVUw=' +.Ed .Sh DIAGNOSTICS The .Nm @@ -240,14 +245,11 @@ device driver first appeared in .Sh AUTHORS The .Nm -device driver was originally written for -.Ox -by -.An Matt Dunwoodie Aq Mt ncon@nconroy.net -and ported to -.Fx -by -.An Matt Macy Aq Mt mmacy@FreeBSD.org . +device driver written by +.An Jason A. Donenfeld Aq Mt Jason@zx2c4.com , +.An Matt Dunwoodie Aq Mt ncon@nconroy.net , +and +.An Kyle Evans Aq Mt kevans@FreeBSD.org . .Pp This manual page was written by .An Gordon Bergling Aq Mt gbe@FreeBSD.org diff --git a/sys/dev/if_wg/crypto.c b/sys/dev/if_wg/crypto.c new file mode 100644 index 000000000000..f28585429272 --- /dev/null +++ b/sys/dev/if_wg/crypto.c @@ -0,0 +1,1705 @@ +/* + * Copyright (C) 2015-2021 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/types.h> +#include <sys/endian.h> +#include <sys/systm.h> + +#include "crypto.h" + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#endif +#ifndef noinline +#define noinline __attribute__((noinline)) +#endif +#ifndef __aligned +#define __aligned(x) __attribute__((aligned(x))) +#endif +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#endif + +#define le32_to_cpup(a) le32toh(*(a)) +#define le64_to_cpup(a) le64toh(*(a)) +#define cpu_to_le32(a) htole32(a) +#define cpu_to_le64(a) htole64(a) + +static inline uint32_t get_unaligned_le32(const uint8_t *a) +{ + uint32_t l; + __builtin_memcpy(&l, a, sizeof(l)); + return le32_to_cpup(&l); +} +static inline uint64_t get_unaligned_le64(const uint8_t *a) +{ + uint64_t l; + __builtin_memcpy(&l, a, sizeof(l)); + return le64_to_cpup(&l); +} +static inline void put_unaligned_le32(uint32_t s, uint8_t *d) +{ + uint32_t l = cpu_to_le32(s); + __builtin_memcpy(d, &l, sizeof(l)); +} +static inline void cpu_to_le32_array(uint32_t *buf, unsigned int words) +{ + while (words--) { + *buf = cpu_to_le32(*buf); + ++buf; + } +} +static inline void le32_to_cpu_array(uint32_t *buf, unsigned int words) +{ + while (words--) { + *buf = le32_to_cpup(buf); + ++buf; + } +} + +static inline uint32_t rol32(uint32_t word, unsigned int shift) +{ + return (word << (shift & 31)) | (word >> ((-shift) & 31)); +} +static inline uint32_t ror32(uint32_t word, unsigned int shift) +{ + return (word >> (shift & 31)) | (word << ((-shift) & 31)); +} + +static void xor_cpy(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, + size_t len) +{ + size_t i; + + for (i = 0; i < len; ++i) + dst[i] = src1[i] ^ src2[i]; +} + +#define QUARTER_ROUND(x, a, b, c, d) ( \ + x[a] += x[b], \ + x[d] = rol32((x[d] ^ x[a]), 16), \ + x[c] += x[d], \ + x[b] = rol32((x[b] ^ x[c]), 12), \ + x[a] += x[b], \ + x[d] = rol32((x[d] ^ x[a]), 8), \ + x[c] += x[d], \ + x[b] = rol32((x[b] ^ x[c]), 7) \ +) + +#define C(i, j) (i * 4 + j) + +#define DOUBLE_ROUND(x) ( \ + /* Column Round */ \ + QUARTER_ROUND(x, C(0, 0), C(1, 0), C(2, 0), C(3, 0)), \ + QUARTER_ROUND(x, C(0, 1), C(1, 1), C(2, 1), C(3, 1)), \ + QUARTER_ROUND(x, C(0, 2), C(1, 2), C(2, 2), C(3, 2)), \ + QUARTER_ROUND(x, C(0, 3), C(1, 3), C(2, 3), C(3, 3)), \ + /* Diagonal Round */ \ + QUARTER_ROUND(x, C(0, 0), C(1, 1), C(2, 2), C(3, 3)), \ + QUARTER_ROUND(x, C(0, 1), C(1, 2), C(2, 3), C(3, 0)), \ + QUARTER_ROUND(x, C(0, 2), C(1, 3), C(2, 0), C(3, 1)), \ + QUARTER_ROUND(x, C(0, 3), C(1, 0), C(2, 1), C(3, 2)) \ +) + +#define TWENTY_ROUNDS(x) ( \ + DOUBLE_ROUND(x), \ + DOUBLE_ROUND(x), \ + DOUBLE_ROUND(x), \ + DOUBLE_ROUND(x), \ + DOUBLE_ROUND(x), \ + DOUBLE_ROUND(x), \ + DOUBLE_ROUND(x), \ + DOUBLE_ROUND(x), \ + DOUBLE_ROUND(x), \ + DOUBLE_ROUND(x) \ +) + +enum chacha20_lengths { + CHACHA20_NONCE_SIZE = 16, + CHACHA20_KEY_SIZE = 32, + CHACHA20_KEY_WORDS = CHACHA20_KEY_SIZE / sizeof(uint32_t), + CHACHA20_BLOCK_SIZE = 64, + CHACHA20_BLOCK_WORDS = CHACHA20_BLOCK_SIZE / sizeof(uint32_t), + HCHACHA20_NONCE_SIZE = CHACHA20_NONCE_SIZE, + HCHACHA20_KEY_SIZE = CHACHA20_KEY_SIZE +}; + +enum chacha20_constants { /* expand 32-byte k */ + CHACHA20_CONSTANT_EXPA = 0x61707865U, + CHACHA20_CONSTANT_ND_3 = 0x3320646eU, + CHACHA20_CONSTANT_2_BY = 0x79622d32U, + CHACHA20_CONSTANT_TE_K = 0x6b206574U +}; + +struct chacha20_ctx { + union { + uint32_t state[16]; + struct { + uint32_t constant[4]; + uint32_t key[8]; + uint32_t counter[4]; + }; + }; +}; + +static void chacha20_init(struct chacha20_ctx *ctx, + const uint8_t key[CHACHA20_KEY_SIZE], + const uint64_t nonce) +{ + ctx->constant[0] = CHACHA20_CONSTANT_EXPA; + ctx->constant[1] = CHACHA20_CONSTANT_ND_3; + ctx->constant[2] = CHACHA20_CONSTANT_2_BY; + ctx->constant[3] = CHACHA20_CONSTANT_TE_K; + ctx->key[0] = get_unaligned_le32(key + 0); + ctx->key[1] = get_unaligned_le32(key + 4); + ctx->key[2] = get_unaligned_le32(key + 8); + ctx->key[3] = get_unaligned_le32(key + 12); + ctx->key[4] = get_unaligned_le32(key + 16); + ctx->key[5] = get_unaligned_le32(key + 20); + ctx->key[6] = get_unaligned_le32(key + 24); + ctx->key[7] = get_unaligned_le32(key + 28); + ctx->counter[0] = 0; + ctx->counter[1] = 0; + ctx->counter[2] = nonce & 0xffffffffU; + ctx->counter[3] = nonce >> 32; +} + +static void chacha20_block(struct chacha20_ctx *ctx, uint32_t *stream) +{ + uint32_t x[CHACHA20_BLOCK_WORDS]; + int i; + + for (i = 0; i < ARRAY_SIZE(x); ++i) + x[i] = ctx->state[i]; + + TWENTY_ROUNDS(x); + + for (i = 0; i < ARRAY_SIZE(x); ++i) + stream[i] = cpu_to_le32(x[i] + ctx->state[i]); + + ctx->counter[0] += 1; +} + +static void chacha20(struct chacha20_ctx *ctx, uint8_t *out, const uint8_t *in, + uint32_t len) +{ + uint32_t buf[CHACHA20_BLOCK_WORDS]; + + while (len >= CHACHA20_BLOCK_SIZE) { + chacha20_block(ctx, buf); + xor_cpy(out, in, (uint8_t *)buf, CHACHA20_BLOCK_SIZE); + len -= CHACHA20_BLOCK_SIZE; + out += CHACHA20_BLOCK_SIZE; + in += CHACHA20_BLOCK_SIZE; + } + if (len) { + chacha20_block(ctx, buf); + xor_cpy(out, in, (uint8_t *)buf, len); + } +} + +static void hchacha20(uint32_t derived_key[CHACHA20_KEY_WORDS], + const uint8_t nonce[HCHACHA20_NONCE_SIZE], + const uint8_t key[HCHACHA20_KEY_SIZE]) +{ + uint32_t x[] = { CHACHA20_CONSTANT_EXPA, + CHACHA20_CONSTANT_ND_3, + CHACHA20_CONSTANT_2_BY, + CHACHA20_CONSTANT_TE_K, + get_unaligned_le32(key + 0), + get_unaligned_le32(key + 4), + get_unaligned_le32(key + 8), + get_unaligned_le32(key + 12), + get_unaligned_le32(key + 16), + get_unaligned_le32(key + 20), + get_unaligned_le32(key + 24), + get_unaligned_le32(key + 28), + get_unaligned_le32(nonce + 0), + get_unaligned_le32(nonce + 4), + get_unaligned_le32(nonce + 8), + get_unaligned_le32(nonce + 12) + }; + + TWENTY_ROUNDS(x); + + memcpy(derived_key + 0, x + 0, sizeof(uint32_t) * 4); + memcpy(derived_key + 4, x + 12, sizeof(uint32_t) * 4); +} + +enum poly1305_lengths { + POLY1305_BLOCK_SIZE = 16, + POLY1305_KEY_SIZE = 32, + POLY1305_MAC_SIZE = 16 +}; + +struct poly1305_internal { + uint32_t h[5]; + uint32_t r[5]; + uint32_t s[4]; +}; + +struct poly1305_ctx { + struct poly1305_internal state; + uint32_t nonce[4]; + uint8_t data[POLY1305_BLOCK_SIZE]; + size_t num; +}; + +static void poly1305_init_core(struct poly1305_internal *st, + const uint8_t key[16]) +{ + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + st->r[0] = (get_unaligned_le32(&key[0])) & 0x3ffffff; + st->r[1] = (get_unaligned_le32(&key[3]) >> 2) & 0x3ffff03; + st->r[2] = (get_unaligned_le32(&key[6]) >> 4) & 0x3ffc0ff; + st->r[3] = (get_unaligned_le32(&key[9]) >> 6) & 0x3f03fff; + st->r[4] = (get_unaligned_le32(&key[12]) >> 8) & 0x00fffff; + + /* s = 5*r */ + st->s[0] = st->r[1] * 5; + st->s[1] = st->r[2] * 5; + st->s[2] = st->r[3] * 5; + st->s[3] = st->r[4] * 5; + + /* h = 0 */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + st->h[3] = 0; + st->h[4] = 0; +} + +static void poly1305_blocks_core(struct poly1305_internal *st, + const uint8_t *input, size_t len, + const uint32_t padbit) +{ + const uint32_t hibit = padbit << 24; + uint32_t r0, r1, r2, r3, r4; + uint32_t s1, s2, s3, s4; + uint32_t h0, h1, h2, h3, h4; + uint64_t d0, d1, d2, d3, d4; + uint32_t c; + + r0 = st->r[0]; + r1 = st->r[1]; + r2 = st->r[2]; + r3 = st->r[3]; + r4 = st->r[4]; + + s1 = st->s[0]; + s2 = st->s[1]; + s3 = st->s[2]; + s4 = st->s[3]; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + while (len >= POLY1305_BLOCK_SIZE) { + /* h += m[i] */ + h0 += (get_unaligned_le32(&input[0])) & 0x3ffffff; + h1 += (get_unaligned_le32(&input[3]) >> 2) & 0x3ffffff; + h2 += (get_unaligned_le32(&input[6]) >> 4) & 0x3ffffff; + h3 += (get_unaligned_le32(&input[9]) >> 6) & 0x3ffffff; + h4 += (get_unaligned_le32(&input[12]) >> 8) | hibit; + + /* h *= r */ + d0 = ((uint64_t)h0 * r0) + ((uint64_t)h1 * s4) + + ((uint64_t)h2 * s3) + ((uint64_t)h3 * s2) + + ((uint64_t)h4 * s1); + d1 = ((uint64_t)h0 * r1) + ((uint64_t)h1 * r0) + + ((uint64_t)h2 * s4) + ((uint64_t)h3 * s3) + + ((uint64_t)h4 * s2); + d2 = ((uint64_t)h0 * r2) + ((uint64_t)h1 * r1) + + ((uint64_t)h2 * r0) + ((uint64_t)h3 * s4) + + ((uint64_t)h4 * s3); + d3 = ((uint64_t)h0 * r3) + ((uint64_t)h1 * r2) + + ((uint64_t)h2 * r1) + ((uint64_t)h3 * r0) + + ((uint64_t)h4 * s4); + d4 = ((uint64_t)h0 * r4) + ((uint64_t)h1 * r3) + + ((uint64_t)h2 * r2) + ((uint64_t)h3 * r1) + + ((uint64_t)h4 * r0); + + /* (partial) h %= p */ + c = (uint32_t)(d0 >> 26); + h0 = (uint32_t)d0 & 0x3ffffff; + d1 += c; + c = (uint32_t)(d1 >> 26); + h1 = (uint32_t)d1 & 0x3ffffff; + d2 += c; + c = (uint32_t)(d2 >> 26); + h2 = (uint32_t)d2 & 0x3ffffff; + d3 += c; + c = (uint32_t)(d3 >> 26); + h3 = (uint32_t)d3 & 0x3ffffff; + d4 += c; + c = (uint32_t)(d4 >> 26); + h4 = (uint32_t)d4 & 0x3ffffff; + h0 += c * 5; + c = (h0 >> 26); + h0 = h0 & 0x3ffffff; + h1 += c; + + input += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + } + + st->h[0] = h0; + st->h[1] = h1; + st->h[2] = h2; + st->h[3] = h3; + st->h[4] = h4; +} + +static void poly1305_emit_core(struct poly1305_internal *st, uint8_t mac[16], + const uint32_t nonce[4]) +{ + uint32_t h0, h1, h2, h3, h4, c; + uint32_t g0, g1, g2, g3, g4; + uint64_t f; + uint32_t mask; + + /* fully carry h */ + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + c = h1 >> 26; + h1 = h1 & 0x3ffffff; + h2 += c; + c = h2 >> 26; + h2 = h2 & 0x3ffffff; + h3 += c; + c = h3 >> 26; + h3 = h3 & 0x3ffffff; + h4 += c; + c = h4 >> 26; + h4 = h4 & 0x3ffffff; + h0 += c * 5; + c = h0 >> 26; + h0 = h0 & 0x3ffffff; + h1 += c; + + /* compute h + -p */ + g0 = h0 + 5; + c = g0 >> 26; + g0 &= 0x3ffffff; + g1 = h1 + c; + c = g1 >> 26; + g1 &= 0x3ffffff; + g2 = h2 + c; + c = g2 >> 26; + g2 &= 0x3ffffff; + g3 = h3 + c; + c = g3 >> 26; + g3 &= 0x3ffffff; + g4 = h4 + c - (1UL << 26); + + /* select h if h < p, or h + -p if h >= p */ + mask = (g4 >> ((sizeof(uint32_t) * 8) - 1)) - 1; + g0 &= mask; + g1 &= mask; + g2 &= mask; + g3 &= mask; + g4 &= mask; + mask = ~mask; + + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + h2 = (h2 & mask) | g2; + h3 = (h3 & mask) | g3; + h4 = (h4 & mask) | g4; + + /* h = h % (2^128) */ + h0 = ((h0) | (h1 << 26)) & 0xffffffff; + h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; + h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; + h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; + + /* mac = (h + nonce) % (2^128) */ + f = (uint64_t)h0 + nonce[0]; + h0 = (uint32_t)f; + f = (uint64_t)h1 + nonce[1] + (f >> 32); + h1 = (uint32_t)f; + f = (uint64_t)h2 + nonce[2] + (f >> 32); + h2 = (uint32_t)f; + f = (uint64_t)h3 + nonce[3] + (f >> 32); + h3 = (uint32_t)f; + + put_unaligned_le32(h0, &mac[0]); + put_unaligned_le32(h1, &mac[4]); + put_unaligned_le32(h2, &mac[8]); + put_unaligned_le32(h3, &mac[12]); +} + +static void poly1305_init(struct poly1305_ctx *ctx, + const uint8_t key[POLY1305_KEY_SIZE]) +{ + ctx->nonce[0] = get_unaligned_le32(&key[16]); + ctx->nonce[1] = get_unaligned_le32(&key[20]); + ctx->nonce[2] = get_unaligned_le32(&key[24]); + ctx->nonce[3] = get_unaligned_le32(&key[28]); + + poly1305_init_core(&ctx->state, key); + + ctx->num = 0; +} + +static void poly1305_update(struct poly1305_ctx *ctx, const uint8_t *input, + size_t len) +{ + const size_t num = ctx->num; + size_t rem; + + if (num) { + rem = POLY1305_BLOCK_SIZE - num; + if (len < rem) { + memcpy(ctx->data + num, input, len); + ctx->num = num + len; + return; + } + memcpy(ctx->data + num, input, rem); + poly1305_blocks_core(&ctx->state, ctx->data, + POLY1305_BLOCK_SIZE, 1); + input += rem; + len -= rem; + } + + rem = len % POLY1305_BLOCK_SIZE; + len -= rem; + + if (len >= POLY1305_BLOCK_SIZE) { + poly1305_blocks_core(&ctx->state, input, len, 1); + input += len; + } + + if (rem) + memcpy(ctx->data, input, rem); + + ctx->num = rem; +} + +static void poly1305_final(struct poly1305_ctx *ctx, + uint8_t mac[POLY1305_MAC_SIZE]) +{ + size_t num = ctx->num; + + if (num) { + ctx->data[num++] = 1; + while (num < POLY1305_BLOCK_SIZE) + ctx->data[num++] = 0; + poly1305_blocks_core(&ctx->state, ctx->data, + POLY1305_BLOCK_SIZE, 0); + } + + poly1305_emit_core(&ctx->state, mac, ctx->nonce); + + explicit_bzero(ctx, sizeof(*ctx)); +} + + +static const uint8_t pad0[16] = { 0 }; + +void +chacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, const size_t src_len, + const uint8_t *ad, const size_t ad_len, + const uint64_t nonce, + const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) +{ + struct poly1305_ctx poly1305_state; + struct chacha20_ctx chacha20_state; + union { + uint8_t block0[POLY1305_KEY_SIZE]; + uint64_t lens[2]; + } b = { { 0 } }; + + chacha20_init(&chacha20_state, key, nonce); + chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0)); + poly1305_init(&poly1305_state, b.block0); + + poly1305_update(&poly1305_state, ad, ad_len); + poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf); + + chacha20(&chacha20_state, dst, src, src_len); + + poly1305_update(&poly1305_state, dst, src_len); + poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf); + + b.lens[0] = cpu_to_le64(ad_len); + b.lens[1] = cpu_to_le64(src_len); + poly1305_update(&poly1305_state, (uint8_t *)b.lens, sizeof(b.lens)); + + poly1305_final(&poly1305_state, dst + src_len); + + explicit_bzero(&chacha20_state, sizeof(chacha20_state)); + explicit_bzero(&b, sizeof(b)); +} + +bool +chacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src, const size_t src_len, + const uint8_t *ad, const size_t ad_len, + const uint64_t nonce, + const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) +{ + struct poly1305_ctx poly1305_state; + struct chacha20_ctx chacha20_state; + bool ret; + size_t dst_len; + union { + uint8_t block0[POLY1305_KEY_SIZE]; + uint8_t mac[POLY1305_MAC_SIZE]; + uint64_t lens[2]; + } b = { { 0 } }; + + if (src_len < POLY1305_MAC_SIZE) + return false; + + chacha20_init(&chacha20_state, key, nonce); + chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0)); + poly1305_init(&poly1305_state, b.block0); + + poly1305_update(&poly1305_state, ad, ad_len); + poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf); + + dst_len = src_len - POLY1305_MAC_SIZE; + poly1305_update(&poly1305_state, src, dst_len); + poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf); + + b.lens[0] = cpu_to_le64(ad_len); + b.lens[1] = cpu_to_le64(dst_len); + poly1305_update(&poly1305_state, (uint8_t *)b.lens, sizeof(b.lens)); + + poly1305_final(&poly1305_state, b.mac); + + ret = timingsafe_bcmp(b.mac, src + dst_len, POLY1305_MAC_SIZE) == 0; + if (ret) + chacha20(&chacha20_state, dst, src, dst_len); + + explicit_bzero(&chacha20_state, sizeof(chacha20_state)); + explicit_bzero(&b, sizeof(b)); + + return ret; +} + +void +xchacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, + const size_t src_len, const uint8_t *ad, + const size_t ad_len, + const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE], + const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) +{ + uint32_t derived_key[CHACHA20_KEY_WORDS]; + + hchacha20(derived_key, nonce, key); + cpu_to_le32_array(derived_key, ARRAY_SIZE(derived_key)); + chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, + get_unaligned_le64(nonce + 16), + (uint8_t *)derived_key); + explicit_bzero(derived_key, CHACHA20POLY1305_KEY_SIZE); +} + +bool +xchacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src, + const size_t src_len, const uint8_t *ad, + const size_t ad_len, + const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE], + const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) +{ + bool ret; + uint32_t derived_key[CHACHA20_KEY_WORDS]; + + hchacha20(derived_key, nonce, key); + cpu_to_le32_array(derived_key, ARRAY_SIZE(derived_key)); + ret = chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, + get_unaligned_le64(nonce + 16), + (uint8_t *)derived_key); + explicit_bzero(derived_key, CHACHA20POLY1305_KEY_SIZE); + return ret; +} + + +static const uint32_t blake2s_iv[8] = { + 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL +}; + +static const uint8_t blake2s_sigma[10][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, +}; + +static inline void blake2s_set_lastblock(struct blake2s_state *state) +{ + state->f[0] = -1; +} + +static inline void blake2s_increment_counter(struct blake2s_state *state, + const uint32_t inc) +{ + state->t[0] += inc; + state->t[1] += (state->t[0] < inc); +} + +static inline void blake2s_init_param(struct blake2s_state *state, + const uint32_t param) +{ + int i; + + memset(state, 0, sizeof(*state)); + for (i = 0; i < 8; ++i) + state->h[i] = blake2s_iv[i]; + state->h[0] ^= param; +} + +void blake2s_init(struct blake2s_state *state, const size_t outlen) +{ + blake2s_init_param(state, 0x01010000 | outlen); + state->outlen = outlen; +} + +void blake2s_init_key(struct blake2s_state *state, const size_t outlen, + const uint8_t *key, const size_t keylen) +{ + uint8_t block[BLAKE2S_BLOCK_SIZE] = { 0 }; + + blake2s_init_param(state, 0x01010000 | keylen << 8 | outlen); + state->outlen = outlen; + memcpy(block, key, keylen); + blake2s_update(state, block, BLAKE2S_BLOCK_SIZE); + explicit_bzero(block, BLAKE2S_BLOCK_SIZE); +} + +static inline void blake2s_compress(struct blake2s_state *state, + const uint8_t *block, size_t nblocks, + const uint32_t inc) +{ + uint32_t m[16]; + uint32_t v[16]; + int i; + + while (nblocks > 0) { + blake2s_increment_counter(state, inc); + memcpy(m, block, BLAKE2S_BLOCK_SIZE); + le32_to_cpu_array(m, ARRAY_SIZE(m)); + memcpy(v, state->h, 32); + v[ 8] = blake2s_iv[0]; + v[ 9] = blake2s_iv[1]; + v[10] = blake2s_iv[2]; + v[11] = blake2s_iv[3]; + v[12] = blake2s_iv[4] ^ state->t[0]; + v[13] = blake2s_iv[5] ^ state->t[1]; + v[14] = blake2s_iv[6] ^ state->f[0]; + v[15] = blake2s_iv[7] ^ state->f[1]; + +#define G(r, i, a, b, c, d) do { \ + a += b + m[blake2s_sigma[r][2 * i + 0]]; \ + d = ror32(d ^ a, 16); \ + c += d; \ + b = ror32(b ^ c, 12); \ + a += b + m[blake2s_sigma[r][2 * i + 1]]; \ + d = ror32(d ^ a, 8); \ + c += d; \ + b = ror32(b ^ c, 7); \ +} while (0) + +#define ROUND(r) do { \ + G(r, 0, v[0], v[ 4], v[ 8], v[12]); \ + G(r, 1, v[1], v[ 5], v[ 9], v[13]); \ + G(r, 2, v[2], v[ 6], v[10], v[14]); \ + G(r, 3, v[3], v[ 7], v[11], v[15]); \ + G(r, 4, v[0], v[ 5], v[10], v[15]); \ + G(r, 5, v[1], v[ 6], v[11], v[12]); \ + G(r, 6, v[2], v[ 7], v[ 8], v[13]); \ + G(r, 7, v[3], v[ 4], v[ 9], v[14]); \ +} while (0) + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + +#undef G +#undef ROUND + + for (i = 0; i < 8; ++i) + state->h[i] ^= v[i] ^ v[i + 8]; + + block += BLAKE2S_BLOCK_SIZE; + --nblocks; + } +} + +void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen) +{ + const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; + + if (!inlen) + return; + if (inlen > fill) { + memcpy(state->buf + state->buflen, in, fill); + blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); + state->buflen = 0; + in += fill; + inlen -= fill; + } + if (inlen > BLAKE2S_BLOCK_SIZE) { + const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); + /* Hash one less (full) block than strictly possible */ + blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); + in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); + inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); + } + memcpy(state->buf + state->buflen, in, inlen); + state->buflen += inlen; +} + +void blake2s_final(struct blake2s_state *state, uint8_t *out) +{ + blake2s_set_lastblock(state); + memset(state->buf + state->buflen, 0, + BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ + blake2s_compress(state, state->buf, 1, state->buflen); + cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); + memcpy(out, state->h, state->outlen); + explicit_bzero(state, sizeof(*state)); +} + +void blake2s(uint8_t *out, const uint8_t *in, const uint8_t *key, + const size_t outlen, const size_t inlen, const size_t keylen) +{ + struct blake2s_state state; + + if (keylen) + blake2s_init_key(&state, outlen, key, keylen); + else + blake2s_init(&state, outlen); + + blake2s_update(&state, in, inlen); + blake2s_final(&state, out); +} + +void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key, const size_t outlen, + const size_t inlen, const size_t keylen) +{ + struct blake2s_state state; + uint8_t x_key[BLAKE2S_BLOCK_SIZE] __aligned(sizeof(uint32_t)) = { 0 }; + uint8_t i_hash[BLAKE2S_HASH_SIZE] __aligned(sizeof(uint32_t)); + int i; + + if (keylen > BLAKE2S_BLOCK_SIZE) { + blake2s_init(&state, BLAKE2S_HASH_SIZE); + blake2s_update(&state, key, keylen); + blake2s_final(&state, x_key); + } else + memcpy(x_key, key, keylen); + + for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i) + x_key[i] ^= 0x36; + + blake2s_init(&state, BLAKE2S_HASH_SIZE); + blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE); + blake2s_update(&state, in, inlen); + blake2s_final(&state, i_hash); + + for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i) + x_key[i] ^= 0x5c ^ 0x36; + + blake2s_init(&state, BLAKE2S_HASH_SIZE); + blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE); + blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE); + blake2s_final(&state, i_hash); + + memcpy(out, i_hash, outlen); + explicit_bzero(x_key, BLAKE2S_BLOCK_SIZE); + explicit_bzero(i_hash, BLAKE2S_HASH_SIZE); +} + + +/* Below here is fiat's implementation of x25519. + * + * Copyright (C) 2015-2016 The fiat-crypto Authors. + * Copyright (C) 2018-2021 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * + * This is a machine-generated formally verified implementation of Curve25519 + * ECDH from: <https://github.com/mit-plv/fiat-crypto>. Though originally + * machine generated, it has been tweaked to be suitable for use in the kernel. + * It is optimized for 32-bit machines and machines that cannot work efficiently + * with 128-bit integer types. + */ + +/* fe means field element. Here the field is \Z/(2^255-19). An element t, + * entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77 + * t[3]+2^102 t[4]+...+2^230 t[9]. + * fe limbs are bounded by 1.125*2^26,1.125*2^25,1.125*2^26,1.125*2^25,etc. + * Multiplication and carrying produce fe from fe_loose. + */ +typedef struct fe { uint32_t v[10]; } fe; + +/* fe_loose limbs are bounded by 3.375*2^26,3.375*2^25,3.375*2^26,3.375*2^25,etc + * Addition and subtraction produce fe_loose from (fe, fe). + */ +typedef struct fe_loose { uint32_t v[10]; } fe_loose; + +static inline void fe_frombytes_impl(uint32_t h[10], const uint8_t *s) +{ + /* Ignores top bit of s. */ + uint32_t a0 = get_unaligned_le32(s); + uint32_t a1 = get_unaligned_le32(s+4); + uint32_t a2 = get_unaligned_le32(s+8); + uint32_t a3 = get_unaligned_le32(s+12); + uint32_t a4 = get_unaligned_le32(s+16); + uint32_t a5 = get_unaligned_le32(s+20); + uint32_t a6 = get_unaligned_le32(s+24); + uint32_t a7 = get_unaligned_le32(s+28); + h[0] = a0&((1<<26)-1); /* 26 used, 32-26 left. 26 */ + h[1] = (a0>>26) | ((a1&((1<<19)-1))<< 6); /* (32-26) + 19 = 6+19 = 25 */ + h[2] = (a1>>19) | ((a2&((1<<13)-1))<<13); /* (32-19) + 13 = 13+13 = 26 */ + h[3] = (a2>>13) | ((a3&((1<< 6)-1))<<19); /* (32-13) + 6 = 19+ 6 = 25 */ + h[4] = (a3>> 6); /* (32- 6) = 26 */ + h[5] = a4&((1<<25)-1); /* 25 */ + h[6] = (a4>>25) | ((a5&((1<<19)-1))<< 7); /* (32-25) + 19 = 7+19 = 26 */ + h[7] = (a5>>19) | ((a6&((1<<12)-1))<<13); /* (32-19) + 12 = 13+12 = 25 */ + h[8] = (a6>>12) | ((a7&((1<< 6)-1))<<20); /* (32-12) + 6 = 20+ 6 = 26 */ + h[9] = (a7>> 6)&((1<<25)-1); /* 25 */ +} + +static inline void fe_frombytes(fe *h, const uint8_t *s) +{ + fe_frombytes_impl(h->v, s); +} + +static inline uint8_t /*bool*/ +addcarryx_u25(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low) +{ + /* This function extracts 25 bits of result and 1 bit of carry + * (26 total), so a 32-bit intermediate is sufficient. + */ + uint32_t x = a + b + c; + *low = x & ((1 << 25) - 1); + return (x >> 25) & 1; +} + +static inline uint8_t /*bool*/ +addcarryx_u26(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low) +{ + /* This function extracts 26 bits of result and 1 bit of carry + * (27 total), so a 32-bit intermediate is sufficient. + */ + uint32_t x = a + b + c; + *low = x & ((1 << 26) - 1); + return (x >> 26) & 1; +} + +static inline uint8_t /*bool*/ +subborrow_u25(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low) +{ + /* This function extracts 25 bits of result and 1 bit of borrow + * (26 total), so a 32-bit intermediate is sufficient. + */ + uint32_t x = a - b - c; + *low = x & ((1 << 25) - 1); + return x >> 31; +} + +static inline uint8_t /*bool*/ +subborrow_u26(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low) +{ + /* This function extracts 26 bits of result and 1 bit of borrow + *(27 total), so a 32-bit intermediate is sufficient. + */ + uint32_t x = a - b - c; + *low = x & ((1 << 26) - 1); + return x >> 31; +} + +static inline uint32_t cmovznz32(uint32_t t, uint32_t z, uint32_t nz) +{ + t = -!!t; /* all set if nonzero, 0 if 0 */ + return (t&nz) | ((~t)&z); +} + +static inline void fe_freeze(uint32_t out[10], const uint32_t in1[10]) +{ + const uint32_t x17 = in1[9]; + const uint32_t x18 = in1[8]; + const uint32_t x16 = in1[7]; + const uint32_t x14 = in1[6]; + const uint32_t x12 = in1[5]; + const uint32_t x10 = in1[4]; + const uint32_t x8 = in1[3]; + const uint32_t x6 = in1[2]; + const uint32_t x4 = in1[1]; + const uint32_t x2 = in1[0]; + uint32_t x20; uint8_t/*bool*/ x21 = subborrow_u26(0x0, x2, 0x3ffffed, &x20); + uint32_t x23; uint8_t/*bool*/ x24 = subborrow_u25(x21, x4, 0x1ffffff, &x23); + uint32_t x26; uint8_t/*bool*/ x27 = subborrow_u26(x24, x6, 0x3ffffff, &x26); + uint32_t x29; uint8_t/*bool*/ x30 = subborrow_u25(x27, x8, 0x1ffffff, &x29); + uint32_t x32; uint8_t/*bool*/ x33 = subborrow_u26(x30, x10, 0x3ffffff, &x32); + uint32_t x35; uint8_t/*bool*/ x36 = subborrow_u25(x33, x12, 0x1ffffff, &x35); + uint32_t x38; uint8_t/*bool*/ x39 = subborrow_u26(x36, x14, 0x3ffffff, &x38); + uint32_t x41; uint8_t/*bool*/ x42 = subborrow_u25(x39, x16, 0x1ffffff, &x41); + uint32_t x44; uint8_t/*bool*/ x45 = subborrow_u26(x42, x18, 0x3ffffff, &x44); + uint32_t x47; uint8_t/*bool*/ x48 = subborrow_u25(x45, x17, 0x1ffffff, &x47); + uint32_t x49 = cmovznz32(x48, 0x0, 0xffffffff); + uint32_t x50 = (x49 & 0x3ffffed); + uint32_t x52; uint8_t/*bool*/ x53 = addcarryx_u26(0x0, x20, x50, &x52); + uint32_t x54 = (x49 & 0x1ffffff); + uint32_t x56; uint8_t/*bool*/ x57 = addcarryx_u25(x53, x23, x54, &x56); + uint32_t x58 = (x49 & 0x3ffffff); + uint32_t x60; uint8_t/*bool*/ x61 = addcarryx_u26(x57, x26, x58, &x60); + uint32_t x62 = (x49 & 0x1ffffff); + uint32_t x64; uint8_t/*bool*/ x65 = addcarryx_u25(x61, x29, x62, &x64); + uint32_t x66 = (x49 & 0x3ffffff); + uint32_t x68; uint8_t/*bool*/ x69 = addcarryx_u26(x65, x32, x66, &x68); + uint32_t x70 = (x49 & 0x1ffffff); + uint32_t x72; uint8_t/*bool*/ x73 = addcarryx_u25(x69, x35, x70, &x72); + uint32_t x74 = (x49 & 0x3ffffff); + uint32_t x76; uint8_t/*bool*/ x77 = addcarryx_u26(x73, x38, x74, &x76); + uint32_t x78 = (x49 & 0x1ffffff); + uint32_t x80; uint8_t/*bool*/ x81 = addcarryx_u25(x77, x41, x78, &x80); + uint32_t x82 = (x49 & 0x3ffffff); + uint32_t x84; uint8_t/*bool*/ x85 = addcarryx_u26(x81, x44, x82, &x84); + uint32_t x86 = (x49 & 0x1ffffff); + uint32_t x88; addcarryx_u25(x85, x47, x86, &x88); + out[0] = x52; + out[1] = x56; + out[2] = x60; + out[3] = x64; + out[4] = x68; + out[5] = x72; + out[6] = x76; + out[7] = x80; + out[8] = x84; + out[9] = x88; +} + +static inline void fe_tobytes(uint8_t s[32], const fe *f) +{ + uint32_t h[10]; + fe_freeze(h, f->v); + s[0] = h[0] >> 0; + s[1] = h[0] >> 8; + s[2] = h[0] >> 16; + s[3] = (h[0] >> 24) | (h[1] << 2); + s[4] = h[1] >> 6; + s[5] = h[1] >> 14; + s[6] = (h[1] >> 22) | (h[2] << 3); + s[7] = h[2] >> 5; + s[8] = h[2] >> 13; + s[9] = (h[2] >> 21) | (h[3] << 5); + s[10] = h[3] >> 3; + s[11] = h[3] >> 11; + s[12] = (h[3] >> 19) | (h[4] << 6); + s[13] = h[4] >> 2; + s[14] = h[4] >> 10; + s[15] = h[4] >> 18; + s[16] = h[5] >> 0; + s[17] = h[5] >> 8; + s[18] = h[5] >> 16; + s[19] = (h[5] >> 24) | (h[6] << 1); + s[20] = h[6] >> 7; + s[21] = h[6] >> 15; + s[22] = (h[6] >> 23) | (h[7] << 3); + s[23] = h[7] >> 5; + s[24] = h[7] >> 13; + s[25] = (h[7] >> 21) | (h[8] << 4); + s[26] = h[8] >> 4; + s[27] = h[8] >> 12; + s[28] = (h[8] >> 20) | (h[9] << 6); + s[29] = h[9] >> 2; + s[30] = h[9] >> 10; + s[31] = h[9] >> 18; +} + +/* h = f */ +static inline void fe_copy(fe *h, const fe *f) +{ + memmove(h, f, sizeof(uint32_t) * 10); +} + +static inline void fe_copy_lt(fe_loose *h, const fe *f) +{ + memmove(h, f, sizeof(uint32_t) * 10); +} + +/* h = 0 */ +static inline void fe_0(fe *h) +{ + memset(h, 0, sizeof(uint32_t) * 10); +} + +/* h = 1 */ +static inline void fe_1(fe *h) +{ + memset(h, 0, sizeof(uint32_t) * 10); + h->v[0] = 1; +} + +static void fe_add_impl(uint32_t out[10], const uint32_t in1[10], const uint32_t in2[10]) +{ + const uint32_t x20 = in1[9]; + const uint32_t x21 = in1[8]; + const uint32_t x19 = in1[7]; + const uint32_t x17 = in1[6]; + const uint32_t x15 = in1[5]; + const uint32_t x13 = in1[4]; + const uint32_t x11 = in1[3]; + const uint32_t x9 = in1[2]; + const uint32_t x7 = in1[1]; + const uint32_t x5 = in1[0]; + const uint32_t x38 = in2[9]; + const uint32_t x39 = in2[8]; + const uint32_t x37 = in2[7]; + const uint32_t x35 = in2[6]; + const uint32_t x33 = in2[5]; + const uint32_t x31 = in2[4]; + const uint32_t x29 = in2[3]; + const uint32_t x27 = in2[2]; + const uint32_t x25 = in2[1]; + const uint32_t x23 = in2[0]; + out[0] = (x5 + x23); + out[1] = (x7 + x25); + out[2] = (x9 + x27); + out[3] = (x11 + x29); + out[4] = (x13 + x31); + out[5] = (x15 + x33); + out[6] = (x17 + x35); + out[7] = (x19 + x37); + out[8] = (x21 + x39); + out[9] = (x20 + x38); +} + +/* h = f + g + * Can overlap h with f or g. + */ +static inline void fe_add(fe_loose *h, const fe *f, const fe *g) +{ + fe_add_impl(h->v, f->v, g->v); +} + +static void fe_sub_impl(uint32_t out[10], const uint32_t in1[10], const uint32_t in2[10]) +{ + const uint32_t x20 = in1[9]; + const uint32_t x21 = in1[8]; + const uint32_t x19 = in1[7]; + const uint32_t x17 = in1[6]; + const uint32_t x15 = in1[5]; + const uint32_t x13 = in1[4]; + const uint32_t x11 = in1[3]; + const uint32_t x9 = in1[2]; + const uint32_t x7 = in1[1]; + const uint32_t x5 = in1[0]; + const uint32_t x38 = in2[9]; + const uint32_t x39 = in2[8]; + const uint32_t x37 = in2[7]; + const uint32_t x35 = in2[6]; + const uint32_t x33 = in2[5]; + const uint32_t x31 = in2[4]; + const uint32_t x29 = in2[3]; + const uint32_t x27 = in2[2]; + const uint32_t x25 = in2[1]; + const uint32_t x23 = in2[0]; + out[0] = ((0x7ffffda + x5) - x23); + out[1] = ((0x3fffffe + x7) - x25); + out[2] = ((0x7fffffe + x9) - x27); + out[3] = ((0x3fffffe + x11) - x29); + out[4] = ((0x7fffffe + x13) - x31); + out[5] = ((0x3fffffe + x15) - x33); + out[6] = ((0x7fffffe + x17) - x35); + out[7] = ((0x3fffffe + x19) - x37); + out[8] = ((0x7fffffe + x21) - x39); + out[9] = ((0x3fffffe + x20) - x38); +} + +/* h = f - g + * Can overlap h with f or g. + */ +static inline void fe_sub(fe_loose *h, const fe *f, const fe *g) +{ + fe_sub_impl(h->v, f->v, g->v); +} + +static void fe_mul_impl(uint32_t out[10], const uint32_t in1[10], const uint32_t in2[10]) +{ + const uint32_t x20 = in1[9]; + const uint32_t x21 = in1[8]; + const uint32_t x19 = in1[7]; + const uint32_t x17 = in1[6]; + const uint32_t x15 = in1[5]; + const uint32_t x13 = in1[4]; + const uint32_t x11 = in1[3]; + const uint32_t x9 = in1[2]; + const uint32_t x7 = in1[1]; + const uint32_t x5 = in1[0]; + const uint32_t x38 = in2[9]; + const uint32_t x39 = in2[8]; + const uint32_t x37 = in2[7]; + const uint32_t x35 = in2[6]; + const uint32_t x33 = in2[5]; + const uint32_t x31 = in2[4]; + const uint32_t x29 = in2[3]; + const uint32_t x27 = in2[2]; + const uint32_t x25 = in2[1]; + const uint32_t x23 = in2[0]; + uint64_t x40 = ((uint64_t)x23 * x5); + uint64_t x41 = (((uint64_t)x23 * x7) + ((uint64_t)x25 * x5)); + uint64_t x42 = ((((uint64_t)(0x2 * x25) * x7) + ((uint64_t)x23 * x9)) + ((uint64_t)x27 * x5)); + uint64_t x43 = (((((uint64_t)x25 * x9) + ((uint64_t)x27 * x7)) + ((uint64_t)x23 * x11)) + ((uint64_t)x29 * x5)); + uint64_t x44 = (((((uint64_t)x27 * x9) + (0x2 * (((uint64_t)x25 * x11) + ((uint64_t)x29 * x7)))) + ((uint64_t)x23 * x13)) + ((uint64_t)x31 * x5)); + uint64_t x45 = (((((((uint64_t)x27 * x11) + ((uint64_t)x29 * x9)) + ((uint64_t)x25 * x13)) + ((uint64_t)x31 * x7)) + ((uint64_t)x23 * x15)) + ((uint64_t)x33 * x5)); + uint64_t x46 = (((((0x2 * ((((uint64_t)x29 * x11) + ((uint64_t)x25 * x15)) + ((uint64_t)x33 * x7))) + ((uint64_t)x27 * x13)) + ((uint64_t)x31 * x9)) + ((uint64_t)x23 * x17)) + ((uint64_t)x35 * x5)); + uint64_t x47 = (((((((((uint64_t)x29 * x13) + ((uint64_t)x31 * x11)) + ((uint64_t)x27 * x15)) + ((uint64_t)x33 * x9)) + ((uint64_t)x25 * x17)) + ((uint64_t)x35 * x7)) + ((uint64_t)x23 * x19)) + ((uint64_t)x37 * x5)); + uint64_t x48 = (((((((uint64_t)x31 * x13) + (0x2 * (((((uint64_t)x29 * x15) + ((uint64_t)x33 * x11)) + ((uint64_t)x25 * x19)) + ((uint64_t)x37 * x7)))) + ((uint64_t)x27 * x17)) + ((uint64_t)x35 * x9)) + ((uint64_t)x23 * x21)) + ((uint64_t)x39 * x5)); + uint64_t x49 = (((((((((((uint64_t)x31 * x15) + ((uint64_t)x33 * x13)) + ((uint64_t)x29 * x17)) + ((uint64_t)x35 * x11)) + ((uint64_t)x27 * x19)) + ((uint64_t)x37 * x9)) + ((uint64_t)x25 * x21)) + ((uint64_t)x39 * x7)) + ((uint64_t)x23 * x20)) + ((uint64_t)x38 * x5)); + uint64_t x50 = (((((0x2 * ((((((uint64_t)x33 * x15) + ((uint64_t)x29 * x19)) + ((uint64_t)x37 * x11)) + ((uint64_t)x25 * x20)) + ((uint64_t)x38 * x7))) + ((uint64_t)x31 * x17)) + ((uint64_t)x35 * x13)) + ((uint64_t)x27 * x21)) + ((uint64_t)x39 * x9)); + uint64_t x51 = (((((((((uint64_t)x33 * x17) + ((uint64_t)x35 * x15)) + ((uint64_t)x31 * x19)) + ((uint64_t)x37 * x13)) + ((uint64_t)x29 * x21)) + ((uint64_t)x39 * x11)) + ((uint64_t)x27 * x20)) + ((uint64_t)x38 * x9)); + uint64_t x52 = (((((uint64_t)x35 * x17) + (0x2 * (((((uint64_t)x33 * x19) + ((uint64_t)x37 * x15)) + ((uint64_t)x29 * x20)) + ((uint64_t)x38 * x11)))) + ((uint64_t)x31 * x21)) + ((uint64_t)x39 * x13)); + uint64_t x53 = (((((((uint64_t)x35 * x19) + ((uint64_t)x37 * x17)) + ((uint64_t)x33 * x21)) + ((uint64_t)x39 * x15)) + ((uint64_t)x31 * x20)) + ((uint64_t)x38 * x13)); + uint64_t x54 = (((0x2 * ((((uint64_t)x37 * x19) + ((uint64_t)x33 * x20)) + ((uint64_t)x38 * x15))) + ((uint64_t)x35 * x21)) + ((uint64_t)x39 * x17)); + uint64_t x55 = (((((uint64_t)x37 * x21) + ((uint64_t)x39 * x19)) + ((uint64_t)x35 * x20)) + ((uint64_t)x38 * x17)); + uint64_t x56 = (((uint64_t)x39 * x21) + (0x2 * (((uint64_t)x37 * x20) + ((uint64_t)x38 * x19)))); + uint64_t x57 = (((uint64_t)x39 * x20) + ((uint64_t)x38 * x21)); + uint64_t x58 = ((uint64_t)(0x2 * x38) * x20); + uint64_t x59 = (x48 + (x58 << 0x4)); + uint64_t x60 = (x59 + (x58 << 0x1)); + uint64_t x61 = (x60 + x58); + uint64_t x62 = (x47 + (x57 << 0x4)); + uint64_t x63 = (x62 + (x57 << 0x1)); + uint64_t x64 = (x63 + x57); + uint64_t x65 = (x46 + (x56 << 0x4)); + uint64_t x66 = (x65 + (x56 << 0x1)); + uint64_t x67 = (x66 + x56); + uint64_t x68 = (x45 + (x55 << 0x4)); + uint64_t x69 = (x68 + (x55 << 0x1)); + uint64_t x70 = (x69 + x55); + uint64_t x71 = (x44 + (x54 << 0x4)); + uint64_t x72 = (x71 + (x54 << 0x1)); + uint64_t x73 = (x72 + x54); + uint64_t x74 = (x43 + (x53 << 0x4)); + uint64_t x75 = (x74 + (x53 << 0x1)); + uint64_t x76 = (x75 + x53); + uint64_t x77 = (x42 + (x52 << 0x4)); + uint64_t x78 = (x77 + (x52 << 0x1)); + uint64_t x79 = (x78 + x52); + uint64_t x80 = (x41 + (x51 << 0x4)); + uint64_t x81 = (x80 + (x51 << 0x1)); + uint64_t x82 = (x81 + x51); + uint64_t x83 = (x40 + (x50 << 0x4)); + uint64_t x84 = (x83 + (x50 << 0x1)); + uint64_t x85 = (x84 + x50); + uint64_t x86 = (x85 >> 0x1a); + uint32_t x87 = ((uint32_t)x85 & 0x3ffffff); + uint64_t x88 = (x86 + x82); + uint64_t x89 = (x88 >> 0x19); + uint32_t x90 = ((uint32_t)x88 & 0x1ffffff); + uint64_t x91 = (x89 + x79); + uint64_t x92 = (x91 >> 0x1a); + uint32_t x93 = ((uint32_t)x91 & 0x3ffffff); + uint64_t x94 = (x92 + x76); + uint64_t x95 = (x94 >> 0x19); + uint32_t x96 = ((uint32_t)x94 & 0x1ffffff); + uint64_t x97 = (x95 + x73); + uint64_t x98 = (x97 >> 0x1a); + uint32_t x99 = ((uint32_t)x97 & 0x3ffffff); + uint64_t x100 = (x98 + x70); + uint64_t x101 = (x100 >> 0x19); + uint32_t x102 = ((uint32_t)x100 & 0x1ffffff); + uint64_t x103 = (x101 + x67); + uint64_t x104 = (x103 >> 0x1a); + uint32_t x105 = ((uint32_t)x103 & 0x3ffffff); + uint64_t x106 = (x104 + x64); + uint64_t x107 = (x106 >> 0x19); + uint32_t x108 = ((uint32_t)x106 & 0x1ffffff); + uint64_t x109 = (x107 + x61); + uint64_t x110 = (x109 >> 0x1a); + uint32_t x111 = ((uint32_t)x109 & 0x3ffffff); + uint64_t x112 = (x110 + x49); + uint64_t x113 = (x112 >> 0x19); + uint32_t x114 = ((uint32_t)x112 & 0x1ffffff); + uint64_t x115 = (x87 + (0x13 * x113)); + uint32_t x116 = (uint32_t) (x115 >> 0x1a); + uint32_t x117 = ((uint32_t)x115 & 0x3ffffff); + uint32_t x118 = (x116 + x90); + uint32_t x119 = (x118 >> 0x19); + uint32_t x120 = (x118 & 0x1ffffff); + out[0] = x117; + out[1] = x120; + out[2] = (x119 + x93); + out[3] = x96; + out[4] = x99; + out[5] = x102; + out[6] = x105; + out[7] = x108; + out[8] = x111; + out[9] = x114; +} + +static inline void fe_mul_ttt(fe *h, const fe *f, const fe *g) +{ + fe_mul_impl(h->v, f->v, g->v); +} + +static inline void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) +{ + fe_mul_impl(h->v, f->v, g->v); +} + +static inline void +fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) +{ + fe_mul_impl(h->v, f->v, g->v); +} + +static void fe_sqr_impl(uint32_t out[10], const uint32_t in1[10]) +{ + const uint32_t x17 = in1[9]; + const uint32_t x18 = in1[8]; + const uint32_t x16 = in1[7]; + const uint32_t x14 = in1[6]; + const uint32_t x12 = in1[5]; + const uint32_t x10 = in1[4]; + const uint32_t x8 = in1[3]; + const uint32_t x6 = in1[2]; + const uint32_t x4 = in1[1]; + const uint32_t x2 = in1[0]; + uint64_t x19 = ((uint64_t)x2 * x2); + uint64_t x20 = ((uint64_t)(0x2 * x2) * x4); + uint64_t x21 = (0x2 * (((uint64_t)x4 * x4) + ((uint64_t)x2 * x6))); + uint64_t x22 = (0x2 * (((uint64_t)x4 * x6) + ((uint64_t)x2 * x8))); + uint64_t x23 = ((((uint64_t)x6 * x6) + ((uint64_t)(0x4 * x4) * x8)) + ((uint64_t)(0x2 * x2) * x10)); + uint64_t x24 = (0x2 * ((((uint64_t)x6 * x8) + ((uint64_t)x4 * x10)) + ((uint64_t)x2 * x12))); + uint64_t x25 = (0x2 * (((((uint64_t)x8 * x8) + ((uint64_t)x6 * x10)) + ((uint64_t)x2 * x14)) + ((uint64_t)(0x2 * x4) * x12))); + uint64_t x26 = (0x2 * (((((uint64_t)x8 * x10) + ((uint64_t)x6 * x12)) + ((uint64_t)x4 * x14)) + ((uint64_t)x2 * x16))); + uint64_t x27 = (((uint64_t)x10 * x10) + (0x2 * ((((uint64_t)x6 * x14) + ((uint64_t)x2 * x18)) + (0x2 * (((uint64_t)x4 * x16) + ((uint64_t)x8 * x12)))))); + uint64_t x28 = (0x2 * ((((((uint64_t)x10 * x12) + ((uint64_t)x8 * x14)) + ((uint64_t)x6 * x16)) + ((uint64_t)x4 * x18)) + ((uint64_t)x2 * x17))); + uint64_t x29 = (0x2 * (((((uint64_t)x12 * x12) + ((uint64_t)x10 * x14)) + ((uint64_t)x6 * x18)) + (0x2 * (((uint64_t)x8 * x16) + ((uint64_t)x4 * x17))))); + uint64_t x30 = (0x2 * (((((uint64_t)x12 * x14) + ((uint64_t)x10 * x16)) + ((uint64_t)x8 * x18)) + ((uint64_t)x6 * x17))); + uint64_t x31 = (((uint64_t)x14 * x14) + (0x2 * (((uint64_t)x10 * x18) + (0x2 * (((uint64_t)x12 * x16) + ((uint64_t)x8 * x17)))))); + uint64_t x32 = (0x2 * ((((uint64_t)x14 * x16) + ((uint64_t)x12 * x18)) + ((uint64_t)x10 * x17))); + uint64_t x33 = (0x2 * ((((uint64_t)x16 * x16) + ((uint64_t)x14 * x18)) + ((uint64_t)(0x2 * x12) * x17))); + uint64_t x34 = (0x2 * (((uint64_t)x16 * x18) + ((uint64_t)x14 * x17))); + uint64_t x35 = (((uint64_t)x18 * x18) + ((uint64_t)(0x4 * x16) * x17)); + uint64_t x36 = ((uint64_t)(0x2 * x18) * x17); + uint64_t x37 = ((uint64_t)(0x2 * x17) * x17); + uint64_t x38 = (x27 + (x37 << 0x4)); + uint64_t x39 = (x38 + (x37 << 0x1)); + uint64_t x40 = (x39 + x37); + uint64_t x41 = (x26 + (x36 << 0x4)); + uint64_t x42 = (x41 + (x36 << 0x1)); + uint64_t x43 = (x42 + x36); + uint64_t x44 = (x25 + (x35 << 0x4)); + uint64_t x45 = (x44 + (x35 << 0x1)); + uint64_t x46 = (x45 + x35); + uint64_t x47 = (x24 + (x34 << 0x4)); + uint64_t x48 = (x47 + (x34 << 0x1)); + uint64_t x49 = (x48 + x34); + uint64_t x50 = (x23 + (x33 << 0x4)); + uint64_t x51 = (x50 + (x33 << 0x1)); + uint64_t x52 = (x51 + x33); + uint64_t x53 = (x22 + (x32 << 0x4)); + uint64_t x54 = (x53 + (x32 << 0x1)); + uint64_t x55 = (x54 + x32); + uint64_t x56 = (x21 + (x31 << 0x4)); + uint64_t x57 = (x56 + (x31 << 0x1)); + uint64_t x58 = (x57 + x31); + uint64_t x59 = (x20 + (x30 << 0x4)); + uint64_t x60 = (x59 + (x30 << 0x1)); + uint64_t x61 = (x60 + x30); + uint64_t x62 = (x19 + (x29 << 0x4)); + uint64_t x63 = (x62 + (x29 << 0x1)); + uint64_t x64 = (x63 + x29); + uint64_t x65 = (x64 >> 0x1a); + uint32_t x66 = ((uint32_t)x64 & 0x3ffffff); + uint64_t x67 = (x65 + x61); + uint64_t x68 = (x67 >> 0x19); + uint32_t x69 = ((uint32_t)x67 & 0x1ffffff); + uint64_t x70 = (x68 + x58); + uint64_t x71 = (x70 >> 0x1a); + uint32_t x72 = ((uint32_t)x70 & 0x3ffffff); + uint64_t x73 = (x71 + x55); + uint64_t x74 = (x73 >> 0x19); + uint32_t x75 = ((uint32_t)x73 & 0x1ffffff); + uint64_t x76 = (x74 + x52); + uint64_t x77 = (x76 >> 0x1a); + uint32_t x78 = ((uint32_t)x76 & 0x3ffffff); + uint64_t x79 = (x77 + x49); + uint64_t x80 = (x79 >> 0x19); + uint32_t x81 = ((uint32_t)x79 & 0x1ffffff); + uint64_t x82 = (x80 + x46); + uint64_t x83 = (x82 >> 0x1a); + uint32_t x84 = ((uint32_t)x82 & 0x3ffffff); + uint64_t x85 = (x83 + x43); + uint64_t x86 = (x85 >> 0x19); + uint32_t x87 = ((uint32_t)x85 & 0x1ffffff); + uint64_t x88 = (x86 + x40); + uint64_t x89 = (x88 >> 0x1a); + uint32_t x90 = ((uint32_t)x88 & 0x3ffffff); + uint64_t x91 = (x89 + x28); + uint64_t x92 = (x91 >> 0x19); + uint32_t x93 = ((uint32_t)x91 & 0x1ffffff); + uint64_t x94 = (x66 + (0x13 * x92)); + uint32_t x95 = (uint32_t) (x94 >> 0x1a); + uint32_t x96 = ((uint32_t)x94 & 0x3ffffff); + uint32_t x97 = (x95 + x69); + uint32_t x98 = (x97 >> 0x19); + uint32_t x99 = (x97 & 0x1ffffff); + out[0] = x96; + out[1] = x99; + out[2] = (x98 + x72); + out[3] = x75; + out[4] = x78; + out[5] = x81; + out[6] = x84; + out[7] = x87; + out[8] = x90; + out[9] = x93; +} + +static inline void fe_sq_tl(fe *h, const fe_loose *f) +{ + fe_sqr_impl(h->v, f->v); +} + +static inline void fe_sq_tt(fe *h, const fe *f) +{ + fe_sqr_impl(h->v, f->v); +} + +static inline void fe_loose_invert(fe *out, const fe_loose *z) +{ + fe t0; + fe t1; + fe t2; + fe t3; + int i; + + fe_sq_tl(&t0, z); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 2; ++i) + fe_sq_tt(&t1, &t1); + fe_mul_tlt(&t1, z, &t1); + fe_mul_ttt(&t0, &t0, &t1); + fe_sq_tt(&t2, &t0); + fe_mul_ttt(&t1, &t1, &t2); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 5; ++i) + fe_sq_tt(&t2, &t2); + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 10; ++i) + fe_sq_tt(&t2, &t2); + fe_mul_ttt(&t2, &t2, &t1); + fe_sq_tt(&t3, &t2); + for (i = 1; i < 20; ++i) + fe_sq_tt(&t3, &t3); + fe_mul_ttt(&t2, &t3, &t2); + fe_sq_tt(&t2, &t2); + for (i = 1; i < 10; ++i) + fe_sq_tt(&t2, &t2); + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 50; ++i) + fe_sq_tt(&t2, &t2); + fe_mul_ttt(&t2, &t2, &t1); + fe_sq_tt(&t3, &t2); + for (i = 1; i < 100; ++i) + fe_sq_tt(&t3, &t3); + fe_mul_ttt(&t2, &t3, &t2); + fe_sq_tt(&t2, &t2); + for (i = 1; i < 50; ++i) + fe_sq_tt(&t2, &t2); + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t1, &t1); + for (i = 1; i < 5; ++i) + fe_sq_tt(&t1, &t1); + fe_mul_ttt(out, &t1, &t0); +} + +static inline void fe_invert(fe *out, const fe *z) +{ + fe_loose l; + fe_copy_lt(&l, z); + fe_loose_invert(out, &l); +} + +/* Replace (f,g) with (g,f) if b == 1; + * replace (f,g) with (f,g) if b == 0. + * + * Preconditions: b in {0,1} + */ +static inline void fe_cswap(fe *f, fe *g, unsigned int b) +{ + unsigned i; + b = 0 - b; + for (i = 0; i < 10; i++) { + uint32_t x = f->v[i] ^ g->v[i]; + x &= b; + f->v[i] ^= x; + g->v[i] ^= x; + } +} + +/* NOTE: based on fiat-crypto fe_mul, edited for in2=121666, 0, 0.*/ +static inline void fe_mul_121666_impl(uint32_t out[10], const uint32_t in1[10]) +{ + const uint32_t x20 = in1[9]; + const uint32_t x21 = in1[8]; + const uint32_t x19 = in1[7]; + const uint32_t x17 = in1[6]; + const uint32_t x15 = in1[5]; + const uint32_t x13 = in1[4]; + const uint32_t x11 = in1[3]; + const uint32_t x9 = in1[2]; + const uint32_t x7 = in1[1]; + const uint32_t x5 = in1[0]; + const uint32_t x38 = 0; + const uint32_t x39 = 0; + const uint32_t x37 = 0; + const uint32_t x35 = 0; + const uint32_t x33 = 0; + const uint32_t x31 = 0; + const uint32_t x29 = 0; + const uint32_t x27 = 0; + const uint32_t x25 = 0; + const uint32_t x23 = 121666; + uint64_t x40 = ((uint64_t)x23 * x5); + uint64_t x41 = (((uint64_t)x23 * x7) + ((uint64_t)x25 * x5)); + uint64_t x42 = ((((uint64_t)(0x2 * x25) * x7) + ((uint64_t)x23 * x9)) + ((uint64_t)x27 * x5)); + uint64_t x43 = (((((uint64_t)x25 * x9) + ((uint64_t)x27 * x7)) + ((uint64_t)x23 * x11)) + ((uint64_t)x29 * x5)); + uint64_t x44 = (((((uint64_t)x27 * x9) + (0x2 * (((uint64_t)x25 * x11) + ((uint64_t)x29 * x7)))) + ((uint64_t)x23 * x13)) + ((uint64_t)x31 * x5)); + uint64_t x45 = (((((((uint64_t)x27 * x11) + ((uint64_t)x29 * x9)) + ((uint64_t)x25 * x13)) + ((uint64_t)x31 * x7)) + ((uint64_t)x23 * x15)) + ((uint64_t)x33 * x5)); + uint64_t x46 = (((((0x2 * ((((uint64_t)x29 * x11) + ((uint64_t)x25 * x15)) + ((uint64_t)x33 * x7))) + ((uint64_t)x27 * x13)) + ((uint64_t)x31 * x9)) + ((uint64_t)x23 * x17)) + ((uint64_t)x35 * x5)); + uint64_t x47 = (((((((((uint64_t)x29 * x13) + ((uint64_t)x31 * x11)) + ((uint64_t)x27 * x15)) + ((uint64_t)x33 * x9)) + ((uint64_t)x25 * x17)) + ((uint64_t)x35 * x7)) + ((uint64_t)x23 * x19)) + ((uint64_t)x37 * x5)); + uint64_t x48 = (((((((uint64_t)x31 * x13) + (0x2 * (((((uint64_t)x29 * x15) + ((uint64_t)x33 * x11)) + ((uint64_t)x25 * x19)) + ((uint64_t)x37 * x7)))) + ((uint64_t)x27 * x17)) + ((uint64_t)x35 * x9)) + ((uint64_t)x23 * x21)) + ((uint64_t)x39 * x5)); + uint64_t x49 = (((((((((((uint64_t)x31 * x15) + ((uint64_t)x33 * x13)) + ((uint64_t)x29 * x17)) + ((uint64_t)x35 * x11)) + ((uint64_t)x27 * x19)) + ((uint64_t)x37 * x9)) + ((uint64_t)x25 * x21)) + ((uint64_t)x39 * x7)) + ((uint64_t)x23 * x20)) + ((uint64_t)x38 * x5)); + uint64_t x50 = (((((0x2 * ((((((uint64_t)x33 * x15) + ((uint64_t)x29 * x19)) + ((uint64_t)x37 * x11)) + ((uint64_t)x25 * x20)) + ((uint64_t)x38 * x7))) + ((uint64_t)x31 * x17)) + ((uint64_t)x35 * x13)) + ((uint64_t)x27 * x21)) + ((uint64_t)x39 * x9)); + uint64_t x51 = (((((((((uint64_t)x33 * x17) + ((uint64_t)x35 * x15)) + ((uint64_t)x31 * x19)) + ((uint64_t)x37 * x13)) + ((uint64_t)x29 * x21)) + ((uint64_t)x39 * x11)) + ((uint64_t)x27 * x20)) + ((uint64_t)x38 * x9)); + uint64_t x52 = (((((uint64_t)x35 * x17) + (0x2 * (((((uint64_t)x33 * x19) + ((uint64_t)x37 * x15)) + ((uint64_t)x29 * x20)) + ((uint64_t)x38 * x11)))) + ((uint64_t)x31 * x21)) + ((uint64_t)x39 * x13)); + uint64_t x53 = (((((((uint64_t)x35 * x19) + ((uint64_t)x37 * x17)) + ((uint64_t)x33 * x21)) + ((uint64_t)x39 * x15)) + ((uint64_t)x31 * x20)) + ((uint64_t)x38 * x13)); + uint64_t x54 = (((0x2 * ((((uint64_t)x37 * x19) + ((uint64_t)x33 * x20)) + ((uint64_t)x38 * x15))) + ((uint64_t)x35 * x21)) + ((uint64_t)x39 * x17)); + uint64_t x55 = (((((uint64_t)x37 * x21) + ((uint64_t)x39 * x19)) + ((uint64_t)x35 * x20)) + ((uint64_t)x38 * x17)); + uint64_t x56 = (((uint64_t)x39 * x21) + (0x2 * (((uint64_t)x37 * x20) + ((uint64_t)x38 * x19)))); + uint64_t x57 = (((uint64_t)x39 * x20) + ((uint64_t)x38 * x21)); + uint64_t x58 = ((uint64_t)(0x2 * x38) * x20); + uint64_t x59 = (x48 + (x58 << 0x4)); + uint64_t x60 = (x59 + (x58 << 0x1)); + uint64_t x61 = (x60 + x58); + uint64_t x62 = (x47 + (x57 << 0x4)); + uint64_t x63 = (x62 + (x57 << 0x1)); + uint64_t x64 = (x63 + x57); + uint64_t x65 = (x46 + (x56 << 0x4)); + uint64_t x66 = (x65 + (x56 << 0x1)); + uint64_t x67 = (x66 + x56); + uint64_t x68 = (x45 + (x55 << 0x4)); + uint64_t x69 = (x68 + (x55 << 0x1)); + uint64_t x70 = (x69 + x55); + uint64_t x71 = (x44 + (x54 << 0x4)); + uint64_t x72 = (x71 + (x54 << 0x1)); + uint64_t x73 = (x72 + x54); + uint64_t x74 = (x43 + (x53 << 0x4)); + uint64_t x75 = (x74 + (x53 << 0x1)); + uint64_t x76 = (x75 + x53); + uint64_t x77 = (x42 + (x52 << 0x4)); + uint64_t x78 = (x77 + (x52 << 0x1)); + uint64_t x79 = (x78 + x52); + uint64_t x80 = (x41 + (x51 << 0x4)); + uint64_t x81 = (x80 + (x51 << 0x1)); + uint64_t x82 = (x81 + x51); + uint64_t x83 = (x40 + (x50 << 0x4)); + uint64_t x84 = (x83 + (x50 << 0x1)); + uint64_t x85 = (x84 + x50); + uint64_t x86 = (x85 >> 0x1a); + uint32_t x87 = ((uint32_t)x85 & 0x3ffffff); + uint64_t x88 = (x86 + x82); + uint64_t x89 = (x88 >> 0x19); + uint32_t x90 = ((uint32_t)x88 & 0x1ffffff); + uint64_t x91 = (x89 + x79); + uint64_t x92 = (x91 >> 0x1a); + uint32_t x93 = ((uint32_t)x91 & 0x3ffffff); + uint64_t x94 = (x92 + x76); + uint64_t x95 = (x94 >> 0x19); + uint32_t x96 = ((uint32_t)x94 & 0x1ffffff); + uint64_t x97 = (x95 + x73); + uint64_t x98 = (x97 >> 0x1a); + uint32_t x99 = ((uint32_t)x97 & 0x3ffffff); + uint64_t x100 = (x98 + x70); + uint64_t x101 = (x100 >> 0x19); + uint32_t x102 = ((uint32_t)x100 & 0x1ffffff); + uint64_t x103 = (x101 + x67); + uint64_t x104 = (x103 >> 0x1a); + uint32_t x105 = ((uint32_t)x103 & 0x3ffffff); + uint64_t x106 = (x104 + x64); + uint64_t x107 = (x106 >> 0x19); + uint32_t x108 = ((uint32_t)x106 & 0x1ffffff); + uint64_t x109 = (x107 + x61); + uint64_t x110 = (x109 >> 0x1a); + uint32_t x111 = ((uint32_t)x109 & 0x3ffffff); + uint64_t x112 = (x110 + x49); + uint64_t x113 = (x112 >> 0x19); + uint32_t x114 = ((uint32_t)x112 & 0x1ffffff); + uint64_t x115 = (x87 + (0x13 * x113)); + uint32_t x116 = (uint32_t) (x115 >> 0x1a); + uint32_t x117 = ((uint32_t)x115 & 0x3ffffff); + uint32_t x118 = (x116 + x90); + uint32_t x119 = (x118 >> 0x19); + uint32_t x120 = (x118 & 0x1ffffff); + out[0] = x117; + out[1] = x120; + out[2] = (x119 + x93); + out[3] = x96; + out[4] = x99; + out[5] = x102; + out[6] = x105; + out[7] = x108; + out[8] = x111; + out[9] = x114; +} + +static inline void fe_mul121666(fe *h, const fe_loose *f) +{ + fe_mul_121666_impl(h->v, f->v); +} + +static const uint8_t curve25519_null_point[CURVE25519_KEY_SIZE]; + +bool curve25519(uint8_t out[CURVE25519_KEY_SIZE], + const uint8_t scalar[CURVE25519_KEY_SIZE], + const uint8_t point[CURVE25519_KEY_SIZE]) +{ + fe x1, x2, z2, x3, z3; + fe_loose x2l, z2l, x3l; + unsigned swap = 0; + int pos; + uint8_t e[32]; + + memcpy(e, scalar, 32); + curve25519_clamp_secret(e); + + /* The following implementation was transcribed to Coq and proven to + * correspond to unary scalar multiplication in affine coordinates given + * that x1 != 0 is the x coordinate of some point on the curve. It was + * also checked in Coq that doing a ladderstep with x1 = x3 = 0 gives + * z2' = z3' = 0, and z2 = z3 = 0 gives z2' = z3' = 0. The statement was + * quantified over the underlying field, so it applies to Curve25519 + * itself and the quadratic twist of Curve25519. It was not proven in + * Coq that prime-field arithmetic correctly simulates extension-field + * arithmetic on prime-field values. The decoding of the byte array + * representation of e was not considered. + * + * Specification of Montgomery curves in affine coordinates: + * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27> + * + * Proof that these form a group that is isomorphic to a Weierstrass + * curve: + * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35> + * + * Coq transcription and correctness proof of the loop + * (where scalarbits=255): + * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118> + * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278> + * preconditions: 0 <= e < 2^255 (not necessarily e < order), + * fe_invert(0) = 0 + */ + fe_frombytes(&x1, point); + fe_1(&x2); + fe_0(&z2); + fe_copy(&x3, &x1); + fe_1(&z3); + + for (pos = 254; pos >= 0; --pos) { + fe tmp0, tmp1; + fe_loose tmp0l, tmp1l; + /* loop invariant as of right before the test, for the case + * where x1 != 0: + * pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 + * is nonzero + * let r := e >> (pos+1) in the following equalities of + * projective points: + * to_xz (r*P) === if swap then (x3, z3) else (x2, z2) + * to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) + * x1 is the nonzero x coordinate of the nonzero + * point (r*P-(r+1)*P) + */ + unsigned b = 1 & (e[pos / 8] >> (pos & 7)); + swap ^= b; + fe_cswap(&x2, &x3, swap); + fe_cswap(&z2, &z3, swap); + swap = b; + /* Coq transcription of ladderstep formula (called from + * transcribed loop): + * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89> + * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131> + * x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217> + * x1 = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147> + */ + fe_sub(&tmp0l, &x3, &z3); + fe_sub(&tmp1l, &x2, &z2); + fe_add(&x2l, &x2, &z2); + fe_add(&z2l, &x3, &z3); + fe_mul_tll(&z3, &tmp0l, &x2l); + fe_mul_tll(&z2, &z2l, &tmp1l); + fe_sq_tl(&tmp0, &tmp1l); + fe_sq_tl(&tmp1, &x2l); + fe_add(&x3l, &z3, &z2); + fe_sub(&z2l, &z3, &z2); + fe_mul_ttt(&x2, &tmp1, &tmp0); + fe_sub(&tmp1l, &tmp1, &tmp0); + fe_sq_tl(&z2, &z2l); + fe_mul121666(&z3, &tmp1l); + fe_sq_tl(&x3, &x3l); + fe_add(&tmp0l, &tmp0, &z3); + fe_mul_ttt(&z3, &x1, &z2); + fe_mul_tll(&z2, &tmp1l, &tmp0l); + } + /* here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) + * else (x2, z2) + */ + fe_cswap(&x2, &x3, swap); + fe_cswap(&z2, &z3, swap); + + fe_invert(&z2, &z2); + fe_mul_ttt(&x2, &x2, &z2); + fe_tobytes(out, &x2); + + explicit_bzero(&x1, sizeof(x1)); + explicit_bzero(&x2, sizeof(x2)); + explicit_bzero(&z2, sizeof(z2)); + explicit_bzero(&x3, sizeof(x3)); + explicit_bzero(&z3, sizeof(z3)); + explicit_bzero(&x2l, sizeof(x2l)); + explicit_bzero(&z2l, sizeof(z2l)); + explicit_bzero(&x3l, sizeof(x3l)); + explicit_bzero(&e, sizeof(e)); + + return timingsafe_bcmp(out, curve25519_null_point, CURVE25519_KEY_SIZE) != 0; +} diff --git a/sys/dev/if_wg/crypto.h b/sys/dev/if_wg/crypto.h new file mode 100644 index 000000000000..6e045c2fe0bf --- /dev/null +++ b/sys/dev/if_wg/crypto.h @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2015-2021 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _WG_CRYPTO +#define _WG_CRYPTO + +#include <sys/types.h> + +enum chacha20poly1305_lengths { + XCHACHA20POLY1305_NONCE_SIZE = 24, + CHACHA20POLY1305_KEY_SIZE = 32, + CHACHA20POLY1305_AUTHTAG_SIZE = 16 +}; + +void +chacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, const size_t src_len, + const uint8_t *ad, const size_t ad_len, + const uint64_t nonce, + const uint8_t key[CHACHA20POLY1305_KEY_SIZE]); + +bool +chacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src, const size_t src_len, + const uint8_t *ad, const size_t ad_len, + const uint64_t nonce, + const uint8_t key[CHACHA20POLY1305_KEY_SIZE]); + +void +xchacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, + const size_t src_len, const uint8_t *ad, + const size_t ad_len, + const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE], + const uint8_t key[CHACHA20POLY1305_KEY_SIZE]); + +bool +xchacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src, + const size_t src_len, const uint8_t *ad, + const size_t ad_len, + const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE], + const uint8_t key[CHACHA20POLY1305_KEY_SIZE]); + + +enum blake2s_lengths { + BLAKE2S_BLOCK_SIZE = 64, + BLAKE2S_HASH_SIZE = 32, + BLAKE2S_KEY_SIZE = 32 +}; + +struct blake2s_state { + uint32_t h[8]; + uint32_t t[2]; + uint32_t f[2]; + uint8_t buf[BLAKE2S_BLOCK_SIZE]; + unsigned int buflen; + unsigned int outlen; +}; + +void blake2s_init(struct blake2s_state *state, const size_t outlen); + +void blake2s_init_key(struct blake2s_state *state, const size_t outlen, + const uint8_t *key, const size_t keylen); + +void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen); + +void blake2s_final(struct blake2s_state *state, uint8_t *out); + +void blake2s(uint8_t *out, const uint8_t *in, const uint8_t *key, + const size_t outlen, const size_t inlen, const size_t keylen); + +void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key, + const size_t outlen, const size_t inlen, const size_t keylen); + +enum curve25519_lengths { + CURVE25519_KEY_SIZE = 32 +}; + +bool curve25519(uint8_t mypublic[static CURVE25519_KEY_SIZE], + const uint8_t secret[static CURVE25519_KEY_SIZE], + const uint8_t basepoint[static CURVE25519_KEY_SIZE]); + +static inline bool +curve25519_generate_public(uint8_t pub[static CURVE25519_KEY_SIZE], + const uint8_t secret[static CURVE25519_KEY_SIZE]) +{ + static const uint8_t basepoint[CURVE25519_KEY_SIZE] = { 9 }; + + return curve25519(pub, secret, basepoint); +} + +static inline void curve25519_clamp_secret(uint8_t secret[static CURVE25519_KEY_SIZE]) +{ + secret[0] &= 248; + secret[31] = (secret[31] & 127) | 64; +} + +static inline void curve25519_generate_secret(uint8_t secret[CURVE25519_KEY_SIZE]) +{ + arc4random_buf(secret, CURVE25519_KEY_SIZE); + curve25519_clamp_secret(secret); +} + +#endif diff --git a/sys/dev/if_wg/if_wg.c b/sys/dev/if_wg/if_wg.c new file mode 100644 index 000000000000..ba2eb3221fac --- /dev/null +++ b/sys/dev/if_wg/if_wg.c @@ -0,0 +1,3454 @@ +/* + * Copyright (C) 2015-2021 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * Copyright (C) 2019-2021 Matt Dunwoodie <ncon@noconroy.net> + * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate) + * Copyright (c) 2021 Kyle Evans <kevans@FreeBSD.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* TODO audit imports */ +#include "opt_inet.h" +#include "opt_inet6.h" + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <vm/uma.h> + +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/kernel.h> + +#include <sys/sockio.h> +#include <sys/socketvar.h> +#include <sys/errno.h> +#include <sys/jail.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <sys/rwlock.h> +#include <sys/rmlock.h> +#include <sys/protosw.h> +#include <sys/module.h> +#include <sys/endian.h> +#include <sys/kdb.h> +#include <sys/sx.h> +#include <sys/sysctl.h> +#include <sys/gtaskqueue.h> +#include <sys/smp.h> +#include <sys/nv.h> + +#include <net/bpf.h> + +#include <sys/syslog.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_clone.h> +#include <net/if_types.h> +#include <net/ethernet.h> +#include <net/radix.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#include <netinet6/scope6_var.h> +#include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include <netinet/icmp6.h> +#include <netinet/in_pcb.h> +#include <netinet6/in6_pcb.h> +#include <netinet/udp_var.h> + +#include <machine/in_cksum.h> + +#include "support.h" +#include "wg_noise.h" +#include "wg_cookie.h" +#include "if_wg.h" + +/* It'd be nice to use IF_MAXMTU, but that means more complicated mbuf allocations, + * so instead just do the biggest mbuf we can easily allocate minus the usual maximum + * IPv6 overhead of 80 bytes. If somebody wants bigger frames, we can revisit this. */ +#define MAX_MTU (MJUM16BYTES - 80) + +#define DEFAULT_MTU 1420 + +#define MAX_STAGED_PKT 128 +#define MAX_QUEUED_PKT 1024 +#define MAX_QUEUED_PKT_MASK (MAX_QUEUED_PKT - 1) + +#define MAX_QUEUED_HANDSHAKES 4096 + +#define HASHTABLE_PEER_SIZE (1 << 11) +#define HASHTABLE_INDEX_SIZE (1 << 13) +#define MAX_PEERS_PER_IFACE (1 << 20) + +#define REKEY_TIMEOUT 5 +#define REKEY_TIMEOUT_JITTER 334 /* 1/3 sec, round for arc4random_uniform */ +#define KEEPALIVE_TIMEOUT 10 +#define MAX_TIMER_HANDSHAKES (90 / REKEY_TIMEOUT) +#define NEW_HANDSHAKE_TIMEOUT (REKEY_TIMEOUT + KEEPALIVE_TIMEOUT) +#define UNDERLOAD_TIMEOUT 1 + +#define DPRINTF(sc, ...) if (wireguard_debug) if_printf(sc->sc_ifp, ##__VA_ARGS__) + +/* First byte indicating packet type on the wire */ +#define WG_PKT_INITIATION htole32(1) +#define WG_PKT_RESPONSE htole32(2) +#define WG_PKT_COOKIE htole32(3) +#define WG_PKT_DATA htole32(4) + +#define WG_PKT_WITH_PADDING(n) (((n) + (16-1)) & (~(16-1))) +#define WG_KEY_SIZE 32 + +struct wg_pkt_initiation { + uint32_t t; + uint32_t s_idx; + uint8_t ue[NOISE_PUBLIC_KEY_LEN]; + uint8_t es[NOISE_PUBLIC_KEY_LEN + NOISE_AUTHTAG_LEN]; + uint8_t ets[NOISE_TIMESTAMP_LEN + NOISE_AUTHTAG_LEN]; + struct cookie_macs m; +}; + +struct wg_pkt_response { + uint32_t t; + uint32_t s_idx; + uint32_t r_idx; + uint8_t ue[NOISE_PUBLIC_KEY_LEN]; + uint8_t en[0 + NOISE_AUTHTAG_LEN]; + struct cookie_macs m; +}; + +struct wg_pkt_cookie { + uint32_t t; + uint32_t r_idx; + uint8_t nonce[COOKIE_NONCE_SIZE]; + uint8_t ec[COOKIE_ENCRYPTED_SIZE]; +}; + +struct wg_pkt_data { + uint32_t t; + uint32_t r_idx; + uint8_t nonce[sizeof(uint64_t)]; + uint8_t buf[]; +}; + +struct wg_endpoint { + union { + struct sockaddr r_sa; + struct sockaddr_in r_sin; +#ifdef INET6 + struct sockaddr_in6 r_sin6; +#endif + } e_remote; + union { + struct in_addr l_in; +#ifdef INET6 + struct in6_pktinfo l_pktinfo6; +#define l_in6 l_pktinfo6.ipi6_addr +#endif + } e_local; +}; + +struct wg_tag { + struct m_tag t_tag; + struct wg_endpoint t_endpoint; + struct wg_peer *t_peer; + struct mbuf *t_mbuf; + int t_done; + int t_mtu; +}; + +struct wg_index { + LIST_ENTRY(wg_index) i_entry; + SLIST_ENTRY(wg_index) i_unused_entry; + uint32_t i_key; + struct noise_remote *i_value; +}; + +struct wg_timers { + /* t_lock is for blocking wg_timers_event_* when setting t_disabled. */ + struct rwlock t_lock; + + int t_disabled; + int t_need_another_keepalive; + uint16_t t_persistent_keepalive_interval; + struct callout t_new_handshake; + struct callout t_send_keepalive; + struct callout t_retry_handshake; + struct callout t_zero_key_material; + struct callout t_persistent_keepalive; + + struct mtx t_handshake_mtx; + struct timespec t_handshake_last_sent; + struct timespec t_handshake_complete; + volatile int t_handshake_retries; +}; + +struct wg_aip { + struct radix_node r_nodes[2]; + CK_LIST_ENTRY(wg_aip) r_entry; + struct sockaddr_storage r_addr; + struct sockaddr_storage r_mask; + struct wg_peer *r_peer; +}; + +struct wg_queue { + struct mtx q_mtx; + struct mbufq q; +}; + +struct wg_peer { + CK_LIST_ENTRY(wg_peer) p_hash_entry; + CK_LIST_ENTRY(wg_peer) p_entry; + uint64_t p_id; + struct wg_softc *p_sc; + + struct noise_remote p_remote; + struct cookie_maker p_cookie; + struct wg_timers p_timers; + + struct rwlock p_endpoint_lock; + struct wg_endpoint p_endpoint; + + SLIST_HEAD(,wg_index) p_unused_index; + struct wg_index p_index[3]; + + struct wg_queue p_stage_queue; + struct wg_queue p_encap_queue; + struct wg_queue p_decap_queue; + + struct grouptask p_clear_secrets; + struct grouptask p_send_initiation; + struct grouptask p_send_keepalive; + struct grouptask p_send; + struct grouptask p_recv; + + counter_u64_t p_tx_bytes; + counter_u64_t p_rx_bytes; + + CK_LIST_HEAD(, wg_aip) p_aips; + struct mtx p_lock; + struct epoch_context p_ctx; +}; + +enum route_direction { + /* TODO OpenBSD doesn't use IN/OUT, instead passes the address buffer + * directly to route_lookup. */ + IN, + OUT, +}; + +struct wg_aip_table { + size_t t_count; + struct radix_node_head *t_ip; + struct radix_node_head *t_ip6; +}; + +struct wg_allowedip { + uint16_t family; + union { + struct in_addr ip4; + struct in6_addr ip6; + }; + uint8_t cidr; +}; + +struct wg_hashtable { + struct mtx h_mtx; + SIPHASH_KEY h_secret; + CK_LIST_HEAD(, wg_peer) h_peers_list; + CK_LIST_HEAD(, wg_peer) *h_peers; + u_long h_peers_mask; + size_t h_num_peers; +}; + +struct wg_socket { + struct mtx so_mtx; + struct socket *so_so4; + struct socket *so_so6; + uint32_t so_user_cookie; + in_port_t so_port; +}; + +struct wg_softc { + LIST_ENTRY(wg_softc) sc_entry; + struct ifnet *sc_ifp; + int sc_flags; + + struct ucred *sc_ucred; + struct wg_socket sc_socket; + struct wg_hashtable sc_hashtable; + struct wg_aip_table sc_aips; + + struct mbufq sc_handshake_queue; + struct grouptask sc_handshake; + + struct noise_local sc_local; + struct cookie_checker sc_cookie; + + struct buf_ring *sc_encap_ring; + struct buf_ring *sc_decap_ring; + + struct grouptask *sc_encrypt; + struct grouptask *sc_decrypt; + + struct rwlock sc_index_lock; + LIST_HEAD(,wg_index) *sc_index; + u_long sc_index_mask; + + struct sx sc_lock; + volatile u_int sc_peer_count; +}; + +#define WGF_DYING 0x0001 + +/* TODO the following defines are freebsd specific, we should see what is + * necessary and cleanup from there (i suspect a lot can be junked). */ + +#ifndef ENOKEY +#define ENOKEY ENOTCAPABLE +#endif + +#if __FreeBSD_version > 1300000 +typedef void timeout_t (void *); +#endif + +#define GROUPTASK_DRAIN(gtask) \ + gtaskqueue_drain((gtask)->gt_taskqueue, &(gtask)->gt_task) + +#define MTAG_WIREGUARD 0xBEAD +#define M_ENQUEUED M_PROTO1 + +static int clone_count; +static uma_zone_t ratelimit_zone; +static int wireguard_debug; +static volatile unsigned long peer_counter = 0; +static const char wgname[] = "wg"; +static unsigned wg_osd_jail_slot; + +static struct sx wg_sx; +SX_SYSINIT(wg_sx, &wg_sx, "wg_sx"); + +static LIST_HEAD(, wg_softc) wg_list = LIST_HEAD_INITIALIZER(wg_list); + +SYSCTL_NODE(_net, OID_AUTO, wg, CTLFLAG_RW, 0, "WireGuard"); +SYSCTL_INT(_net_wg, OID_AUTO, debug, CTLFLAG_RWTUN, &wireguard_debug, 0, + "enable debug logging"); + +TASKQGROUP_DECLARE(if_io_tqg); + +MALLOC_DEFINE(M_WG, "WG", "wireguard"); +VNET_DEFINE_STATIC(struct if_clone *, wg_cloner); + + +#define V_wg_cloner VNET(wg_cloner) +#define WG_CAPS IFCAP_LINKSTATE +#define ph_family PH_loc.eight[5] + +struct wg_timespec64 { + uint64_t tv_sec; + uint64_t tv_nsec; +}; + +struct wg_peer_export { + struct sockaddr_storage endpoint; + struct timespec last_handshake; + uint8_t public_key[WG_KEY_SIZE]; + uint8_t preshared_key[NOISE_SYMMETRIC_KEY_LEN]; + size_t endpoint_sz; + struct wg_allowedip *aip; + uint64_t rx_bytes; + uint64_t tx_bytes; + int aip_count; + uint16_t persistent_keepalive; +}; + +static struct wg_tag *wg_tag_get(struct mbuf *); +static struct wg_endpoint *wg_mbuf_endpoint_get(struct mbuf *); +static int wg_socket_init(struct wg_softc *, in_port_t); +static int wg_socket_bind(struct socket *, struct socket *, in_port_t *); +static void wg_socket_set(struct wg_softc *, struct socket *, struct socket *); +static void wg_socket_uninit(struct wg_softc *); +static void wg_socket_set_cookie(struct wg_softc *, uint32_t); +static int wg_send(struct wg_softc *, struct wg_endpoint *, struct mbuf *); +static void wg_timers_event_data_sent(struct wg_timers *); +static void wg_timers_event_data_received(struct wg_timers *); +static void wg_timers_event_any_authenticated_packet_sent(struct wg_timers *); +static void wg_timers_event_any_authenticated_packet_received(struct wg_timers *); +static void wg_timers_event_any_authenticated_packet_traversal(struct wg_timers *); +static void wg_timers_event_handshake_initiated(struct wg_timers *); +static void wg_timers_event_handshake_responded(struct wg_timers *); +static void wg_timers_event_handshake_complete(struct wg_timers *); +static void wg_timers_event_session_derived(struct wg_timers *); +static void wg_timers_event_want_initiation(struct wg_timers *); +static void wg_timers_event_reset_handshake_last_sent(struct wg_timers *); +static void wg_timers_run_send_initiation(struct wg_timers *, int); +static void wg_timers_run_retry_handshake(struct wg_timers *); +static void wg_timers_run_send_keepalive(struct wg_timers *); +static void wg_timers_run_new_handshake(struct wg_timers *); +static void wg_timers_run_zero_key_material(struct wg_timers *); +static void wg_timers_run_persistent_keepalive(struct wg_timers *); +static void wg_timers_init(struct wg_timers *); +static void wg_timers_enable(struct wg_timers *); +static void wg_timers_disable(struct wg_timers *); +static void wg_timers_set_persistent_keepalive(struct wg_timers *, uint16_t); +static void wg_timers_get_last_handshake(struct wg_timers *, struct timespec *); +static int wg_timers_expired_handshake_last_sent(struct wg_timers *); +static int wg_timers_check_handshake_last_sent(struct wg_timers *); +static void wg_queue_init(struct wg_queue *, const char *); +static void wg_queue_deinit(struct wg_queue *); +static void wg_queue_purge(struct wg_queue *); +static struct mbuf *wg_queue_dequeue(struct wg_queue *, struct wg_tag **); +static int wg_queue_len(struct wg_queue *); +static int wg_queue_in(struct wg_peer *, struct mbuf *); +static void wg_queue_out(struct wg_peer *); +static void wg_queue_stage(struct wg_peer *, struct mbuf *); +static int wg_aip_init(struct wg_aip_table *); +static void wg_aip_destroy(struct wg_aip_table *); +static void wg_aip_populate_aip4(struct wg_aip *, const struct in_addr *, uint8_t); +static void wg_aip_populate_aip6(struct wg_aip *, const struct in6_addr *, uint8_t); +static int wg_aip_add(struct wg_aip_table *, struct wg_peer *, const struct wg_allowedip *); +static int wg_peer_remove(struct radix_node *, void *); +static void wg_peer_remove_all(struct wg_softc *); +static int wg_aip_delete(struct wg_aip_table *, struct wg_peer *); +static struct wg_peer *wg_aip_lookup(struct wg_aip_table *, struct mbuf *, enum route_direction); +static void wg_hashtable_init(struct wg_hashtable *); +static void wg_hashtable_destroy(struct wg_hashtable *); +static void wg_hashtable_peer_insert(struct wg_hashtable *, struct wg_peer *); +static struct wg_peer *wg_peer_lookup(struct wg_softc *, const uint8_t [32]); +static void wg_hashtable_peer_remove(struct wg_hashtable *, struct wg_peer *); +static int wg_cookie_validate_packet(struct cookie_checker *, struct mbuf *, int); +static struct wg_peer *wg_peer_alloc(struct wg_softc *); +static void wg_peer_free_deferred(epoch_context_t); +static void wg_peer_destroy(struct wg_peer *); +static void wg_peer_send_buf(struct wg_peer *, uint8_t *, size_t); +static void wg_send_initiation(struct wg_peer *); +static void wg_send_response(struct wg_peer *); +static void wg_send_cookie(struct wg_softc *, struct cookie_macs *, uint32_t, struct mbuf *); +static void wg_peer_set_endpoint_from_tag(struct wg_peer *, struct wg_tag *); +static void wg_peer_clear_src(struct wg_peer *); +static void wg_peer_get_endpoint(struct wg_peer *, struct wg_endpoint *); +static void wg_deliver_out(struct wg_peer *); +static void wg_deliver_in(struct wg_peer *); +static void wg_send_buf(struct wg_softc *, struct wg_endpoint *, uint8_t *, size_t); +static void wg_send_keepalive(struct wg_peer *); +static void wg_handshake(struct wg_softc *, struct mbuf *); +static void wg_encap(struct wg_softc *, struct mbuf *); +static void wg_decap(struct wg_softc *, struct mbuf *); +static void wg_softc_handshake_receive(struct wg_softc *); +static void wg_softc_decrypt(struct wg_softc *); +static void wg_softc_encrypt(struct wg_softc *); +static struct noise_remote *wg_remote_get(struct wg_softc *, uint8_t [NOISE_PUBLIC_KEY_LEN]); +static uint32_t wg_index_set(struct wg_softc *, struct noise_remote *); +static struct noise_remote *wg_index_get(struct wg_softc *, uint32_t); +static void wg_index_drop(struct wg_softc *, uint32_t); +static int wg_update_endpoint_addrs(struct wg_endpoint *, const struct sockaddr *, struct ifnet *); +static void wg_input(struct mbuf *, int, struct inpcb *, const struct sockaddr *, void *); +static void wg_encrypt_dispatch(struct wg_softc *); +static void wg_decrypt_dispatch(struct wg_softc *); +static void crypto_taskq_setup(struct wg_softc *); +static void crypto_taskq_destroy(struct wg_softc *); +static int wg_clone_create(struct if_clone *, int, caddr_t); +static void wg_qflush(struct ifnet *); +static int wg_transmit(struct ifnet *, struct mbuf *); +static int wg_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); +static void wg_clone_destroy(struct ifnet *); +static int wg_peer_to_export(struct wg_peer *, struct wg_peer_export *); +static bool wgc_privileged(struct wg_softc *); +static int wgc_get(struct wg_softc *, struct wg_data_io *); +static int wgc_set(struct wg_softc *, struct wg_data_io *); +static int wg_up(struct wg_softc *); +static void wg_down(struct wg_softc *); +static void wg_reassign(struct ifnet *, struct vnet *, char *unused); +static void wg_init(void *); +static int wg_ioctl(struct ifnet *, u_long, caddr_t); +static void vnet_wg_init(const void *); +static void vnet_wg_uninit(const void *); +static void wg_module_init(void); +static void wg_module_deinit(void); + +/* TODO Peer */ +static struct wg_peer * +wg_peer_alloc(struct wg_softc *sc) +{ + struct wg_peer *peer; + + sx_assert(&sc->sc_lock, SX_XLOCKED); + + peer = malloc(sizeof(*peer), M_WG, M_WAITOK|M_ZERO); + peer->p_sc = sc; + peer->p_id = peer_counter++; + CK_LIST_INIT(&peer->p_aips); + + rw_init(&peer->p_endpoint_lock, "wg_peer_endpoint"); + wg_queue_init(&peer->p_stage_queue, "stageq"); + wg_queue_init(&peer->p_encap_queue, "txq"); + wg_queue_init(&peer->p_decap_queue, "rxq"); + + GROUPTASK_INIT(&peer->p_send_initiation, 0, (gtask_fn_t *)wg_send_initiation, peer); + taskqgroup_attach(qgroup_if_io_tqg, &peer->p_send_initiation, peer, NULL, NULL, "wg initiation"); + GROUPTASK_INIT(&peer->p_send_keepalive, 0, (gtask_fn_t *)wg_send_keepalive, peer); + taskqgroup_attach(qgroup_if_io_tqg, &peer->p_send_keepalive, peer, NULL, NULL, "wg keepalive"); + GROUPTASK_INIT(&peer->p_clear_secrets, 0, (gtask_fn_t *)noise_remote_clear, &peer->p_remote); + taskqgroup_attach(qgroup_if_io_tqg, &peer->p_clear_secrets, + &peer->p_remote, NULL, NULL, "wg clear secrets"); + + GROUPTASK_INIT(&peer->p_send, 0, (gtask_fn_t *)wg_deliver_out, peer); + taskqgroup_attach(qgroup_if_io_tqg, &peer->p_send, peer, NULL, NULL, "wg send"); + GROUPTASK_INIT(&peer->p_recv, 0, (gtask_fn_t *)wg_deliver_in, peer); + taskqgroup_attach(qgroup_if_io_tqg, &peer->p_recv, peer, NULL, NULL, "wg recv"); + + wg_timers_init(&peer->p_timers); + + peer->p_tx_bytes = counter_u64_alloc(M_WAITOK); + peer->p_rx_bytes = counter_u64_alloc(M_WAITOK); + + SLIST_INIT(&peer->p_unused_index); + SLIST_INSERT_HEAD(&peer->p_unused_index, &peer->p_index[0], + i_unused_entry); + SLIST_INSERT_HEAD(&peer->p_unused_index, &peer->p_index[1], + i_unused_entry); + SLIST_INSERT_HEAD(&peer->p_unused_index, &peer->p_index[2], + i_unused_entry); + + return (peer); +} + +#define WG_HASHTABLE_PEER_FOREACH(peer, i, ht) \ + for (i = 0; i < HASHTABLE_PEER_SIZE; i++) \ + LIST_FOREACH(peer, &(ht)->h_peers[i], p_hash_entry) +#define WG_HASHTABLE_PEER_FOREACH_SAFE(peer, i, ht, tpeer) \ + for (i = 0; i < HASHTABLE_PEER_SIZE; i++) \ + CK_LIST_FOREACH_SAFE(peer, &(ht)->h_peers[i], p_hash_entry, tpeer) +static void +wg_hashtable_init(struct wg_hashtable *ht) +{ + mtx_init(&ht->h_mtx, "hash lock", NULL, MTX_DEF); + arc4random_buf(&ht->h_secret, sizeof(ht->h_secret)); + ht->h_num_peers = 0; + ht->h_peers = hashinit(HASHTABLE_PEER_SIZE, M_DEVBUF, + &ht->h_peers_mask); +} + +static void +wg_hashtable_destroy(struct wg_hashtable *ht) +{ + MPASS(ht->h_num_peers == 0); + mtx_destroy(&ht->h_mtx); + hashdestroy(ht->h_peers, M_DEVBUF, ht->h_peers_mask); +} + +static void +wg_hashtable_peer_insert(struct wg_hashtable *ht, struct wg_peer *peer) +{ + uint64_t key; + + key = siphash24(&ht->h_secret, peer->p_remote.r_public, + sizeof(peer->p_remote.r_public)); + + mtx_lock(&ht->h_mtx); + ht->h_num_peers++; + CK_LIST_INSERT_HEAD(&ht->h_peers[key & ht->h_peers_mask], peer, p_hash_entry); + CK_LIST_INSERT_HEAD(&ht->h_peers_list, peer, p_entry); + mtx_unlock(&ht->h_mtx); +} + +static struct wg_peer * +wg_peer_lookup(struct wg_softc *sc, + const uint8_t pubkey[WG_KEY_SIZE]) +{ + struct wg_hashtable *ht = &sc->sc_hashtable; + uint64_t key; + struct wg_peer *i = NULL; + + key = siphash24(&ht->h_secret, pubkey, WG_KEY_SIZE); + + mtx_lock(&ht->h_mtx); + CK_LIST_FOREACH(i, &ht->h_peers[key & ht->h_peers_mask], p_hash_entry) { + if (timingsafe_bcmp(i->p_remote.r_public, pubkey, + WG_KEY_SIZE) == 0) + break; + } + mtx_unlock(&ht->h_mtx); + + return i; +} + +static void +wg_hashtable_peer_remove(struct wg_hashtable *ht, struct wg_peer *peer) +{ + mtx_lock(&ht->h_mtx); + ht->h_num_peers--; + CK_LIST_REMOVE(peer, p_hash_entry); + CK_LIST_REMOVE(peer, p_entry); + mtx_unlock(&ht->h_mtx); +} + +static void +wg_peer_free_deferred(epoch_context_t ctx) +{ + struct wg_peer *peer = __containerof(ctx, struct wg_peer, p_ctx); + counter_u64_free(peer->p_tx_bytes); + counter_u64_free(peer->p_rx_bytes); + rw_destroy(&peer->p_timers.t_lock); + rw_destroy(&peer->p_endpoint_lock); + free(peer, M_WG); +} + +static void +wg_peer_destroy(struct wg_peer *peer) +{ + /* Callers should already have called: + * wg_hashtable_peer_remove(&sc->sc_hashtable, peer); + */ + wg_aip_delete(&peer->p_sc->sc_aips, peer); + MPASS(CK_LIST_EMPTY(&peer->p_aips)); + + /* We disable all timers, so we can't call the following tasks. */ + wg_timers_disable(&peer->p_timers); + + /* Ensure the tasks have finished running */ + GROUPTASK_DRAIN(&peer->p_clear_secrets); + GROUPTASK_DRAIN(&peer->p_send_initiation); + GROUPTASK_DRAIN(&peer->p_send_keepalive); + GROUPTASK_DRAIN(&peer->p_recv); + GROUPTASK_DRAIN(&peer->p_send); + + taskqgroup_detach(qgroup_if_io_tqg, &peer->p_clear_secrets); + taskqgroup_detach(qgroup_if_io_tqg, &peer->p_send_initiation); + taskqgroup_detach(qgroup_if_io_tqg, &peer->p_send_keepalive); + taskqgroup_detach(qgroup_if_io_tqg, &peer->p_recv); + taskqgroup_detach(qgroup_if_io_tqg, &peer->p_send); + + wg_queue_deinit(&peer->p_decap_queue); + wg_queue_deinit(&peer->p_encap_queue); + wg_queue_deinit(&peer->p_stage_queue); + + /* Final cleanup */ + --peer->p_sc->sc_peer_count; + noise_remote_clear(&peer->p_remote); + DPRINTF(peer->p_sc, "Peer %llu destroyed\n", (unsigned long long)peer->p_id); + NET_EPOCH_CALL(wg_peer_free_deferred, &peer->p_ctx); +} + +static void +wg_peer_set_endpoint_from_tag(struct wg_peer *peer, struct wg_tag *t) +{ + struct wg_endpoint *e = &t->t_endpoint; + + MPASS(e->e_remote.r_sa.sa_family != 0); + if (memcmp(e, &peer->p_endpoint, sizeof(*e)) == 0) + return; + + peer->p_endpoint = *e; +} + +static void +wg_peer_clear_src(struct wg_peer *peer) +{ + rw_rlock(&peer->p_endpoint_lock); + bzero(&peer->p_endpoint.e_local, sizeof(peer->p_endpoint.e_local)); + rw_runlock(&peer->p_endpoint_lock); +} + +static void +wg_peer_get_endpoint(struct wg_peer *p, struct wg_endpoint *e) +{ + memcpy(e, &p->p_endpoint, sizeof(*e)); +} + +/* Allowed IP */ +static int +wg_aip_init(struct wg_aip_table *tbl) +{ + int rc; + + tbl->t_count = 0; + rc = rn_inithead((void **)&tbl->t_ip, + offsetof(struct sockaddr_in, sin_addr) * NBBY); + + if (rc == 0) + return (ENOMEM); + RADIX_NODE_HEAD_LOCK_INIT(tbl->t_ip); +#ifdef INET6 + rc = rn_inithead((void **)&tbl->t_ip6, + offsetof(struct sockaddr_in6, sin6_addr) * NBBY); + if (rc == 0) { + free(tbl->t_ip, M_RTABLE); + return (ENOMEM); + } + RADIX_NODE_HEAD_LOCK_INIT(tbl->t_ip6); +#endif + return (0); +} + +static void +wg_aip_destroy(struct wg_aip_table *tbl) +{ + RADIX_NODE_HEAD_DESTROY(tbl->t_ip); + free(tbl->t_ip, M_RTABLE); +#ifdef INET6 + RADIX_NODE_HEAD_DESTROY(tbl->t_ip6); + free(tbl->t_ip6, M_RTABLE); +#endif +} + +static void +wg_aip_populate_aip4(struct wg_aip *aip, const struct in_addr *addr, + uint8_t mask) +{ + struct sockaddr_in *raddr, *rmask; + uint8_t *p; + unsigned int i; + + raddr = (struct sockaddr_in *)&aip->r_addr; + rmask = (struct sockaddr_in *)&aip->r_mask; + + raddr->sin_len = sizeof(*raddr); + raddr->sin_family = AF_INET; + raddr->sin_addr = *addr; + + rmask->sin_len = sizeof(*rmask); + p = (uint8_t *)&rmask->sin_addr.s_addr; + for (i = 0; i < mask / NBBY; i++) + p[i] = 0xff; + if ((mask % NBBY) != 0) + p[i] = (0xff00 >> (mask % NBBY)) & 0xff; + raddr->sin_addr.s_addr &= rmask->sin_addr.s_addr; +} + +static void +wg_aip_populate_aip6(struct wg_aip *aip, const struct in6_addr *addr, + uint8_t mask) +{ + struct sockaddr_in6 *raddr, *rmask; + + raddr = (struct sockaddr_in6 *)&aip->r_addr; + rmask = (struct sockaddr_in6 *)&aip->r_mask; + + raddr->sin6_len = sizeof(*raddr); + raddr->sin6_family = AF_INET6; + raddr->sin6_addr = *addr; + + rmask->sin6_len = sizeof(*rmask); + in6_prefixlen2mask(&rmask->sin6_addr, mask); + for (int i = 0; i < 4; ++i) + raddr->sin6_addr.__u6_addr.__u6_addr32[i] &= rmask->sin6_addr.__u6_addr.__u6_addr32[i]; +} + +/* wg_aip_take assumes that the caller guarantees the allowed-ip exists. */ +static void +wg_aip_take(struct radix_node_head *root, struct wg_peer *peer, + struct wg_aip *route) +{ + struct radix_node *node; + struct wg_peer *ppeer; + + RADIX_NODE_HEAD_LOCK_ASSERT(root); + + node = root->rnh_lookup(&route->r_addr, &route->r_mask, + &root->rh); + MPASS(node != NULL); + + route = (struct wg_aip *)node; + ppeer = route->r_peer; + if (ppeer != peer) { + route->r_peer = peer; + + CK_LIST_REMOVE(route, r_entry); + CK_LIST_INSERT_HEAD(&peer->p_aips, route, r_entry); + } +} + +static int +wg_aip_add(struct wg_aip_table *tbl, struct wg_peer *peer, + const struct wg_allowedip *aip) +{ + struct radix_node *node; + struct radix_node_head *root; + struct wg_aip *route; + sa_family_t family; + bool needfree = false; + + family = aip->family; + if (family != AF_INET && family != AF_INET6) { + return (EINVAL); + } + + route = malloc(sizeof(*route), M_WG, M_WAITOK|M_ZERO); + switch (family) { + case AF_INET: + root = tbl->t_ip; + + wg_aip_populate_aip4(route, &aip->ip4, aip->cidr); + break; + case AF_INET6: + root = tbl->t_ip6; + + wg_aip_populate_aip6(route, &aip->ip6, aip->cidr); + break; + } + + route->r_peer = peer; + + RADIX_NODE_HEAD_LOCK(root); + node = root->rnh_addaddr(&route->r_addr, &route->r_mask, &root->rh, + route->r_nodes); + if (node == route->r_nodes) { + tbl->t_count++; + CK_LIST_INSERT_HEAD(&peer->p_aips, route, r_entry); + } else { + needfree = true; + wg_aip_take(root, peer, route); + } + RADIX_NODE_HEAD_UNLOCK(root); + if (needfree) { + free(route, M_WG); + } + return (0); +} + +static struct wg_peer * +wg_aip_lookup(struct wg_aip_table *tbl, struct mbuf *m, + enum route_direction dir) +{ + RADIX_NODE_HEAD_RLOCK_TRACKER; + struct ip *iphdr; + struct ip6_hdr *ip6hdr; + struct radix_node_head *root; + struct radix_node *node; + struct wg_peer *peer = NULL; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + void *addr; + int version; + + NET_EPOCH_ASSERT(); + iphdr = mtod(m, struct ip *); + version = iphdr->ip_v; + + if (__predict_false(dir != IN && dir != OUT)) + return NULL; + + if (version == 4) { + root = tbl->t_ip; + memset(&sin, 0, sizeof(sin)); + sin.sin_len = sizeof(struct sockaddr_in); + if (dir == IN) + sin.sin_addr = iphdr->ip_src; + else + sin.sin_addr = iphdr->ip_dst; + addr = &sin; + } else if (version == 6) { + ip6hdr = mtod(m, struct ip6_hdr *); + memset(&sin6, 0, sizeof(sin6)); + sin6.sin6_len = sizeof(struct sockaddr_in6); + + root = tbl->t_ip6; + if (dir == IN) + addr = &ip6hdr->ip6_src; + else + addr = &ip6hdr->ip6_dst; + memcpy(&sin6.sin6_addr, addr, sizeof(sin6.sin6_addr)); + addr = &sin6; + } else { + return (NULL); + } + RADIX_NODE_HEAD_RLOCK(root); + if ((node = root->rnh_matchaddr(addr, &root->rh)) != NULL) { + peer = ((struct wg_aip *) node)->r_peer; + } + RADIX_NODE_HEAD_RUNLOCK(root); + return (peer); +} + +struct peer_del_arg { + struct radix_node_head * pda_head; + struct wg_peer *pda_peer; + struct wg_aip_table *pda_tbl; +}; + +static int +wg_peer_remove(struct radix_node *rn, void *arg) +{ + struct peer_del_arg *pda = arg; + struct wg_peer *peer = pda->pda_peer; + struct radix_node_head * rnh = pda->pda_head; + struct wg_aip_table *tbl = pda->pda_tbl; + struct wg_aip *route = (struct wg_aip *)rn; + struct radix_node *x; + + if (route->r_peer != peer) + return (0); + x = (struct radix_node *)rnh->rnh_deladdr(&route->r_addr, + &route->r_mask, &rnh->rh); + if (x != NULL) { + tbl->t_count--; + CK_LIST_REMOVE(route, r_entry); + free(route, M_WG); + } + return (0); +} + +static void +wg_peer_remove_all(struct wg_softc *sc) +{ + struct wg_peer *peer, *tpeer; + + sx_assert(&sc->sc_lock, SX_XLOCKED); + + CK_LIST_FOREACH_SAFE(peer, &sc->sc_hashtable.h_peers_list, + p_entry, tpeer) { + wg_hashtable_peer_remove(&sc->sc_hashtable, peer); + wg_peer_destroy(peer); + } +} + +static int +wg_aip_delete(struct wg_aip_table *tbl, struct wg_peer *peer) +{ + struct peer_del_arg pda; + + pda.pda_peer = peer; + pda.pda_tbl = tbl; + RADIX_NODE_HEAD_LOCK(tbl->t_ip); + pda.pda_head = tbl->t_ip; + rn_walktree(&tbl->t_ip->rh, wg_peer_remove, &pda); + RADIX_NODE_HEAD_UNLOCK(tbl->t_ip); + + RADIX_NODE_HEAD_LOCK(tbl->t_ip6); + pda.pda_head = tbl->t_ip6; + rn_walktree(&tbl->t_ip6->rh, wg_peer_remove, &pda); + RADIX_NODE_HEAD_UNLOCK(tbl->t_ip6); + return (0); +} + +static int +wg_socket_init(struct wg_softc *sc, in_port_t port) +{ + struct thread *td; + struct ucred *cred; + struct socket *so4, *so6; + int rc; + + sx_assert(&sc->sc_lock, SX_XLOCKED); + + td = curthread; + if (sc->sc_ucred == NULL) + return (EBUSY); + cred = crhold(sc->sc_ucred); + + /* + * For socket creation, we use the creds of the thread that created the + * tunnel rather than the current thread to maintain the semantics that + * WireGuard has on Linux with network namespaces -- that the sockets + * are created in their home vnet so that they can be configured and + * functionally attached to a foreign vnet as the jail's only interface + * to the network. + */ + rc = socreate(AF_INET, &so4, SOCK_DGRAM, IPPROTO_UDP, cred, td); + if (rc) + goto out; + + rc = udp_set_kernel_tunneling(so4, wg_input, NULL, sc); + /* + * udp_set_kernel_tunneling can only fail if there is already a tunneling function set. + * This should never happen with a new socket. + */ + MPASS(rc == 0); + + rc = socreate(AF_INET6, &so6, SOCK_DGRAM, IPPROTO_UDP, cred, td); + if (rc) { + SOCK_LOCK(so4); + sofree(so4); + goto out; + } + rc = udp_set_kernel_tunneling(so6, wg_input, NULL, sc); + MPASS(rc == 0); + + so4->so_user_cookie = so6->so_user_cookie = sc->sc_socket.so_user_cookie; + + rc = wg_socket_bind(so4, so6, &port); + if (rc == 0) { + sc->sc_socket.so_port = port; + wg_socket_set(sc, so4, so6); + } +out: + crfree(cred); + return (rc); +} + +static void wg_socket_set_cookie(struct wg_softc *sc, uint32_t user_cookie) +{ + struct wg_socket *so = &sc->sc_socket; + + sx_assert(&sc->sc_lock, SX_XLOCKED); + + so->so_user_cookie = user_cookie; + if (so->so_so4) + so->so_so4->so_user_cookie = user_cookie; + if (so->so_so6) + so->so_so6->so_user_cookie = user_cookie; +} + +static void +wg_socket_uninit(struct wg_softc *sc) +{ + wg_socket_set(sc, NULL, NULL); +} + +static void +wg_socket_set(struct wg_softc *sc, struct socket *new_so4, struct socket *new_so6) +{ + struct wg_socket *so = &sc->sc_socket; + struct socket *so4, *so6; + + sx_assert(&sc->sc_lock, SX_XLOCKED); + + so4 = atomic_load_ptr(&so->so_so4); + so6 = atomic_load_ptr(&so->so_so6); + atomic_store_ptr(&so->so_so4, new_so4); + atomic_store_ptr(&so->so_so6, new_so6); + + if (!so4 && !so6) + return; + NET_EPOCH_WAIT(); + if (so4) + soclose(so4); + if (so6) + soclose(so6); +} + +union wg_sockaddr { + struct sockaddr sa; + struct sockaddr_in in4; + struct sockaddr_in6 in6; +}; + +static int +wg_socket_bind(struct socket *so4, struct socket *so6, in_port_t *requested_port) +{ + int rc; + struct thread *td; + union wg_sockaddr laddr; + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; + in_port_t port = *requested_port; + + td = curthread; + bzero(&laddr, sizeof(laddr)); + sin = &laddr.in4; + sin->sin_len = sizeof(laddr.in4); + sin->sin_family = AF_INET; + sin->sin_port = htons(port); + sin->sin_addr = (struct in_addr) { 0 }; + + if ((rc = sobind(so4, &laddr.sa, td)) != 0) + return (rc); + + if (port == 0) { + rc = sogetsockaddr(so4, (struct sockaddr **)&sin); + if (rc != 0) + return (rc); + port = ntohs(sin->sin_port); + free(sin, M_SONAME); + } + + sin6 = &laddr.in6; + sin6->sin6_len = sizeof(laddr.in6); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = htons(port); + sin6->sin6_addr = (struct in6_addr) { .s6_addr = { 0 } }; + rc = sobind(so6, &laddr.sa, td); + if (rc != 0) + return (rc); + *requested_port = port; + return (0); +} + +static int +wg_send(struct wg_softc *sc, struct wg_endpoint *e, struct mbuf *m) +{ + struct epoch_tracker et; + struct sockaddr *sa; + struct wg_socket *so = &sc->sc_socket; + struct socket *so4, *so6; + struct mbuf *control = NULL; + int ret = 0; + size_t len = m->m_pkthdr.len; + + /* Get local control address before locking */ + if (e->e_remote.r_sa.sa_family == AF_INET) { + if (e->e_local.l_in.s_addr != INADDR_ANY) + control = sbcreatecontrol((caddr_t)&e->e_local.l_in, + sizeof(struct in_addr), IP_SENDSRCADDR, + IPPROTO_IP); + } else if (e->e_remote.r_sa.sa_family == AF_INET6) { + if (!IN6_IS_ADDR_UNSPECIFIED(&e->e_local.l_in6)) + control = sbcreatecontrol((caddr_t)&e->e_local.l_pktinfo6, + sizeof(struct in6_pktinfo), IPV6_PKTINFO, + IPPROTO_IPV6); + } else { + m_freem(m); + return (EAFNOSUPPORT); + } + + /* Get remote address */ + sa = &e->e_remote.r_sa; + + NET_EPOCH_ENTER(et); + so4 = atomic_load_ptr(&so->so_so4); + so6 = atomic_load_ptr(&so->so_so6); + if (e->e_remote.r_sa.sa_family == AF_INET && so4 != NULL) + ret = sosend(so4, sa, NULL, m, control, 0, curthread); + else if (e->e_remote.r_sa.sa_family == AF_INET6 && so6 != NULL) + ret = sosend(so6, sa, NULL, m, control, 0, curthread); + else { + ret = ENOTCONN; + m_freem(control); + m_freem(m); + } + NET_EPOCH_EXIT(et); + if (ret == 0) { + if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1); + if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, len); + } + return (ret); +} + +static void +wg_send_buf(struct wg_softc *sc, struct wg_endpoint *e, uint8_t *buf, + size_t len) +{ + struct mbuf *m; + int ret = 0; + +retry: + m = m_gethdr(M_WAITOK, MT_DATA); + m->m_len = 0; + m_copyback(m, 0, len, buf); + + if (ret == 0) { + ret = wg_send(sc, e, m); + /* Retry if we couldn't bind to e->e_local */ + if (ret == EADDRNOTAVAIL) { + bzero(&e->e_local, sizeof(e->e_local)); + goto retry; + } + } else { + ret = wg_send(sc, e, m); + } + if (ret) + DPRINTF(sc, "Unable to send packet: %d\n", ret); +} + +/* TODO Tag */ +static struct wg_tag * +wg_tag_get(struct mbuf *m) +{ + struct m_tag *tag; + + tag = m_tag_find(m, MTAG_WIREGUARD, NULL); + if (tag == NULL) { + tag = m_tag_get(MTAG_WIREGUARD, sizeof(struct wg_tag), M_NOWAIT|M_ZERO); + m_tag_prepend(m, tag); + MPASS(!SLIST_EMPTY(&m->m_pkthdr.tags)); + MPASS(m_tag_locate(m, MTAG_ABI_COMPAT, MTAG_WIREGUARD, NULL) == tag); + } + return (struct wg_tag *)tag; +} + +static struct wg_endpoint * +wg_mbuf_endpoint_get(struct mbuf *m) +{ + struct wg_tag *hdr; + + if ((hdr = wg_tag_get(m)) == NULL) + return (NULL); + + return (&hdr->t_endpoint); +} + +/* Timers */ +static void +wg_timers_init(struct wg_timers *t) +{ + bzero(t, sizeof(*t)); + + t->t_disabled = 1; + rw_init(&t->t_lock, "wg peer timers"); + callout_init(&t->t_retry_handshake, true); + callout_init(&t->t_send_keepalive, true); + callout_init(&t->t_new_handshake, true); + callout_init(&t->t_zero_key_material, true); + callout_init(&t->t_persistent_keepalive, true); +} + +static void +wg_timers_enable(struct wg_timers *t) +{ + rw_wlock(&t->t_lock); + t->t_disabled = 0; + rw_wunlock(&t->t_lock); + wg_timers_run_persistent_keepalive(t); +} + +static void +wg_timers_disable(struct wg_timers *t) +{ + rw_wlock(&t->t_lock); + t->t_disabled = 1; + t->t_need_another_keepalive = 0; + rw_wunlock(&t->t_lock); + + callout_stop(&t->t_retry_handshake); + callout_stop(&t->t_send_keepalive); + callout_stop(&t->t_new_handshake); + callout_stop(&t->t_zero_key_material); + callout_stop(&t->t_persistent_keepalive); +} + +static void +wg_timers_set_persistent_keepalive(struct wg_timers *t, uint16_t interval) +{ + rw_rlock(&t->t_lock); + if (!t->t_disabled) { + t->t_persistent_keepalive_interval = interval; + wg_timers_run_persistent_keepalive(t); + } + rw_runlock(&t->t_lock); +} + +static void +wg_timers_get_last_handshake(struct wg_timers *t, struct timespec *time) +{ + rw_rlock(&t->t_lock); + time->tv_sec = t->t_handshake_complete.tv_sec; + time->tv_nsec = t->t_handshake_complete.tv_nsec; + rw_runlock(&t->t_lock); +} + +static int +wg_timers_expired_handshake_last_sent(struct wg_timers *t) +{ + struct timespec uptime; + struct timespec expire = { .tv_sec = REKEY_TIMEOUT, .tv_nsec = 0 }; + + getnanouptime(&uptime); + timespecadd(&t->t_handshake_last_sent, &expire, &expire); + return timespeccmp(&uptime, &expire, >) ? ETIMEDOUT : 0; +} + +static int +wg_timers_check_handshake_last_sent(struct wg_timers *t) +{ + int ret; + + rw_wlock(&t->t_lock); + if ((ret = wg_timers_expired_handshake_last_sent(t)) == ETIMEDOUT) + getnanouptime(&t->t_handshake_last_sent); + rw_wunlock(&t->t_lock); + return (ret); +} + +/* Should be called after an authenticated data packet is sent. */ +static void +wg_timers_event_data_sent(struct wg_timers *t) +{ + rw_rlock(&t->t_lock); + if (!t->t_disabled && !callout_pending(&t->t_new_handshake)) + callout_reset(&t->t_new_handshake, MSEC_2_TICKS( + NEW_HANDSHAKE_TIMEOUT * 1000 + + arc4random_uniform(REKEY_TIMEOUT_JITTER)), + (timeout_t *)wg_timers_run_new_handshake, t); + rw_runlock(&t->t_lock); +} + +/* Should be called after an authenticated data packet is received. */ +static void +wg_timers_event_data_received(struct wg_timers *t) +{ + rw_rlock(&t->t_lock); + if (!t->t_disabled) { + if (!callout_pending(&t->t_send_keepalive)) { + callout_reset(&t->t_send_keepalive, + MSEC_2_TICKS(KEEPALIVE_TIMEOUT * 1000), + (timeout_t *)wg_timers_run_send_keepalive, t); + } else { + t->t_need_another_keepalive = 1; + } + } + rw_runlock(&t->t_lock); +} + +/* + * Should be called after any type of authenticated packet is sent, whether + * keepalive, data, or handshake. + */ +static void +wg_timers_event_any_authenticated_packet_sent(struct wg_timers *t) +{ + callout_stop(&t->t_send_keepalive); +} + +/* + * Should be called after any type of authenticated packet is received, whether + * keepalive, data, or handshake. + */ +static void +wg_timers_event_any_authenticated_packet_received(struct wg_timers *t) +{ + callout_stop(&t->t_new_handshake); +} + +/* + * Should be called before a packet with authentication, whether + * keepalive, data, or handshake is sent, or after one is received. + */ +static void +wg_timers_event_any_authenticated_packet_traversal(struct wg_timers *t) +{ + rw_rlock(&t->t_lock); + if (!t->t_disabled && t->t_persistent_keepalive_interval > 0) + callout_reset(&t->t_persistent_keepalive, + MSEC_2_TICKS(t->t_persistent_keepalive_interval * 1000), + (timeout_t *)wg_timers_run_persistent_keepalive, t); + rw_runlock(&t->t_lock); +} + +/* Should be called after a handshake initiation message is sent. */ +static void +wg_timers_event_handshake_initiated(struct wg_timers *t) +{ + rw_rlock(&t->t_lock); + if (!t->t_disabled) + callout_reset(&t->t_retry_handshake, MSEC_2_TICKS( + REKEY_TIMEOUT * 1000 + + arc4random_uniform(REKEY_TIMEOUT_JITTER)), + (timeout_t *)wg_timers_run_retry_handshake, t); + rw_runlock(&t->t_lock); +} + +static void +wg_timers_event_handshake_responded(struct wg_timers *t) +{ + rw_wlock(&t->t_lock); + getnanouptime(&t->t_handshake_last_sent); + rw_wunlock(&t->t_lock); +} + +/* + * Should be called after a handshake response message is received and processed + * or when getting key confirmation via the first data message. + */ +static void +wg_timers_event_handshake_complete(struct wg_timers *t) +{ + rw_wlock(&t->t_lock); + if (!t->t_disabled) { + callout_stop(&t->t_retry_handshake); + t->t_handshake_retries = 0; + getnanotime(&t->t_handshake_complete); + wg_timers_run_send_keepalive(t); + } + rw_wunlock(&t->t_lock); +} + +/* + * Should be called after an ephemeral key is created, which is before sending a + * handshake response or after receiving a handshake response. + */ +static void +wg_timers_event_session_derived(struct wg_timers *t) +{ + rw_rlock(&t->t_lock); + if (!t->t_disabled) { + callout_reset(&t->t_zero_key_material, + MSEC_2_TICKS(REJECT_AFTER_TIME * 3 * 1000), + (timeout_t *)wg_timers_run_zero_key_material, t); + } + rw_runlock(&t->t_lock); +} + +static void +wg_timers_event_want_initiation(struct wg_timers *t) +{ + rw_rlock(&t->t_lock); + if (!t->t_disabled) + wg_timers_run_send_initiation(t, 0); + rw_runlock(&t->t_lock); +} + +static void +wg_timers_event_reset_handshake_last_sent(struct wg_timers *t) +{ + rw_wlock(&t->t_lock); + t->t_handshake_last_sent.tv_sec -= (REKEY_TIMEOUT + 1); + rw_wunlock(&t->t_lock); +} + +static void +wg_timers_run_send_initiation(struct wg_timers *t, int is_retry) +{ + struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers); + if (!is_retry) + t->t_handshake_retries = 0; + if (wg_timers_expired_handshake_last_sent(t) == ETIMEDOUT) + GROUPTASK_ENQUEUE(&peer->p_send_initiation); +} + +static void +wg_timers_run_retry_handshake(struct wg_timers *t) +{ + struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers); + + rw_wlock(&t->t_lock); + if (t->t_handshake_retries <= MAX_TIMER_HANDSHAKES) { + t->t_handshake_retries++; + rw_wunlock(&t->t_lock); + + DPRINTF(peer->p_sc, "Handshake for peer %llu did not complete " + "after %d seconds, retrying (try %d)\n", + (unsigned long long)peer->p_id, + REKEY_TIMEOUT, t->t_handshake_retries + 1); + wg_peer_clear_src(peer); + wg_timers_run_send_initiation(t, 1); + } else { + rw_wunlock(&t->t_lock); + + DPRINTF(peer->p_sc, "Handshake for peer %llu did not complete " + "after %d retries, giving up\n", + (unsigned long long) peer->p_id, MAX_TIMER_HANDSHAKES + 2); + + callout_stop(&t->t_send_keepalive); + wg_queue_purge(&peer->p_stage_queue); + if (!callout_pending(&t->t_zero_key_material)) + callout_reset(&t->t_zero_key_material, + MSEC_2_TICKS(REJECT_AFTER_TIME * 3 * 1000), + (timeout_t *)wg_timers_run_zero_key_material, t); + } +} + +static void +wg_timers_run_send_keepalive(struct wg_timers *t) +{ + struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers); + + GROUPTASK_ENQUEUE(&peer->p_send_keepalive); + if (t->t_need_another_keepalive) { + t->t_need_another_keepalive = 0; + callout_reset(&t->t_send_keepalive, + MSEC_2_TICKS(KEEPALIVE_TIMEOUT * 1000), + (timeout_t *)wg_timers_run_send_keepalive, t); + } +} + +static void +wg_timers_run_new_handshake(struct wg_timers *t) +{ + struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers); + + DPRINTF(peer->p_sc, "Retrying handshake with peer %llu because we " + "stopped hearing back after %d seconds\n", + (unsigned long long)peer->p_id, NEW_HANDSHAKE_TIMEOUT); + wg_peer_clear_src(peer); + + wg_timers_run_send_initiation(t, 0); +} + +static void +wg_timers_run_zero_key_material(struct wg_timers *t) +{ + struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers); + + DPRINTF(peer->p_sc, "Zeroing out all keys for peer %llu, since we " + "haven't received a new one in %d seconds\n", + (unsigned long long)peer->p_id, REJECT_AFTER_TIME * 3); + GROUPTASK_ENQUEUE(&peer->p_clear_secrets); +} + +static void +wg_timers_run_persistent_keepalive(struct wg_timers *t) +{ + struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers); + + if (t->t_persistent_keepalive_interval != 0) + GROUPTASK_ENQUEUE(&peer->p_send_keepalive); +} + +/* TODO Handshake */ +static void +wg_peer_send_buf(struct wg_peer *peer, uint8_t *buf, size_t len) +{ + struct wg_endpoint endpoint; + + counter_u64_add(peer->p_tx_bytes, len); + wg_timers_event_any_authenticated_packet_traversal(&peer->p_timers); + wg_timers_event_any_authenticated_packet_sent(&peer->p_timers); + wg_peer_get_endpoint(peer, &endpoint); + wg_send_buf(peer->p_sc, &endpoint, buf, len); +} + +static void +wg_send_initiation(struct wg_peer *peer) +{ + struct wg_pkt_initiation pkt; + struct epoch_tracker et; + + if (wg_timers_check_handshake_last_sent(&peer->p_timers) != ETIMEDOUT) + return; + DPRINTF(peer->p_sc, "Sending handshake initiation to peer %llu\n", + (unsigned long long)peer->p_id); + + NET_EPOCH_ENTER(et); + if (noise_create_initiation(&peer->p_remote, &pkt.s_idx, pkt.ue, + pkt.es, pkt.ets) != 0) + goto out; + pkt.t = WG_PKT_INITIATION; + cookie_maker_mac(&peer->p_cookie, &pkt.m, &pkt, + sizeof(pkt)-sizeof(pkt.m)); + wg_peer_send_buf(peer, (uint8_t *)&pkt, sizeof(pkt)); + wg_timers_event_handshake_initiated(&peer->p_timers); +out: + NET_EPOCH_EXIT(et); +} + +static void +wg_send_response(struct wg_peer *peer) +{ + struct wg_pkt_response pkt; + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); + + DPRINTF(peer->p_sc, "Sending handshake response to peer %llu\n", + (unsigned long long)peer->p_id); + + if (noise_create_response(&peer->p_remote, &pkt.s_idx, &pkt.r_idx, + pkt.ue, pkt.en) != 0) + goto out; + if (noise_remote_begin_session(&peer->p_remote) != 0) + goto out; + + wg_timers_event_session_derived(&peer->p_timers); + pkt.t = WG_PKT_RESPONSE; + cookie_maker_mac(&peer->p_cookie, &pkt.m, &pkt, + sizeof(pkt)-sizeof(pkt.m)); + wg_timers_event_handshake_responded(&peer->p_timers); + wg_peer_send_buf(peer, (uint8_t*)&pkt, sizeof(pkt)); +out: + NET_EPOCH_EXIT(et); +} + +static void +wg_send_cookie(struct wg_softc *sc, struct cookie_macs *cm, uint32_t idx, + struct mbuf *m) +{ + struct wg_pkt_cookie pkt; + struct wg_endpoint *e; + + DPRINTF(sc, "Sending cookie response for denied handshake message\n"); + + pkt.t = WG_PKT_COOKIE; + pkt.r_idx = idx; + + e = wg_mbuf_endpoint_get(m); + cookie_checker_create_payload(&sc->sc_cookie, cm, pkt.nonce, + pkt.ec, &e->e_remote.r_sa); + wg_send_buf(sc, e, (uint8_t *)&pkt, sizeof(pkt)); +} + +static void +wg_send_keepalive(struct wg_peer *peer) +{ + struct mbuf *m = NULL; + struct wg_tag *t; + struct epoch_tracker et; + + if (wg_queue_len(&peer->p_stage_queue) != 0) { + NET_EPOCH_ENTER(et); + goto send; + } + if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) + return; + if ((t = wg_tag_get(m)) == NULL) { + m_freem(m); + return; + } + t->t_peer = peer; + t->t_mbuf = NULL; + t->t_done = 0; + t->t_mtu = 0; /* MTU == 0 OK for keepalive */ + + NET_EPOCH_ENTER(et); + wg_queue_stage(peer, m); +send: + wg_queue_out(peer); + NET_EPOCH_EXIT(et); +} + +static int +wg_cookie_validate_packet(struct cookie_checker *checker, struct mbuf *m, + int under_load) +{ + struct wg_pkt_initiation *init; + struct wg_pkt_response *resp; + struct cookie_macs *macs; + struct wg_endpoint *e; + int type, size; + void *data; + + type = *mtod(m, uint32_t *); + data = m->m_data; + e = wg_mbuf_endpoint_get(m); + if (type == WG_PKT_INITIATION) { + init = mtod(m, struct wg_pkt_initiation *); + macs = &init->m; + size = sizeof(*init) - sizeof(*macs); + } else if (type == WG_PKT_RESPONSE) { + resp = mtod(m, struct wg_pkt_response *); + macs = &resp->m; + size = sizeof(*resp) - sizeof(*macs); + } else + return 0; + + return (cookie_checker_validate_macs(checker, macs, data, size, + under_load, &e->e_remote.r_sa)); +} + + +static void +wg_handshake(struct wg_softc *sc, struct mbuf *m) +{ + struct wg_pkt_initiation *init; + struct wg_pkt_response *resp; + struct noise_remote *remote; + struct wg_pkt_cookie *cook; + struct wg_peer *peer; + struct wg_tag *t; + + /* This is global, so that our load calculation applies to the whole + * system. We don't care about races with it at all. + */ + static struct timeval wg_last_underload; + static const struct timeval underload_interval = { UNDERLOAD_TIMEOUT, 0 }; + bool packet_needs_cookie = false; + int underload, res; + + underload = mbufq_len(&sc->sc_handshake_queue) >= + MAX_QUEUED_HANDSHAKES / 8; + if (underload) + getmicrouptime(&wg_last_underload); + else if (wg_last_underload.tv_sec != 0) { + if (!ratecheck(&wg_last_underload, &underload_interval)) + underload = 1; + else + bzero(&wg_last_underload, sizeof(wg_last_underload)); + } + + res = wg_cookie_validate_packet(&sc->sc_cookie, m, underload); + + if (res && res != EAGAIN) { + printf("validate_packet got %d\n", res); + goto free; + } + if (res == EINVAL) { + DPRINTF(sc, "Invalid initiation MAC\n"); + goto free; + } else if (res == ECONNREFUSED) { + DPRINTF(sc, "Handshake ratelimited\n"); + goto free; + } else if (res == EAGAIN) { + packet_needs_cookie = true; + } else if (res != 0) { + DPRINTF(sc, "Unexpected handshake ratelimit response: %d\n", res); + goto free; + } + + t = wg_tag_get(m); + switch (*mtod(m, uint32_t *)) { + case WG_PKT_INITIATION: + init = mtod(m, struct wg_pkt_initiation *); + + if (packet_needs_cookie) { + wg_send_cookie(sc, &init->m, init->s_idx, m); + goto free; + } + if (noise_consume_initiation(&sc->sc_local, &remote, + init->s_idx, init->ue, init->es, init->ets) != 0) { + DPRINTF(sc, "Invalid handshake initiation"); + goto free; + } + + peer = __containerof(remote, struct wg_peer, p_remote); + DPRINTF(sc, "Receiving handshake initiation from peer %llu\n", + (unsigned long long)peer->p_id); + counter_u64_add(peer->p_rx_bytes, sizeof(*init)); + if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1); + if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, sizeof(*init)); + wg_peer_set_endpoint_from_tag(peer, t); + wg_send_response(peer); + break; + case WG_PKT_RESPONSE: + resp = mtod(m, struct wg_pkt_response *); + + if (packet_needs_cookie) { + wg_send_cookie(sc, &resp->m, resp->s_idx, m); + goto free; + } + + if ((remote = wg_index_get(sc, resp->r_idx)) == NULL) { + DPRINTF(sc, "Unknown handshake response\n"); + goto free; + } + peer = __containerof(remote, struct wg_peer, p_remote); + if (noise_consume_response(remote, resp->s_idx, resp->r_idx, + resp->ue, resp->en) != 0) { + DPRINTF(sc, "Invalid handshake response\n"); + goto free; + } + + DPRINTF(sc, "Receiving handshake response from peer %llu\n", + (unsigned long long)peer->p_id); + counter_u64_add(peer->p_rx_bytes, sizeof(*resp)); + if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1); + if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, sizeof(*resp)); + wg_peer_set_endpoint_from_tag(peer, t); + if (noise_remote_begin_session(&peer->p_remote) == 0) { + wg_timers_event_session_derived(&peer->p_timers); + wg_timers_event_handshake_complete(&peer->p_timers); + } + break; + case WG_PKT_COOKIE: + cook = mtod(m, struct wg_pkt_cookie *); + + if ((remote = wg_index_get(sc, cook->r_idx)) == NULL) { + DPRINTF(sc, "Unknown cookie index\n"); + goto free; + } + + peer = __containerof(remote, struct wg_peer, p_remote); + + if (cookie_maker_consume_payload(&peer->p_cookie, + cook->nonce, cook->ec) != 0) { + DPRINTF(sc, "Could not decrypt cookie response\n"); + goto free; + } + + DPRINTF(sc, "Receiving cookie response\n"); + goto free; + default: + goto free; + } + MPASS(peer != NULL); + wg_timers_event_any_authenticated_packet_received(&peer->p_timers); + wg_timers_event_any_authenticated_packet_traversal(&peer->p_timers); + +free: + m_freem(m); +} + +static void +wg_softc_handshake_receive(struct wg_softc *sc) +{ + struct mbuf *m; + + while ((m = mbufq_dequeue(&sc->sc_handshake_queue)) != NULL) + wg_handshake(sc, m); +} + +/* TODO Encrypt */ +static void +wg_encap(struct wg_softc *sc, struct mbuf *m) +{ + struct wg_pkt_data *data; + size_t padding_len, plaintext_len, out_len; + struct mbuf *mc; + struct wg_peer *peer; + struct wg_tag *t; + uint64_t nonce; + int res, allocation_order; + + NET_EPOCH_ASSERT(); + t = wg_tag_get(m); + peer = t->t_peer; + + plaintext_len = MIN(WG_PKT_WITH_PADDING(m->m_pkthdr.len), t->t_mtu); + padding_len = plaintext_len - m->m_pkthdr.len; + out_len = sizeof(struct wg_pkt_data) + plaintext_len + NOISE_AUTHTAG_LEN; + + if (out_len <= MCLBYTES) + allocation_order = MCLBYTES; + else if (out_len <= MJUMPAGESIZE) + allocation_order = MJUMPAGESIZE; + else if (out_len <= MJUM9BYTES) + allocation_order = MJUM9BYTES; + else if (out_len <= MJUM16BYTES) + allocation_order = MJUM16BYTES; + else + goto error; + + if ((mc = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, allocation_order)) == NULL) + goto error; + + data = mtod(mc, struct wg_pkt_data *); + m_copydata(m, 0, m->m_pkthdr.len, data->buf); + bzero(data->buf + m->m_pkthdr.len, padding_len); + + data->t = WG_PKT_DATA; + + res = noise_remote_encrypt(&peer->p_remote, &data->r_idx, &nonce, + data->buf, plaintext_len); + nonce = htole64(nonce); /* Wire format is little endian. */ + memcpy(data->nonce, &nonce, sizeof(data->nonce)); + + if (__predict_false(res)) { + if (res == EINVAL) { + wg_timers_event_want_initiation(&peer->p_timers); + m_freem(mc); + goto error; + } else if (res == ESTALE) { + wg_timers_event_want_initiation(&peer->p_timers); + } else { + m_freem(mc); + goto error; + } + } + + /* A packet with length 0 is a keepalive packet */ + if (m->m_pkthdr.len == 0) + DPRINTF(sc, "Sending keepalive packet to peer %llu\n", + (unsigned long long)peer->p_id); + /* + * Set the correct output value here since it will be copied + * when we move the pkthdr in send. + */ + mc->m_len = mc->m_pkthdr.len = out_len; + mc->m_flags &= ~(M_MCAST | M_BCAST); + + t->t_mbuf = mc; + error: + /* XXX membar ? */ + t->t_done = 1; + GROUPTASK_ENQUEUE(&peer->p_send); +} + +static void +wg_decap(struct wg_softc *sc, struct mbuf *m) +{ + struct wg_pkt_data *data; + struct wg_peer *peer, *routed_peer; + struct wg_tag *t; + size_t plaintext_len; + uint8_t version; + uint64_t nonce; + int res; + + NET_EPOCH_ASSERT(); + data = mtod(m, struct wg_pkt_data *); + plaintext_len = m->m_pkthdr.len - sizeof(struct wg_pkt_data); + + t = wg_tag_get(m); + peer = t->t_peer; + + memcpy(&nonce, data->nonce, sizeof(nonce)); + nonce = le64toh(nonce); /* Wire format is little endian. */ + + res = noise_remote_decrypt(&peer->p_remote, data->r_idx, nonce, + data->buf, plaintext_len); + + if (__predict_false(res)) { + if (res == EINVAL) { + goto error; + } else if (res == ECONNRESET) { + wg_timers_event_handshake_complete(&peer->p_timers); + } else if (res == ESTALE) { + wg_timers_event_want_initiation(&peer->p_timers); + } else { + panic("unexpected response: %d\n", res); + } + } + wg_peer_set_endpoint_from_tag(peer, t); + + /* Remove the data header, and crypto mac tail from the packet */ + m_adj(m, sizeof(struct wg_pkt_data)); + m_adj(m, -NOISE_AUTHTAG_LEN); + + /* A packet with length 0 is a keepalive packet */ + if (m->m_pkthdr.len == 0) { + DPRINTF(peer->p_sc, "Receiving keepalive packet from peer " + "%llu\n", (unsigned long long)peer->p_id); + goto done; + } + + version = mtod(m, struct ip *)->ip_v; + if (!((version == 4 && m->m_pkthdr.len >= sizeof(struct ip)) || + (version == 6 && m->m_pkthdr.len >= sizeof(struct ip6_hdr)))) { + DPRINTF(peer->p_sc, "Packet is neither ipv4 nor ipv6 from peer " + "%llu\n", (unsigned long long)peer->p_id); + goto error; + } + + routed_peer = wg_aip_lookup(&peer->p_sc->sc_aips, m, IN); + if (routed_peer != peer) { + DPRINTF(peer->p_sc, "Packet has unallowed src IP from peer " + "%llu\n", (unsigned long long)peer->p_id); + goto error; + } + +done: + t->t_mbuf = m; +error: + t->t_done = 1; + GROUPTASK_ENQUEUE(&peer->p_recv); +} + +static void +wg_softc_decrypt(struct wg_softc *sc) +{ + struct epoch_tracker et; + struct mbuf *m; + + NET_EPOCH_ENTER(et); + while ((m = buf_ring_dequeue_mc(sc->sc_decap_ring)) != NULL) + wg_decap(sc, m); + NET_EPOCH_EXIT(et); +} + +static void +wg_softc_encrypt(struct wg_softc *sc) +{ + struct mbuf *m; + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); + while ((m = buf_ring_dequeue_mc(sc->sc_encap_ring)) != NULL) + wg_encap(sc, m); + NET_EPOCH_EXIT(et); +} + +static void +wg_encrypt_dispatch(struct wg_softc *sc) +{ + for (int i = 0; i < mp_ncpus; i++) { + if (sc->sc_encrypt[i].gt_task.ta_flags & TASK_ENQUEUED) + continue; + GROUPTASK_ENQUEUE(&sc->sc_encrypt[i]); + } +} + +static void +wg_decrypt_dispatch(struct wg_softc *sc) +{ + for (int i = 0; i < mp_ncpus; i++) { + if (sc->sc_decrypt[i].gt_task.ta_flags & TASK_ENQUEUED) + continue; + GROUPTASK_ENQUEUE(&sc->sc_decrypt[i]); + } +} + +static void +wg_deliver_out(struct wg_peer *peer) +{ + struct epoch_tracker et; + struct wg_tag *t; + struct mbuf *m; + struct wg_endpoint endpoint; + size_t len; + int ret; + + NET_EPOCH_ENTER(et); + if (peer->p_sc->sc_ifp->if_link_state == LINK_STATE_DOWN) + goto done; + + wg_peer_get_endpoint(peer, &endpoint); + + while ((m = wg_queue_dequeue(&peer->p_encap_queue, &t)) != NULL) { + /* t_mbuf will contain the encrypted packet */ + if (t->t_mbuf == NULL) { + if_inc_counter(peer->p_sc->sc_ifp, IFCOUNTER_OERRORS, 1); + m_freem(m); + continue; + } + len = t->t_mbuf->m_pkthdr.len; + ret = wg_send(peer->p_sc, &endpoint, t->t_mbuf); + + if (ret == 0) { + wg_timers_event_any_authenticated_packet_traversal( + &peer->p_timers); + wg_timers_event_any_authenticated_packet_sent( + &peer->p_timers); + + if (m->m_pkthdr.len != 0) + wg_timers_event_data_sent(&peer->p_timers); + counter_u64_add(peer->p_tx_bytes, len); + } else if (ret == EADDRNOTAVAIL) { + wg_peer_clear_src(peer); + wg_peer_get_endpoint(peer, &endpoint); + } + m_freem(m); + } +done: + NET_EPOCH_EXIT(et); +} + +static void +wg_deliver_in(struct wg_peer *peer) +{ + struct mbuf *m; + struct ifnet *ifp; + struct wg_softc *sc; + struct epoch_tracker et; + struct wg_tag *t; + uint32_t af; + int version; + + NET_EPOCH_ENTER(et); + sc = peer->p_sc; + ifp = sc->sc_ifp; + + while ((m = wg_queue_dequeue(&peer->p_decap_queue, &t)) != NULL) { + /* t_mbuf will contain the encrypted packet */ + if (t->t_mbuf == NULL) { + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); + m_freem(m); + continue; + } + MPASS(m == t->t_mbuf); + + wg_timers_event_any_authenticated_packet_received( + &peer->p_timers); + wg_timers_event_any_authenticated_packet_traversal( + &peer->p_timers); + + counter_u64_add(peer->p_rx_bytes, m->m_pkthdr.len + sizeof(struct wg_pkt_data) + NOISE_AUTHTAG_LEN); + if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1); + if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len + sizeof(struct wg_pkt_data) + NOISE_AUTHTAG_LEN); + + if (m->m_pkthdr.len == 0) { + m_freem(m); + continue; + } + + m->m_flags &= ~(M_MCAST | M_BCAST); + m->m_pkthdr.rcvif = ifp; + version = mtod(m, struct ip *)->ip_v; + if (version == IPVERSION) { + af = AF_INET; + BPF_MTAP2(ifp, &af, sizeof(af), m); + CURVNET_SET(ifp->if_vnet); + ip_input(m); + CURVNET_RESTORE(); + } else if (version == 6) { + af = AF_INET6; + BPF_MTAP2(ifp, &af, sizeof(af), m); + CURVNET_SET(ifp->if_vnet); + ip6_input(m); + CURVNET_RESTORE(); + } else + m_freem(m); + + wg_timers_event_data_received(&peer->p_timers); + } + NET_EPOCH_EXIT(et); +} + +static int +wg_queue_in(struct wg_peer *peer, struct mbuf *m) +{ + struct buf_ring *parallel = peer->p_sc->sc_decap_ring; + struct wg_queue *serial = &peer->p_decap_queue; + struct wg_tag *t; + int rc; + + MPASS(wg_tag_get(m) != NULL); + + mtx_lock(&serial->q_mtx); + if ((rc = mbufq_enqueue(&serial->q, m)) == ENOBUFS) { + m_freem(m); + if_inc_counter(peer->p_sc->sc_ifp, IFCOUNTER_OQDROPS, 1); + } else { + m->m_flags |= M_ENQUEUED; + rc = buf_ring_enqueue(parallel, m); + if (rc == ENOBUFS) { + t = wg_tag_get(m); + t->t_done = 1; + } + } + mtx_unlock(&serial->q_mtx); + return (rc); +} + +static void +wg_queue_stage(struct wg_peer *peer, struct mbuf *m) +{ + struct wg_queue *q = &peer->p_stage_queue; + mtx_lock(&q->q_mtx); + STAILQ_INSERT_TAIL(&q->q.mq_head, m, m_stailqpkt); + q->q.mq_len++; + while (mbufq_full(&q->q)) { + m = mbufq_dequeue(&q->q); + if (m) { + m_freem(m); + if_inc_counter(peer->p_sc->sc_ifp, IFCOUNTER_OQDROPS, 1); + } + } + mtx_unlock(&q->q_mtx); +} + +static void +wg_queue_out(struct wg_peer *peer) +{ + struct buf_ring *parallel = peer->p_sc->sc_encap_ring; + struct wg_queue *serial = &peer->p_encap_queue; + struct wg_tag *t; + struct mbufq staged; + struct mbuf *m; + + if (noise_remote_ready(&peer->p_remote) != 0) { + if (wg_queue_len(&peer->p_stage_queue)) + wg_timers_event_want_initiation(&peer->p_timers); + return; + } + + /* We first "steal" the staged queue to a local queue, so that we can do these + * remaining operations without having to hold the staged queue mutex. */ + STAILQ_INIT(&staged.mq_head); + mtx_lock(&peer->p_stage_queue.q_mtx); + STAILQ_SWAP(&staged.mq_head, &peer->p_stage_queue.q.mq_head, mbuf); + staged.mq_len = peer->p_stage_queue.q.mq_len; + peer->p_stage_queue.q.mq_len = 0; + staged.mq_maxlen = peer->p_stage_queue.q.mq_maxlen; + mtx_unlock(&peer->p_stage_queue.q_mtx); + + while ((m = mbufq_dequeue(&staged)) != NULL) { + if ((t = wg_tag_get(m)) == NULL) { + m_freem(m); + continue; + } + t->t_peer = peer; + mtx_lock(&serial->q_mtx); + if (mbufq_enqueue(&serial->q, m) != 0) { + m_freem(m); + if_inc_counter(peer->p_sc->sc_ifp, IFCOUNTER_OQDROPS, 1); + } else { + m->m_flags |= M_ENQUEUED; + if (buf_ring_enqueue(parallel, m)) { + t = wg_tag_get(m); + t->t_done = 1; + } + } + mtx_unlock(&serial->q_mtx); + } + wg_encrypt_dispatch(peer->p_sc); +} + +static struct mbuf * +wg_queue_dequeue(struct wg_queue *q, struct wg_tag **t) +{ + struct mbuf *m_, *m; + + m = NULL; + mtx_lock(&q->q_mtx); + m_ = mbufq_first(&q->q); + if (m_ != NULL && (*t = wg_tag_get(m_))->t_done) { + m = mbufq_dequeue(&q->q); + m->m_flags &= ~M_ENQUEUED; + } + mtx_unlock(&q->q_mtx); + return (m); +} + +static int +wg_queue_len(struct wg_queue *q) +{ + /* This access races. We might consider adding locking here. */ + return (mbufq_len(&q->q)); +} + +static void +wg_queue_init(struct wg_queue *q, const char *name) +{ + mtx_init(&q->q_mtx, name, NULL, MTX_DEF); + mbufq_init(&q->q, MAX_QUEUED_PKT); +} + +static void +wg_queue_deinit(struct wg_queue *q) +{ + wg_queue_purge(q); + mtx_destroy(&q->q_mtx); +} + +static void +wg_queue_purge(struct wg_queue *q) +{ + mtx_lock(&q->q_mtx); + mbufq_drain(&q->q); + mtx_unlock(&q->q_mtx); +} + +/* TODO Indexes */ +static struct noise_remote * +wg_remote_get(struct wg_softc *sc, uint8_t public[NOISE_PUBLIC_KEY_LEN]) +{ + struct wg_peer *peer; + + if ((peer = wg_peer_lookup(sc, public)) == NULL) + return (NULL); + return (&peer->p_remote); +} + +static uint32_t +wg_index_set(struct wg_softc *sc, struct noise_remote *remote) +{ + struct wg_index *index, *iter; + struct wg_peer *peer; + uint32_t key; + + /* We can modify this without a lock as wg_index_set, wg_index_drop are + * guaranteed to be serialised (per remote). */ + peer = __containerof(remote, struct wg_peer, p_remote); + index = SLIST_FIRST(&peer->p_unused_index); + MPASS(index != NULL); + SLIST_REMOVE_HEAD(&peer->p_unused_index, i_unused_entry); + + index->i_value = remote; + + rw_wlock(&sc->sc_index_lock); +assign_id: + key = index->i_key = arc4random(); + key &= sc->sc_index_mask; + LIST_FOREACH(iter, &sc->sc_index[key], i_entry) + if (iter->i_key == index->i_key) + goto assign_id; + + LIST_INSERT_HEAD(&sc->sc_index[key], index, i_entry); + + rw_wunlock(&sc->sc_index_lock); + + /* Likewise, no need to lock for index here. */ + return index->i_key; +} + +static struct noise_remote * +wg_index_get(struct wg_softc *sc, uint32_t key0) +{ + struct wg_index *iter; + struct noise_remote *remote = NULL; + uint32_t key = key0 & sc->sc_index_mask; + + rw_enter_read(&sc->sc_index_lock); + LIST_FOREACH(iter, &sc->sc_index[key], i_entry) + if (iter->i_key == key0) { + remote = iter->i_value; + break; + } + rw_exit_read(&sc->sc_index_lock); + return remote; +} + +static void +wg_index_drop(struct wg_softc *sc, uint32_t key0) +{ + struct wg_index *iter; + struct wg_peer *peer = NULL; + uint32_t key = key0 & sc->sc_index_mask; + + rw_enter_write(&sc->sc_index_lock); + LIST_FOREACH(iter, &sc->sc_index[key], i_entry) + if (iter->i_key == key0) { + LIST_REMOVE(iter, i_entry); + break; + } + rw_exit_write(&sc->sc_index_lock); + + if (iter == NULL) + return; + + /* We expect a peer */ + peer = __containerof(iter->i_value, struct wg_peer, p_remote); + MPASS(peer != NULL); + SLIST_INSERT_HEAD(&peer->p_unused_index, iter, i_unused_entry); +} + +static int +wg_update_endpoint_addrs(struct wg_endpoint *e, const struct sockaddr *srcsa, + struct ifnet *rcvif) +{ + const struct sockaddr_in *sa4; + const struct sockaddr_in6 *sa6; + int ret = 0; + + /* + * UDP passes a 2-element sockaddr array: first element is the + * source addr/port, second the destination addr/port. + */ + if (srcsa->sa_family == AF_INET) { + sa4 = (const struct sockaddr_in *)srcsa; + e->e_remote.r_sin = sa4[0]; + e->e_local.l_in = sa4[1].sin_addr; + } else if (srcsa->sa_family == AF_INET6) { + sa6 = (const struct sockaddr_in6 *)srcsa; + e->e_remote.r_sin6 = sa6[0]; + e->e_local.l_in6 = sa6[1].sin6_addr; + } else { + ret = EAFNOSUPPORT; + } + + return (ret); +} + +static void +wg_input(struct mbuf *m0, int offset, struct inpcb *inpcb, + const struct sockaddr *srcsa, void *_sc) +{ + struct wg_pkt_data *pkt_data; + struct wg_endpoint *e; + struct wg_softc *sc = _sc; + struct mbuf *m; + int pktlen, pkttype; + struct noise_remote *remote; + struct wg_tag *t; + void *data; + + /* Caller provided us with srcsa, no need for this header. */ + m_adj(m0, offset + sizeof(struct udphdr)); + + /* + * Ensure mbuf has at least enough contiguous data to peel off our + * headers at the beginning. + */ + if ((m = m_defrag(m0, M_NOWAIT)) == NULL) { + m_freem(m0); + return; + } + data = mtod(m, void *); + pkttype = *(uint32_t*)data; + t = wg_tag_get(m); + if (t == NULL) { + goto free; + } + e = wg_mbuf_endpoint_get(m); + + if (wg_update_endpoint_addrs(e, srcsa, m->m_pkthdr.rcvif)) { + goto free; + } + + pktlen = m->m_pkthdr.len; + + if ((pktlen == sizeof(struct wg_pkt_initiation) && + pkttype == WG_PKT_INITIATION) || + (pktlen == sizeof(struct wg_pkt_response) && + pkttype == WG_PKT_RESPONSE) || + (pktlen == sizeof(struct wg_pkt_cookie) && + pkttype == WG_PKT_COOKIE)) { + if (mbufq_enqueue(&sc->sc_handshake_queue, m) == 0) { + GROUPTASK_ENQUEUE(&sc->sc_handshake); + } else { + DPRINTF(sc, "Dropping handshake packet\n"); + m_freem(m); + } + } else if (pktlen >= sizeof(struct wg_pkt_data) + NOISE_AUTHTAG_LEN + && pkttype == WG_PKT_DATA) { + + pkt_data = data; + remote = wg_index_get(sc, pkt_data->r_idx); + if (remote == NULL) { + if_inc_counter(sc->sc_ifp, IFCOUNTER_IERRORS, 1); + m_freem(m); + } else if (buf_ring_count(sc->sc_decap_ring) > MAX_QUEUED_PKT) { + if_inc_counter(sc->sc_ifp, IFCOUNTER_IQDROPS, 1); + m_freem(m); + } else { + t->t_peer = __containerof(remote, struct wg_peer, + p_remote); + t->t_mbuf = NULL; + t->t_done = 0; + + wg_queue_in(t->t_peer, m); + wg_decrypt_dispatch(sc); + } + } else { +free: + m_freem(m); + } +} + +static int +wg_transmit(struct ifnet *ifp, struct mbuf *m) +{ + struct wg_softc *sc; + sa_family_t family; + struct epoch_tracker et; + struct wg_peer *peer; + struct wg_tag *t; + uint32_t af; + int rc; + + /* + * Work around lifetime issue in the ipv6 mld code. + */ + if (__predict_false(ifp->if_flags & IFF_DYING)) + return (ENXIO); + + rc = 0; + sc = ifp->if_softc; + if ((t = wg_tag_get(m)) == NULL) { + rc = ENOBUFS; + goto early_out; + } + af = m->m_pkthdr.ph_family; + BPF_MTAP2(ifp, &af, sizeof(af), m); + + NET_EPOCH_ENTER(et); + peer = wg_aip_lookup(&sc->sc_aips, m, OUT); + if (__predict_false(peer == NULL)) { + rc = ENOKEY; + goto err; + } + + family = peer->p_endpoint.e_remote.r_sa.sa_family; + if (__predict_false(family != AF_INET && family != AF_INET6)) { + DPRINTF(sc, "No valid endpoint has been configured or " + "discovered for peer %llu\n", (unsigned long long)peer->p_id); + + rc = EHOSTUNREACH; + goto err; + } + t->t_peer = peer; + t->t_mbuf = NULL; + t->t_done = 0; + t->t_mtu = ifp->if_mtu; + + wg_queue_stage(peer, m); + wg_queue_out(peer); + NET_EPOCH_EXIT(et); + return (rc); +err: + NET_EPOCH_EXIT(et); +early_out: + if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); + /* TODO: send ICMP unreachable */ + m_free(m); + return (rc); +} + +static int +wg_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa, struct route *rt) +{ + m->m_pkthdr.ph_family = sa->sa_family; + return (wg_transmit(ifp, m)); +} + +static int +wg_peer_add(struct wg_softc *sc, const nvlist_t *nvl) +{ + uint8_t public[WG_KEY_SIZE]; + const void *pub_key; + const struct sockaddr *endpoint; + int err; + size_t size; + struct wg_peer *peer = NULL; + bool need_insert = false; + + sx_assert(&sc->sc_lock, SX_XLOCKED); + + if (!nvlist_exists_binary(nvl, "public-key")) { + return (EINVAL); + } + pub_key = nvlist_get_binary(nvl, "public-key", &size); + if (size != WG_KEY_SIZE) { + return (EINVAL); + } + if (noise_local_keys(&sc->sc_local, public, NULL) == 0 && + bcmp(public, pub_key, WG_KEY_SIZE) == 0) { + return (0); // Silently ignored; not actually a failure. + } + peer = wg_peer_lookup(sc, pub_key); + if (nvlist_exists_bool(nvl, "remove") && + nvlist_get_bool(nvl, "remove")) { + if (peer != NULL) { + wg_hashtable_peer_remove(&sc->sc_hashtable, peer); + wg_peer_destroy(peer); + } + return (0); + } + if (nvlist_exists_bool(nvl, "replace-allowedips") && + nvlist_get_bool(nvl, "replace-allowedips") && + peer != NULL) { + + wg_aip_delete(&peer->p_sc->sc_aips, peer); + } + if (peer == NULL) { + if (sc->sc_peer_count >= MAX_PEERS_PER_IFACE) + return (E2BIG); + sc->sc_peer_count++; + + need_insert = true; + peer = wg_peer_alloc(sc); + MPASS(peer != NULL); + noise_remote_init(&peer->p_remote, pub_key, &sc->sc_local); + cookie_maker_init(&peer->p_cookie, pub_key); + } + if (nvlist_exists_binary(nvl, "endpoint")) { + endpoint = nvlist_get_binary(nvl, "endpoint", &size); + if (size > sizeof(peer->p_endpoint.e_remote)) { + err = EINVAL; + goto out; + } + memcpy(&peer->p_endpoint.e_remote, endpoint, size); + } + if (nvlist_exists_binary(nvl, "preshared-key")) { + const void *key; + + key = nvlist_get_binary(nvl, "preshared-key", &size); + if (size != WG_KEY_SIZE) { + err = EINVAL; + goto out; + } + noise_remote_set_psk(&peer->p_remote, key); + } + if (nvlist_exists_number(nvl, "persistent-keepalive-interval")) { + uint64_t pki = nvlist_get_number(nvl, "persistent-keepalive-interval"); + if (pki > UINT16_MAX) { + err = EINVAL; + goto out; + } + wg_timers_set_persistent_keepalive(&peer->p_timers, pki); + } + if (nvlist_exists_nvlist_array(nvl, "allowed-ips")) { + const void *binary; + uint64_t cidr; + const nvlist_t * const * aipl; + struct wg_allowedip aip; + size_t allowedip_count; + + aipl = nvlist_get_nvlist_array(nvl, "allowed-ips", + &allowedip_count); + for (size_t idx = 0; idx < allowedip_count; idx++) { + if (!nvlist_exists_number(aipl[idx], "cidr")) + continue; + cidr = nvlist_get_number(aipl[idx], "cidr"); + if (nvlist_exists_binary(aipl[idx], "ipv4")) { + binary = nvlist_get_binary(aipl[idx], "ipv4", &size); + if (binary == NULL || cidr > 32 || size != sizeof(aip.ip4)) { + err = EINVAL; + goto out; + } + aip.family = AF_INET; + memcpy(&aip.ip4, binary, sizeof(aip.ip4)); + } else if (nvlist_exists_binary(aipl[idx], "ipv6")) { + binary = nvlist_get_binary(aipl[idx], "ipv6", &size); + if (binary == NULL || cidr > 128 || size != sizeof(aip.ip6)) { + err = EINVAL; + goto out; + } + aip.family = AF_INET6; + memcpy(&aip.ip6, binary, sizeof(aip.ip6)); + } else { + continue; + } + aip.cidr = cidr; + + if ((err = wg_aip_add(&sc->sc_aips, peer, &aip)) != 0) { + goto out; + } + } + } + if (need_insert) { + wg_hashtable_peer_insert(&sc->sc_hashtable, peer); + if (sc->sc_ifp->if_link_state == LINK_STATE_UP) + wg_timers_enable(&peer->p_timers); + } + return (0); + +out: + if (need_insert) /* If we fail, only destroy if it was new. */ + wg_peer_destroy(peer); + return (err); +} + +static int +wgc_set(struct wg_softc *sc, struct wg_data_io *wgd) +{ + uint8_t public[WG_KEY_SIZE], private[WG_KEY_SIZE]; + struct ifnet *ifp; + void *nvlpacked; + nvlist_t *nvl; + ssize_t size; + int err; + + ifp = sc->sc_ifp; + if (wgd->wgd_size == 0 || wgd->wgd_data == NULL) + return (EFAULT); + + sx_xlock(&sc->sc_lock); + + nvlpacked = malloc(wgd->wgd_size, M_TEMP, M_WAITOK); + err = copyin(wgd->wgd_data, nvlpacked, wgd->wgd_size); + if (err) + goto out; + nvl = nvlist_unpack(nvlpacked, wgd->wgd_size, 0); + if (nvl == NULL) { + err = EBADMSG; + goto out; + } + if (nvlist_exists_bool(nvl, "replace-peers") && + nvlist_get_bool(nvl, "replace-peers")) + wg_peer_remove_all(sc); + if (nvlist_exists_number(nvl, "listen-port")) { + uint64_t new_port = nvlist_get_number(nvl, "listen-port"); + if (new_port > UINT16_MAX) { + err = EINVAL; + goto out; + } + if (new_port != sc->sc_socket.so_port) { + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) { + if ((err = wg_socket_init(sc, new_port)) != 0) + goto out; + } else + sc->sc_socket.so_port = new_port; + } + } + if (nvlist_exists_binary(nvl, "private-key")) { + const void *key = nvlist_get_binary(nvl, "private-key", &size); + if (size != WG_KEY_SIZE) { + err = EINVAL; + goto out; + } + + if (noise_local_keys(&sc->sc_local, NULL, private) != 0 || + timingsafe_bcmp(private, key, WG_KEY_SIZE) != 0) { + struct noise_local *local; + struct wg_peer *peer; + struct wg_hashtable *ht = &sc->sc_hashtable; + bool has_identity; + + if (curve25519_generate_public(public, key)) { + /* Peer conflict: remove conflicting peer. */ + if ((peer = wg_peer_lookup(sc, public)) != + NULL) { + wg_hashtable_peer_remove(ht, peer); + wg_peer_destroy(peer); + } + } + + /* + * Set the private key and invalidate all existing + * handshakes. + */ + local = &sc->sc_local; + noise_local_lock_identity(local); + /* Note: we might be removing the private key. */ + has_identity = noise_local_set_private(local, key) == 0; + mtx_lock(&ht->h_mtx); + CK_LIST_FOREACH(peer, &ht->h_peers_list, p_entry) { + noise_remote_precompute(&peer->p_remote); + wg_timers_event_reset_handshake_last_sent( + &peer->p_timers); + noise_remote_expire_current(&peer->p_remote); + } + mtx_unlock(&ht->h_mtx); + cookie_checker_update(&sc->sc_cookie, + has_identity ? public : NULL); + noise_local_unlock_identity(local); + } + } + if (nvlist_exists_number(nvl, "user-cookie")) { + uint64_t user_cookie = nvlist_get_number(nvl, "user-cookie"); + if (user_cookie > UINT32_MAX) { + err = EINVAL; + goto out; + } + wg_socket_set_cookie(sc, user_cookie); + } + if (nvlist_exists_nvlist_array(nvl, "peers")) { + size_t peercount; + const nvlist_t * const*nvl_peers; + + nvl_peers = nvlist_get_nvlist_array(nvl, "peers", &peercount); + for (int i = 0; i < peercount; i++) { + err = wg_peer_add(sc, nvl_peers[i]); + if (err != 0) + goto out; + } + } + + nvlist_destroy(nvl); +out: + free(nvlpacked, M_TEMP); + sx_xunlock(&sc->sc_lock); + return (err); +} + +static unsigned int +in_mask2len(struct in_addr *mask) +{ + unsigned int x, y; + uint8_t *p; + + p = (uint8_t *)mask; + for (x = 0; x < sizeof(*mask); x++) { + if (p[x] != 0xff) + break; + } + y = 0; + if (x < sizeof(*mask)) { + for (y = 0; y < NBBY; y++) { + if ((p[x] & (0x80 >> y)) == 0) + break; + } + } + return x * NBBY + y; +} + +static int +wg_peer_to_export(struct wg_peer *peer, struct wg_peer_export *exp) +{ + struct wg_endpoint *ep; + struct wg_aip *rt; + struct noise_remote *remote; + int i; + + /* Non-sleepable context. */ + NET_EPOCH_ASSERT(); + + bzero(&exp->endpoint, sizeof(exp->endpoint)); + remote = &peer->p_remote; + ep = &peer->p_endpoint; + if (ep->e_remote.r_sa.sa_family != 0) { + exp->endpoint_sz = (ep->e_remote.r_sa.sa_family == AF_INET) ? + sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + + memcpy(&exp->endpoint, &ep->e_remote, exp->endpoint_sz); + } + + /* We always export it. */ + (void)noise_remote_keys(remote, exp->public_key, exp->preshared_key); + exp->persistent_keepalive = + peer->p_timers.t_persistent_keepalive_interval; + wg_timers_get_last_handshake(&peer->p_timers, &exp->last_handshake); + exp->rx_bytes = counter_u64_fetch(peer->p_rx_bytes); + exp->tx_bytes = counter_u64_fetch(peer->p_tx_bytes); + + exp->aip_count = 0; + CK_LIST_FOREACH(rt, &peer->p_aips, r_entry) { + exp->aip_count++; + } + + /* Early success; no allowed-ips to copy out. */ + if (exp->aip_count == 0) + return (0); + + exp->aip = malloc(exp->aip_count * sizeof(*exp->aip), M_TEMP, M_NOWAIT); + if (exp->aip == NULL) + return (ENOMEM); + + i = 0; + CK_LIST_FOREACH(rt, &peer->p_aips, r_entry) { + exp->aip[i].family = rt->r_addr.ss_family; + if (exp->aip[i].family == AF_INET) { + struct sockaddr_in *sin = + (struct sockaddr_in *)&rt->r_addr; + + exp->aip[i].ip4 = sin->sin_addr; + + sin = (struct sockaddr_in *)&rt->r_mask; + exp->aip[i].cidr = in_mask2len(&sin->sin_addr); + } else if (exp->aip[i].family == AF_INET6) { + struct sockaddr_in6 *sin6 = + (struct sockaddr_in6 *)&rt->r_addr; + + exp->aip[i].ip6 = sin6->sin6_addr; + + sin6 = (struct sockaddr_in6 *)&rt->r_mask; + exp->aip[i].cidr = in6_mask2len(&sin6->sin6_addr, NULL); + } + i++; + if (i == exp->aip_count) + break; + } + + /* Again, AllowedIPs might have shrank; update it. */ + exp->aip_count = i; + + return (0); +} + +static nvlist_t * +wg_peer_export_to_nvl(struct wg_softc *sc, struct wg_peer_export *exp) +{ + struct wg_timespec64 ts64; + nvlist_t *nvl, **nvl_aips; + size_t i; + uint16_t family; + + nvl_aips = NULL; + if ((nvl = nvlist_create(0)) == NULL) + return (NULL); + + nvlist_add_binary(nvl, "public-key", exp->public_key, + sizeof(exp->public_key)); + if (wgc_privileged(sc)) + nvlist_add_binary(nvl, "preshared-key", exp->preshared_key, + sizeof(exp->preshared_key)); + if (exp->endpoint_sz != 0) + nvlist_add_binary(nvl, "endpoint", &exp->endpoint, + exp->endpoint_sz); + + if (exp->aip_count != 0) { + nvl_aips = mallocarray(exp->aip_count, sizeof(*nvl_aips), + M_WG, M_WAITOK | M_ZERO); + } + + for (i = 0; i < exp->aip_count; i++) { + nvl_aips[i] = nvlist_create(0); + if (nvl_aips[i] == NULL) + goto err; + family = exp->aip[i].family; + nvlist_add_number(nvl_aips[i], "cidr", exp->aip[i].cidr); + if (family == AF_INET) + nvlist_add_binary(nvl_aips[i], "ipv4", + &exp->aip[i].ip4, sizeof(exp->aip[i].ip4)); + else if (family == AF_INET6) + nvlist_add_binary(nvl_aips[i], "ipv6", + &exp->aip[i].ip6, sizeof(exp->aip[i].ip6)); + } + + if (i != 0) { + nvlist_add_nvlist_array(nvl, "allowed-ips", + (const nvlist_t *const *)nvl_aips, i); + } + + for (i = 0; i < exp->aip_count; ++i) + nvlist_destroy(nvl_aips[i]); + + free(nvl_aips, M_WG); + nvl_aips = NULL; + + ts64.tv_sec = exp->last_handshake.tv_sec; + ts64.tv_nsec = exp->last_handshake.tv_nsec; + nvlist_add_binary(nvl, "last-handshake-time", &ts64, sizeof(ts64)); + + if (exp->persistent_keepalive != 0) + nvlist_add_number(nvl, "persistent-keepalive-interval", + exp->persistent_keepalive); + + if (exp->rx_bytes != 0) + nvlist_add_number(nvl, "rx-bytes", exp->rx_bytes); + if (exp->tx_bytes != 0) + nvlist_add_number(nvl, "tx-bytes", exp->tx_bytes); + + return (nvl); +err: + for (i = 0; i < exp->aip_count && nvl_aips[i] != NULL; i++) { + nvlist_destroy(nvl_aips[i]); + } + + free(nvl_aips, M_WG); + nvlist_destroy(nvl); + return (NULL); +} + +static int +wg_marshal_peers(struct wg_softc *sc, nvlist_t **nvlp, nvlist_t ***nvl_arrayp, int *peer_countp) +{ + struct wg_peer *peer; + int err, i, peer_count; + nvlist_t *nvl, **nvl_array; + struct epoch_tracker et; + struct wg_peer_export *wpe; + + nvl = NULL; + nvl_array = NULL; + if (nvl_arrayp) + *nvl_arrayp = NULL; + if (nvlp) + *nvlp = NULL; + if (peer_countp) + *peer_countp = 0; + peer_count = sc->sc_hashtable.h_num_peers; + if (peer_count == 0) { + return (ENOENT); + } + + if (nvlp && (nvl = nvlist_create(0)) == NULL) + return (ENOMEM); + + err = i = 0; + nvl_array = malloc(peer_count*sizeof(void*), M_TEMP, M_WAITOK | M_ZERO); + wpe = malloc(peer_count*sizeof(*wpe), M_TEMP, M_WAITOK | M_ZERO); + + NET_EPOCH_ENTER(et); + CK_LIST_FOREACH(peer, &sc->sc_hashtable.h_peers_list, p_entry) { + if ((err = wg_peer_to_export(peer, &wpe[i])) != 0) { + break; + } + + i++; + if (i == peer_count) + break; + } + NET_EPOCH_EXIT(et); + + if (err != 0) + goto out; + + /* Update the peer count, in case we found fewer entries. */ + *peer_countp = peer_count = i; + if (peer_count == 0) { + err = ENOENT; + goto out; + } + + for (i = 0; i < peer_count; i++) { + int idx; + + /* + * Peers are added to the list in reverse order, effectively, + * because it's simpler/quicker to add at the head every time. + * + * Export them in reverse order. No worries if we fail mid-way + * through, the cleanup below will DTRT. + */ + idx = peer_count - i - 1; + nvl_array[idx] = wg_peer_export_to_nvl(sc, &wpe[i]); + if (nvl_array[idx] == NULL) { + break; + } + } + + if (i < peer_count) { + /* Error! */ + *peer_countp = 0; + err = ENOMEM; + } else if (nvl) { + nvlist_add_nvlist_array(nvl, "peers", + (const nvlist_t * const *)nvl_array, peer_count); + if ((err = nvlist_error(nvl))) { + goto out; + } + *nvlp = nvl; + } + *nvl_arrayp = nvl_array; + out: + if (err != 0) { + /* Note that nvl_array is populated in reverse order. */ + for (i = 0; i < peer_count; i++) { + nvlist_destroy(nvl_array[i]); + } + + free(nvl_array, M_TEMP); + if (nvl != NULL) + nvlist_destroy(nvl); + } + + for (i = 0; i < peer_count; i++) + free(wpe[i].aip, M_TEMP); + free(wpe, M_TEMP); + return (err); +} + +static int +wgc_get(struct wg_softc *sc, struct wg_data_io *wgd) +{ + nvlist_t *nvl, **nvl_array; + void *packed; + size_t size; + int peer_count, err; + + nvl = nvlist_create(0); + if (nvl == NULL) + return (ENOMEM); + + sx_slock(&sc->sc_lock); + + err = 0; + packed = NULL; + if (sc->sc_socket.so_port != 0) + nvlist_add_number(nvl, "listen-port", sc->sc_socket.so_port); + if (sc->sc_socket.so_user_cookie != 0) + nvlist_add_number(nvl, "user-cookie", sc->sc_socket.so_user_cookie); + if (sc->sc_local.l_has_identity) { + nvlist_add_binary(nvl, "public-key", sc->sc_local.l_public, WG_KEY_SIZE); + if (wgc_privileged(sc)) + nvlist_add_binary(nvl, "private-key", sc->sc_local.l_private, WG_KEY_SIZE); + } + if (sc->sc_hashtable.h_num_peers > 0) { + err = wg_marshal_peers(sc, NULL, &nvl_array, &peer_count); + if (err) + goto out_nvl; + nvlist_add_nvlist_array(nvl, "peers", + (const nvlist_t * const *)nvl_array, peer_count); + } + packed = nvlist_pack(nvl, &size); + if (packed == NULL) { + err = ENOMEM; + goto out_nvl; + } + if (wgd->wgd_size == 0) { + wgd->wgd_size = size; + goto out_packed; + } + if (wgd->wgd_size < size) { + err = ENOSPC; + goto out_packed; + } + if (wgd->wgd_data == NULL) { + err = EFAULT; + goto out_packed; + } + err = copyout(packed, wgd->wgd_data, size); + wgd->wgd_size = size; + +out_packed: + free(packed, M_NVLIST); +out_nvl: + nvlist_destroy(nvl); + sx_sunlock(&sc->sc_lock); + return (err); +} + +static int +wg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct wg_data_io *wgd = (struct wg_data_io *)data; + struct ifreq *ifr = (struct ifreq *)data; + struct wg_softc *sc = ifp->if_softc; + int ret = 0; + + switch (cmd) { + case SIOCSWG: + ret = priv_check(curthread, PRIV_NET_WG); + if (ret == 0) + ret = wgc_set(sc, wgd); + break; + case SIOCGWG: + ret = wgc_get(sc, wgd); + break; + /* Interface IOCTLs */ + case SIOCSIFADDR: + /* + * This differs from *BSD norms, but is more uniform with how + * WireGuard behaves elsewhere. + */ + break; + case SIOCSIFFLAGS: + if ((ifp->if_flags & IFF_UP) != 0) + ret = wg_up(sc); + else + wg_down(sc); + break; + case SIOCSIFMTU: + if (ifr->ifr_mtu <= 0 || ifr->ifr_mtu > MAX_MTU) + ret = EINVAL; + else + ifp->if_mtu = ifr->ifr_mtu; + break; + case SIOCADDMULTI: + case SIOCDELMULTI: + break; + default: + ret = ENOTTY; + } + + return ret; +} + +static int +wg_up(struct wg_softc *sc) +{ + struct wg_hashtable *ht = &sc->sc_hashtable; + struct ifnet *ifp = sc->sc_ifp; + struct wg_peer *peer; + int rc = EBUSY; + + sx_xlock(&sc->sc_lock); + /* Jail's being removed, no more wg_up(). */ + if ((sc->sc_flags & WGF_DYING) != 0) + goto out; + + /* Silent success if we're already running. */ + rc = 0; + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + goto out; + ifp->if_drv_flags |= IFF_DRV_RUNNING; + + rc = wg_socket_init(sc, sc->sc_socket.so_port); + if (rc == 0) { + mtx_lock(&ht->h_mtx); + CK_LIST_FOREACH(peer, &ht->h_peers_list, p_entry) { + wg_timers_enable(&peer->p_timers); + wg_queue_out(peer); + } + mtx_unlock(&ht->h_mtx); + + if_link_state_change(sc->sc_ifp, LINK_STATE_UP); + } else { + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + } +out: + sx_xunlock(&sc->sc_lock); + return (rc); +} + +static void +wg_down(struct wg_softc *sc) +{ + struct wg_hashtable *ht = &sc->sc_hashtable; + struct ifnet *ifp = sc->sc_ifp; + struct wg_peer *peer; + + sx_xlock(&sc->sc_lock); + if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { + sx_xunlock(&sc->sc_lock); + return; + } + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + + mtx_lock(&ht->h_mtx); + CK_LIST_FOREACH(peer, &ht->h_peers_list, p_entry) { + wg_queue_purge(&peer->p_stage_queue); + wg_timers_disable(&peer->p_timers); + } + mtx_unlock(&ht->h_mtx); + + mbufq_drain(&sc->sc_handshake_queue); + + mtx_lock(&ht->h_mtx); + CK_LIST_FOREACH(peer, &ht->h_peers_list, p_entry) { + noise_remote_clear(&peer->p_remote); + wg_timers_event_reset_handshake_last_sent(&peer->p_timers); + } + mtx_unlock(&ht->h_mtx); + + if_link_state_change(sc->sc_ifp, LINK_STATE_DOWN); + wg_socket_uninit(sc); + + sx_xunlock(&sc->sc_lock); +} + +static void +crypto_taskq_setup(struct wg_softc *sc) +{ + + sc->sc_encrypt = malloc(sizeof(struct grouptask)*mp_ncpus, M_WG, M_WAITOK); + sc->sc_decrypt = malloc(sizeof(struct grouptask)*mp_ncpus, M_WG, M_WAITOK); + + for (int i = 0; i < mp_ncpus; i++) { + GROUPTASK_INIT(&sc->sc_encrypt[i], 0, + (gtask_fn_t *)wg_softc_encrypt, sc); + taskqgroup_attach_cpu(qgroup_if_io_tqg, &sc->sc_encrypt[i], sc, i, NULL, NULL, "wg encrypt"); + GROUPTASK_INIT(&sc->sc_decrypt[i], 0, + (gtask_fn_t *)wg_softc_decrypt, sc); + taskqgroup_attach_cpu(qgroup_if_io_tqg, &sc->sc_decrypt[i], sc, i, NULL, NULL, "wg decrypt"); + } +} + +static void +crypto_taskq_destroy(struct wg_softc *sc) +{ + for (int i = 0; i < mp_ncpus; i++) { + taskqgroup_detach(qgroup_if_io_tqg, &sc->sc_encrypt[i]); + taskqgroup_detach(qgroup_if_io_tqg, &sc->sc_decrypt[i]); + } + free(sc->sc_encrypt, M_WG); + free(sc->sc_decrypt, M_WG); +} + +static int +wg_clone_create(struct if_clone *ifc, int unit, caddr_t params) +{ + struct wg_softc *sc; + struct ifnet *ifp; + struct noise_upcall noise_upcall; + + sc = malloc(sizeof(*sc), M_WG, M_WAITOK | M_ZERO); + sc->sc_ucred = crhold(curthread->td_ucred); + ifp = sc->sc_ifp = if_alloc(IFT_WIREGUARD); + ifp->if_softc = sc; + if_initname(ifp, wgname, unit); + + noise_upcall.u_arg = sc; + noise_upcall.u_remote_get = + (struct noise_remote *(*)(void *, uint8_t *))wg_remote_get; + noise_upcall.u_index_set = + (uint32_t (*)(void *, struct noise_remote *))wg_index_set; + noise_upcall.u_index_drop = + (void (*)(void *, uint32_t))wg_index_drop; + noise_local_init(&sc->sc_local, &noise_upcall); + cookie_checker_init(&sc->sc_cookie, ratelimit_zone); + + sc->sc_socket.so_port = 0; + + atomic_add_int(&clone_count, 1); + ifp->if_capabilities = ifp->if_capenable = WG_CAPS; + + mbufq_init(&sc->sc_handshake_queue, MAX_QUEUED_HANDSHAKES); + sx_init(&sc->sc_lock, "wg softc lock"); + rw_init(&sc->sc_index_lock, "wg index lock"); + sc->sc_peer_count = 0; + sc->sc_encap_ring = buf_ring_alloc(MAX_QUEUED_PKT, M_WG, M_WAITOK, NULL); + sc->sc_decap_ring = buf_ring_alloc(MAX_QUEUED_PKT, M_WG, M_WAITOK, NULL); + GROUPTASK_INIT(&sc->sc_handshake, 0, + (gtask_fn_t *)wg_softc_handshake_receive, sc); + taskqgroup_attach(qgroup_if_io_tqg, &sc->sc_handshake, sc, NULL, NULL, "wg tx initiation"); + crypto_taskq_setup(sc); + + wg_hashtable_init(&sc->sc_hashtable); + sc->sc_index = hashinit(HASHTABLE_INDEX_SIZE, M_DEVBUF, &sc->sc_index_mask); + wg_aip_init(&sc->sc_aips); + + if_setmtu(ifp, ETHERMTU - 80); + ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_NOARP; + ifp->if_init = wg_init; + ifp->if_reassign = wg_reassign; + ifp->if_qflush = wg_qflush; + ifp->if_transmit = wg_transmit; + ifp->if_output = wg_output; + ifp->if_ioctl = wg_ioctl; + + if_attach(ifp); + bpfattach(ifp, DLT_NULL, sizeof(uint32_t)); + + sx_xlock(&wg_sx); + LIST_INSERT_HEAD(&wg_list, sc, sc_entry); + sx_xunlock(&wg_sx); + + return 0; +} + +static void +wg_clone_destroy(struct ifnet *ifp) +{ + struct wg_softc *sc = ifp->if_softc; + struct ucred *cred; + + sx_xlock(&wg_sx); + sx_xlock(&sc->sc_lock); + sc->sc_flags |= WGF_DYING; + cred = sc->sc_ucred; + sc->sc_ucred = NULL; + sx_xunlock(&sc->sc_lock); + LIST_REMOVE(sc, sc_entry); + sx_xunlock(&wg_sx); + + if_link_state_change(sc->sc_ifp, LINK_STATE_DOWN); + + sx_xlock(&sc->sc_lock); + wg_socket_uninit(sc); + sx_xunlock(&sc->sc_lock); + + /* + * No guarantees that all traffic have passed until the epoch has + * elapsed with the socket closed. + */ + NET_EPOCH_WAIT(); + + taskqgroup_drain_all(qgroup_if_io_tqg); + sx_xlock(&sc->sc_lock); + wg_peer_remove_all(sc); + epoch_drain_callbacks(net_epoch_preempt); + sx_xunlock(&sc->sc_lock); + sx_destroy(&sc->sc_lock); + rw_destroy(&sc->sc_index_lock); + taskqgroup_detach(qgroup_if_io_tqg, &sc->sc_handshake); + crypto_taskq_destroy(sc); + buf_ring_free(sc->sc_encap_ring, M_WG); + buf_ring_free(sc->sc_decap_ring, M_WG); + + wg_aip_destroy(&sc->sc_aips); + wg_hashtable_destroy(&sc->sc_hashtable); + + if (cred != NULL) + crfree(cred); + if_detach(sc->sc_ifp); + if_free(sc->sc_ifp); + /* Ensure any local/private keys are cleaned up */ + explicit_bzero(sc, sizeof(*sc)); + free(sc, M_WG); + + atomic_add_int(&clone_count, -1); +} + +static void +wg_qflush(struct ifnet *ifp __unused) +{ +} + +/* + * Privileged information (private-key, preshared-key) are only exported for + * root and jailed root by default. + */ +static bool +wgc_privileged(struct wg_softc *sc) +{ + struct thread *td; + + td = curthread; + return (priv_check(td, PRIV_NET_WG) == 0); +} + +static void +wg_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused, + char *unused __unused) +{ + struct wg_softc *sc; + + sc = ifp->if_softc; + wg_down(sc); +} + +static void +wg_init(void *xsc) +{ + struct wg_softc *sc; + + sc = xsc; + wg_up(sc); +} + +static void +vnet_wg_init(const void *unused __unused) +{ + + V_wg_cloner = if_clone_simple(wgname, wg_clone_create, wg_clone_destroy, + 0); +} +VNET_SYSINIT(vnet_wg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + vnet_wg_init, NULL); + +static void +vnet_wg_uninit(const void *unused __unused) +{ + + if_clone_detach(V_wg_cloner); +} +VNET_SYSUNINIT(vnet_wg_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + vnet_wg_uninit, NULL); + +static int +wg_prison_remove(void *obj, void *data __unused) +{ + const struct prison *pr = obj; + struct wg_softc *sc; + struct ucred *cred; + bool dying; + + /* + * Do a pass through all if_wg interfaces and release creds on any from + * the jail that are supposed to be going away. This will, in turn, let + * the jail die so that we don't end up with Schrödinger's jail. + */ + sx_slock(&wg_sx); + LIST_FOREACH(sc, &wg_list, sc_entry) { + cred = NULL; + + sx_xlock(&sc->sc_lock); + dying = (sc->sc_flags & WGF_DYING) != 0; + if (!dying && sc->sc_ucred != NULL && + sc->sc_ucred->cr_prison == pr) { + /* Home jail is going away. */ + cred = sc->sc_ucred; + sc->sc_ucred = NULL; + + sc->sc_flags |= WGF_DYING; + } + + /* + * If this is our foreign vnet going away, we'll also down the + * link and kill the socket because traffic needs to stop. Any + * address will be revoked in the rehoming process. + */ + if (cred != NULL || (!dying && + sc->sc_ifp->if_vnet == pr->pr_vnet)) { + if_link_state_change(sc->sc_ifp, LINK_STATE_DOWN); + /* Have to kill the sockets, as they also hold refs. */ + wg_socket_uninit(sc); + } + + sx_xunlock(&sc->sc_lock); + + if (cred != NULL) { + CURVNET_SET(sc->sc_ifp->if_vnet); + if_purgeaddrs(sc->sc_ifp); + CURVNET_RESTORE(); + crfree(cred); + } + } + sx_sunlock(&wg_sx); + + return (0); +} + +static void +wg_module_init(void) +{ + osd_method_t methods[PR_MAXMETHOD] = { + [PR_METHOD_REMOVE] = wg_prison_remove, + }; + + ratelimit_zone = uma_zcreate("wg ratelimit", sizeof(struct ratelimit), + NULL, NULL, NULL, NULL, 0, 0); + wg_osd_jail_slot = osd_jail_register(NULL, methods); +} + +static void +wg_module_deinit(void) +{ + + uma_zdestroy(ratelimit_zone); + osd_jail_deregister(wg_osd_jail_slot); + + MPASS(LIST_EMPTY(&wg_list)); +} + +static int +wg_module_event_handler(module_t mod, int what, void *arg) +{ + + switch (what) { + case MOD_LOAD: + wg_module_init(); + break; + case MOD_UNLOAD: + if (atomic_load_int(&clone_count) == 0) + wg_module_deinit(); + else + return (EBUSY); + break; + default: + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t wg_moduledata = { + "wg", + wg_module_event_handler, + NULL +}; + +DECLARE_MODULE(wg, wg_moduledata, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(wg, 1); +MODULE_DEPEND(wg, crypto, 1, 1, 1); diff --git a/sys/dev/if_wg/if_wg.h b/sys/dev/if_wg/if_wg.h new file mode 100644 index 000000000000..2a100456d406 --- /dev/null +++ b/sys/dev/if_wg/if_wg.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2019 Matt Dunwoodie <ncon@noconroy.net> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * $FreeBSD$ + */ + +#ifndef __IF_WG_H__ +#define __IF_WG_H__ + +#include <net/if.h> +#include <netinet/in.h> + +struct wg_data_io { + char wgd_name[IFNAMSIZ]; + void *wgd_data; + size_t wgd_size; +}; + +#define WG_KEY_SIZE 32 + +#define SIOCSWG _IOWR('i', 210, struct wg_data_io) +#define SIOCGWG _IOWR('i', 211, struct wg_data_io) + +#endif /* __IF_WG_H__ */ diff --git a/sys/dev/if_wg/include/crypto/blake2s.h b/sys/dev/if_wg/include/crypto/blake2s.h deleted file mode 100644 index 17e6447ebcd8..000000000000 --- a/sys/dev/if_wg/include/crypto/blake2s.h +++ /dev/null @@ -1,56 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#include <sys/types.h> - -#ifndef _BLAKE2S_H_ -#define _BLAKE2S_H_ - - -enum blake2s_lengths { - BLAKE2S_BLOCK_SIZE = 64, - BLAKE2S_HASH_SIZE = 32, - BLAKE2S_KEY_SIZE = 32 -}; - -struct blake2s_state { - uint32_t h[8]; - uint32_t t[2]; - uint32_t f[2]; - uint8_t buf[BLAKE2S_BLOCK_SIZE]; - size_t buflen; - uint8_t last_node; -}; - -void blake2s_init(struct blake2s_state *state, const size_t outlen); -void blake2s_init_key(struct blake2s_state *state, const size_t outlen, - const void *key, const size_t keylen); -void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen); -void blake2s_final(struct blake2s_state *state, uint8_t *out, const size_t outlen); - -static inline void blake2s(uint8_t *out, const uint8_t *in, const uint8_t *key, - const size_t outlen, const size_t inlen, - const size_t keylen) -{ - struct blake2s_state state; -#ifdef __linux___ - WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen || - outlen > BLAKE2S_HASH_SIZE || keylen > BLAKE2S_KEY_SIZE || - (!key && keylen))); -#endif - - if (keylen) - blake2s_init_key(&state, outlen, key, keylen); - else - blake2s_init(&state, outlen); - - blake2s_update(&state, in, inlen); - blake2s_final(&state, out, outlen); -} - -void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key, - const size_t outlen, const size_t inlen, const size_t keylen); - -#endif /* _BLAKE2S_H_ */ diff --git a/sys/dev/if_wg/include/crypto/curve25519.h b/sys/dev/if_wg/include/crypto/curve25519.h deleted file mode 100644 index 3e90d1b270fe..000000000000 --- a/sys/dev/if_wg/include/crypto/curve25519.h +++ /dev/null @@ -1,74 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _CURVE25519_H_ -#define _CURVE25519_H_ - -#include <sys/systm.h> - -#define CURVE25519_KEY_SIZE 32 - -void curve25519_generic(u8 [CURVE25519_KEY_SIZE], - const u8 [CURVE25519_KEY_SIZE], - const u8 [CURVE25519_KEY_SIZE]); - -static inline void curve25519_clamp_secret(u8 secret[CURVE25519_KEY_SIZE]) -{ - secret[0] &= 248; - secret[31] = (secret[31] & 127) | 64; -} - -static const u8 null_point[CURVE25519_KEY_SIZE] = { 0 }; - -static inline int curve25519(u8 mypublic[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE], - const u8 basepoint[CURVE25519_KEY_SIZE]) -{ - curve25519_generic(mypublic, secret, basepoint); - return timingsafe_bcmp(mypublic, null_point, CURVE25519_KEY_SIZE); -} - -static inline int curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]) -{ - static const u8 basepoint[CURVE25519_KEY_SIZE] __aligned(32) = { 9 }; - - if (timingsafe_bcmp(secret, null_point, CURVE25519_KEY_SIZE) == 0) - return 0; - - return curve25519(pub, secret, basepoint); -} - -static inline void curve25519_generate_secret(u8 secret[CURVE25519_KEY_SIZE]) -{ - arc4random_buf(secret, CURVE25519_KEY_SIZE); - curve25519_clamp_secret(secret); -} - -#endif /* _CURVE25519_H_ */ diff --git a/sys/dev/if_wg/include/crypto/zinc.h b/sys/dev/if_wg/include/crypto/zinc.h deleted file mode 100644 index 9aa1e8d59bf5..000000000000 --- a/sys/dev/if_wg/include/crypto/zinc.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#ifndef _WG_ZINC_H -#define _WG_ZINC_H - -int chacha20_mod_init(void); -int poly1305_mod_init(void); -int chacha20poly1305_mod_init(void); -int blake2s_mod_init(void); -int curve25519_mod_init(void); - -#endif diff --git a/sys/dev/if_wg/include/sys/if_wg_session.h b/sys/dev/if_wg/include/sys/if_wg_session.h deleted file mode 100644 index 45399e534364..000000000000 --- a/sys/dev/if_wg/include/sys/if_wg_session.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2019 Matt Dunwoodie <ncon@noconroy.net> - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * - * $FreeBSD$ - */ - -#ifndef __IF_WG_H__ -#define __IF_WG_H__ - -#include <net/if.h> -#include <netinet/in.h> - -/* - * This is the public interface to the WireGuard network interface. - * - * It is designed to be used by tools such as ifconfig(8) and wg(4). - */ - -#define WG_KEY_SIZE 32 - -#define WG_DEVICE_HAS_PUBKEY (1 << 0) -#define WG_DEVICE_HAS_PRIVKEY (1 << 1) -#define WG_DEVICE_HAS_MASKED_PRIVKEY (1 << 2) -#define WG_DEVICE_HAS_PORT (1 << 3) -#define WG_DEVICE_HAS_RDOMAIN (1 << 4) -#define WG_DEVICE_REPLACE_PEERS (1 << 5) - -#define WG_PEER_HAS_PUBKEY (1 << 0) -#define WG_PEER_HAS_SHAREDKEY (1 << 1) -#define WG_PEER_HAS_MASKED_SHAREDKEY (1 << 2) -#define WG_PEER_HAS_ENDPOINT (1 << 3) -#define WG_PEER_HAS_PERSISTENTKEEPALIVE (1 << 4) -#define WG_PEER_REPLACE_CIDRS (1 << 5) -#define WG_PEER_REMOVE (1 << 6) - -#define SIOCSWG _IOWR('i', 200, struct wg_device_io) -#define SIOCGWG _IOWR('i', 201, struct wg_device_io) - -#define WG_PEERS_FOREACH(p, d) \ - for (p = (d)->d_peers; p < (d)->d_peers + (d)->d_num_peers; p++) -#define WG_CIDRS_FOREACH(c, p) \ - for (c = (p)->p_cidrs; c < (p)->p_cidrs + (p)->p_num_cidrs; c++) - -struct wg_allowedip { - struct sockaddr_storage a_addr; - struct sockaddr_storage a_mask; -}; - -enum { - WG_PEER_CTR_TX_BYTES, - WG_PEER_CTR_RX_BYTES, - WG_PEER_CTR_NUM, -}; - -struct wg_device_io { - char d_name[IFNAMSIZ]; - uint8_t d_flags; - in_port_t d_port; - int d_rdomain; - uint8_t d_pubkey[WG_KEY_SIZE]; - uint8_t d_privkey[WG_KEY_SIZE]; - size_t d_num_peers; - size_t d_num_cidrs; - struct wg_peer_io *d_peers; -}; - - -#ifndef ENOKEY -#define ENOKEY ENOTCAPABLE -#endif - -typedef enum { - WGC_GET = 0x5, - WGC_SET = 0x6, -} wg_cmd_t; - -#endif /* __IF_WG_H__ */ diff --git a/sys/dev/if_wg/include/sys/if_wg_session_vars.h b/sys/dev/if_wg/include/sys/if_wg_session_vars.h deleted file mode 100644 index 5fd85d3b7162..000000000000 --- a/sys/dev/if_wg/include/sys/if_wg_session_vars.h +++ /dev/null @@ -1,319 +0,0 @@ -/* - * Copyright (c) 2019 Matt Dunwoodie <ncon@noconroy.net> - * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate) - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * - * $FreeBSD$ - */ - -#ifndef _IF_WG_VARS_H_ -#define _IF_WG_VARS_H_ - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/time.h> - -#include <sys/lock.h> -#include <sys/mutex.h> -#include <crypto/siphash/siphash.h> - - -#include <net/if.h> -#include <net/if_var.h> -#include <net/if_types.h> -#include <net/ethernet.h> -#include <net/pfvar.h> -#include <net/iflib.h> - -#include <sys/wg_noise.h> -#include <sys/wg_cookie.h> -/* This is only needed for wg_keypair. */ -#include <sys/if_wg_session.h> - -#define UNIMPLEMENTED() panic("%s not implemented\n", __func__) - -#define WG_KEY_SIZE 32 -#define WG_MSG_PADDING_SIZE 16 - - -/* Constant for session */ -#define REKEY_TIMEOUT 5 -#define REKEY_TIMEOUT_JITTER 500 /* TODO ok? jason */ -#define REJECT_AFTER_TIME 180 -#define KEEPALIVE_TIMEOUT 10 -#define MAX_TIMER_HANDSHAKES (90 / REKEY_TIMEOUT) -#define NEW_HANDSHAKE_TIMEOUT (REKEY_TIMEOUT + KEEPALIVE_TIMEOUT) - -#define MAX_QUEUED_INCOMING_HANDSHAKES 4096 /* TODO: replace this with DQL */ -#define MAX_QUEUED_PACKETS 1024 /* TODO: replace this with DQL */ - -#define HASHTABLE_PEER_SIZE (1 << 6) //1 << 11 -#define HASHTABLE_INDEX_SIZE (HASHTABLE_PEER_SIZE * 3) //1 << 13 - -#define PEER_MAGIC1 0xCAFEBABEB00FDADDULL -#define PEER_MAGIC2 0xCAAFD0D0D00DBABEULL -#define PEER_MAGIC3 0xD00DBABEF00DFADEULL - - -enum message_type { - MESSAGE_INVALID = 0, - MESSAGE_HANDSHAKE_INITIATION = 1, - MESSAGE_HANDSHAKE_RESPONSE = 2, - MESSAGE_HANDSHAKE_COOKIE = 3, - MESSAGE_DATA = 4 -}; - -struct wg_softc; - -#if __FreeBSD_version > 1300000 -typedef void timeout_t (void *); -#endif - -/* Socket */ -struct wg_endpoint { - union wg_remote { - struct sockaddr r_sa; - struct sockaddr_in r_sin; - struct sockaddr_in6 r_sin6; - } e_remote; - union wg_source { - struct in_addr l_in; - struct in6_pktinfo l_pktinfo6; -#define l_in6 l_pktinfo6.ipi6_addr - } e_local; -}; - -struct wg_socket { - struct mtx so_mtx; - in_port_t so_port; - struct socket *so_so4; - struct socket *so_so6; -}; - -struct wg_queue { - struct mtx q_mtx; - struct mbufq q; -}; - -struct wg_index { - LIST_ENTRY(wg_index) i_entry; - SLIST_ENTRY(wg_index) i_unused_entry; - uint32_t i_key; - struct noise_remote *i_value; -}; - -struct wg_timers { - /* t_lock is for blocking wg_timers_event_* when setting t_disabled. */ - struct rwlock t_lock; - - int t_disabled; - int t_need_another_keepalive; - uint16_t t_persistent_keepalive_interval; - struct callout t_new_handshake; - struct callout t_send_keepalive; - struct callout t_retry_handshake; - struct callout t_zero_key_material; - struct callout t_persistent_keepalive; - - struct mtx t_handshake_mtx; - struct timespec t_handshake_last_sent; - struct timespec t_handshake_complete; - volatile int t_handshake_retries; - -}; - -struct wg_peer { - uint64_t p_magic_1; - CK_LIST_ENTRY(wg_peer) p_hash_entry; - CK_LIST_ENTRY(wg_peer) p_entry; - uint64_t p_id; - struct wg_softc *p_sc; - - struct noise_remote p_remote; - struct cookie_maker p_cookie; - struct wg_timers p_timers; - - struct rwlock p_endpoint_lock; - struct wg_endpoint p_endpoint; - - uint64_t p_magic_2; - - SLIST_HEAD(,wg_index) p_unused_index; - struct wg_index p_index[3]; - - struct wg_queue p_encap_queue; - struct wg_queue p_decap_queue; - - struct grouptask p_clear_secrets; - struct grouptask p_send_initiation; - struct grouptask p_send_keepalive; - struct grouptask p_send; - struct grouptask p_recv; - - counter_u64_t p_tx_bytes; - counter_u64_t p_rx_bytes; - - CK_LIST_HEAD(, wg_route) p_routes; - uint64_t p_magic_3; - struct mtx p_lock; - struct epoch_context p_ctx; -}; - - - -/* Packet */ - -void wg_softc_decrypt(struct wg_softc *); -void wg_softc_encrypt(struct wg_softc *); - -/* Queue */ -void wg_queue_init(struct wg_queue *, const char *); -void wg_queue_deinit(struct wg_queue *); - -/* Counter */ - -/* Timers */ - -/* Route */ -enum route_direction { - IN, - OUT, -}; - -struct wg_route_table { - size_t t_count; - struct radix_node_head *t_ip; - struct radix_node_head *t_ip6; -}; -struct wg_peer; - -struct wg_route { - struct radix_node r_nodes[2]; - struct wg_allowedip r_cidr; - CK_LIST_ENTRY(wg_route) r_entry; - struct wg_peer *r_peer; -}; - - -int wg_route_add(struct wg_route_table *, struct wg_peer *, - const struct wg_allowedip *); -int wg_route_delete(struct wg_route_table *, struct wg_peer *); - -/* Noise */ - -/* - * Peer - * - * - * - */ - -struct wg_softc; - -struct wg_hashtable { - struct mtx h_mtx; - SIPHASH_KEY h_secret; - CK_LIST_HEAD(, wg_peer) h_peers_list; - CK_LIST_HEAD(, wg_peer) *h_peers; - u_long h_peers_mask; - size_t h_num_peers; - LIST_HEAD(, noise_keypair) *h_keys; - u_long h_keys_mask; - size_t h_num_keys; -}; - -/* Softc */ -struct wg_softc { - if_softc_ctx_t shared; - if_ctx_t wg_ctx; - struct ifnet *sc_ifp; - uint16_t sc_incoming_port; - uint32_t sc_user_cookie; - - struct wg_socket sc_socket; - struct wg_hashtable sc_hashtable; - struct wg_route_table sc_routes; - - struct mbufq sc_handshake_queue; - struct grouptask sc_handshake; - - struct noise_local sc_local; - struct cookie_checker sc_cookie; - - struct buf_ring *sc_encap_ring; - struct buf_ring *sc_decap_ring; - - struct grouptask *sc_encrypt; - struct grouptask *sc_decrypt; - - struct rwlock sc_index_lock; - LIST_HEAD(,wg_index) *sc_index; - u_long sc_index_mask; - - struct mtx sc_mtx; -}; - -struct wg_tag { - struct m_tag wt_tag; - struct wg_endpoint t_endpoint; - struct wg_peer *t_peer; - struct mbuf *t_mbuf; - sa_family_t t_family; - int t_done; - int t_mtu; -}; - -struct wg_peer *wg_route_lookup(struct wg_route_table *, struct mbuf *, - enum route_direction); - -void wg_peer_remove_all(struct wg_softc *); -struct wg_peer *wg_peer_alloc(struct wg_softc *); -void wg_peer_destroy(struct wg_peer *); - -void wg_hashtable_init(struct wg_hashtable *); -void wg_hashtable_destroy(struct wg_hashtable *); -void wg_hashtable_peer_insert(struct wg_hashtable *, struct wg_peer *); -struct wg_peer *wg_peer_lookup(struct wg_softc *, - const uint8_t [WG_KEY_SIZE]); -void wg_hashtable_peer_remove(struct wg_hashtable *, struct wg_peer *); - - -int wg_queue_out(struct wg_peer *peer, struct mbuf *m); - -int wg_route_init(struct wg_route_table *); -void wg_route_destroy(struct wg_route_table *); - -int wg_socket_init(struct wg_softc *sc); -void wg_socket_reinit(struct wg_softc *, struct socket *so4, - struct socket *so6); -int wg_socket_close(struct wg_socket *so); - -void wg_softc_handshake_receive(struct wg_softc *sc); - -int wg_timers_get_persistent_keepalive(struct wg_timers *, uint16_t *); -void wg_timers_set_persistent_keepalive(struct wg_timers *t, uint16_t); -void wg_timers_get_last_handshake(struct wg_timers *, struct timespec *); - - -struct noise_remote *wg_remote_get(struct wg_softc *, uint8_t [NOISE_KEY_SIZE]); -uint32_t wg_index_set(struct wg_softc *, struct noise_remote *); -struct noise_remote *wg_index_get(struct wg_softc *, uint32_t); -void wg_index_drop(struct wg_softc *, uint32_t); -void wg_encrypt_dispatch(struct wg_softc *); -void wg_decrypt_dispatch(struct wg_softc *); - -struct wg_tag *wg_tag_get(struct mbuf *m); - - -#endif /* _IF_WG_VARS_H_ */ diff --git a/sys/dev/if_wg/include/sys/simd-x86_64.h b/sys/dev/if_wg/include/sys/simd-x86_64.h deleted file mode 100644 index 1453083aa273..000000000000 --- a/sys/dev/if_wg/include/sys/simd-x86_64.h +++ /dev/null @@ -1,74 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _SIMD_X86_64_H_ -#define _SIMD_X86_64_H_ - - -#include <x86/x86_var.h> -#include <x86/specialreg.h> - -static inline uint64_t -xgetbv(uint32_t index) -{ - uint32_t eax, edx; - /* xgetbv - instruction byte code */ - __asm__ __volatile__(".byte 0x0f; .byte 0x01; .byte 0xd0" - : "=a" (eax), "=d" (edx) - : "c" (index)); - - return ((((uint64_t)edx)<<32) | (uint64_t)eax); -} - - -/* - * Detect register set support - */ -static inline boolean_t -__simd_state_enabled(const uint64_t state) -{ - boolean_t has_osxsave; - uint64_t xcr0; - - has_osxsave = !!(cpu_feature2 & CPUID2_OSXSAVE); - - if (!has_osxsave) - return (0); - - xcr0 = xgetbv(0); - return ((xcr0 & state) == state); -} - -#define _XSTATE_SSE_AVX (0x2 | 0x4) -#define _XSTATE_AVX512 (0xE0 | _XSTATE_SSE_AVX) - -#define __ymm_enabled() __simd_state_enabled(_XSTATE_SSE_AVX) -#define __zmm_enabled() __simd_state_enabled(_XSTATE_AVX512) -#endif - diff --git a/sys/dev/if_wg/include/sys/support.h b/sys/dev/if_wg/include/sys/support.h deleted file mode 100644 index 7874fd9b1524..000000000000 --- a/sys/dev/if_wg/include/sys/support.h +++ /dev/null @@ -1,342 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef SYS_SUPPORT_H_ -#define SYS_SUPPORT_H_ -#ifdef __LOCORE -#include <machine/asm.h> -#define SYM_FUNC_START ENTRY -#define SYM_FUNC_END END - -#else -#include <sys/types.h> -#include <sys/limits.h> -#include <sys/endian.h> -#include <sys/libkern.h> -#include <sys/malloc.h> -#include <sys/proc.h> -#include <sys/lock.h> -#include <vm/uma.h> - -#if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) -#include <machine/fpu.h> -#endif -#include <crypto/siphash/siphash.h> - - -#define COMPAT_ZINC_IS_A_MODULE -MALLOC_DECLARE(M_WG); - -#define BUILD_BUG_ON(x) CTASSERT(!(x)) - -#define BIT(nr) (1UL << (nr)) -#define BIT_ULL(nr) (1ULL << (nr)) -#ifdef __LP64__ -#define BITS_PER_LONG 64 -#else -#define BITS_PER_LONG 32 -#endif - -#define rw_enter_write rw_wlock -#define rw_exit_write rw_wunlock -#define rw_enter_read rw_rlock -#define rw_exit_read rw_runlock -#define rw_exit rw_unlock - -#define ASSERT(x) MPASS(x) - -#define ___PASTE(a,b) a##b -#define __PASTE(a,b) ___PASTE(a,b) -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) - -#define typeof(x) __typeof__(x) - - -#define min_t(t, a, b) ({ t __a = (a); t __b = (b); __a > __b ? __b : __a; }) - -typedef uint8_t u8; -typedef uint16_t u16; -typedef uint32_t u32; -typedef uint32_t __le32; -typedef uint64_t u64; -typedef uint64_t __le64; - -#define __must_check __attribute__((__warn_unused_result__)) -#define asmlinkage -#define __ro_after_init __read_mostly - -#define get_unaligned_le32(x) le32dec(x) -#define get_unaligned_le64(x) le64dec(x) - -#define cpu_to_le64(x) htole64(x) -#define cpu_to_le32(x) htole32(x) -#define letoh64(x) le64toh(x) - -#define need_resched() \ - ((curthread->td_flags & (TDF_NEEDRESCHED|TDF_ASTPENDING)) || \ - curthread->td_owepreempt) - - -#define CONTAINER_OF(a, b, c) __containerof((a), b, c) - -typedef struct { - uint64_t k0; - uint64_t k1; -} SIPHASH_KEY; - -static inline uint64_t -siphash24(const SIPHASH_KEY *key, const void *src, size_t len) -{ - SIPHASH_CTX ctx; - - return (SipHashX(&ctx, 2, 4, (const uint8_t *)key, src, len)); -} - -static inline void -put_unaligned_le32(u32 val, void *p) -{ - *((__le32 *)p) = cpu_to_le32(val); -} - - -#define rol32(i32, n) ((i32) << (n) | (i32) >> (32 - (n))) - -#define memzero_explicit(p, s) explicit_bzero(p, s) - -#define EXPORT_SYMBOL(x) - -#define U32_MAX ((u32)~0U) -#if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) -#define kfpu_begin(ctx) { \ - if (ctx->sc_fpu_ctx == NULL) { \ - ctx->sc_fpu_ctx = fpu_kern_alloc_ctx(0); \ - } \ - critical_enter(); \ - fpu_kern_enter(curthread, ctx->sc_fpu_ctx, FPU_KERN_NORMAL); \ -} - -#define kfpu_end(ctx) { \ - MPASS(ctx->sc_fpu_ctx != NULL); \ - fpu_kern_leave(curthread, ctx->sc_fpu_ctx); \ - critical_exit(); \ -} -#else -#define kfpu_begin(ctx) -#define kfpu_end(ctx) -#define fpu_kern_free_ctx(p) -#endif - -typedef enum { - HAVE_NO_SIMD = 1 << 0, - HAVE_FULL_SIMD = 1 << 1, - HAVE_SIMD_IN_USE = 1 << 31 -} simd_context_state_t; - -typedef struct { - simd_context_state_t sc_state; - struct fpu_kern_ctx *sc_fpu_ctx; -} simd_context_t; - - -#define DONT_USE_SIMD NULL - -static __must_check inline bool -may_use_simd(void) -{ -#if defined(__amd64__) - return true; -#else - return false; -#endif -} - -static inline void -simd_get(simd_context_t *ctx) -{ - ctx->sc_state = may_use_simd() ? HAVE_FULL_SIMD : HAVE_NO_SIMD; -} - -static inline void -simd_put(simd_context_t *ctx) -{ -#if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) - if (is_fpu_kern_thread(0)) - return; -#endif - if (ctx->sc_state & HAVE_SIMD_IN_USE) - kfpu_end(ctx); - ctx->sc_state = HAVE_NO_SIMD; -} - -static __must_check inline bool -simd_use(simd_context_t *ctx) -{ -#if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) - if (is_fpu_kern_thread(0)) - return true; -#else - return false; -#endif - if (ctx == NULL) - return false; - if (!(ctx->sc_state & HAVE_FULL_SIMD)) - return false; - if (ctx->sc_state & HAVE_SIMD_IN_USE) - return true; - kfpu_begin(ctx); - ctx->sc_state |= HAVE_SIMD_IN_USE; - return true; -} - -static inline bool -simd_relax(simd_context_t *ctx) -{ - if ((ctx->sc_state & HAVE_SIMD_IN_USE) && need_resched()) { - simd_put(ctx); - simd_get(ctx); - return simd_use(ctx); - } - return false; -} - -#define unlikely(x) __predict_false(x) -#define likely(x) __predict_true(x) -/* Generic path for arbitrary size */ - - -static inline unsigned long -__crypto_memneq_generic(const void *a, const void *b, size_t size) -{ - unsigned long neq = 0; - - while (size >= sizeof(unsigned long)) { - neq |= *(const unsigned long *)a ^ *(const unsigned long *)b; - __compiler_membar(); - a = ((const char *)a + sizeof(unsigned long)); - b = ((const char *)b + sizeof(unsigned long)); - size -= sizeof(unsigned long); - } - while (size > 0) { - neq |= *(const unsigned char *)a ^ *(const unsigned char *)b; - __compiler_membar(); - a = (const char *)a + 1; - b = (const char *)b + 1; - size -= 1; - } - return neq; -} - -#define crypto_memneq(a, b, c) __crypto_memneq_generic((a), (b), (c)) - -static inline void -__cpu_to_le32s(uint32_t *buf) -{ - *buf = htole32(*buf); -} - -static inline void cpu_to_le32_array(u32 *buf, unsigned int words) -{ - while (words--) { - __cpu_to_le32s(buf); - buf++; - } -} - -#define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 1 -void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int len); - -static inline void crypto_xor_cpy(u8 *dst, const u8 *src1, const u8 *src2, - unsigned int size) -{ - if (CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS && - __builtin_constant_p(size) && - (size % sizeof(unsigned long)) == 0) { - unsigned long *d = (unsigned long *)dst; - const unsigned long *s1 = (const unsigned long *)src1; - const unsigned long *s2 = (const unsigned long *)src2; - - while (size > 0) { - *d++ = *s1++ ^ *s2++; - size -= sizeof(unsigned long); - } - } else { - __crypto_xor(dst, src1, src2, size); - } -} -#include <sys/kernel.h> -#define module_init(fn) \ -static void \ -wrap_ ## fn(void *dummy __unused) \ -{ \ - fn(); \ -} \ -SYSINIT(if_wg_ ## fn, SI_SUB_LAST, SI_ORDER_FIRST, wrap_ ## fn, NULL) - - -#define module_exit(fn) \ -static void \ -wrap_ ## fn(void *dummy __unused) \ -{ \ - fn(); \ -} \ -SYSUNINIT(if_wg_ ## fn, SI_SUB_LAST, SI_ORDER_FIRST, wrap_ ## fn, NULL) - -#define module_param(a, b, c) -#define MODULE_LICENSE(x) -#define MODULE_DESCRIPTION(x) -#define MODULE_AUTHOR(x) - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - -#define __initconst -#define __initdata -#define __init -#define __exit -#define BUG() panic("%s:%d bug hit!\n", __FILE__, __LINE__) - -#define WARN_ON(cond) ({ \ - bool __ret = (cond); \ - if (__ret) { \ - printf("WARNING %s failed at %s:%d\n", \ - __stringify(cond), __FILE__, __LINE__); \ - } \ - unlikely(__ret); \ -}) - -#define pr_err printf -#define pr_info printf -#define IS_ENABLED(x) 0 -#define ___stringify(...) #__VA_ARGS__ -#define __stringify(...) ___stringify(__VA_ARGS__) -#define kmalloc(size, flag) malloc((size), M_WG, M_WAITOK) -#define kfree(p) free(p, M_WG) -#define vzalloc(size) malloc((size), M_WG, M_WAITOK|M_ZERO) -#define vfree(p) free(p, M_WG) -#endif -#endif diff --git a/sys/dev/if_wg/include/sys/wg_module.h b/sys/dev/if_wg/include/sys/wg_module.h deleted file mode 100644 index cc662104d640..000000000000 --- a/sys/dev/if_wg/include/sys/wg_module.h +++ /dev/null @@ -1,121 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ -#ifndef MODULE_H_ -#define MODULE_H_ - -#include <sys/mbuf.h> -#include <sys/socket.h> -#include <net/if.h> -#include <net/if_var.h> -#include <sys/support.h> - - -#include <sys/types.h> -#include <sys/epoch.h> -#include <sys/lock.h> -#include <sys/mutex.h> - - - -#include <crypto/curve25519.h> -#include <zinc/chacha20poly1305.h> -#include <crypto/blake2s.h> - - -enum noise_lengths { - NOISE_PUBLIC_KEY_LEN = CURVE25519_KEY_SIZE, - NOISE_SYMMETRIC_KEY_LEN = CHACHA20POLY1305_KEY_SIZE, - NOISE_TIMESTAMP_LEN = sizeof(uint64_t) + sizeof(uint32_t), - NOISE_AUTHTAG_LEN = CHACHA20POLY1305_AUTHTAG_SIZE, - NOISE_HASH_LEN = BLAKE2S_HASH_SIZE -}; - -#define noise_encrypted_len(plain_len) ((plain_len) + NOISE_AUTHTAG_LEN) - -enum cookie_values { - COOKIE_SECRET_MAX_AGE = 2 * 60, - COOKIE_SECRET_LATENCY = 5, - COOKIE_NONCE_LEN = XCHACHA20POLY1305_NONCE_SIZE, - COOKIE_LEN = 16 -}; - -enum limits { - REKEY_TIMEOUT = 5, - INITIATIONS_PER_SECOND = 50, - MAX_PEERS_PER_DEVICE = 1U << 20, - KEEPALIVE_TIMEOUT = 10, - MAX_TIMER_HANDSHAKES = 90 / REKEY_TIMEOUT, - MAX_QUEUED_INCOMING_HANDSHAKES = 4096, /* TODO: replace this with DQL */ - MAX_STAGED_PACKETS = 128, - MAX_QUEUED_PACKETS = 1024 /* TODO: replace this with DQL */ -}; - -#define zfree(addr, type) \ - do { \ - explicit_bzero(addr, sizeof(*addr)); \ - free(addr, type); \ - } while (0) - -struct crypt_queue { - union { - struct { - int last_cpu; - }; - }; -}; - -#define __ATOMIC_LOAD_SIZE \ - ({ \ - switch (size) { \ - case 1: *(uint8_t *)res = *(volatile uint8_t *)p; break; \ - case 2: *(uint16_t *)res = *(volatile uint16_t *)p; break; \ - case 4: *(uint32_t *)res = *(volatile uint32_t *)p; break; \ - case 8: *(uint64_t *)res = *(volatile uint64_t *)p; break; \ - } \ -}) - -static inline void -__atomic_load_acq_size(volatile void *p, void *res, int size) -{ - __ATOMIC_LOAD_SIZE; -} - -#define atomic_load_acq(x) \ - ({ \ - union { __typeof(x) __val; char __c[1]; } __u; \ - __atomic_load_acq_size(&(x), __u.__c, sizeof(x)); \ - __u.__val; \ -}) - - -int wg_ctx_init(void); -void wg_ctx_uninit(void); - - -#endif diff --git a/sys/dev/if_wg/include/sys/wg_noise.h b/sys/dev/if_wg/include/sys/wg_noise.h deleted file mode 100644 index 40bdab515bc7..000000000000 --- a/sys/dev/if_wg/include/sys/wg_noise.h +++ /dev/null @@ -1,286 +0,0 @@ -/* - * Copyright (C) 2015-2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * Copyright (C) 2019-2020 Matt Dunwoodie <ncon@noconroy.net> - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * - * ======== wg_noise.h ======== - * - * This file provides a thread safe interface to the Noise protocol as used in - * WireGuard. The three user facing components are: - * - * * noise_local - * Stores the local state for a noise peer. - * * noise_remote - * Stores the remote state for a noise peer. - * * noise_upcall - * Stores callback routines for index and peers - * - * Additionally a noise_counter, which is invsible to the user is used to track - * message nonces, to prevent message replay. - * - * This module uses Curve25519 for asymmetric crypto, and ChaCha20Poly1305 for - * symmetric crypto. The handshake uses ephemeral keys, which provide perfect - * forward secrecy. Keys are NOISE_KEY_SIZE (32) bytes long and can be - * generated with a CSRNG. While this module will clamp the key to form a valid - * Curve25519 key, it is recommended that keys are stored in Curve25519 form to - * preserve interoperability with other systems. Additionally, there is an - * optional PresharedKey of length NOISE_PSK_SIZE (also 32 bytes), which when - * used, will provide protection against known quantum attacks. Without it, - * Curve25519 is broken by Shor's algorithm. - * - * -------- noise_local -------- - * - * void noise_local_init(noise_local *, noise_upcall *) - * - Initialise noise_local, should only be called once and before use. - * - * int noise_local_set_private(noise_local *, uint8_t *private) - * - Set the local private key. This will also calculate the corresponding - * public key. - * - * int noise_local_keys(noise_local *, uint8_t *public, uint8_t *private) - * - Get the local keys. It will ensure that a key has been set and if - * not, will return ENXIO. - * - * -------- noise_remote -------- - * - * void noise_remote_init(noise_remote *, uint8_t *public) - * - Initialise noise_local, should only be called once and before use. Key - * must be provided and it cannot be changed once set. - * - * void noise_remote_set_psk(noise_remote *, uint8_t *psk) - * - Set the shared key. To remove the shared key, set a key of all 0x00. - * - * void noise_remote_keys(noise_remote *, uint8_t *public, uint8_t *psk) - * - Get the remote keys. - * - * -------- noise_upcall -------- - * - * The noise_upcall struct is used to lookup incoming public keys, as well as - * allocate and deallocate index for a remote. The allocation and deallocation - * are serialised per noise_remote and guaranteed to only have 3 allocated - * indexes at once. - * - * u_arg - passed to callback functions as void * - * u_get_remote - lookup noise_remote based on public key. - * u_set_index - allocate index for noise_remote. any further packets that - * arrive with this index should be passed to noise_* functions - * with the corresponding noise_remote. - * u_drop_index - dealloate index passed to callback. - * - * -------- crypto -------- - * - * The following functions are used for the crypto side of things: - * - * int noise_create_initiation(noise_remote *, noise_initiation *) - * int noise_consume_initiation(noise_local *, noise_remote **, noise_initiation *) - * int noise_create_response(noise_remote *, noise_response *) - * int noise_consume_response(noise_remote *, noise_response *) - * - * int noise_remote_promote(noise_remote *) - * void noise_remote_clear(noise_remote *) - * void noise_remote_expire_current(noise_remote *) - * int noise_remote_encrypt(noise_remote *, noise_data *, size_t) - * int noise_remote_decrypt(noise_remote *, noise_data *, size_t) - * - * $FreeBSD$ - */ - -#ifndef __NOISE_H__ -#define __NOISE_H__ - -#include <sys/types.h> -#include <sys/time.h> -#include <sys/rwlock.h> -#include <sys/support.h> - -#include <crypto/blake2s.h> -#include <zinc/chacha20poly1305.h> -#include <crypto/curve25519.h> - -#define NOISE_KEY_SIZE CURVE25519_KEY_SIZE -#define NOISE_PSK_SIZE 32 -#define NOISE_MAC_SIZE CHACHA20POLY1305_AUTHTAG_SIZE -#define NOISE_HASH_SIZE BLAKE2S_HASH_SIZE -#define NOISE_SYMMETRIC_SIZE CHACHA20POLY1305_KEY_SIZE -#define NOISE_TIMESTAMP_SIZE 12 - -/* Protocol string constants */ -#define NOISE_HANDSHAKE_NAME "Noise_IKpsk2_25519_ChaChaPoly_BLAKE2s" -#define NOISE_IDENTIFIER_NAME "WireGuard v1 zx2c4 Jason@zx2c4.com" - -/* Constants for the counter */ -#define COUNTER_TYPE size_t -#define COUNTER_BITS_TOTAL 512 -#define COUNTER_TYPE_BITS (sizeof(COUNTER_TYPE) * 8) -#define COUNTER_TYPE_NUM (COUNTER_BITS_TOTAL / COUNTER_TYPE_BITS) -#define COUNTER_WINDOW_SIZE (COUNTER_BITS_TOTAL - COUNTER_TYPE_BITS) - -/* Constants for the keypair */ -#define REKEY_AFTER_MESSAGES (1ull << 60) -#define REJECT_AFTER_MESSAGES (UINT64_MAX - COUNTER_WINDOW_SIZE - 1) -#define REKEY_AFTER_TIME 120 -#define REKEY_AFTER_TIME_RECV 165 -#define REJECT_AFTER_TIME 180 -#define REJECT_INTERVAL (1000000000 / 50) /* fifty times per sec */ -/* 24 = floor(log2(REJECT_INTERVAL)) */ -#define REJECT_INTERVAL_MASK (~((1ull<<24)-1)) - -enum noise_state_hs { - HS_ZEROED = 0, - CREATED_INITIATION, - CONSUMED_INITIATION, - CREATED_RESPONSE, - CONSUMED_RESPONSE, -}; - -struct noise_handshake { - enum noise_state_hs hs_state; - uint32_t hs_local_index; - uint32_t hs_remote_index; - uint8_t hs_e[NOISE_KEY_SIZE]; - uint8_t hs_hash[NOISE_HASH_SIZE]; - uint8_t hs_ck[NOISE_HASH_SIZE]; -}; - -struct noise_counter { - struct rwlock c_lock; - uint64_t c_send; - uint64_t c_recv; - COUNTER_TYPE c_backtrack[COUNTER_TYPE_NUM]; -}; - -enum noise_state_kp { - KP_ZEROED = 0, - INITIATOR, - RESPONDER, -}; - -struct noise_keypair { - SLIST_ENTRY(noise_keypair) kp_entry; - int kp_valid; - int kp_is_initiator; - uint32_t kp_local_index; - uint32_t kp_remote_index; - uint8_t kp_send[NOISE_SYMMETRIC_SIZE]; - uint8_t kp_recv[NOISE_SYMMETRIC_SIZE]; - struct timespec kp_birthdate; /* nanouptime */ - struct noise_counter kp_ctr; -}; - -struct noise_remote { - uint8_t r_public[NOISE_KEY_SIZE]; - struct noise_local *r_local; - uint8_t r_ss[NOISE_KEY_SIZE]; - - struct rwlock r_handshake_lock; - struct noise_handshake r_handshake; - uint8_t r_psk[NOISE_PSK_SIZE]; - uint8_t r_timestamp[NOISE_TIMESTAMP_SIZE]; - struct timespec r_last_init; /* nanouptime */ - - struct rwlock r_keypair_lock; - SLIST_HEAD(,noise_keypair) r_unused_keypairs; - struct noise_keypair *r_next, *r_current, *r_previous; - struct noise_keypair r_keypair[3]; /* 3: next, current, previous. */ - -}; - -struct noise_local { - struct rwlock l_identity_lock; - int l_has_identity; - uint8_t l_public[NOISE_KEY_SIZE]; - uint8_t l_private[NOISE_KEY_SIZE]; - - struct noise_upcall { - void *u_arg; - struct noise_remote * - (*u_remote_get)(void *, uint8_t[NOISE_KEY_SIZE]); - uint32_t - (*u_index_set)(void *, struct noise_remote *); - void (*u_index_drop)(void *, uint32_t); - } l_upcall; -}; - -struct noise_initiation { - uint32_t s_idx; - uint8_t ue[NOISE_KEY_SIZE]; - uint8_t es[NOISE_KEY_SIZE + NOISE_MAC_SIZE]; - uint8_t ets[NOISE_TIMESTAMP_SIZE + NOISE_MAC_SIZE]; -} __packed; - -struct noise_response { - uint32_t s_idx; - uint32_t r_idx; - uint8_t ue[NOISE_KEY_SIZE]; - uint8_t en[0 + NOISE_MAC_SIZE]; -} __packed; - -struct noise_data { - uint32_t r_idx; - uint64_t nonce; - uint8_t buf[]; -} __packed; - - -/* Set/Get noise parameters */ -void noise_local_init(struct noise_local *, struct noise_upcall *); -void noise_local_lock_identity(struct noise_local *); -void noise_local_unlock_identity(struct noise_local *); -int noise_local_set_private(struct noise_local *, uint8_t[NOISE_KEY_SIZE]); -int noise_local_keys(struct noise_local *, uint8_t[NOISE_KEY_SIZE], - uint8_t[NOISE_KEY_SIZE]); - -void noise_remote_init(struct noise_remote *, const uint8_t[NOISE_KEY_SIZE], - struct noise_local *); -int noise_remote_set_psk(struct noise_remote *, const uint8_t[NOISE_PSK_SIZE]); -int noise_remote_keys(struct noise_remote *, uint8_t[NOISE_KEY_SIZE], - uint8_t[NOISE_PSK_SIZE]); - -/* Should be called anytime noise_local_set_private is called */ -void noise_remote_precompute(struct noise_remote *); - -/* Cryptographic functions */ -int noise_create_initiation( - struct noise_remote *, - struct noise_initiation *); - -int noise_consume_initiation( - struct noise_local *, - struct noise_remote **, - struct noise_initiation *); - -int noise_create_response( - struct noise_remote *, - struct noise_response *); - -int noise_consume_response( - struct noise_remote *, - struct noise_response *); - - int noise_remote_begin_session(struct noise_remote *); -void noise_remote_clear(struct noise_remote *); -void noise_remote_expire_current(struct noise_remote *); - -int noise_remote_ready(struct noise_remote *); - -int noise_remote_encrypt( - struct noise_remote *, - struct noise_data *, - size_t); -int noise_remote_decrypt( - struct noise_remote *, - struct noise_data *, - size_t); - -#endif /* __NOISE_H__ */ diff --git a/sys/dev/if_wg/include/zinc/blake2s.h b/sys/dev/if_wg/include/zinc/blake2s.h deleted file mode 100644 index e87bfdbc9f6d..000000000000 --- a/sys/dev/if_wg/include/zinc/blake2s.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#ifndef _ZINC_BLAKE2S_H -#define _ZINC_BLAKE2S_H - -#include <sys/types.h> - -enum blake2s_lengths { - BLAKE2S_BLOCK_SIZE = 64, - BLAKE2S_HASH_SIZE = 32, - BLAKE2S_KEY_SIZE = 32 -}; - -struct blake2s_state { - uint32_t h[8]; - uint32_t t[2]; - uint32_t f[2]; - uint8_t buf[BLAKE2S_BLOCK_SIZE]; - unsigned int buflen; - unsigned int outlen; -}; - -void blake2s_init(struct blake2s_state *state, const size_t outlen); -void blake2s_init_key(struct blake2s_state *state, const size_t outlen, - const void *key, const size_t keylen); -void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen); -//void blake2s_final(struct blake2s_state *state, uint8_t *out); - -static inline void blake2s(uint8_t *out, const uint8_t *in, const uint8_t *key, - const size_t outlen, const size_t inlen, - const size_t keylen) -{ - struct blake2s_state state; - - if (keylen) - blake2s_init_key(&state, outlen, key, keylen); - else - blake2s_init(&state, outlen); - - blake2s_update(&state, in, inlen); - blake2s_final(&state, out); -} - -void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key, const size_t outlen, - const size_t inlen, const size_t keylen); - -#endif /* _ZINC_BLAKE2S_H */ diff --git a/sys/dev/if_wg/include/zinc/chacha20.h b/sys/dev/if_wg/include/zinc/chacha20.h deleted file mode 100644 index 1a9524bdfe85..000000000000 --- a/sys/dev/if_wg/include/zinc/chacha20.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#ifndef _ZINC_CHACHA20_H -#define _ZINC_CHACHA20_H - -#include <sys/param.h> -#include <sys/support.h> - -enum chacha20_lengths { - CHACHA20_NONCE_SIZE = 16, - CHACHA20_KEY_SIZE = 32, - CHACHA20_KEY_WORDS = CHACHA20_KEY_SIZE / sizeof(u32), - CHACHA20_BLOCK_SIZE = 64, - CHACHA20_BLOCK_WORDS = CHACHA20_BLOCK_SIZE / sizeof(u32), - HCHACHA20_NONCE_SIZE = CHACHA20_NONCE_SIZE, - HCHACHA20_KEY_SIZE = CHACHA20_KEY_SIZE -}; - -enum chacha20_constants { /* expand 32-byte k */ - CHACHA20_CONSTANT_EXPA = 0x61707865U, - CHACHA20_CONSTANT_ND_3 = 0x3320646eU, - CHACHA20_CONSTANT_2_BY = 0x79622d32U, - CHACHA20_CONSTANT_TE_K = 0x6b206574U -}; - -struct chacha20_ctx { - union { - u32 state[16]; - struct { - u32 constant[4]; - u32 key[8]; - u32 counter[4]; - }; - }; -}; - -static inline void chacha20_init(struct chacha20_ctx *ctx, - const u8 key[CHACHA20_KEY_SIZE], - const u64 nonce) -{ - ctx->constant[0] = CHACHA20_CONSTANT_EXPA; - ctx->constant[1] = CHACHA20_CONSTANT_ND_3; - ctx->constant[2] = CHACHA20_CONSTANT_2_BY; - ctx->constant[3] = CHACHA20_CONSTANT_TE_K; - ctx->key[0] = get_unaligned_le32(key + 0); - ctx->key[1] = get_unaligned_le32(key + 4); - ctx->key[2] = get_unaligned_le32(key + 8); - ctx->key[3] = get_unaligned_le32(key + 12); - ctx->key[4] = get_unaligned_le32(key + 16); - ctx->key[5] = get_unaligned_le32(key + 20); - ctx->key[6] = get_unaligned_le32(key + 24); - ctx->key[7] = get_unaligned_le32(key + 28); - ctx->counter[0] = 0; - ctx->counter[1] = 0; - ctx->counter[2] = nonce & U32_MAX; - ctx->counter[3] = nonce >> 32; -} -void chacha20(struct chacha20_ctx *ctx, u8 *dst, const u8 *src, u32 len, - simd_context_t *simd_context); - -void hchacha20(u32 derived_key[CHACHA20_KEY_WORDS], - const u8 nonce[HCHACHA20_NONCE_SIZE], - const u8 key[HCHACHA20_KEY_SIZE], simd_context_t *simd_context); - -#endif /* _ZINC_CHACHA20_H */ diff --git a/sys/dev/if_wg/include/zinc/chacha20poly1305.h b/sys/dev/if_wg/include/zinc/chacha20poly1305.h deleted file mode 100644 index 2d18b0fc3e82..000000000000 --- a/sys/dev/if_wg/include/zinc/chacha20poly1305.h +++ /dev/null @@ -1,48 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#ifndef _ZINC_CHACHA20POLY1305_H -#define _ZINC_CHACHA20POLY1305_H - -#include <sys/types.h> - -struct scatterlist; - -enum chacha20poly1305_lengths { - XCHACHA20POLY1305_NONCE_SIZE = 24, - CHACHA20POLY1305_KEY_SIZE = 32, - CHACHA20POLY1305_AUTHTAG_SIZE = 16 -}; - -void chacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, const size_t src_len, - const uint8_t *ad, const size_t ad_len, - const uint64_t nonce, - const uint8_t key[CHACHA20POLY1305_KEY_SIZE]); - -bool chacha20poly1305_encrypt_sg_inplace( - struct scatterlist *src, const size_t src_len, const uint8_t *ad, - const size_t ad_len, const uint64_t nonce, - const uint8_t key[CHACHA20POLY1305_KEY_SIZE], simd_context_t *simd_context); - -bool chacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src, const size_t src_len, - const uint8_t *ad, const size_t ad_len, const uint64_t nonce, - const uint8_t key[CHACHA20POLY1305_KEY_SIZE]); - -bool chacha20poly1305_decrypt_sg_inplace( - struct scatterlist *src, size_t src_len, const uint8_t *ad, - const size_t ad_len, const uint64_t nonce, - const uint8_t key[CHACHA20POLY1305_KEY_SIZE], simd_context_t *simd_context); - -void xchacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, const size_t src_len, - const uint8_t *ad, const size_t ad_len, - const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE], - const uint8_t key[CHACHA20POLY1305_KEY_SIZE]); - -bool xchacha20poly1305_decrypt( - uint8_t *dst, const uint8_t *src, const size_t src_len, const uint8_t *ad, - const size_t ad_len, const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE], - const uint8_t key[CHACHA20POLY1305_KEY_SIZE]); - -#endif /* _ZINC_CHACHA20POLY1305_H */ diff --git a/sys/dev/if_wg/include/zinc/curve25519.h b/sys/dev/if_wg/include/zinc/curve25519.h deleted file mode 100644 index aa32359462da..000000000000 --- a/sys/dev/if_wg/include/zinc/curve25519.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#ifndef _ZINC_CURVE25519_H -#define _ZINC_CURVE25519_H - -#include <sys/types.h> - -enum curve25519_lengths { - CURVE25519_KEY_SIZE = 32 -}; - -bool curve25519(uint8_t mypublic[CURVE25519_KEY_SIZE], - const uint8_t secret[CURVE25519_KEY_SIZE], - const uint8_t basepoint[CURVE25519_KEY_SIZE]); -void curve25519_generate_secret(uint8_t secret[CURVE25519_KEY_SIZE]); -bool curve25519_generate_public( - uint8_t pub[CURVE25519_KEY_SIZE], const uint8_t secret[CURVE25519_KEY_SIZE]); - -static inline void curve25519_clamp_secret(uint8_t secret[CURVE25519_KEY_SIZE]) -{ - secret[0] &= 248; - secret[31] = (secret[31] & 127) | 64; -} - -#endif /* _ZINC_CURVE25519_H */ diff --git a/sys/dev/if_wg/include/zinc/poly1305.h b/sys/dev/if_wg/include/zinc/poly1305.h deleted file mode 100644 index ca4cc60b41b3..000000000000 --- a/sys/dev/if_wg/include/zinc/poly1305.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#ifndef _ZINC_POLY1305_H -#define _ZINC_POLY1305_H - - -enum poly1305_lengths { - POLY1305_BLOCK_SIZE = 16, - POLY1305_KEY_SIZE = 32, - POLY1305_MAC_SIZE = 16 -}; - -struct poly1305_ctx { - u8 opaque[24 * sizeof(u64)]; - u32 nonce[4]; - u8 data[POLY1305_BLOCK_SIZE]; - size_t num; -} __aligned(8); - -void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE]); -void poly1305_update(struct poly1305_ctx *ctx, const u8 *input, size_t len, - simd_context_t *simd_context); -void poly1305_final(struct poly1305_ctx *ctx, u8 mac[POLY1305_MAC_SIZE], - simd_context_t *simd_context); - -#endif /* _ZINC_POLY1305_H */ diff --git a/sys/dev/if_wg/module/blake2s.c b/sys/dev/if_wg/module/blake2s.c deleted file mode 100644 index a362a6b350f1..000000000000 --- a/sys/dev/if_wg/module/blake2s.c +++ /dev/null @@ -1,256 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2012 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * - * This is an implementation of the BLAKE2s hash and PRF functions. - * - * Information: https://blake2.net/ - * - */ - -#include <sys/types.h> -#include <sys/systm.h> -#include <sys/endian.h> - -#include <crypto/blake2s.h> - -static inline uint32_t -ror32(uint32_t word, unsigned int shift) -{ - return (word >> shift) | (word << (32 - shift)); -} - -typedef union { - struct { - uint8_t digest_length; - uint8_t key_length; - uint8_t fanout; - uint8_t depth; - uint32_t leaf_length; - uint32_t node_offset; - uint16_t xof_length; - uint8_t node_depth; - uint8_t inner_length; - uint8_t salt[8]; - uint8_t personal[8]; - }; - uint32_t words[8]; -} __packed blake2s_param; - -static const uint32_t blake2s_iv[8] = { - 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, - 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL -}; - -static const uint8_t blake2s_sigma[10][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, -}; - -static inline void blake2s_set_lastblock(struct blake2s_state *state) -{ - if (state->last_node) - state->f[1] = -1; - state->f[0] = -1; -} - -static inline void blake2s_increment_counter(struct blake2s_state *state, - const uint32_t inc) -{ - state->t[0] += inc; - state->t[1] += (state->t[0] < inc); -} - -static inline void blake2s_init_param(struct blake2s_state *state, - const blake2s_param *param) -{ - int i; - - memset(state, 0, sizeof(*state)); - for (i = 0; i < 8; ++i) - state->h[i] = blake2s_iv[i] ^ le32toh(param->words[i]); -} - -void blake2s_init(struct blake2s_state *state, const size_t outlen) -{ - blake2s_param param __aligned(__alignof__(uint32_t)) = { - .digest_length = outlen, - .fanout = 1, - .depth = 1 - }; - - /*WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE));*/ - blake2s_init_param(state, ¶m); -} - -void blake2s_init_key(struct blake2s_state *state, const size_t outlen, - const void *key, const size_t keylen) -{ - blake2s_param param = { .digest_length = outlen, - .key_length = keylen, - .fanout = 1, - .depth = 1 }; - uint8_t block[BLAKE2S_BLOCK_SIZE] = { 0 }; - - /*WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE || - !key || !keylen || keylen > BLAKE2S_KEY_SIZE));*/ - blake2s_init_param(state, ¶m); - memcpy(block, key, keylen); - blake2s_update(state, block, BLAKE2S_BLOCK_SIZE); - explicit_bzero(block, BLAKE2S_BLOCK_SIZE); -} - -static inline void blake2s_compress(struct blake2s_state *state, - const uint8_t *block, size_t nblocks, - const uint32_t inc) -{ - uint32_t m[16]; - uint32_t v[16]; - int i; - - /*WARN_ON(IS_ENABLED(DEBUG) && - (nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));*/ - - while (nblocks > 0) { - blake2s_increment_counter(state, inc); - memcpy(m, block, BLAKE2S_BLOCK_SIZE); - for(i = 0; i < (sizeof(m)/sizeof(m[0])); i++) - (m[i]) = le32toh((m[i])); - memcpy(v, state->h, 32); - v[ 8] = blake2s_iv[0]; - v[ 9] = blake2s_iv[1]; - v[10] = blake2s_iv[2]; - v[11] = blake2s_iv[3]; - v[12] = blake2s_iv[4] ^ state->t[0]; - v[13] = blake2s_iv[5] ^ state->t[1]; - v[14] = blake2s_iv[6] ^ state->f[0]; - v[15] = blake2s_iv[7] ^ state->f[1]; - -#define G(r, i, a, b, c, d) do { \ - a += b + m[blake2s_sigma[r][2 * i + 0]]; \ - d = ror32(d ^ a, 16); \ - c += d; \ - b = ror32(b ^ c, 12); \ - a += b + m[blake2s_sigma[r][2 * i + 1]]; \ - d = ror32(d ^ a, 8); \ - c += d; \ - b = ror32(b ^ c, 7); \ -} while (0) - -#define ROUND(r) do { \ - G(r, 0, v[0], v[ 4], v[ 8], v[12]); \ - G(r, 1, v[1], v[ 5], v[ 9], v[13]); \ - G(r, 2, v[2], v[ 6], v[10], v[14]); \ - G(r, 3, v[3], v[ 7], v[11], v[15]); \ - G(r, 4, v[0], v[ 5], v[10], v[15]); \ - G(r, 5, v[1], v[ 6], v[11], v[12]); \ - G(r, 6, v[2], v[ 7], v[ 8], v[13]); \ - G(r, 7, v[3], v[ 4], v[ 9], v[14]); \ -} while (0) - ROUND(0); - ROUND(1); - ROUND(2); - ROUND(3); - ROUND(4); - ROUND(5); - ROUND(6); - ROUND(7); - ROUND(8); - ROUND(9); - -#undef G -#undef ROUND - - for (i = 0; i < 8; ++i) - state->h[i] ^= v[i] ^ v[i + 8]; - - block += BLAKE2S_BLOCK_SIZE; - --nblocks; - } -} - -void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen) -{ - const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; - - if (!inlen) - return; - if (inlen > fill) { - memcpy(state->buf + state->buflen, in, fill); - blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); - state->buflen = 0; - in += fill; - inlen -= fill; - } - if (inlen > BLAKE2S_BLOCK_SIZE) { - const size_t nblocks = - (inlen + BLAKE2S_BLOCK_SIZE - 1) / BLAKE2S_BLOCK_SIZE; - /* Hash one less (full) block than strictly possible */ - blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); - in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); - inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); - } - memcpy(state->buf + state->buflen, in, inlen); - state->buflen += inlen; -} - -void blake2s_final(struct blake2s_state *state, uint8_t *out, const size_t outlen) -{ - int i; - /*WARN_ON(IS_ENABLED(DEBUG) && - (!out || !outlen || outlen > BLAKE2S_HASH_SIZE));*/ - blake2s_set_lastblock(state); - memset(state->buf + state->buflen, 0, - BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ - blake2s_compress(state, state->buf, 1, state->buflen); - for(i = 0; i < (sizeof(state->h)/sizeof(state->h[0])); i++) - (state->h[i]) = htole32((state->h[i])); - - memcpy(out, state->h, outlen); - explicit_bzero(state, sizeof(*state)); -} - -void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key, const size_t outlen, - const size_t inlen, const size_t keylen) -{ - struct blake2s_state state; - uint8_t x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(uint32_t)) = { 0 }; - uint8_t i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(uint32_t)); - int i; - - if (keylen > BLAKE2S_BLOCK_SIZE) { - blake2s_init(&state, BLAKE2S_HASH_SIZE); - blake2s_update(&state, key, keylen); - blake2s_final(&state, x_key, BLAKE2S_HASH_SIZE); - } else - memcpy(x_key, key, keylen); - - for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i) - x_key[i] ^= 0x36; - - blake2s_init(&state, BLAKE2S_HASH_SIZE); - blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE); - blake2s_update(&state, in, inlen); - blake2s_final(&state, i_hash, BLAKE2S_HASH_SIZE); - - for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i) - x_key[i] ^= 0x5c ^ 0x36; - - blake2s_init(&state, BLAKE2S_HASH_SIZE); - blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE); - blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE); - blake2s_final(&state, i_hash, BLAKE2S_HASH_SIZE); - - memcpy(out, i_hash, outlen); - explicit_bzero(x_key, BLAKE2S_BLOCK_SIZE); - explicit_bzero(i_hash, BLAKE2S_HASH_SIZE); -} diff --git a/sys/dev/if_wg/module/blake2s.h b/sys/dev/if_wg/module/blake2s.h deleted file mode 100644 index 865de953fb25..000000000000 --- a/sys/dev/if_wg/module/blake2s.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#include <sys/types.h> - -#ifndef _BLAKE2S_H_ -#define _BLAKE2S_H_ - -/*#define WARN_ON(a) if(a) printf("%s failed at %s:%d\n", #a, __FILE__, __LINE__) -#define IS_ENABLED(...) true*/ - - -enum blake2s_lengths { - BLAKE2S_BLOCK_SIZE = 64, - BLAKE2S_HASH_SIZE = 32, - BLAKE2S_KEY_SIZE = 32 -}; - -struct blake2s_state { - uint32_t h[8]; - uint32_t t[2]; - uint32_t f[2]; - uint8_t buf[BLAKE2S_BLOCK_SIZE]; - size_t buflen; - uint8_t last_node; -}; - -void blake2s_init(struct blake2s_state *state, const size_t outlen); -void blake2s_init_key(struct blake2s_state *state, const size_t outlen, - const void *key, const size_t keylen); -void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen); -void blake2s_final(struct blake2s_state *state, uint8_t *out, const size_t outlen); - -static inline void blake2s(uint8_t *out, const uint8_t *in, const uint8_t *key, - const size_t outlen, const size_t inlen, - const size_t keylen) -{ - struct blake2s_state state; - - /*WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen || - outlen > BLAKE2S_HASH_SIZE || keylen > BLAKE2S_KEY_SIZE || - (!key && keylen)));*/ - - if (keylen) - blake2s_init_key(&state, outlen, key, keylen); - else - blake2s_init(&state, outlen); - - blake2s_update(&state, in, inlen); - blake2s_final(&state, out, outlen); -} - -void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key, - const size_t outlen, const size_t inlen, const size_t keylen); - -#endif /* _BLAKE2S_H_ */ diff --git a/sys/dev/if_wg/module/chacha20-x86_64.S b/sys/dev/if_wg/module/chacha20-x86_64.S deleted file mode 100644 index 0edb79483758..000000000000 --- a/sys/dev/if_wg/module/chacha20-x86_64.S +++ /dev/null @@ -1,2834 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause -// -// Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. -// Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. -// Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. -// -// This code is taken from the OpenSSL project but the author, Andy Polyakov, -// has relicensed it under the licenses specified in the SPDX header above. -// The original headers, including the original license headers, are -// included below for completeness. -// -// ==================================================================== -// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -// project. The module is, however, dual licensed under OpenSSL and -// CRYPTOGAMS licenses depending on where you obtain it. For further -// details see http://www.openssl.org/~appro/cryptogams/. -// ==================================================================== -// -// November 2014 -// -// ChaCha20 for x86_64. -// -// December 2016 -// -// Add AVX512F code path. -// -// December 2017 -// -// Add AVX512VL code path. -// -// Performance in cycles per byte out of large buffer. -// -// IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v) -// -// P4 9.48/+99% - - -// Core2 7.83/+55% 7.90/5.76 4.35 -// Westmere 7.19/+50% 5.60/4.50 3.00 -// Sandy Bridge 8.31/+42% 5.45/4.00 2.72 -// Ivy Bridge 6.71/+46% 5.40/? 2.41 -// Haswell 5.92/+43% 5.20/3.45 2.42 1.23 -// Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)] -// Silvermont 12.0/+33% 7.75/6.90 7.03(iii) -// Knights L 11.7/- ? 9.60(iii) 0.80 -// Goldmont 10.6/+17% 5.10/3.52 3.28 -// Sledgehammer 7.28/+52% - - -// Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv) -// Ryzen 5.96/+50% 5.19/3.00 2.40 2.09 -// VIA Nano 10.5/+46% 6.72/6.88 6.05 -// -// (i) compared to older gcc 3.x one can observe >2x improvement on -// most platforms; -// (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used -// by chacha20_poly1305_tls_cipher, results are EVP-free; -// (iii) this is not optimal result for Atom because of MSROM -// limitations, SSE2 can do better, but gain is considered too -// low to justify the [maintenance] effort; -// (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20 -// and 4.85 for 128-byte inputs; -// (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable; -// (vi) even though Skylake-X can execute AVX512F code and deliver 0.57 -// cpb in single thread, the corresponding capability is suppressed; - -//#include <linux/linkage.h> -.section .rodata.cst16.Lzero, "aM", @progbits, 16 -.align 16 -.Lzero: -.long 0,0,0,0 -.section .rodata.cst16.Lone, "aM", @progbits, 16 -.align 16 -.Lone: -.long 1,0,0,0 -.section .rodata.cst16.Linc, "aM", @progbits, 16 -.align 16 -.Linc: -.long 0,1,2,3 -.section .rodata.cst16.Lfour, "aM", @progbits, 16 -.align 16 -.Lfour: -.long 4,4,4,4 -.section .rodata.cst32.Lincy, "aM", @progbits, 32 -.align 32 -.Lincy: -.long 0,2,4,6,1,3,5,7 -.section .rodata.cst32.Leight, "aM", @progbits, 32 -.align 32 -.Leight: -.long 8,8,8,8,8,8,8,8 -.section .rodata.cst16.Lrot16, "aM", @progbits, 16 -.align 16 -.Lrot16: -.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd -.section .rodata.cst16.Lrot24, "aM", @progbits, 16 -.align 16 -.Lrot24: -.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe -.section .rodata.cst32.Ltwoy, "aM", @progbits, 32 -.align 32 -.Ltwoy: -.long 2,0,0,0, 2,0,0,0 -.section .rodata.cst64.Lzeroz, "aM", @progbits, 64 -.align 64 -.Lzeroz: -.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 -.section .rodata.cst64.Lfourz, "aM", @progbits, 64 -.align 64 -.Lfourz: -.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 -.section .rodata.cst64.Lincz, "aM", @progbits, 64 -.align 64 -.Lincz: -.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 -.section .rodata.cst64.Lsixteen, "aM", @progbits, 64 -.align 64 -.Lsixteen: -.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 -.section .rodata.cst16.Lsigma, "aM", @progbits, 16 -.align 16 -.Lsigma: -.ascii "expand 32-byte k" -.text -#ifdef CONFIG_AS_SSSE3 -.align 32 -SYM_FUNC_START(hchacha20_ssse3) -.Lhchacha20_ssse3: - movdqa .Lsigma(%rip),%xmm0 - movdqu (%rdx),%xmm1 - movdqu 16(%rdx),%xmm2 - movdqu (%rsi),%xmm3 - # This code is only used when targeting kernel. - # If targeting win64, xmm{6,7} preserving needs to be added. - movdqa .Lrot16(%rip),%xmm6 - movdqa .Lrot24(%rip),%xmm7 - mov $10,%r8 # reuse %r8 - jmp 1f -.align 32 -1: - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm6,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm7,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $147,%xmm0,%xmm0 - pshufd $78,%xmm3,%xmm3 - pshufd $57,%xmm2,%xmm2 - nop - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm6,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm7,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $57,%xmm0,%xmm0 - pshufd $78,%xmm3,%xmm3 - pshufd $147,%xmm2,%xmm2 - dec %r8 - jnz 1b - movdqu %xmm0, (%rdi) - movdqu %xmm3, 16(%rdi) - ret -SYM_FUNC_END(hchacha20_ssse3) -.align 32 -SYM_FUNC_START(chacha20_ssse3) -.Lchacha20_ssse3: - lea 8(%rsp),%r10 # frame pointer - cmp $128,%rdx # we might throw away some data, - je .Lchacha20_128 - ja .Lchacha20_4x # but overall it won't be slower - -.Ldo_ssse3_after_all: - sub $64+8,%rsp - and $-16,%rsp - movdqa .Lsigma(%rip),%xmm0 - movdqu (%rcx),%xmm1 - movdqu 16(%rcx),%xmm2 - movdqu (%r8),%xmm3 - movdqa .Lrot16(%rip),%xmm6 - movdqa .Lrot24(%rip),%xmm7 - - movdqa %xmm0,0x00(%rsp) - movdqa %xmm1,0x10(%rsp) - movdqa %xmm2,0x20(%rsp) - movdqa %xmm3,0x30(%rsp) - mov $10,%r8 # reuse %r8 - jmp .Loop_ssse3 - -.align 32 -.Loop_outer_ssse3: - movdqa .Lone(%rip),%xmm3 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x10(%rsp),%xmm1 - movdqa 0x20(%rsp),%xmm2 - paddd 0x30(%rsp),%xmm3 - mov $10,%r8 - movdqa %xmm3,0x30(%rsp) - jmp .Loop_ssse3 - -.align 32 -.Loop_ssse3: - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm6,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm7,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $147,%xmm0,%xmm0 - pshufd $78,%xmm3,%xmm3 - pshufd $57,%xmm2,%xmm2 - nop - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm6,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm7,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $57,%xmm0,%xmm0 - pshufd $78,%xmm3,%xmm3 - pshufd $147,%xmm2,%xmm2 - dec %r8 - jnz .Loop_ssse3 - paddd 0x00(%rsp),%xmm0 - paddd 0x10(%rsp),%xmm1 - paddd 0x20(%rsp),%xmm2 - paddd 0x30(%rsp),%xmm3 - - cmp $64,%rdx - jb .Ltail_ssse3 - - movdqu 0x00(%rsi),%xmm4 - movdqu 0x10(%rsi),%xmm5 - pxor %xmm4,%xmm0 # xor with input - movdqu 0x20(%rsi),%xmm4 - pxor %xmm5,%xmm1 - movdqu 0x30(%rsi),%xmm5 - lea 0x40(%rsi),%rsi # inp+=64 - pxor %xmm4,%xmm2 - pxor %xmm5,%xmm3 - - movdqu %xmm0,0x00(%rdi) # write output - movdqu %xmm1,0x10(%rdi) - movdqu %xmm2,0x20(%rdi) - movdqu %xmm3,0x30(%rdi) - lea 0x40(%rdi),%rdi # out+=64 - - sub $64,%rdx - jnz .Loop_outer_ssse3 - - jmp .Ldone_ssse3 - -.align 16 -.Ltail_ssse3: - movdqa %xmm0,0x00(%rsp) - movdqa %xmm1,0x10(%rsp) - movdqa %xmm2,0x20(%rsp) - movdqa %xmm3,0x30(%rsp) - xor %r8,%r8 - -.Loop_tail_ssse3: - movzb (%rsi,%r8),%eax - movzb (%rsp,%r8),%ecx - lea 1(%r8),%r8 - xor %ecx,%eax - mov %al,-1(%rdi,%r8) - dec %rdx - jnz .Loop_tail_ssse3 - -.Ldone_ssse3: - lea -8(%r10),%rsp -.Lssse3_epilogue: - ret -SYM_FUNC_END(chacha20_ssse3) -.type chacha20_128,@function -.align 32 -chacha20_128: -.Lchacha20_128: - lea 8(%rsp),%r10 # frame pointer - sub $64+8,%rsp - and $-16,%rsp - movdqa .Lsigma(%rip),%xmm8 - movdqu (%rcx),%xmm9 - movdqu 16(%rcx),%xmm2 - movdqu (%r8),%xmm3 - movdqa .Lone(%rip),%xmm1 - movdqa .Lrot16(%rip),%xmm6 - movdqa .Lrot24(%rip),%xmm7 - - movdqa %xmm8,%xmm10 - movdqa %xmm8,0x00(%rsp) - movdqa %xmm9,%xmm11 - movdqa %xmm9,0x10(%rsp) - movdqa %xmm2,%xmm0 - movdqa %xmm2,0x20(%rsp) - paddd %xmm3,%xmm1 - movdqa %xmm3,0x30(%rsp) - mov $10,%r8 # reuse %r8 - jmp .Loop_128 - -.align 32 -.Loop_128: - paddd %xmm9,%xmm8 - pxor %xmm8,%xmm3 - paddd %xmm11,%xmm10 - pxor %xmm10,%xmm1 - pshufb %xmm6,%xmm3 - pshufb %xmm6,%xmm1 - paddd %xmm3,%xmm2 - paddd %xmm1,%xmm0 - pxor %xmm2,%xmm9 - pxor %xmm0,%xmm11 - movdqa %xmm9,%xmm4 - psrld $20,%xmm9 - movdqa %xmm11,%xmm5 - pslld $12,%xmm4 - psrld $20,%xmm11 - por %xmm4,%xmm9 - pslld $12,%xmm5 - por %xmm5,%xmm11 - paddd %xmm9,%xmm8 - pxor %xmm8,%xmm3 - paddd %xmm11,%xmm10 - pxor %xmm10,%xmm1 - pshufb %xmm7,%xmm3 - pshufb %xmm7,%xmm1 - paddd %xmm3,%xmm2 - paddd %xmm1,%xmm0 - pxor %xmm2,%xmm9 - pxor %xmm0,%xmm11 - movdqa %xmm9,%xmm4 - psrld $25,%xmm9 - movdqa %xmm11,%xmm5 - pslld $7,%xmm4 - psrld $25,%xmm11 - por %xmm4,%xmm9 - pslld $7,%xmm5 - por %xmm5,%xmm11 - pshufd $147,%xmm8,%xmm8 - pshufd $78,%xmm3,%xmm3 - pshufd $57,%xmm2,%xmm2 - pshufd $147,%xmm10,%xmm10 - pshufd $78,%xmm1,%xmm1 - pshufd $57,%xmm0,%xmm0 - paddd %xmm9,%xmm8 - pxor %xmm8,%xmm3 - paddd %xmm11,%xmm10 - pxor %xmm10,%xmm1 - pshufb %xmm6,%xmm3 - pshufb %xmm6,%xmm1 - paddd %xmm3,%xmm2 - paddd %xmm1,%xmm0 - pxor %xmm2,%xmm9 - pxor %xmm0,%xmm11 - movdqa %xmm9,%xmm4 - psrld $20,%xmm9 - movdqa %xmm11,%xmm5 - pslld $12,%xmm4 - psrld $20,%xmm11 - por %xmm4,%xmm9 - pslld $12,%xmm5 - por %xmm5,%xmm11 - paddd %xmm9,%xmm8 - pxor %xmm8,%xmm3 - paddd %xmm11,%xmm10 - pxor %xmm10,%xmm1 - pshufb %xmm7,%xmm3 - pshufb %xmm7,%xmm1 - paddd %xmm3,%xmm2 - paddd %xmm1,%xmm0 - pxor %xmm2,%xmm9 - pxor %xmm0,%xmm11 - movdqa %xmm9,%xmm4 - psrld $25,%xmm9 - movdqa %xmm11,%xmm5 - pslld $7,%xmm4 - psrld $25,%xmm11 - por %xmm4,%xmm9 - pslld $7,%xmm5 - por %xmm5,%xmm11 - pshufd $57,%xmm8,%xmm8 - pshufd $78,%xmm3,%xmm3 - pshufd $147,%xmm2,%xmm2 - pshufd $57,%xmm10,%xmm10 - pshufd $78,%xmm1,%xmm1 - pshufd $147,%xmm0,%xmm0 - dec %r8 - jnz .Loop_128 - paddd 0x00(%rsp),%xmm8 - paddd 0x10(%rsp),%xmm9 - paddd 0x20(%rsp),%xmm2 - paddd 0x30(%rsp),%xmm3 - paddd .Lone(%rip),%xmm1 - paddd 0x00(%rsp),%xmm10 - paddd 0x10(%rsp),%xmm11 - paddd 0x20(%rsp),%xmm0 - paddd 0x30(%rsp),%xmm1 - - movdqu 0x00(%rsi),%xmm4 - movdqu 0x10(%rsi),%xmm5 - pxor %xmm4,%xmm8 # xor with input - movdqu 0x20(%rsi),%xmm4 - pxor %xmm5,%xmm9 - movdqu 0x30(%rsi),%xmm5 - pxor %xmm4,%xmm2 - movdqu 0x40(%rsi),%xmm4 - pxor %xmm5,%xmm3 - movdqu 0x50(%rsi),%xmm5 - pxor %xmm4,%xmm10 - movdqu 0x60(%rsi),%xmm4 - pxor %xmm5,%xmm11 - movdqu 0x70(%rsi),%xmm5 - pxor %xmm4,%xmm0 - pxor %xmm5,%xmm1 - - movdqu %xmm8,0x00(%rdi) # write output - movdqu %xmm9,0x10(%rdi) - movdqu %xmm2,0x20(%rdi) - movdqu %xmm3,0x30(%rdi) - movdqu %xmm10,0x40(%rdi) - movdqu %xmm11,0x50(%rdi) - movdqu %xmm0,0x60(%rdi) - movdqu %xmm1,0x70(%rdi) - lea -8(%r10),%rsp -.L128_epilogue: - ret -.size chacha20_128,.-chacha20_128 -.type chacha20_4x,@function -.align 32 -chacha20_4x: -.Lchacha20_4x: - lea 8(%rsp),%r10 # frame pointer - cmp $192,%rdx - ja .Lproceed4x -.Lproceed4x: - sub $0x140+8,%rsp - and $-16,%rsp - movdqa .Lsigma(%rip),%xmm11 # key[0] - movdqu (%rcx),%xmm15 # key[1] - movdqu 16(%rcx),%xmm7 # key[2] - movdqu (%r8),%xmm3 # key[3] - lea 0x100(%rsp),%rcx # size optimization - lea .Lrot16(%rip),%r9 - lea .Lrot24(%rip),%r11 - - pshufd $0x00,%xmm11,%xmm8 # smash key by lanes... - pshufd $0x55,%xmm11,%xmm9 - movdqa %xmm8,0x40(%rsp) # ... and offload - pshufd $0xaa,%xmm11,%xmm10 - movdqa %xmm9,0x50(%rsp) - pshufd $0xff,%xmm11,%xmm11 - movdqa %xmm10,0x60(%rsp) - movdqa %xmm11,0x70(%rsp) - - pshufd $0x00,%xmm15,%xmm12 - pshufd $0x55,%xmm15,%xmm13 - movdqa %xmm12,0x80-0x100(%rcx) - pshufd $0xaa,%xmm15,%xmm14 - movdqa %xmm13,0x90-0x100(%rcx) - pshufd $0xff,%xmm15,%xmm15 - movdqa %xmm14,0xa0-0x100(%rcx) - movdqa %xmm15,0xb0-0x100(%rcx) - - pshufd $0x00,%xmm7,%xmm4 # "" - pshufd $0x55,%xmm7,%xmm5 # "" - movdqa %xmm4,0xc0-0x100(%rcx) - pshufd $0xaa,%xmm7,%xmm6 # "" - movdqa %xmm5,0xd0-0x100(%rcx) - pshufd $0xff,%xmm7,%xmm7 # "" - movdqa %xmm6,0xe0-0x100(%rcx) - movdqa %xmm7,0xf0-0x100(%rcx) - - pshufd $0x00,%xmm3,%xmm0 - pshufd $0x55,%xmm3,%xmm1 - paddd .Linc(%rip),%xmm0 # don't save counters yet - pshufd $0xaa,%xmm3,%xmm2 - movdqa %xmm1,0x110-0x100(%rcx) - pshufd $0xff,%xmm3,%xmm3 - movdqa %xmm2,0x120-0x100(%rcx) - movdqa %xmm3,0x130-0x100(%rcx) - - jmp .Loop_enter4x - -.align 32 -.Loop_outer4x: - movdqa 0x40(%rsp),%xmm8 # re-load smashed key - movdqa 0x50(%rsp),%xmm9 - movdqa 0x60(%rsp),%xmm10 - movdqa 0x70(%rsp),%xmm11 - movdqa 0x80-0x100(%rcx),%xmm12 - movdqa 0x90-0x100(%rcx),%xmm13 - movdqa 0xa0-0x100(%rcx),%xmm14 - movdqa 0xb0-0x100(%rcx),%xmm15 - movdqa 0xc0-0x100(%rcx),%xmm4 # "" - movdqa 0xd0-0x100(%rcx),%xmm5 # "" - movdqa 0xe0-0x100(%rcx),%xmm6 # "" - movdqa 0xf0-0x100(%rcx),%xmm7 # "" - movdqa 0x100-0x100(%rcx),%xmm0 - movdqa 0x110-0x100(%rcx),%xmm1 - movdqa 0x120-0x100(%rcx),%xmm2 - movdqa 0x130-0x100(%rcx),%xmm3 - paddd .Lfour(%rip),%xmm0 # next SIMD counters - -.Loop_enter4x: - movdqa %xmm6,0x20(%rsp) # SIMD equivalent of "%nox" - movdqa %xmm7,0x30(%rsp) # SIMD equivalent of "%nox" - movdqa (%r9),%xmm7 # .Lrot16(%rip) - mov $10,%eax - movdqa %xmm0,0x100-0x100(%rcx) # save SIMD counters - jmp .Loop4x - -.align 32 -.Loop4x: - paddd %xmm12,%xmm8 - paddd %xmm13,%xmm9 - pxor %xmm8,%xmm0 - pxor %xmm9,%xmm1 - pshufb %xmm7,%xmm0 - pshufb %xmm7,%xmm1 - paddd %xmm0,%xmm4 - paddd %xmm1,%xmm5 - pxor %xmm4,%xmm12 - pxor %xmm5,%xmm13 - movdqa %xmm12,%xmm6 - pslld $12,%xmm12 - psrld $20,%xmm6 - movdqa %xmm13,%xmm7 - pslld $12,%xmm13 - por %xmm6,%xmm12 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm13 - paddd %xmm12,%xmm8 - paddd %xmm13,%xmm9 - pxor %xmm8,%xmm0 - pxor %xmm9,%xmm1 - pshufb %xmm6,%xmm0 - pshufb %xmm6,%xmm1 - paddd %xmm0,%xmm4 - paddd %xmm1,%xmm5 - pxor %xmm4,%xmm12 - pxor %xmm5,%xmm13 - movdqa %xmm12,%xmm7 - pslld $7,%xmm12 - psrld $25,%xmm7 - movdqa %xmm13,%xmm6 - pslld $7,%xmm13 - por %xmm7,%xmm12 - psrld $25,%xmm6 - movdqa (%r9),%xmm7 - por %xmm6,%xmm13 - movdqa %xmm4,0(%rsp) - movdqa %xmm5,16(%rsp) - movdqa 32(%rsp),%xmm4 - movdqa 48(%rsp),%xmm5 - paddd %xmm14,%xmm10 - paddd %xmm15,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm3 - pshufb %xmm7,%xmm2 - pshufb %xmm7,%xmm3 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - pxor %xmm4,%xmm14 - pxor %xmm5,%xmm15 - movdqa %xmm14,%xmm6 - pslld $12,%xmm14 - psrld $20,%xmm6 - movdqa %xmm15,%xmm7 - pslld $12,%xmm15 - por %xmm6,%xmm14 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm15 - paddd %xmm14,%xmm10 - paddd %xmm15,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm3 - pshufb %xmm6,%xmm2 - pshufb %xmm6,%xmm3 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - pxor %xmm4,%xmm14 - pxor %xmm5,%xmm15 - movdqa %xmm14,%xmm7 - pslld $7,%xmm14 - psrld $25,%xmm7 - movdqa %xmm15,%xmm6 - pslld $7,%xmm15 - por %xmm7,%xmm14 - psrld $25,%xmm6 - movdqa (%r9),%xmm7 - por %xmm6,%xmm15 - paddd %xmm13,%xmm8 - paddd %xmm14,%xmm9 - pxor %xmm8,%xmm3 - pxor %xmm9,%xmm0 - pshufb %xmm7,%xmm3 - pshufb %xmm7,%xmm0 - paddd %xmm3,%xmm4 - paddd %xmm0,%xmm5 - pxor %xmm4,%xmm13 - pxor %xmm5,%xmm14 - movdqa %xmm13,%xmm6 - pslld $12,%xmm13 - psrld $20,%xmm6 - movdqa %xmm14,%xmm7 - pslld $12,%xmm14 - por %xmm6,%xmm13 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm14 - paddd %xmm13,%xmm8 - paddd %xmm14,%xmm9 - pxor %xmm8,%xmm3 - pxor %xmm9,%xmm0 - pshufb %xmm6,%xmm3 - pshufb %xmm6,%xmm0 - paddd %xmm3,%xmm4 - paddd %xmm0,%xmm5 - pxor %xmm4,%xmm13 - pxor %xmm5,%xmm14 - movdqa %xmm13,%xmm7 - pslld $7,%xmm13 - psrld $25,%xmm7 - movdqa %xmm14,%xmm6 - pslld $7,%xmm14 - por %xmm7,%xmm13 - psrld $25,%xmm6 - movdqa (%r9),%xmm7 - por %xmm6,%xmm14 - movdqa %xmm4,32(%rsp) - movdqa %xmm5,48(%rsp) - movdqa 0(%rsp),%xmm4 - movdqa 16(%rsp),%xmm5 - paddd %xmm15,%xmm10 - paddd %xmm12,%xmm11 - pxor %xmm10,%xmm1 - pxor %xmm11,%xmm2 - pshufb %xmm7,%xmm1 - pshufb %xmm7,%xmm2 - paddd %xmm1,%xmm4 - paddd %xmm2,%xmm5 - pxor %xmm4,%xmm15 - pxor %xmm5,%xmm12 - movdqa %xmm15,%xmm6 - pslld $12,%xmm15 - psrld $20,%xmm6 - movdqa %xmm12,%xmm7 - pslld $12,%xmm12 - por %xmm6,%xmm15 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm12 - paddd %xmm15,%xmm10 - paddd %xmm12,%xmm11 - pxor %xmm10,%xmm1 - pxor %xmm11,%xmm2 - pshufb %xmm6,%xmm1 - pshufb %xmm6,%xmm2 - paddd %xmm1,%xmm4 - paddd %xmm2,%xmm5 - pxor %xmm4,%xmm15 - pxor %xmm5,%xmm12 - movdqa %xmm15,%xmm7 - pslld $7,%xmm15 - psrld $25,%xmm7 - movdqa %xmm12,%xmm6 - pslld $7,%xmm12 - por %xmm7,%xmm15 - psrld $25,%xmm6 - movdqa (%r9),%xmm7 - por %xmm6,%xmm12 - dec %eax - jnz .Loop4x - - paddd 0x40(%rsp),%xmm8 # accumulate key material - paddd 0x50(%rsp),%xmm9 - paddd 0x60(%rsp),%xmm10 - paddd 0x70(%rsp),%xmm11 - - movdqa %xmm8,%xmm6 # "de-interlace" data - punpckldq %xmm9,%xmm8 - movdqa %xmm10,%xmm7 - punpckldq %xmm11,%xmm10 - punpckhdq %xmm9,%xmm6 - punpckhdq %xmm11,%xmm7 - movdqa %xmm8,%xmm9 - punpcklqdq %xmm10,%xmm8 # "a0" - movdqa %xmm6,%xmm11 - punpcklqdq %xmm7,%xmm6 # "a2" - punpckhqdq %xmm10,%xmm9 # "a1" - punpckhqdq %xmm7,%xmm11 # "a3" - paddd 0x80-0x100(%rcx),%xmm12 - paddd 0x90-0x100(%rcx),%xmm13 - paddd 0xa0-0x100(%rcx),%xmm14 - paddd 0xb0-0x100(%rcx),%xmm15 - - movdqa %xmm8,0x00(%rsp) # offload - movdqa %xmm9,0x10(%rsp) - movdqa 0x20(%rsp),%xmm8 # "xc2" - movdqa 0x30(%rsp),%xmm9 # "xc3" - - movdqa %xmm12,%xmm10 - punpckldq %xmm13,%xmm12 - movdqa %xmm14,%xmm7 - punpckldq %xmm15,%xmm14 - punpckhdq %xmm13,%xmm10 - punpckhdq %xmm15,%xmm7 - movdqa %xmm12,%xmm13 - punpcklqdq %xmm14,%xmm12 # "b0" - movdqa %xmm10,%xmm15 - punpcklqdq %xmm7,%xmm10 # "b2" - punpckhqdq %xmm14,%xmm13 # "b1" - punpckhqdq %xmm7,%xmm15 # "b3" - paddd 0xc0-0x100(%rcx),%xmm4 - paddd 0xd0-0x100(%rcx),%xmm5 - paddd 0xe0-0x100(%rcx),%xmm8 - paddd 0xf0-0x100(%rcx),%xmm9 - - movdqa %xmm6,0x20(%rsp) # keep offloading - movdqa %xmm11,0x30(%rsp) - - movdqa %xmm4,%xmm14 - punpckldq %xmm5,%xmm4 - movdqa %xmm8,%xmm7 - punpckldq %xmm9,%xmm8 - punpckhdq %xmm5,%xmm14 - punpckhdq %xmm9,%xmm7 - movdqa %xmm4,%xmm5 - punpcklqdq %xmm8,%xmm4 # "c0" - movdqa %xmm14,%xmm9 - punpcklqdq %xmm7,%xmm14 # "c2" - punpckhqdq %xmm8,%xmm5 # "c1" - punpckhqdq %xmm7,%xmm9 # "c3" - paddd 0x100-0x100(%rcx),%xmm0 - paddd 0x110-0x100(%rcx),%xmm1 - paddd 0x120-0x100(%rcx),%xmm2 - paddd 0x130-0x100(%rcx),%xmm3 - - movdqa %xmm0,%xmm8 - punpckldq %xmm1,%xmm0 - movdqa %xmm2,%xmm7 - punpckldq %xmm3,%xmm2 - punpckhdq %xmm1,%xmm8 - punpckhdq %xmm3,%xmm7 - movdqa %xmm0,%xmm1 - punpcklqdq %xmm2,%xmm0 # "d0" - movdqa %xmm8,%xmm3 - punpcklqdq %xmm7,%xmm8 # "d2" - punpckhqdq %xmm2,%xmm1 # "d1" - punpckhqdq %xmm7,%xmm3 # "d3" - cmp $64*4,%rdx - jb .Ltail4x - - movdqu 0x00(%rsi),%xmm6 # xor with input - movdqu 0x10(%rsi),%xmm11 - movdqu 0x20(%rsi),%xmm2 - movdqu 0x30(%rsi),%xmm7 - pxor 0x00(%rsp),%xmm6 # is offloaded, remember? - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - - movdqu %xmm6,0x00(%rdi) - movdqu 0x40(%rsi),%xmm6 - movdqu %xmm11,0x10(%rdi) - movdqu 0x50(%rsi),%xmm11 - movdqu %xmm2,0x20(%rdi) - movdqu 0x60(%rsi),%xmm2 - movdqu %xmm7,0x30(%rdi) - movdqu 0x70(%rsi),%xmm7 - lea 0x80(%rsi),%rsi # size optimization - pxor 0x10(%rsp),%xmm6 - pxor %xmm13,%xmm11 - pxor %xmm5,%xmm2 - pxor %xmm1,%xmm7 - - movdqu %xmm6,0x40(%rdi) - movdqu 0x00(%rsi),%xmm6 - movdqu %xmm11,0x50(%rdi) - movdqu 0x10(%rsi),%xmm11 - movdqu %xmm2,0x60(%rdi) - movdqu 0x20(%rsi),%xmm2 - movdqu %xmm7,0x70(%rdi) - lea 0x80(%rdi),%rdi # size optimization - movdqu 0x30(%rsi),%xmm7 - pxor 0x20(%rsp),%xmm6 - pxor %xmm10,%xmm11 - pxor %xmm14,%xmm2 - pxor %xmm8,%xmm7 - - movdqu %xmm6,0x00(%rdi) - movdqu 0x40(%rsi),%xmm6 - movdqu %xmm11,0x10(%rdi) - movdqu 0x50(%rsi),%xmm11 - movdqu %xmm2,0x20(%rdi) - movdqu 0x60(%rsi),%xmm2 - movdqu %xmm7,0x30(%rdi) - movdqu 0x70(%rsi),%xmm7 - lea 0x80(%rsi),%rsi # inp+=64*4 - pxor 0x30(%rsp),%xmm6 - pxor %xmm15,%xmm11 - pxor %xmm9,%xmm2 - pxor %xmm3,%xmm7 - movdqu %xmm6,0x40(%rdi) - movdqu %xmm11,0x50(%rdi) - movdqu %xmm2,0x60(%rdi) - movdqu %xmm7,0x70(%rdi) - lea 0x80(%rdi),%rdi # out+=64*4 - - sub $64*4,%rdx - jnz .Loop_outer4x - - jmp .Ldone4x - -.Ltail4x: - cmp $192,%rdx - jae .L192_or_more4x - cmp $128,%rdx - jae .L128_or_more4x - cmp $64,%rdx - jae .L64_or_more4x - - #movdqa 0x00(%rsp),%xmm6 # is offloaded, remember? - xor %r9,%r9 - #movdqa %xmm6,0x00(%rsp) - movdqa %xmm12,0x10(%rsp) - movdqa %xmm4,0x20(%rsp) - movdqa %xmm0,0x30(%rsp) - jmp .Loop_tail4x - -.align 32 -.L64_or_more4x: - movdqu 0x00(%rsi),%xmm6 # xor with input - movdqu 0x10(%rsi),%xmm11 - movdqu 0x20(%rsi),%xmm2 - movdqu 0x30(%rsi),%xmm7 - pxor 0x00(%rsp),%xmm6 # is offloaded, remember? - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - movdqu %xmm6,0x00(%rdi) - movdqu %xmm11,0x10(%rdi) - movdqu %xmm2,0x20(%rdi) - movdqu %xmm7,0x30(%rdi) - je .Ldone4x - - movdqa 0x10(%rsp),%xmm6 # is offloaded, remember? - lea 0x40(%rsi),%rsi # inp+=64*1 - xor %r9,%r9 - movdqa %xmm6,0x00(%rsp) - movdqa %xmm13,0x10(%rsp) - lea 0x40(%rdi),%rdi # out+=64*1 - movdqa %xmm5,0x20(%rsp) - sub $64,%rdx # len-=64*1 - movdqa %xmm1,0x30(%rsp) - jmp .Loop_tail4x - -.align 32 -.L128_or_more4x: - movdqu 0x00(%rsi),%xmm6 # xor with input - movdqu 0x10(%rsi),%xmm11 - movdqu 0x20(%rsi),%xmm2 - movdqu 0x30(%rsi),%xmm7 - pxor 0x00(%rsp),%xmm6 # is offloaded, remember? - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - - movdqu %xmm6,0x00(%rdi) - movdqu 0x40(%rsi),%xmm6 - movdqu %xmm11,0x10(%rdi) - movdqu 0x50(%rsi),%xmm11 - movdqu %xmm2,0x20(%rdi) - movdqu 0x60(%rsi),%xmm2 - movdqu %xmm7,0x30(%rdi) - movdqu 0x70(%rsi),%xmm7 - pxor 0x10(%rsp),%xmm6 - pxor %xmm13,%xmm11 - pxor %xmm5,%xmm2 - pxor %xmm1,%xmm7 - movdqu %xmm6,0x40(%rdi) - movdqu %xmm11,0x50(%rdi) - movdqu %xmm2,0x60(%rdi) - movdqu %xmm7,0x70(%rdi) - je .Ldone4x - - movdqa 0x20(%rsp),%xmm6 # is offloaded, remember? - lea 0x80(%rsi),%rsi # inp+=64*2 - xor %r9,%r9 - movdqa %xmm6,0x00(%rsp) - movdqa %xmm10,0x10(%rsp) - lea 0x80(%rdi),%rdi # out+=64*2 - movdqa %xmm14,0x20(%rsp) - sub $128,%rdx # len-=64*2 - movdqa %xmm8,0x30(%rsp) - jmp .Loop_tail4x - -.align 32 -.L192_or_more4x: - movdqu 0x00(%rsi),%xmm6 # xor with input - movdqu 0x10(%rsi),%xmm11 - movdqu 0x20(%rsi),%xmm2 - movdqu 0x30(%rsi),%xmm7 - pxor 0x00(%rsp),%xmm6 # is offloaded, remember? - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - - movdqu %xmm6,0x00(%rdi) - movdqu 0x40(%rsi),%xmm6 - movdqu %xmm11,0x10(%rdi) - movdqu 0x50(%rsi),%xmm11 - movdqu %xmm2,0x20(%rdi) - movdqu 0x60(%rsi),%xmm2 - movdqu %xmm7,0x30(%rdi) - movdqu 0x70(%rsi),%xmm7 - lea 0x80(%rsi),%rsi # size optimization - pxor 0x10(%rsp),%xmm6 - pxor %xmm13,%xmm11 - pxor %xmm5,%xmm2 - pxor %xmm1,%xmm7 - - movdqu %xmm6,0x40(%rdi) - movdqu 0x00(%rsi),%xmm6 - movdqu %xmm11,0x50(%rdi) - movdqu 0x10(%rsi),%xmm11 - movdqu %xmm2,0x60(%rdi) - movdqu 0x20(%rsi),%xmm2 - movdqu %xmm7,0x70(%rdi) - lea 0x80(%rdi),%rdi # size optimization - movdqu 0x30(%rsi),%xmm7 - pxor 0x20(%rsp),%xmm6 - pxor %xmm10,%xmm11 - pxor %xmm14,%xmm2 - pxor %xmm8,%xmm7 - movdqu %xmm6,0x00(%rdi) - movdqu %xmm11,0x10(%rdi) - movdqu %xmm2,0x20(%rdi) - movdqu %xmm7,0x30(%rdi) - je .Ldone4x - - movdqa 0x30(%rsp),%xmm6 # is offloaded, remember? - lea 0x40(%rsi),%rsi # inp+=64*3 - xor %r9,%r9 - movdqa %xmm6,0x00(%rsp) - movdqa %xmm15,0x10(%rsp) - lea 0x40(%rdi),%rdi # out+=64*3 - movdqa %xmm9,0x20(%rsp) - sub $192,%rdx # len-=64*3 - movdqa %xmm3,0x30(%rsp) - -.Loop_tail4x: - movzb (%rsi,%r9),%eax - movzb (%rsp,%r9),%ecx - lea 1(%r9),%r9 - xor %ecx,%eax - mov %al,-1(%rdi,%r9) - dec %rdx - jnz .Loop_tail4x - -.Ldone4x: - lea -8(%r10),%rsp -.L4x_epilogue: - ret -.size chacha20_4x,.-chacha20_4x -#endif -#ifdef CONFIG_AS_AVX2 -.align 32 -SYM_FUNC_START(chacha20_avx2) -.Lchacha20_avx2: -.Lchacha20_8x: - lea 8(%rsp),%r10 # frame register - sub $0x280+8,%rsp - and $-32,%rsp - vzeroupper - - ################ stack layout - # +0x00 SIMD equivalent of %r12d - # ... - # +0x80 constant copy of key[0-2] smashed by lanes - # ... - # +0x200 SIMD counters (with nonce smashed by lanes) - # ... - # +0x280 - - vbroadcasti128 .Lsigma(%rip),%ymm11 # key[0] - vbroadcasti128 (%rcx),%ymm3 # key[1] - vbroadcasti128 16(%rcx),%ymm15 # key[2] - vbroadcasti128 (%r8),%ymm7 # key[3] - lea 0x100(%rsp),%rcx # size optimization - lea 0x200(%rsp),%rax # size optimization - lea .Lrot16(%rip),%r9 - lea .Lrot24(%rip),%r11 - - vpshufd $0x00,%ymm11,%ymm8 # smash key by lanes... - vpshufd $0x55,%ymm11,%ymm9 - vmovdqa %ymm8,0x80-0x100(%rcx) # ... and offload - vpshufd $0xaa,%ymm11,%ymm10 - vmovdqa %ymm9,0xa0-0x100(%rcx) - vpshufd $0xff,%ymm11,%ymm11 - vmovdqa %ymm10,0xc0-0x100(%rcx) - vmovdqa %ymm11,0xe0-0x100(%rcx) - - vpshufd $0x00,%ymm3,%ymm0 - vpshufd $0x55,%ymm3,%ymm1 - vmovdqa %ymm0,0x100-0x100(%rcx) - vpshufd $0xaa,%ymm3,%ymm2 - vmovdqa %ymm1,0x120-0x100(%rcx) - vpshufd $0xff,%ymm3,%ymm3 - vmovdqa %ymm2,0x140-0x100(%rcx) - vmovdqa %ymm3,0x160-0x100(%rcx) - - vpshufd $0x00,%ymm15,%ymm12 # "xc0" - vpshufd $0x55,%ymm15,%ymm13 # "xc1" - vmovdqa %ymm12,0x180-0x200(%rax) - vpshufd $0xaa,%ymm15,%ymm14 # "xc2" - vmovdqa %ymm13,0x1a0-0x200(%rax) - vpshufd $0xff,%ymm15,%ymm15 # "xc3" - vmovdqa %ymm14,0x1c0-0x200(%rax) - vmovdqa %ymm15,0x1e0-0x200(%rax) - - vpshufd $0x00,%ymm7,%ymm4 - vpshufd $0x55,%ymm7,%ymm5 - vpaddd .Lincy(%rip),%ymm4,%ymm4 # don't save counters yet - vpshufd $0xaa,%ymm7,%ymm6 - vmovdqa %ymm5,0x220-0x200(%rax) - vpshufd $0xff,%ymm7,%ymm7 - vmovdqa %ymm6,0x240-0x200(%rax) - vmovdqa %ymm7,0x260-0x200(%rax) - - jmp .Loop_enter8x - -.align 32 -.Loop_outer8x: - vmovdqa 0x80-0x100(%rcx),%ymm8 # re-load smashed key - vmovdqa 0xa0-0x100(%rcx),%ymm9 - vmovdqa 0xc0-0x100(%rcx),%ymm10 - vmovdqa 0xe0-0x100(%rcx),%ymm11 - vmovdqa 0x100-0x100(%rcx),%ymm0 - vmovdqa 0x120-0x100(%rcx),%ymm1 - vmovdqa 0x140-0x100(%rcx),%ymm2 - vmovdqa 0x160-0x100(%rcx),%ymm3 - vmovdqa 0x180-0x200(%rax),%ymm12 # "xc0" - vmovdqa 0x1a0-0x200(%rax),%ymm13 # "xc1" - vmovdqa 0x1c0-0x200(%rax),%ymm14 # "xc2" - vmovdqa 0x1e0-0x200(%rax),%ymm15 # "xc3" - vmovdqa 0x200-0x200(%rax),%ymm4 - vmovdqa 0x220-0x200(%rax),%ymm5 - vmovdqa 0x240-0x200(%rax),%ymm6 - vmovdqa 0x260-0x200(%rax),%ymm7 - vpaddd .Leight(%rip),%ymm4,%ymm4 # next SIMD counters - -.Loop_enter8x: - vmovdqa %ymm14,0x40(%rsp) # SIMD equivalent of "%nox" - vmovdqa %ymm15,0x60(%rsp) # SIMD equivalent of "%nox" - vbroadcasti128 (%r9),%ymm15 - vmovdqa %ymm4,0x200-0x200(%rax) # save SIMD counters - mov $10,%eax - jmp .Loop8x - -.align 32 -.Loop8x: - vpaddd %ymm0,%ymm8,%ymm8 - vpxor %ymm4,%ymm8,%ymm4 - vpshufb %ymm15,%ymm4,%ymm4 - vpaddd %ymm1,%ymm9,%ymm9 - vpxor %ymm5,%ymm9,%ymm5 - vpshufb %ymm15,%ymm5,%ymm5 - vpaddd %ymm4,%ymm12,%ymm12 - vpxor %ymm0,%ymm12,%ymm0 - vpslld $12,%ymm0,%ymm14 - vpsrld $20,%ymm0,%ymm0 - vpor %ymm0,%ymm14,%ymm0 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm5,%ymm13,%ymm13 - vpxor %ymm1,%ymm13,%ymm1 - vpslld $12,%ymm1,%ymm15 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm1,%ymm15,%ymm1 - vpaddd %ymm0,%ymm8,%ymm8 - vpxor %ymm4,%ymm8,%ymm4 - vpshufb %ymm14,%ymm4,%ymm4 - vpaddd %ymm1,%ymm9,%ymm9 - vpxor %ymm5,%ymm9,%ymm5 - vpshufb %ymm14,%ymm5,%ymm5 - vpaddd %ymm4,%ymm12,%ymm12 - vpxor %ymm0,%ymm12,%ymm0 - vpslld $7,%ymm0,%ymm15 - vpsrld $25,%ymm0,%ymm0 - vpor %ymm0,%ymm15,%ymm0 - vbroadcasti128 (%r9),%ymm15 - vpaddd %ymm5,%ymm13,%ymm13 - vpxor %ymm1,%ymm13,%ymm1 - vpslld $7,%ymm1,%ymm14 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm1,%ymm14,%ymm1 - vmovdqa %ymm12,0(%rsp) - vmovdqa %ymm13,32(%rsp) - vmovdqa 64(%rsp),%ymm12 - vmovdqa 96(%rsp),%ymm13 - vpaddd %ymm2,%ymm10,%ymm10 - vpxor %ymm6,%ymm10,%ymm6 - vpshufb %ymm15,%ymm6,%ymm6 - vpaddd %ymm3,%ymm11,%ymm11 - vpxor %ymm7,%ymm11,%ymm7 - vpshufb %ymm15,%ymm7,%ymm7 - vpaddd %ymm6,%ymm12,%ymm12 - vpxor %ymm2,%ymm12,%ymm2 - vpslld $12,%ymm2,%ymm14 - vpsrld $20,%ymm2,%ymm2 - vpor %ymm2,%ymm14,%ymm2 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm7,%ymm13,%ymm13 - vpxor %ymm3,%ymm13,%ymm3 - vpslld $12,%ymm3,%ymm15 - vpsrld $20,%ymm3,%ymm3 - vpor %ymm3,%ymm15,%ymm3 - vpaddd %ymm2,%ymm10,%ymm10 - vpxor %ymm6,%ymm10,%ymm6 - vpshufb %ymm14,%ymm6,%ymm6 - vpaddd %ymm3,%ymm11,%ymm11 - vpxor %ymm7,%ymm11,%ymm7 - vpshufb %ymm14,%ymm7,%ymm7 - vpaddd %ymm6,%ymm12,%ymm12 - vpxor %ymm2,%ymm12,%ymm2 - vpslld $7,%ymm2,%ymm15 - vpsrld $25,%ymm2,%ymm2 - vpor %ymm2,%ymm15,%ymm2 - vbroadcasti128 (%r9),%ymm15 - vpaddd %ymm7,%ymm13,%ymm13 - vpxor %ymm3,%ymm13,%ymm3 - vpslld $7,%ymm3,%ymm14 - vpsrld $25,%ymm3,%ymm3 - vpor %ymm3,%ymm14,%ymm3 - vpaddd %ymm1,%ymm8,%ymm8 - vpxor %ymm7,%ymm8,%ymm7 - vpshufb %ymm15,%ymm7,%ymm7 - vpaddd %ymm2,%ymm9,%ymm9 - vpxor %ymm4,%ymm9,%ymm4 - vpshufb %ymm15,%ymm4,%ymm4 - vpaddd %ymm7,%ymm12,%ymm12 - vpxor %ymm1,%ymm12,%ymm1 - vpslld $12,%ymm1,%ymm14 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm1,%ymm14,%ymm1 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm4,%ymm13,%ymm13 - vpxor %ymm2,%ymm13,%ymm2 - vpslld $12,%ymm2,%ymm15 - vpsrld $20,%ymm2,%ymm2 - vpor %ymm2,%ymm15,%ymm2 - vpaddd %ymm1,%ymm8,%ymm8 - vpxor %ymm7,%ymm8,%ymm7 - vpshufb %ymm14,%ymm7,%ymm7 - vpaddd %ymm2,%ymm9,%ymm9 - vpxor %ymm4,%ymm9,%ymm4 - vpshufb %ymm14,%ymm4,%ymm4 - vpaddd %ymm7,%ymm12,%ymm12 - vpxor %ymm1,%ymm12,%ymm1 - vpslld $7,%ymm1,%ymm15 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm1,%ymm15,%ymm1 - vbroadcasti128 (%r9),%ymm15 - vpaddd %ymm4,%ymm13,%ymm13 - vpxor %ymm2,%ymm13,%ymm2 - vpslld $7,%ymm2,%ymm14 - vpsrld $25,%ymm2,%ymm2 - vpor %ymm2,%ymm14,%ymm2 - vmovdqa %ymm12,64(%rsp) - vmovdqa %ymm13,96(%rsp) - vmovdqa 0(%rsp),%ymm12 - vmovdqa 32(%rsp),%ymm13 - vpaddd %ymm3,%ymm10,%ymm10 - vpxor %ymm5,%ymm10,%ymm5 - vpshufb %ymm15,%ymm5,%ymm5 - vpaddd %ymm0,%ymm11,%ymm11 - vpxor %ymm6,%ymm11,%ymm6 - vpshufb %ymm15,%ymm6,%ymm6 - vpaddd %ymm5,%ymm12,%ymm12 - vpxor %ymm3,%ymm12,%ymm3 - vpslld $12,%ymm3,%ymm14 - vpsrld $20,%ymm3,%ymm3 - vpor %ymm3,%ymm14,%ymm3 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm6,%ymm13,%ymm13 - vpxor %ymm0,%ymm13,%ymm0 - vpslld $12,%ymm0,%ymm15 - vpsrld $20,%ymm0,%ymm0 - vpor %ymm0,%ymm15,%ymm0 - vpaddd %ymm3,%ymm10,%ymm10 - vpxor %ymm5,%ymm10,%ymm5 - vpshufb %ymm14,%ymm5,%ymm5 - vpaddd %ymm0,%ymm11,%ymm11 - vpxor %ymm6,%ymm11,%ymm6 - vpshufb %ymm14,%ymm6,%ymm6 - vpaddd %ymm5,%ymm12,%ymm12 - vpxor %ymm3,%ymm12,%ymm3 - vpslld $7,%ymm3,%ymm15 - vpsrld $25,%ymm3,%ymm3 - vpor %ymm3,%ymm15,%ymm3 - vbroadcasti128 (%r9),%ymm15 - vpaddd %ymm6,%ymm13,%ymm13 - vpxor %ymm0,%ymm13,%ymm0 - vpslld $7,%ymm0,%ymm14 - vpsrld $25,%ymm0,%ymm0 - vpor %ymm0,%ymm14,%ymm0 - dec %eax - jnz .Loop8x - - lea 0x200(%rsp),%rax # size optimization - vpaddd 0x80-0x100(%rcx),%ymm8,%ymm8 # accumulate key - vpaddd 0xa0-0x100(%rcx),%ymm9,%ymm9 - vpaddd 0xc0-0x100(%rcx),%ymm10,%ymm10 - vpaddd 0xe0-0x100(%rcx),%ymm11,%ymm11 - - vpunpckldq %ymm9,%ymm8,%ymm14 # "de-interlace" data - vpunpckldq %ymm11,%ymm10,%ymm15 - vpunpckhdq %ymm9,%ymm8,%ymm8 - vpunpckhdq %ymm11,%ymm10,%ymm10 - vpunpcklqdq %ymm15,%ymm14,%ymm9 # "a0" - vpunpckhqdq %ymm15,%ymm14,%ymm14 # "a1" - vpunpcklqdq %ymm10,%ymm8,%ymm11 # "a2" - vpunpckhqdq %ymm10,%ymm8,%ymm8 # "a3" - vpaddd 0x100-0x100(%rcx),%ymm0,%ymm0 - vpaddd 0x120-0x100(%rcx),%ymm1,%ymm1 - vpaddd 0x140-0x100(%rcx),%ymm2,%ymm2 - vpaddd 0x160-0x100(%rcx),%ymm3,%ymm3 - - vpunpckldq %ymm1,%ymm0,%ymm10 - vpunpckldq %ymm3,%ymm2,%ymm15 - vpunpckhdq %ymm1,%ymm0,%ymm0 - vpunpckhdq %ymm3,%ymm2,%ymm2 - vpunpcklqdq %ymm15,%ymm10,%ymm1 # "b0" - vpunpckhqdq %ymm15,%ymm10,%ymm10 # "b1" - vpunpcklqdq %ymm2,%ymm0,%ymm3 # "b2" - vpunpckhqdq %ymm2,%ymm0,%ymm0 # "b3" - vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 # "de-interlace" further - vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 - vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 - vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 - vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 - vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 - vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 - vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 - vmovdqa %ymm15,0x00(%rsp) # offload - vmovdqa %ymm9,0x20(%rsp) - vmovdqa 0x40(%rsp),%ymm15 # %ymm15 - vmovdqa 0x60(%rsp),%ymm9 # %ymm9 - - vpaddd 0x180-0x200(%rax),%ymm12,%ymm12 - vpaddd 0x1a0-0x200(%rax),%ymm13,%ymm13 - vpaddd 0x1c0-0x200(%rax),%ymm15,%ymm15 - vpaddd 0x1e0-0x200(%rax),%ymm9,%ymm9 - - vpunpckldq %ymm13,%ymm12,%ymm2 - vpunpckldq %ymm9,%ymm15,%ymm8 - vpunpckhdq %ymm13,%ymm12,%ymm12 - vpunpckhdq %ymm9,%ymm15,%ymm15 - vpunpcklqdq %ymm8,%ymm2,%ymm13 # "c0" - vpunpckhqdq %ymm8,%ymm2,%ymm2 # "c1" - vpunpcklqdq %ymm15,%ymm12,%ymm9 # "c2" - vpunpckhqdq %ymm15,%ymm12,%ymm12 # "c3" - vpaddd 0x200-0x200(%rax),%ymm4,%ymm4 - vpaddd 0x220-0x200(%rax),%ymm5,%ymm5 - vpaddd 0x240-0x200(%rax),%ymm6,%ymm6 - vpaddd 0x260-0x200(%rax),%ymm7,%ymm7 - - vpunpckldq %ymm5,%ymm4,%ymm15 - vpunpckldq %ymm7,%ymm6,%ymm8 - vpunpckhdq %ymm5,%ymm4,%ymm4 - vpunpckhdq %ymm7,%ymm6,%ymm6 - vpunpcklqdq %ymm8,%ymm15,%ymm5 # "d0" - vpunpckhqdq %ymm8,%ymm15,%ymm15 # "d1" - vpunpcklqdq %ymm6,%ymm4,%ymm7 # "d2" - vpunpckhqdq %ymm6,%ymm4,%ymm4 # "d3" - vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 # "de-interlace" further - vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 - vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 - vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 - vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 - vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 - vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 - vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 - vmovdqa 0x00(%rsp),%ymm6 # was offloaded, remember? - vmovdqa 0x20(%rsp),%ymm12 - - cmp $64*8,%rdx - jb .Ltail8x - - vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input - vpxor 0x20(%rsi),%ymm8,%ymm8 - vpxor 0x40(%rsi),%ymm1,%ymm1 - vpxor 0x60(%rsi),%ymm5,%ymm5 - lea 0x80(%rsi),%rsi # size optimization - vmovdqu %ymm6,0x00(%rdi) - vmovdqu %ymm8,0x20(%rdi) - vmovdqu %ymm1,0x40(%rdi) - vmovdqu %ymm5,0x60(%rdi) - lea 0x80(%rdi),%rdi # size optimization - - vpxor 0x00(%rsi),%ymm12,%ymm12 - vpxor 0x20(%rsi),%ymm13,%ymm13 - vpxor 0x40(%rsi),%ymm10,%ymm10 - vpxor 0x60(%rsi),%ymm15,%ymm15 - lea 0x80(%rsi),%rsi # size optimization - vmovdqu %ymm12,0x00(%rdi) - vmovdqu %ymm13,0x20(%rdi) - vmovdqu %ymm10,0x40(%rdi) - vmovdqu %ymm15,0x60(%rdi) - lea 0x80(%rdi),%rdi # size optimization - - vpxor 0x00(%rsi),%ymm14,%ymm14 - vpxor 0x20(%rsi),%ymm2,%ymm2 - vpxor 0x40(%rsi),%ymm3,%ymm3 - vpxor 0x60(%rsi),%ymm7,%ymm7 - lea 0x80(%rsi),%rsi # size optimization - vmovdqu %ymm14,0x00(%rdi) - vmovdqu %ymm2,0x20(%rdi) - vmovdqu %ymm3,0x40(%rdi) - vmovdqu %ymm7,0x60(%rdi) - lea 0x80(%rdi),%rdi # size optimization - - vpxor 0x00(%rsi),%ymm11,%ymm11 - vpxor 0x20(%rsi),%ymm9,%ymm9 - vpxor 0x40(%rsi),%ymm0,%ymm0 - vpxor 0x60(%rsi),%ymm4,%ymm4 - lea 0x80(%rsi),%rsi # size optimization - vmovdqu %ymm11,0x00(%rdi) - vmovdqu %ymm9,0x20(%rdi) - vmovdqu %ymm0,0x40(%rdi) - vmovdqu %ymm4,0x60(%rdi) - lea 0x80(%rdi),%rdi # size optimization - - sub $64*8,%rdx - jnz .Loop_outer8x - - jmp .Ldone8x - -.Ltail8x: - cmp $448,%rdx - jae .L448_or_more8x - cmp $384,%rdx - jae .L384_or_more8x - cmp $320,%rdx - jae .L320_or_more8x - cmp $256,%rdx - jae .L256_or_more8x - cmp $192,%rdx - jae .L192_or_more8x - cmp $128,%rdx - jae .L128_or_more8x - cmp $64,%rdx - jae .L64_or_more8x - - xor %r9,%r9 - vmovdqa %ymm6,0x00(%rsp) - vmovdqa %ymm8,0x20(%rsp) - jmp .Loop_tail8x - -.align 32 -.L64_or_more8x: - vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input - vpxor 0x20(%rsi),%ymm8,%ymm8 - vmovdqu %ymm6,0x00(%rdi) - vmovdqu %ymm8,0x20(%rdi) - je .Ldone8x - - lea 0x40(%rsi),%rsi # inp+=64*1 - xor %r9,%r9 - vmovdqa %ymm1,0x00(%rsp) - lea 0x40(%rdi),%rdi # out+=64*1 - sub $64,%rdx # len-=64*1 - vmovdqa %ymm5,0x20(%rsp) - jmp .Loop_tail8x - -.align 32 -.L128_or_more8x: - vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input - vpxor 0x20(%rsi),%ymm8,%ymm8 - vpxor 0x40(%rsi),%ymm1,%ymm1 - vpxor 0x60(%rsi),%ymm5,%ymm5 - vmovdqu %ymm6,0x00(%rdi) - vmovdqu %ymm8,0x20(%rdi) - vmovdqu %ymm1,0x40(%rdi) - vmovdqu %ymm5,0x60(%rdi) - je .Ldone8x - - lea 0x80(%rsi),%rsi # inp+=64*2 - xor %r9,%r9 - vmovdqa %ymm12,0x00(%rsp) - lea 0x80(%rdi),%rdi # out+=64*2 - sub $128,%rdx # len-=64*2 - vmovdqa %ymm13,0x20(%rsp) - jmp .Loop_tail8x - -.align 32 -.L192_or_more8x: - vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input - vpxor 0x20(%rsi),%ymm8,%ymm8 - vpxor 0x40(%rsi),%ymm1,%ymm1 - vpxor 0x60(%rsi),%ymm5,%ymm5 - vpxor 0x80(%rsi),%ymm12,%ymm12 - vpxor 0xa0(%rsi),%ymm13,%ymm13 - vmovdqu %ymm6,0x00(%rdi) - vmovdqu %ymm8,0x20(%rdi) - vmovdqu %ymm1,0x40(%rdi) - vmovdqu %ymm5,0x60(%rdi) - vmovdqu %ymm12,0x80(%rdi) - vmovdqu %ymm13,0xa0(%rdi) - je .Ldone8x - - lea 0xc0(%rsi),%rsi # inp+=64*3 - xor %r9,%r9 - vmovdqa %ymm10,0x00(%rsp) - lea 0xc0(%rdi),%rdi # out+=64*3 - sub $192,%rdx # len-=64*3 - vmovdqa %ymm15,0x20(%rsp) - jmp .Loop_tail8x - -.align 32 -.L256_or_more8x: - vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input - vpxor 0x20(%rsi),%ymm8,%ymm8 - vpxor 0x40(%rsi),%ymm1,%ymm1 - vpxor 0x60(%rsi),%ymm5,%ymm5 - vpxor 0x80(%rsi),%ymm12,%ymm12 - vpxor 0xa0(%rsi),%ymm13,%ymm13 - vpxor 0xc0(%rsi),%ymm10,%ymm10 - vpxor 0xe0(%rsi),%ymm15,%ymm15 - vmovdqu %ymm6,0x00(%rdi) - vmovdqu %ymm8,0x20(%rdi) - vmovdqu %ymm1,0x40(%rdi) - vmovdqu %ymm5,0x60(%rdi) - vmovdqu %ymm12,0x80(%rdi) - vmovdqu %ymm13,0xa0(%rdi) - vmovdqu %ymm10,0xc0(%rdi) - vmovdqu %ymm15,0xe0(%rdi) - je .Ldone8x - - lea 0x100(%rsi),%rsi # inp+=64*4 - xor %r9,%r9 - vmovdqa %ymm14,0x00(%rsp) - lea 0x100(%rdi),%rdi # out+=64*4 - sub $256,%rdx # len-=64*4 - vmovdqa %ymm2,0x20(%rsp) - jmp .Loop_tail8x - -.align 32 -.L320_or_more8x: - vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input - vpxor 0x20(%rsi),%ymm8,%ymm8 - vpxor 0x40(%rsi),%ymm1,%ymm1 - vpxor 0x60(%rsi),%ymm5,%ymm5 - vpxor 0x80(%rsi),%ymm12,%ymm12 - vpxor 0xa0(%rsi),%ymm13,%ymm13 - vpxor 0xc0(%rsi),%ymm10,%ymm10 - vpxor 0xe0(%rsi),%ymm15,%ymm15 - vpxor 0x100(%rsi),%ymm14,%ymm14 - vpxor 0x120(%rsi),%ymm2,%ymm2 - vmovdqu %ymm6,0x00(%rdi) - vmovdqu %ymm8,0x20(%rdi) - vmovdqu %ymm1,0x40(%rdi) - vmovdqu %ymm5,0x60(%rdi) - vmovdqu %ymm12,0x80(%rdi) - vmovdqu %ymm13,0xa0(%rdi) - vmovdqu %ymm10,0xc0(%rdi) - vmovdqu %ymm15,0xe0(%rdi) - vmovdqu %ymm14,0x100(%rdi) - vmovdqu %ymm2,0x120(%rdi) - je .Ldone8x - - lea 0x140(%rsi),%rsi # inp+=64*5 - xor %r9,%r9 - vmovdqa %ymm3,0x00(%rsp) - lea 0x140(%rdi),%rdi # out+=64*5 - sub $320,%rdx # len-=64*5 - vmovdqa %ymm7,0x20(%rsp) - jmp .Loop_tail8x - -.align 32 -.L384_or_more8x: - vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input - vpxor 0x20(%rsi),%ymm8,%ymm8 - vpxor 0x40(%rsi),%ymm1,%ymm1 - vpxor 0x60(%rsi),%ymm5,%ymm5 - vpxor 0x80(%rsi),%ymm12,%ymm12 - vpxor 0xa0(%rsi),%ymm13,%ymm13 - vpxor 0xc0(%rsi),%ymm10,%ymm10 - vpxor 0xe0(%rsi),%ymm15,%ymm15 - vpxor 0x100(%rsi),%ymm14,%ymm14 - vpxor 0x120(%rsi),%ymm2,%ymm2 - vpxor 0x140(%rsi),%ymm3,%ymm3 - vpxor 0x160(%rsi),%ymm7,%ymm7 - vmovdqu %ymm6,0x00(%rdi) - vmovdqu %ymm8,0x20(%rdi) - vmovdqu %ymm1,0x40(%rdi) - vmovdqu %ymm5,0x60(%rdi) - vmovdqu %ymm12,0x80(%rdi) - vmovdqu %ymm13,0xa0(%rdi) - vmovdqu %ymm10,0xc0(%rdi) - vmovdqu %ymm15,0xe0(%rdi) - vmovdqu %ymm14,0x100(%rdi) - vmovdqu %ymm2,0x120(%rdi) - vmovdqu %ymm3,0x140(%rdi) - vmovdqu %ymm7,0x160(%rdi) - je .Ldone8x - - lea 0x180(%rsi),%rsi # inp+=64*6 - xor %r9,%r9 - vmovdqa %ymm11,0x00(%rsp) - lea 0x180(%rdi),%rdi # out+=64*6 - sub $384,%rdx # len-=64*6 - vmovdqa %ymm9,0x20(%rsp) - jmp .Loop_tail8x - -.align 32 -.L448_or_more8x: - vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input - vpxor 0x20(%rsi),%ymm8,%ymm8 - vpxor 0x40(%rsi),%ymm1,%ymm1 - vpxor 0x60(%rsi),%ymm5,%ymm5 - vpxor 0x80(%rsi),%ymm12,%ymm12 - vpxor 0xa0(%rsi),%ymm13,%ymm13 - vpxor 0xc0(%rsi),%ymm10,%ymm10 - vpxor 0xe0(%rsi),%ymm15,%ymm15 - vpxor 0x100(%rsi),%ymm14,%ymm14 - vpxor 0x120(%rsi),%ymm2,%ymm2 - vpxor 0x140(%rsi),%ymm3,%ymm3 - vpxor 0x160(%rsi),%ymm7,%ymm7 - vpxor 0x180(%rsi),%ymm11,%ymm11 - vpxor 0x1a0(%rsi),%ymm9,%ymm9 - vmovdqu %ymm6,0x00(%rdi) - vmovdqu %ymm8,0x20(%rdi) - vmovdqu %ymm1,0x40(%rdi) - vmovdqu %ymm5,0x60(%rdi) - vmovdqu %ymm12,0x80(%rdi) - vmovdqu %ymm13,0xa0(%rdi) - vmovdqu %ymm10,0xc0(%rdi) - vmovdqu %ymm15,0xe0(%rdi) - vmovdqu %ymm14,0x100(%rdi) - vmovdqu %ymm2,0x120(%rdi) - vmovdqu %ymm3,0x140(%rdi) - vmovdqu %ymm7,0x160(%rdi) - vmovdqu %ymm11,0x180(%rdi) - vmovdqu %ymm9,0x1a0(%rdi) - je .Ldone8x - - lea 0x1c0(%rsi),%rsi # inp+=64*7 - xor %r9,%r9 - vmovdqa %ymm0,0x00(%rsp) - lea 0x1c0(%rdi),%rdi # out+=64*7 - sub $448,%rdx # len-=64*7 - vmovdqa %ymm4,0x20(%rsp) - -.Loop_tail8x: - movzb (%rsi,%r9),%eax - movzb (%rsp,%r9),%ecx - lea 1(%r9),%r9 - xor %ecx,%eax - mov %al,-1(%rdi,%r9) - dec %rdx - jnz .Loop_tail8x - -.Ldone8x: - vzeroall - lea -8(%r10),%rsp -.L8x_epilogue: - ret -SYM_FUNC_END(chacha20_avx2) -#endif -#ifdef CONFIG_AS_AVX512 -.align 32 -SYM_FUNC_START(chacha20_avx512) -.Lchacha20_avx512: - lea 8(%rsp),%r10 # frame pointer - cmp $512,%rdx - ja .Lchacha20_16x - - sub $64+8,%rsp - and $-64,%rsp - vbroadcasti32x4 .Lsigma(%rip),%zmm0 - vbroadcasti32x4 (%rcx),%zmm1 - vbroadcasti32x4 16(%rcx),%zmm2 - vbroadcasti32x4 (%r8),%zmm3 - - vmovdqa32 %zmm0,%zmm16 - vmovdqa32 %zmm1,%zmm17 - vmovdqa32 %zmm2,%zmm18 - vpaddd .Lzeroz(%rip),%zmm3,%zmm3 - vmovdqa32 .Lfourz(%rip),%zmm20 - mov $10,%r8 # reuse %r8 - vmovdqa32 %zmm3,%zmm19 - jmp .Loop_avx512 - -.align 16 -.Loop_outer_avx512: - vmovdqa32 %zmm16,%zmm0 - vmovdqa32 %zmm17,%zmm1 - vmovdqa32 %zmm18,%zmm2 - vpaddd %zmm20,%zmm19,%zmm3 - mov $10,%r8 - vmovdqa32 %zmm3,%zmm19 - jmp .Loop_avx512 - -.align 32 -.Loop_avx512: - vpaddd %zmm1,%zmm0,%zmm0 - vpxord %zmm0,%zmm3,%zmm3 - vprold $16,%zmm3,%zmm3 - vpaddd %zmm3,%zmm2,%zmm2 - vpxord %zmm2,%zmm1,%zmm1 - vprold $12,%zmm1,%zmm1 - vpaddd %zmm1,%zmm0,%zmm0 - vpxord %zmm0,%zmm3,%zmm3 - vprold $8,%zmm3,%zmm3 - vpaddd %zmm3,%zmm2,%zmm2 - vpxord %zmm2,%zmm1,%zmm1 - vprold $7,%zmm1,%zmm1 - vpshufd $78,%zmm2,%zmm2 - vpshufd $57,%zmm1,%zmm1 - vpshufd $147,%zmm3,%zmm3 - vpaddd %zmm1,%zmm0,%zmm0 - vpxord %zmm0,%zmm3,%zmm3 - vprold $16,%zmm3,%zmm3 - vpaddd %zmm3,%zmm2,%zmm2 - vpxord %zmm2,%zmm1,%zmm1 - vprold $12,%zmm1,%zmm1 - vpaddd %zmm1,%zmm0,%zmm0 - vpxord %zmm0,%zmm3,%zmm3 - vprold $8,%zmm3,%zmm3 - vpaddd %zmm3,%zmm2,%zmm2 - vpxord %zmm2,%zmm1,%zmm1 - vprold $7,%zmm1,%zmm1 - vpshufd $78,%zmm2,%zmm2 - vpshufd $147,%zmm1,%zmm1 - vpshufd $57,%zmm3,%zmm3 - dec %r8 - jnz .Loop_avx512 - vpaddd %zmm16,%zmm0,%zmm0 - vpaddd %zmm17,%zmm1,%zmm1 - vpaddd %zmm18,%zmm2,%zmm2 - vpaddd %zmm19,%zmm3,%zmm3 - - sub $64,%rdx - jb .Ltail64_avx512 - - vpxor 0x00(%rsi),%xmm0,%xmm4 # xor with input - vpxor 0x10(%rsi),%xmm1,%xmm5 - vpxor 0x20(%rsi),%xmm2,%xmm6 - vpxor 0x30(%rsi),%xmm3,%xmm7 - lea 0x40(%rsi),%rsi # inp+=64 - - vmovdqu %xmm4,0x00(%rdi) # write output - vmovdqu %xmm5,0x10(%rdi) - vmovdqu %xmm6,0x20(%rdi) - vmovdqu %xmm7,0x30(%rdi) - lea 0x40(%rdi),%rdi # out+=64 - - jz .Ldone_avx512 - - vextracti32x4 $1,%zmm0,%xmm4 - vextracti32x4 $1,%zmm1,%xmm5 - vextracti32x4 $1,%zmm2,%xmm6 - vextracti32x4 $1,%zmm3,%xmm7 - - sub $64,%rdx - jb .Ltail_avx512 - - vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input - vpxor 0x10(%rsi),%xmm5,%xmm5 - vpxor 0x20(%rsi),%xmm6,%xmm6 - vpxor 0x30(%rsi),%xmm7,%xmm7 - lea 0x40(%rsi),%rsi # inp+=64 - - vmovdqu %xmm4,0x00(%rdi) # write output - vmovdqu %xmm5,0x10(%rdi) - vmovdqu %xmm6,0x20(%rdi) - vmovdqu %xmm7,0x30(%rdi) - lea 0x40(%rdi),%rdi # out+=64 - - jz .Ldone_avx512 - - vextracti32x4 $2,%zmm0,%xmm4 - vextracti32x4 $2,%zmm1,%xmm5 - vextracti32x4 $2,%zmm2,%xmm6 - vextracti32x4 $2,%zmm3,%xmm7 - - sub $64,%rdx - jb .Ltail_avx512 - - vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input - vpxor 0x10(%rsi),%xmm5,%xmm5 - vpxor 0x20(%rsi),%xmm6,%xmm6 - vpxor 0x30(%rsi),%xmm7,%xmm7 - lea 0x40(%rsi),%rsi # inp+=64 - - vmovdqu %xmm4,0x00(%rdi) # write output - vmovdqu %xmm5,0x10(%rdi) - vmovdqu %xmm6,0x20(%rdi) - vmovdqu %xmm7,0x30(%rdi) - lea 0x40(%rdi),%rdi # out+=64 - - jz .Ldone_avx512 - - vextracti32x4 $3,%zmm0,%xmm4 - vextracti32x4 $3,%zmm1,%xmm5 - vextracti32x4 $3,%zmm2,%xmm6 - vextracti32x4 $3,%zmm3,%xmm7 - - sub $64,%rdx - jb .Ltail_avx512 - - vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input - vpxor 0x10(%rsi),%xmm5,%xmm5 - vpxor 0x20(%rsi),%xmm6,%xmm6 - vpxor 0x30(%rsi),%xmm7,%xmm7 - lea 0x40(%rsi),%rsi # inp+=64 - - vmovdqu %xmm4,0x00(%rdi) # write output - vmovdqu %xmm5,0x10(%rdi) - vmovdqu %xmm6,0x20(%rdi) - vmovdqu %xmm7,0x30(%rdi) - lea 0x40(%rdi),%rdi # out+=64 - - jnz .Loop_outer_avx512 - - jmp .Ldone_avx512 - -.align 16 -.Ltail64_avx512: - vmovdqa %xmm0,0x00(%rsp) - vmovdqa %xmm1,0x10(%rsp) - vmovdqa %xmm2,0x20(%rsp) - vmovdqa %xmm3,0x30(%rsp) - add $64,%rdx - jmp .Loop_tail_avx512 - -.align 16 -.Ltail_avx512: - vmovdqa %xmm4,0x00(%rsp) - vmovdqa %xmm5,0x10(%rsp) - vmovdqa %xmm6,0x20(%rsp) - vmovdqa %xmm7,0x30(%rsp) - add $64,%rdx - -.Loop_tail_avx512: - movzb (%rsi,%r8),%eax - movzb (%rsp,%r8),%ecx - lea 1(%r8),%r8 - xor %ecx,%eax - mov %al,-1(%rdi,%r8) - dec %rdx - jnz .Loop_tail_avx512 - - vmovdqu32 %zmm16,0x00(%rsp) - -.Ldone_avx512: - vzeroall - lea -8(%r10),%rsp -.Lavx512_epilogue: - ret -SYM_FUNC_END(chacha20_avx512) -.align 32 -SYM_FUNC_START(chacha20_avx512vl) -.Lchacha20_avx512vl: - lea 8(%rsp),%r10 # frame pointer - cmp $128,%rdx - ja .Lchacha20_8xvl - - sub $64+8,%rsp - and $-32,%rsp - vbroadcasti128 .Lsigma(%rip),%ymm0 - vbroadcasti128 (%rcx),%ymm1 - vbroadcasti128 16(%rcx),%ymm2 - vbroadcasti128 (%r8),%ymm3 - - vmovdqa32 %ymm0,%ymm16 - vmovdqa32 %ymm1,%ymm17 - vmovdqa32 %ymm2,%ymm18 - vpaddd .Lzeroz(%rip),%ymm3,%ymm3 - vmovdqa32 .Ltwoy(%rip),%ymm20 - mov $10,%r8 # reuse %r8 - vmovdqa32 %ymm3,%ymm19 - jmp .Loop_avx512vl - -.align 16 -.Loop_outer_avx512vl: - vmovdqa32 %ymm18,%ymm2 - vpaddd %ymm20,%ymm19,%ymm3 - mov $10,%r8 - vmovdqa32 %ymm3,%ymm19 - jmp .Loop_avx512vl - -.align 32 -.Loop_avx512vl: - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - vpshufd $78,%ymm2,%ymm2 - vpshufd $57,%ymm1,%ymm1 - vpshufd $147,%ymm3,%ymm3 - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - vpshufd $78,%ymm2,%ymm2 - vpshufd $147,%ymm1,%ymm1 - vpshufd $57,%ymm3,%ymm3 - dec %r8 - jnz .Loop_avx512vl - vpaddd %ymm16,%ymm0,%ymm0 - vpaddd %ymm17,%ymm1,%ymm1 - vpaddd %ymm18,%ymm2,%ymm2 - vpaddd %ymm19,%ymm3,%ymm3 - - sub $64,%rdx - jb .Ltail64_avx512vl - - vpxor 0x00(%rsi),%xmm0,%xmm4 # xor with input - vpxor 0x10(%rsi),%xmm1,%xmm5 - vpxor 0x20(%rsi),%xmm2,%xmm6 - vpxor 0x30(%rsi),%xmm3,%xmm7 - lea 0x40(%rsi),%rsi # inp+=64 - - vmovdqu %xmm4,0x00(%rdi) # write output - vmovdqu %xmm5,0x10(%rdi) - vmovdqu %xmm6,0x20(%rdi) - vmovdqu %xmm7,0x30(%rdi) - lea 0x40(%rdi),%rdi # out+=64 - - jz .Ldone_avx512vl - - vextracti128 $1,%ymm0,%xmm4 - vextracti128 $1,%ymm1,%xmm5 - vextracti128 $1,%ymm2,%xmm6 - vextracti128 $1,%ymm3,%xmm7 - - sub $64,%rdx - jb .Ltail_avx512vl - - vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input - vpxor 0x10(%rsi),%xmm5,%xmm5 - vpxor 0x20(%rsi),%xmm6,%xmm6 - vpxor 0x30(%rsi),%xmm7,%xmm7 - lea 0x40(%rsi),%rsi # inp+=64 - - vmovdqu %xmm4,0x00(%rdi) # write output - vmovdqu %xmm5,0x10(%rdi) - vmovdqu %xmm6,0x20(%rdi) - vmovdqu %xmm7,0x30(%rdi) - lea 0x40(%rdi),%rdi # out+=64 - - vmovdqa32 %ymm16,%ymm0 - vmovdqa32 %ymm17,%ymm1 - jnz .Loop_outer_avx512vl - - jmp .Ldone_avx512vl - -.align 16 -.Ltail64_avx512vl: - vmovdqa %xmm0,0x00(%rsp) - vmovdqa %xmm1,0x10(%rsp) - vmovdqa %xmm2,0x20(%rsp) - vmovdqa %xmm3,0x30(%rsp) - add $64,%rdx - jmp .Loop_tail_avx512vl - -.align 16 -.Ltail_avx512vl: - vmovdqa %xmm4,0x00(%rsp) - vmovdqa %xmm5,0x10(%rsp) - vmovdqa %xmm6,0x20(%rsp) - vmovdqa %xmm7,0x30(%rsp) - add $64,%rdx - -.Loop_tail_avx512vl: - movzb (%rsi,%r8),%eax - movzb (%rsp,%r8),%ecx - lea 1(%r8),%r8 - xor %ecx,%eax - mov %al,-1(%rdi,%r8) - dec %rdx - jnz .Loop_tail_avx512vl - - vmovdqu32 %ymm16,0x00(%rsp) - vmovdqu32 %ymm16,0x20(%rsp) - -.Ldone_avx512vl: - vzeroall - lea -8(%r10),%rsp -.Lavx512vl_epilogue: - ret -SYM_FUNC_END(chacha20_avx512vl) -.type chacha20_16x,@function -.align 32 -chacha20_16x: -.Lchacha20_16x: - lea 8(%rsp),%r10 # frame register - sub $64+8,%rsp - and $-64,%rsp - vzeroupper - - lea .Lsigma(%rip),%r9 - vbroadcasti32x4 (%r9),%zmm3 # key[0] - vbroadcasti32x4 (%rcx),%zmm7 # key[1] - vbroadcasti32x4 16(%rcx),%zmm11 # key[2] - vbroadcasti32x4 (%r8),%zmm15 # key[3] - - vpshufd $0x00,%zmm3,%zmm0 # smash key by lanes... - vpshufd $0x55,%zmm3,%zmm1 - vpshufd $0xaa,%zmm3,%zmm2 - vpshufd $0xff,%zmm3,%zmm3 - vmovdqa64 %zmm0,%zmm16 - vmovdqa64 %zmm1,%zmm17 - vmovdqa64 %zmm2,%zmm18 - vmovdqa64 %zmm3,%zmm19 - - vpshufd $0x00,%zmm7,%zmm4 - vpshufd $0x55,%zmm7,%zmm5 - vpshufd $0xaa,%zmm7,%zmm6 - vpshufd $0xff,%zmm7,%zmm7 - vmovdqa64 %zmm4,%zmm20 - vmovdqa64 %zmm5,%zmm21 - vmovdqa64 %zmm6,%zmm22 - vmovdqa64 %zmm7,%zmm23 - - vpshufd $0x00,%zmm11,%zmm8 - vpshufd $0x55,%zmm11,%zmm9 - vpshufd $0xaa,%zmm11,%zmm10 - vpshufd $0xff,%zmm11,%zmm11 - vmovdqa64 %zmm8,%zmm24 - vmovdqa64 %zmm9,%zmm25 - vmovdqa64 %zmm10,%zmm26 - vmovdqa64 %zmm11,%zmm27 - - vpshufd $0x00,%zmm15,%zmm12 - vpshufd $0x55,%zmm15,%zmm13 - vpshufd $0xaa,%zmm15,%zmm14 - vpshufd $0xff,%zmm15,%zmm15 - vpaddd .Lincz(%rip),%zmm12,%zmm12 # don't save counters yet - vmovdqa64 %zmm12,%zmm28 - vmovdqa64 %zmm13,%zmm29 - vmovdqa64 %zmm14,%zmm30 - vmovdqa64 %zmm15,%zmm31 - - mov $10,%eax - jmp .Loop16x - -.align 32 -.Loop_outer16x: - vpbroadcastd 0(%r9),%zmm0 # reload key - vpbroadcastd 4(%r9),%zmm1 - vpbroadcastd 8(%r9),%zmm2 - vpbroadcastd 12(%r9),%zmm3 - vpaddd .Lsixteen(%rip),%zmm28,%zmm28 # next SIMD counters - vmovdqa64 %zmm20,%zmm4 - vmovdqa64 %zmm21,%zmm5 - vmovdqa64 %zmm22,%zmm6 - vmovdqa64 %zmm23,%zmm7 - vmovdqa64 %zmm24,%zmm8 - vmovdqa64 %zmm25,%zmm9 - vmovdqa64 %zmm26,%zmm10 - vmovdqa64 %zmm27,%zmm11 - vmovdqa64 %zmm28,%zmm12 - vmovdqa64 %zmm29,%zmm13 - vmovdqa64 %zmm30,%zmm14 - vmovdqa64 %zmm31,%zmm15 - - vmovdqa64 %zmm0,%zmm16 - vmovdqa64 %zmm1,%zmm17 - vmovdqa64 %zmm2,%zmm18 - vmovdqa64 %zmm3,%zmm19 - - mov $10,%eax - jmp .Loop16x - -.align 32 -.Loop16x: - vpaddd %zmm4,%zmm0,%zmm0 - vpaddd %zmm5,%zmm1,%zmm1 - vpaddd %zmm6,%zmm2,%zmm2 - vpaddd %zmm7,%zmm3,%zmm3 - vpxord %zmm0,%zmm12,%zmm12 - vpxord %zmm1,%zmm13,%zmm13 - vpxord %zmm2,%zmm14,%zmm14 - vpxord %zmm3,%zmm15,%zmm15 - vprold $16,%zmm12,%zmm12 - vprold $16,%zmm13,%zmm13 - vprold $16,%zmm14,%zmm14 - vprold $16,%zmm15,%zmm15 - vpaddd %zmm12,%zmm8,%zmm8 - vpaddd %zmm13,%zmm9,%zmm9 - vpaddd %zmm14,%zmm10,%zmm10 - vpaddd %zmm15,%zmm11,%zmm11 - vpxord %zmm8,%zmm4,%zmm4 - vpxord %zmm9,%zmm5,%zmm5 - vpxord %zmm10,%zmm6,%zmm6 - vpxord %zmm11,%zmm7,%zmm7 - vprold $12,%zmm4,%zmm4 - vprold $12,%zmm5,%zmm5 - vprold $12,%zmm6,%zmm6 - vprold $12,%zmm7,%zmm7 - vpaddd %zmm4,%zmm0,%zmm0 - vpaddd %zmm5,%zmm1,%zmm1 - vpaddd %zmm6,%zmm2,%zmm2 - vpaddd %zmm7,%zmm3,%zmm3 - vpxord %zmm0,%zmm12,%zmm12 - vpxord %zmm1,%zmm13,%zmm13 - vpxord %zmm2,%zmm14,%zmm14 - vpxord %zmm3,%zmm15,%zmm15 - vprold $8,%zmm12,%zmm12 - vprold $8,%zmm13,%zmm13 - vprold $8,%zmm14,%zmm14 - vprold $8,%zmm15,%zmm15 - vpaddd %zmm12,%zmm8,%zmm8 - vpaddd %zmm13,%zmm9,%zmm9 - vpaddd %zmm14,%zmm10,%zmm10 - vpaddd %zmm15,%zmm11,%zmm11 - vpxord %zmm8,%zmm4,%zmm4 - vpxord %zmm9,%zmm5,%zmm5 - vpxord %zmm10,%zmm6,%zmm6 - vpxord %zmm11,%zmm7,%zmm7 - vprold $7,%zmm4,%zmm4 - vprold $7,%zmm5,%zmm5 - vprold $7,%zmm6,%zmm6 - vprold $7,%zmm7,%zmm7 - vpaddd %zmm5,%zmm0,%zmm0 - vpaddd %zmm6,%zmm1,%zmm1 - vpaddd %zmm7,%zmm2,%zmm2 - vpaddd %zmm4,%zmm3,%zmm3 - vpxord %zmm0,%zmm15,%zmm15 - vpxord %zmm1,%zmm12,%zmm12 - vpxord %zmm2,%zmm13,%zmm13 - vpxord %zmm3,%zmm14,%zmm14 - vprold $16,%zmm15,%zmm15 - vprold $16,%zmm12,%zmm12 - vprold $16,%zmm13,%zmm13 - vprold $16,%zmm14,%zmm14 - vpaddd %zmm15,%zmm10,%zmm10 - vpaddd %zmm12,%zmm11,%zmm11 - vpaddd %zmm13,%zmm8,%zmm8 - vpaddd %zmm14,%zmm9,%zmm9 - vpxord %zmm10,%zmm5,%zmm5 - vpxord %zmm11,%zmm6,%zmm6 - vpxord %zmm8,%zmm7,%zmm7 - vpxord %zmm9,%zmm4,%zmm4 - vprold $12,%zmm5,%zmm5 - vprold $12,%zmm6,%zmm6 - vprold $12,%zmm7,%zmm7 - vprold $12,%zmm4,%zmm4 - vpaddd %zmm5,%zmm0,%zmm0 - vpaddd %zmm6,%zmm1,%zmm1 - vpaddd %zmm7,%zmm2,%zmm2 - vpaddd %zmm4,%zmm3,%zmm3 - vpxord %zmm0,%zmm15,%zmm15 - vpxord %zmm1,%zmm12,%zmm12 - vpxord %zmm2,%zmm13,%zmm13 - vpxord %zmm3,%zmm14,%zmm14 - vprold $8,%zmm15,%zmm15 - vprold $8,%zmm12,%zmm12 - vprold $8,%zmm13,%zmm13 - vprold $8,%zmm14,%zmm14 - vpaddd %zmm15,%zmm10,%zmm10 - vpaddd %zmm12,%zmm11,%zmm11 - vpaddd %zmm13,%zmm8,%zmm8 - vpaddd %zmm14,%zmm9,%zmm9 - vpxord %zmm10,%zmm5,%zmm5 - vpxord %zmm11,%zmm6,%zmm6 - vpxord %zmm8,%zmm7,%zmm7 - vpxord %zmm9,%zmm4,%zmm4 - vprold $7,%zmm5,%zmm5 - vprold $7,%zmm6,%zmm6 - vprold $7,%zmm7,%zmm7 - vprold $7,%zmm4,%zmm4 - dec %eax - jnz .Loop16x - - vpaddd %zmm16,%zmm0,%zmm0 # accumulate key - vpaddd %zmm17,%zmm1,%zmm1 - vpaddd %zmm18,%zmm2,%zmm2 - vpaddd %zmm19,%zmm3,%zmm3 - - vpunpckldq %zmm1,%zmm0,%zmm18 # "de-interlace" data - vpunpckldq %zmm3,%zmm2,%zmm19 - vpunpckhdq %zmm1,%zmm0,%zmm0 - vpunpckhdq %zmm3,%zmm2,%zmm2 - vpunpcklqdq %zmm19,%zmm18,%zmm1 # "a0" - vpunpckhqdq %zmm19,%zmm18,%zmm18 # "a1" - vpunpcklqdq %zmm2,%zmm0,%zmm3 # "a2" - vpunpckhqdq %zmm2,%zmm0,%zmm0 # "a3" - vpaddd %zmm20,%zmm4,%zmm4 - vpaddd %zmm21,%zmm5,%zmm5 - vpaddd %zmm22,%zmm6,%zmm6 - vpaddd %zmm23,%zmm7,%zmm7 - - vpunpckldq %zmm5,%zmm4,%zmm2 - vpunpckldq %zmm7,%zmm6,%zmm19 - vpunpckhdq %zmm5,%zmm4,%zmm4 - vpunpckhdq %zmm7,%zmm6,%zmm6 - vpunpcklqdq %zmm19,%zmm2,%zmm5 # "b0" - vpunpckhqdq %zmm19,%zmm2,%zmm2 # "b1" - vpunpcklqdq %zmm6,%zmm4,%zmm7 # "b2" - vpunpckhqdq %zmm6,%zmm4,%zmm4 # "b3" - vshufi32x4 $0x44,%zmm5,%zmm1,%zmm19 # "de-interlace" further - vshufi32x4 $0xee,%zmm5,%zmm1,%zmm5 - vshufi32x4 $0x44,%zmm2,%zmm18,%zmm1 - vshufi32x4 $0xee,%zmm2,%zmm18,%zmm2 - vshufi32x4 $0x44,%zmm7,%zmm3,%zmm18 - vshufi32x4 $0xee,%zmm7,%zmm3,%zmm7 - vshufi32x4 $0x44,%zmm4,%zmm0,%zmm3 - vshufi32x4 $0xee,%zmm4,%zmm0,%zmm4 - vpaddd %zmm24,%zmm8,%zmm8 - vpaddd %zmm25,%zmm9,%zmm9 - vpaddd %zmm26,%zmm10,%zmm10 - vpaddd %zmm27,%zmm11,%zmm11 - - vpunpckldq %zmm9,%zmm8,%zmm6 - vpunpckldq %zmm11,%zmm10,%zmm0 - vpunpckhdq %zmm9,%zmm8,%zmm8 - vpunpckhdq %zmm11,%zmm10,%zmm10 - vpunpcklqdq %zmm0,%zmm6,%zmm9 # "c0" - vpunpckhqdq %zmm0,%zmm6,%zmm6 # "c1" - vpunpcklqdq %zmm10,%zmm8,%zmm11 # "c2" - vpunpckhqdq %zmm10,%zmm8,%zmm8 # "c3" - vpaddd %zmm28,%zmm12,%zmm12 - vpaddd %zmm29,%zmm13,%zmm13 - vpaddd %zmm30,%zmm14,%zmm14 - vpaddd %zmm31,%zmm15,%zmm15 - - vpunpckldq %zmm13,%zmm12,%zmm10 - vpunpckldq %zmm15,%zmm14,%zmm0 - vpunpckhdq %zmm13,%zmm12,%zmm12 - vpunpckhdq %zmm15,%zmm14,%zmm14 - vpunpcklqdq %zmm0,%zmm10,%zmm13 # "d0" - vpunpckhqdq %zmm0,%zmm10,%zmm10 # "d1" - vpunpcklqdq %zmm14,%zmm12,%zmm15 # "d2" - vpunpckhqdq %zmm14,%zmm12,%zmm12 # "d3" - vshufi32x4 $0x44,%zmm13,%zmm9,%zmm0 # "de-interlace" further - vshufi32x4 $0xee,%zmm13,%zmm9,%zmm13 - vshufi32x4 $0x44,%zmm10,%zmm6,%zmm9 - vshufi32x4 $0xee,%zmm10,%zmm6,%zmm10 - vshufi32x4 $0x44,%zmm15,%zmm11,%zmm6 - vshufi32x4 $0xee,%zmm15,%zmm11,%zmm15 - vshufi32x4 $0x44,%zmm12,%zmm8,%zmm11 - vshufi32x4 $0xee,%zmm12,%zmm8,%zmm12 - vshufi32x4 $0x88,%zmm0,%zmm19,%zmm16 # "de-interlace" further - vshufi32x4 $0xdd,%zmm0,%zmm19,%zmm19 - vshufi32x4 $0x88,%zmm13,%zmm5,%zmm0 - vshufi32x4 $0xdd,%zmm13,%zmm5,%zmm13 - vshufi32x4 $0x88,%zmm9,%zmm1,%zmm17 - vshufi32x4 $0xdd,%zmm9,%zmm1,%zmm1 - vshufi32x4 $0x88,%zmm10,%zmm2,%zmm9 - vshufi32x4 $0xdd,%zmm10,%zmm2,%zmm10 - vshufi32x4 $0x88,%zmm6,%zmm18,%zmm14 - vshufi32x4 $0xdd,%zmm6,%zmm18,%zmm18 - vshufi32x4 $0x88,%zmm15,%zmm7,%zmm6 - vshufi32x4 $0xdd,%zmm15,%zmm7,%zmm15 - vshufi32x4 $0x88,%zmm11,%zmm3,%zmm8 - vshufi32x4 $0xdd,%zmm11,%zmm3,%zmm3 - vshufi32x4 $0x88,%zmm12,%zmm4,%zmm11 - vshufi32x4 $0xdd,%zmm12,%zmm4,%zmm12 - cmp $64*16,%rdx - jb .Ltail16x - - vpxord 0x00(%rsi),%zmm16,%zmm16 # xor with input - vpxord 0x40(%rsi),%zmm17,%zmm17 - vpxord 0x80(%rsi),%zmm14,%zmm14 - vpxord 0xc0(%rsi),%zmm8,%zmm8 - vmovdqu32 %zmm16,0x00(%rdi) - vmovdqu32 %zmm17,0x40(%rdi) - vmovdqu32 %zmm14,0x80(%rdi) - vmovdqu32 %zmm8,0xc0(%rdi) - - vpxord 0x100(%rsi),%zmm19,%zmm19 - vpxord 0x140(%rsi),%zmm1,%zmm1 - vpxord 0x180(%rsi),%zmm18,%zmm18 - vpxord 0x1c0(%rsi),%zmm3,%zmm3 - vmovdqu32 %zmm19,0x100(%rdi) - vmovdqu32 %zmm1,0x140(%rdi) - vmovdqu32 %zmm18,0x180(%rdi) - vmovdqu32 %zmm3,0x1c0(%rdi) - - vpxord 0x200(%rsi),%zmm0,%zmm0 - vpxord 0x240(%rsi),%zmm9,%zmm9 - vpxord 0x280(%rsi),%zmm6,%zmm6 - vpxord 0x2c0(%rsi),%zmm11,%zmm11 - vmovdqu32 %zmm0,0x200(%rdi) - vmovdqu32 %zmm9,0x240(%rdi) - vmovdqu32 %zmm6,0x280(%rdi) - vmovdqu32 %zmm11,0x2c0(%rdi) - - vpxord 0x300(%rsi),%zmm13,%zmm13 - vpxord 0x340(%rsi),%zmm10,%zmm10 - vpxord 0x380(%rsi),%zmm15,%zmm15 - vpxord 0x3c0(%rsi),%zmm12,%zmm12 - lea 0x400(%rsi),%rsi - vmovdqu32 %zmm13,0x300(%rdi) - vmovdqu32 %zmm10,0x340(%rdi) - vmovdqu32 %zmm15,0x380(%rdi) - vmovdqu32 %zmm12,0x3c0(%rdi) - lea 0x400(%rdi),%rdi - - sub $64*16,%rdx - jnz .Loop_outer16x - - jmp .Ldone16x - -.align 32 -.Ltail16x: - xor %r9,%r9 - sub %rsi,%rdi - cmp $64*1,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm16,%zmm16 # xor with input - vmovdqu32 %zmm16,(%rdi,%rsi) - je .Ldone16x - vmovdqa32 %zmm17,%zmm16 - lea 64(%rsi),%rsi - - cmp $64*2,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm17,%zmm17 - vmovdqu32 %zmm17,(%rdi,%rsi) - je .Ldone16x - vmovdqa32 %zmm14,%zmm16 - lea 64(%rsi),%rsi - - cmp $64*3,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm14,%zmm14 - vmovdqu32 %zmm14,(%rdi,%rsi) - je .Ldone16x - vmovdqa32 %zmm8,%zmm16 - lea 64(%rsi),%rsi - - cmp $64*4,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm8,%zmm8 - vmovdqu32 %zmm8,(%rdi,%rsi) - je .Ldone16x - vmovdqa32 %zmm19,%zmm16 - lea 64(%rsi),%rsi - - cmp $64*5,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm19,%zmm19 - vmovdqu32 %zmm19,(%rdi,%rsi) - je .Ldone16x - vmovdqa32 %zmm1,%zmm16 - lea 64(%rsi),%rsi - - cmp $64*6,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm1,%zmm1 - vmovdqu32 %zmm1,(%rdi,%rsi) - je .Ldone16x - vmovdqa32 %zmm18,%zmm16 - lea 64(%rsi),%rsi - - cmp $64*7,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm18,%zmm18 - vmovdqu32 %zmm18,(%rdi,%rsi) - je .Ldone16x - vmovdqa32 %zmm3,%zmm16 - lea 64(%rsi),%rsi - - cmp $64*8,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm3,%zmm3 - vmovdqu32 %zmm3,(%rdi,%rsi) - je .Ldone16x - vmovdqa32 %zmm0,%zmm16 - lea 64(%rsi),%rsi - - cmp $64*9,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm0,%zmm0 - vmovdqu32 %zmm0,(%rdi,%rsi) - je .Ldone16x - vmovdqa32 %zmm9,%zmm16 - lea 64(%rsi),%rsi - - cmp $64*10,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm9,%zmm9 - vmovdqu32 %zmm9,(%rdi,%rsi) - je .Ldone16x - vmovdqa32 %zmm6,%zmm16 - lea 64(%rsi),%rsi - - cmp $64*11,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm6,%zmm6 - vmovdqu32 %zmm6,(%rdi,%rsi) - je .Ldone16x - vmovdqa32 %zmm11,%zmm16 - lea 64(%rsi),%rsi - - cmp $64*12,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm11,%zmm11 - vmovdqu32 %zmm11,(%rdi,%rsi) - je .Ldone16x - vmovdqa32 %zmm13,%zmm16 - lea 64(%rsi),%rsi - - cmp $64*13,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm13,%zmm13 - vmovdqu32 %zmm13,(%rdi,%rsi) - je .Ldone16x - vmovdqa32 %zmm10,%zmm16 - lea 64(%rsi),%rsi - - cmp $64*14,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm10,%zmm10 - vmovdqu32 %zmm10,(%rdi,%rsi) - je .Ldone16x - vmovdqa32 %zmm15,%zmm16 - lea 64(%rsi),%rsi - - cmp $64*15,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm15,%zmm15 - vmovdqu32 %zmm15,(%rdi,%rsi) - je .Ldone16x - vmovdqa32 %zmm12,%zmm16 - lea 64(%rsi),%rsi - -.Less_than_64_16x: - vmovdqa32 %zmm16,0x00(%rsp) - lea (%rdi,%rsi),%rdi - and $63,%rdx - -.Loop_tail16x: - movzb (%rsi,%r9),%eax - movzb (%rsp,%r9),%ecx - lea 1(%r9),%r9 - xor %ecx,%eax - mov %al,-1(%rdi,%r9) - dec %rdx - jnz .Loop_tail16x - - vpxord %zmm16,%zmm16,%zmm16 - vmovdqa32 %zmm16,0(%rsp) - -.Ldone16x: - vzeroall - lea -8(%r10),%rsp -.L16x_epilogue: - ret -.size chacha20_16x,.-chacha20_16x -.type chacha20_8xvl,@function -.align 32 -chacha20_8xvl: -.Lchacha20_8xvl: - lea 8(%rsp),%r10 # frame register - sub $64+8,%rsp - and $-64,%rsp - vzeroupper - - lea .Lsigma(%rip),%r9 - vbroadcasti128 (%r9),%ymm3 # key[0] - vbroadcasti128 (%rcx),%ymm7 # key[1] - vbroadcasti128 16(%rcx),%ymm11 # key[2] - vbroadcasti128 (%r8),%ymm15 # key[3] - - vpshufd $0x00,%ymm3,%ymm0 # smash key by lanes... - vpshufd $0x55,%ymm3,%ymm1 - vpshufd $0xaa,%ymm3,%ymm2 - vpshufd $0xff,%ymm3,%ymm3 - vmovdqa64 %ymm0,%ymm16 - vmovdqa64 %ymm1,%ymm17 - vmovdqa64 %ymm2,%ymm18 - vmovdqa64 %ymm3,%ymm19 - - vpshufd $0x00,%ymm7,%ymm4 - vpshufd $0x55,%ymm7,%ymm5 - vpshufd $0xaa,%ymm7,%ymm6 - vpshufd $0xff,%ymm7,%ymm7 - vmovdqa64 %ymm4,%ymm20 - vmovdqa64 %ymm5,%ymm21 - vmovdqa64 %ymm6,%ymm22 - vmovdqa64 %ymm7,%ymm23 - - vpshufd $0x00,%ymm11,%ymm8 - vpshufd $0x55,%ymm11,%ymm9 - vpshufd $0xaa,%ymm11,%ymm10 - vpshufd $0xff,%ymm11,%ymm11 - vmovdqa64 %ymm8,%ymm24 - vmovdqa64 %ymm9,%ymm25 - vmovdqa64 %ymm10,%ymm26 - vmovdqa64 %ymm11,%ymm27 - - vpshufd $0x00,%ymm15,%ymm12 - vpshufd $0x55,%ymm15,%ymm13 - vpshufd $0xaa,%ymm15,%ymm14 - vpshufd $0xff,%ymm15,%ymm15 - vpaddd .Lincy(%rip),%ymm12,%ymm12 # don't save counters yet - vmovdqa64 %ymm12,%ymm28 - vmovdqa64 %ymm13,%ymm29 - vmovdqa64 %ymm14,%ymm30 - vmovdqa64 %ymm15,%ymm31 - - mov $10,%eax - jmp .Loop8xvl - -.align 32 -.Loop_outer8xvl: - #vpbroadcastd 0(%r9),%ymm0 # reload key - #vpbroadcastd 4(%r9),%ymm1 - vpbroadcastd 8(%r9),%ymm2 - vpbroadcastd 12(%r9),%ymm3 - vpaddd .Leight(%rip),%ymm28,%ymm28 # next SIMD counters - vmovdqa64 %ymm20,%ymm4 - vmovdqa64 %ymm21,%ymm5 - vmovdqa64 %ymm22,%ymm6 - vmovdqa64 %ymm23,%ymm7 - vmovdqa64 %ymm24,%ymm8 - vmovdqa64 %ymm25,%ymm9 - vmovdqa64 %ymm26,%ymm10 - vmovdqa64 %ymm27,%ymm11 - vmovdqa64 %ymm28,%ymm12 - vmovdqa64 %ymm29,%ymm13 - vmovdqa64 %ymm30,%ymm14 - vmovdqa64 %ymm31,%ymm15 - - vmovdqa64 %ymm0,%ymm16 - vmovdqa64 %ymm1,%ymm17 - vmovdqa64 %ymm2,%ymm18 - vmovdqa64 %ymm3,%ymm19 - - mov $10,%eax - jmp .Loop8xvl - -.align 32 -.Loop8xvl: - vpaddd %ymm4,%ymm0,%ymm0 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm7,%ymm3,%ymm3 - vpxor %ymm0,%ymm12,%ymm12 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm3,%ymm15,%ymm15 - vprold $16,%ymm12,%ymm12 - vprold $16,%ymm13,%ymm13 - vprold $16,%ymm14,%ymm14 - vprold $16,%ymm15,%ymm15 - vpaddd %ymm12,%ymm8,%ymm8 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm8,%ymm4,%ymm4 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm11,%ymm7,%ymm7 - vprold $12,%ymm4,%ymm4 - vprold $12,%ymm5,%ymm5 - vprold $12,%ymm6,%ymm6 - vprold $12,%ymm7,%ymm7 - vpaddd %ymm4,%ymm0,%ymm0 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm7,%ymm3,%ymm3 - vpxor %ymm0,%ymm12,%ymm12 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm3,%ymm15,%ymm15 - vprold $8,%ymm12,%ymm12 - vprold $8,%ymm13,%ymm13 - vprold $8,%ymm14,%ymm14 - vprold $8,%ymm15,%ymm15 - vpaddd %ymm12,%ymm8,%ymm8 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm8,%ymm4,%ymm4 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm11,%ymm7,%ymm7 - vprold $7,%ymm4,%ymm4 - vprold $7,%ymm5,%ymm5 - vprold $7,%ymm6,%ymm6 - vprold $7,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpaddd %ymm6,%ymm1,%ymm1 - vpaddd %ymm7,%ymm2,%ymm2 - vpaddd %ymm4,%ymm3,%ymm3 - vpxor %ymm0,%ymm15,%ymm15 - vpxor %ymm1,%ymm12,%ymm12 - vpxor %ymm2,%ymm13,%ymm13 - vpxor %ymm3,%ymm14,%ymm14 - vprold $16,%ymm15,%ymm15 - vprold $16,%ymm12,%ymm12 - vprold $16,%ymm13,%ymm13 - vprold $16,%ymm14,%ymm14 - vpaddd %ymm15,%ymm10,%ymm10 - vpaddd %ymm12,%ymm11,%ymm11 - vpaddd %ymm13,%ymm8,%ymm8 - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm10,%ymm5,%ymm5 - vpxor %ymm11,%ymm6,%ymm6 - vpxor %ymm8,%ymm7,%ymm7 - vpxor %ymm9,%ymm4,%ymm4 - vprold $12,%ymm5,%ymm5 - vprold $12,%ymm6,%ymm6 - vprold $12,%ymm7,%ymm7 - vprold $12,%ymm4,%ymm4 - vpaddd %ymm5,%ymm0,%ymm0 - vpaddd %ymm6,%ymm1,%ymm1 - vpaddd %ymm7,%ymm2,%ymm2 - vpaddd %ymm4,%ymm3,%ymm3 - vpxor %ymm0,%ymm15,%ymm15 - vpxor %ymm1,%ymm12,%ymm12 - vpxor %ymm2,%ymm13,%ymm13 - vpxor %ymm3,%ymm14,%ymm14 - vprold $8,%ymm15,%ymm15 - vprold $8,%ymm12,%ymm12 - vprold $8,%ymm13,%ymm13 - vprold $8,%ymm14,%ymm14 - vpaddd %ymm15,%ymm10,%ymm10 - vpaddd %ymm12,%ymm11,%ymm11 - vpaddd %ymm13,%ymm8,%ymm8 - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm10,%ymm5,%ymm5 - vpxor %ymm11,%ymm6,%ymm6 - vpxor %ymm8,%ymm7,%ymm7 - vpxor %ymm9,%ymm4,%ymm4 - vprold $7,%ymm5,%ymm5 - vprold $7,%ymm6,%ymm6 - vprold $7,%ymm7,%ymm7 - vprold $7,%ymm4,%ymm4 - dec %eax - jnz .Loop8xvl - - vpaddd %ymm16,%ymm0,%ymm0 # accumulate key - vpaddd %ymm17,%ymm1,%ymm1 - vpaddd %ymm18,%ymm2,%ymm2 - vpaddd %ymm19,%ymm3,%ymm3 - - vpunpckldq %ymm1,%ymm0,%ymm18 # "de-interlace" data - vpunpckldq %ymm3,%ymm2,%ymm19 - vpunpckhdq %ymm1,%ymm0,%ymm0 - vpunpckhdq %ymm3,%ymm2,%ymm2 - vpunpcklqdq %ymm19,%ymm18,%ymm1 # "a0" - vpunpckhqdq %ymm19,%ymm18,%ymm18 # "a1" - vpunpcklqdq %ymm2,%ymm0,%ymm3 # "a2" - vpunpckhqdq %ymm2,%ymm0,%ymm0 # "a3" - vpaddd %ymm20,%ymm4,%ymm4 - vpaddd %ymm21,%ymm5,%ymm5 - vpaddd %ymm22,%ymm6,%ymm6 - vpaddd %ymm23,%ymm7,%ymm7 - - vpunpckldq %ymm5,%ymm4,%ymm2 - vpunpckldq %ymm7,%ymm6,%ymm19 - vpunpckhdq %ymm5,%ymm4,%ymm4 - vpunpckhdq %ymm7,%ymm6,%ymm6 - vpunpcklqdq %ymm19,%ymm2,%ymm5 # "b0" - vpunpckhqdq %ymm19,%ymm2,%ymm2 # "b1" - vpunpcklqdq %ymm6,%ymm4,%ymm7 # "b2" - vpunpckhqdq %ymm6,%ymm4,%ymm4 # "b3" - vshufi32x4 $0,%ymm5,%ymm1,%ymm19 # "de-interlace" further - vshufi32x4 $3,%ymm5,%ymm1,%ymm5 - vshufi32x4 $0,%ymm2,%ymm18,%ymm1 - vshufi32x4 $3,%ymm2,%ymm18,%ymm2 - vshufi32x4 $0,%ymm7,%ymm3,%ymm18 - vshufi32x4 $3,%ymm7,%ymm3,%ymm7 - vshufi32x4 $0,%ymm4,%ymm0,%ymm3 - vshufi32x4 $3,%ymm4,%ymm0,%ymm4 - vpaddd %ymm24,%ymm8,%ymm8 - vpaddd %ymm25,%ymm9,%ymm9 - vpaddd %ymm26,%ymm10,%ymm10 - vpaddd %ymm27,%ymm11,%ymm11 - - vpunpckldq %ymm9,%ymm8,%ymm6 - vpunpckldq %ymm11,%ymm10,%ymm0 - vpunpckhdq %ymm9,%ymm8,%ymm8 - vpunpckhdq %ymm11,%ymm10,%ymm10 - vpunpcklqdq %ymm0,%ymm6,%ymm9 # "c0" - vpunpckhqdq %ymm0,%ymm6,%ymm6 # "c1" - vpunpcklqdq %ymm10,%ymm8,%ymm11 # "c2" - vpunpckhqdq %ymm10,%ymm8,%ymm8 # "c3" - vpaddd %ymm28,%ymm12,%ymm12 - vpaddd %ymm29,%ymm13,%ymm13 - vpaddd %ymm30,%ymm14,%ymm14 - vpaddd %ymm31,%ymm15,%ymm15 - - vpunpckldq %ymm13,%ymm12,%ymm10 - vpunpckldq %ymm15,%ymm14,%ymm0 - vpunpckhdq %ymm13,%ymm12,%ymm12 - vpunpckhdq %ymm15,%ymm14,%ymm14 - vpunpcklqdq %ymm0,%ymm10,%ymm13 # "d0" - vpunpckhqdq %ymm0,%ymm10,%ymm10 # "d1" - vpunpcklqdq %ymm14,%ymm12,%ymm15 # "d2" - vpunpckhqdq %ymm14,%ymm12,%ymm12 # "d3" - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 # "de-interlace" further - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - vperm2i128 $0x20,%ymm10,%ymm6,%ymm9 - vperm2i128 $0x31,%ymm10,%ymm6,%ymm10 - vperm2i128 $0x20,%ymm15,%ymm11,%ymm6 - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - vperm2i128 $0x20,%ymm12,%ymm8,%ymm11 - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - cmp $64*8,%rdx - jb .Ltail8xvl - - mov $0x80,%eax # size optimization - vpxord 0x00(%rsi),%ymm19,%ymm19 # xor with input - vpxor 0x20(%rsi),%ymm0,%ymm0 - vpxor 0x40(%rsi),%ymm5,%ymm5 - vpxor 0x60(%rsi),%ymm13,%ymm13 - lea (%rsi,%rax),%rsi # size optimization - vmovdqu32 %ymm19,0x00(%rdi) - vmovdqu %ymm0,0x20(%rdi) - vmovdqu %ymm5,0x40(%rdi) - vmovdqu %ymm13,0x60(%rdi) - lea (%rdi,%rax),%rdi # size optimization - - vpxor 0x00(%rsi),%ymm1,%ymm1 - vpxor 0x20(%rsi),%ymm9,%ymm9 - vpxor 0x40(%rsi),%ymm2,%ymm2 - vpxor 0x60(%rsi),%ymm10,%ymm10 - lea (%rsi,%rax),%rsi # size optimization - vmovdqu %ymm1,0x00(%rdi) - vmovdqu %ymm9,0x20(%rdi) - vmovdqu %ymm2,0x40(%rdi) - vmovdqu %ymm10,0x60(%rdi) - lea (%rdi,%rax),%rdi # size optimization - - vpxord 0x00(%rsi),%ymm18,%ymm18 - vpxor 0x20(%rsi),%ymm6,%ymm6 - vpxor 0x40(%rsi),%ymm7,%ymm7 - vpxor 0x60(%rsi),%ymm15,%ymm15 - lea (%rsi,%rax),%rsi # size optimization - vmovdqu32 %ymm18,0x00(%rdi) - vmovdqu %ymm6,0x20(%rdi) - vmovdqu %ymm7,0x40(%rdi) - vmovdqu %ymm15,0x60(%rdi) - lea (%rdi,%rax),%rdi # size optimization - - vpxor 0x00(%rsi),%ymm3,%ymm3 - vpxor 0x20(%rsi),%ymm11,%ymm11 - vpxor 0x40(%rsi),%ymm4,%ymm4 - vpxor 0x60(%rsi),%ymm12,%ymm12 - lea (%rsi,%rax),%rsi # size optimization - vmovdqu %ymm3,0x00(%rdi) - vmovdqu %ymm11,0x20(%rdi) - vmovdqu %ymm4,0x40(%rdi) - vmovdqu %ymm12,0x60(%rdi) - lea (%rdi,%rax),%rdi # size optimization - - vpbroadcastd 0(%r9),%ymm0 # reload key - vpbroadcastd 4(%r9),%ymm1 - - sub $64*8,%rdx - jnz .Loop_outer8xvl - - jmp .Ldone8xvl - -.align 32 -.Ltail8xvl: - vmovdqa64 %ymm19,%ymm8 # size optimization - xor %r9,%r9 - sub %rsi,%rdi - cmp $64*1,%rdx - jb .Less_than_64_8xvl - vpxor 0x00(%rsi),%ymm8,%ymm8 # xor with input - vpxor 0x20(%rsi),%ymm0,%ymm0 - vmovdqu %ymm8,0x00(%rdi,%rsi) - vmovdqu %ymm0,0x20(%rdi,%rsi) - je .Ldone8xvl - vmovdqa %ymm5,%ymm8 - vmovdqa %ymm13,%ymm0 - lea 64(%rsi),%rsi - - cmp $64*2,%rdx - jb .Less_than_64_8xvl - vpxor 0x00(%rsi),%ymm5,%ymm5 - vpxor 0x20(%rsi),%ymm13,%ymm13 - vmovdqu %ymm5,0x00(%rdi,%rsi) - vmovdqu %ymm13,0x20(%rdi,%rsi) - je .Ldone8xvl - vmovdqa %ymm1,%ymm8 - vmovdqa %ymm9,%ymm0 - lea 64(%rsi),%rsi - - cmp $64*3,%rdx - jb .Less_than_64_8xvl - vpxor 0x00(%rsi),%ymm1,%ymm1 - vpxor 0x20(%rsi),%ymm9,%ymm9 - vmovdqu %ymm1,0x00(%rdi,%rsi) - vmovdqu %ymm9,0x20(%rdi,%rsi) - je .Ldone8xvl - vmovdqa %ymm2,%ymm8 - vmovdqa %ymm10,%ymm0 - lea 64(%rsi),%rsi - - cmp $64*4,%rdx - jb .Less_than_64_8xvl - vpxor 0x00(%rsi),%ymm2,%ymm2 - vpxor 0x20(%rsi),%ymm10,%ymm10 - vmovdqu %ymm2,0x00(%rdi,%rsi) - vmovdqu %ymm10,0x20(%rdi,%rsi) - je .Ldone8xvl - vmovdqa32 %ymm18,%ymm8 - vmovdqa %ymm6,%ymm0 - lea 64(%rsi),%rsi - - cmp $64*5,%rdx - jb .Less_than_64_8xvl - vpxord 0x00(%rsi),%ymm18,%ymm18 - vpxor 0x20(%rsi),%ymm6,%ymm6 - vmovdqu32 %ymm18,0x00(%rdi,%rsi) - vmovdqu %ymm6,0x20(%rdi,%rsi) - je .Ldone8xvl - vmovdqa %ymm7,%ymm8 - vmovdqa %ymm15,%ymm0 - lea 64(%rsi),%rsi - - cmp $64*6,%rdx - jb .Less_than_64_8xvl - vpxor 0x00(%rsi),%ymm7,%ymm7 - vpxor 0x20(%rsi),%ymm15,%ymm15 - vmovdqu %ymm7,0x00(%rdi,%rsi) - vmovdqu %ymm15,0x20(%rdi,%rsi) - je .Ldone8xvl - vmovdqa %ymm3,%ymm8 - vmovdqa %ymm11,%ymm0 - lea 64(%rsi),%rsi - - cmp $64*7,%rdx - jb .Less_than_64_8xvl - vpxor 0x00(%rsi),%ymm3,%ymm3 - vpxor 0x20(%rsi),%ymm11,%ymm11 - vmovdqu %ymm3,0x00(%rdi,%rsi) - vmovdqu %ymm11,0x20(%rdi,%rsi) - je .Ldone8xvl - vmovdqa %ymm4,%ymm8 - vmovdqa %ymm12,%ymm0 - lea 64(%rsi),%rsi - -.Less_than_64_8xvl: - vmovdqa %ymm8,0x00(%rsp) - vmovdqa %ymm0,0x20(%rsp) - lea (%rdi,%rsi),%rdi - and $63,%rdx - -.Loop_tail8xvl: - movzb (%rsi,%r9),%eax - movzb (%rsp,%r9),%ecx - lea 1(%r9),%r9 - xor %ecx,%eax - mov %al,-1(%rdi,%r9) - dec %rdx - jnz .Loop_tail8xvl - - vpxor %ymm8,%ymm8,%ymm8 - vmovdqa %ymm8,0x00(%rsp) - vmovdqa %ymm8,0x20(%rsp) - -.Ldone8xvl: - vzeroall - lea -8(%r10),%rsp -.L8xvl_epilogue: - ret -.size chacha20_8xvl,.-chacha20_8xvl -#endif diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm-glue.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm-glue.c deleted file mode 100644 index 41e2e79abb2b..000000000000 --- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm-glue.c +++ /dev/null @@ -1,98 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#include <asm/hwcap.h> -#include <asm/neon.h> -#if defined(CONFIG_ZINC_ARCH_ARM) -#include <asm/system_info.h> -#include <asm/cputype.h> -#endif - -asmlinkage void chacha20_arm(u8 *out, const u8 *in, const size_t len, - const u32 key[8], const u32 counter[4]); -asmlinkage void hchacha20_arm(const u32 state[16], u32 out[8]); -asmlinkage void chacha20_neon(u8 *out, const u8 *in, const size_t len, - const u32 key[8], const u32 counter[4]); - -static bool chacha20_use_neon __ro_after_init; -static bool *const chacha20_nobs[] __initconst = { &chacha20_use_neon }; -static void __init chacha20_fpu_init(void) -{ -#if defined(CONFIG_ZINC_ARCH_ARM64) - chacha20_use_neon = cpu_have_named_feature(ASIMD); -#elif defined(CONFIG_ZINC_ARCH_ARM) - switch (read_cpuid_part()) { - case ARM_CPU_PART_CORTEX_A7: - case ARM_CPU_PART_CORTEX_A5: - /* The Cortex-A7 and Cortex-A5 do not perform well with the NEON - * implementation but do incredibly with the scalar one and use - * less power. - */ - break; - default: - chacha20_use_neon = elf_hwcap & HWCAP_NEON; - } -#endif -} - -static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst, - const u8 *src, size_t len, - simd_context_t *simd_context) -{ - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(PAGE_SIZE < CHACHA20_BLOCK_SIZE || - PAGE_SIZE % CHACHA20_BLOCK_SIZE); - - for (;;) { - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && chacha20_use_neon && - len >= CHACHA20_BLOCK_SIZE * 3 && simd_use(simd_context)) { - const size_t bytes = min_t(size_t, len, PAGE_SIZE); - - chacha20_neon(dst, src, bytes, ctx->key, ctx->counter); - ctx->counter[0] += (bytes + 63) / 64; - len -= bytes; - if (!len) - break; - dst += bytes; - src += bytes; - simd_relax(simd_context); - } else { - chacha20_arm(dst, src, len, ctx->key, ctx->counter); - ctx->counter[0] += (len + 63) / 64; - break; - } - } - - return true; -} - -static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS], - const u8 nonce[HCHACHA20_NONCE_SIZE], - const u8 key[HCHACHA20_KEY_SIZE], - simd_context_t *simd_context) -{ - if (IS_ENABLED(CONFIG_ZINC_ARCH_ARM)) { - u32 x[] = { CHACHA20_CONSTANT_EXPA, - CHACHA20_CONSTANT_ND_3, - CHACHA20_CONSTANT_2_BY, - CHACHA20_CONSTANT_TE_K, - get_unaligned_le32(key + 0), - get_unaligned_le32(key + 4), - get_unaligned_le32(key + 8), - get_unaligned_le32(key + 12), - get_unaligned_le32(key + 16), - get_unaligned_le32(key + 20), - get_unaligned_le32(key + 24), - get_unaligned_le32(key + 28), - get_unaligned_le32(nonce + 0), - get_unaligned_le32(nonce + 4), - get_unaligned_le32(nonce + 8), - get_unaligned_le32(nonce + 12) - }; - hchacha20_arm(x, derived_key); - return true; - } - return false; -} diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl deleted file mode 100755 index 6785383ab7bb..000000000000 --- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl +++ /dev/null @@ -1,1227 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause -# -# This code is taken from the OpenSSL project but the author, Andy Polyakov, -# has relicensed it under the licenses specified in the SPDX header above. -# The original headers, including the original license headers, are -# included below for completeness. -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# December 2014 -# -# ChaCha20 for ARMv4. -# -# September 2018 -# -# Improve scalar performance per Eric Biggers' suggestion to eliminate -# separate rotates. This requires b[0..3] and d[0..3] to be maintained -# pre-rotated, hence odd twists prior inner loop and when accumulating -# key material. Since amount of instructions is reduced as result, even -# NEON performance is improved somewhat, most notably by ~9% on low-end -# Cortex-A5/A7. Full unroll was shown to provide even better scalar -# performance on Cortex-A5/A7, naturally at the cost of manyfold size -# increase. We let it be. Oversized code works in benchmarks, but is not -# necessarily optimal in real life, when it's likely to be out-of-cache -# upon entry and evict significant part of cache upon completion. -# -# Performance in cycles per byte out of large buffer. -# -# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU -# -# Cortex-A5 14.2(*)/+160% 21.8 12.9(**) -# Cortex-A8 10.2(*)/+190% 13.9 6.10 -# Cortex-A9 10.8(*)/+150% 14.3 6.50 -# Cortex-A15 11.0/+40% 16.0 4.90 -# Snapdragon S4 13.9(***)/+90% 13.6 4.90 -# -# (*) most "favourable" result for aligned data on little-endian -# processor, result for misaligned data is 10-15% lower; -# (**) pure 4xNEON [with "vertical" layout] was shown to provide ~8% -# better performance on Cortex-A5/A7, but not on others; -# (***) it's 17% slower than original, trade-off is considered -# acceptable, because of improvement on others, specifically -# +36% on Cortex-A5/A7 and +20% on Cortex-A9; - -$flavour = shift; -if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } -else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -sub AUTOLOAD() # thunk [simplified] x86-style perlasm -{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; - my $arg = pop; - $arg = "#$arg" if ($arg*1 eq $arg); - $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; -} - -my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); -my @t=map("r$_",(8..11)); - -sub ROUND { -my ($a0,$b0,$c0,$d0)=@_; -my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); -my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); -my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); -my $odd = $d0&1; -my ($xc,$xc_) = (@t[0..1]); -my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); -my @ret; - - # Consider order in which variables are addressed by their - # index: - # - # a b c d - # - # 0 4 8 12 < even round - # 1 5 9 13 - # 2 6 10 14 - # 3 7 11 15 - # 0 5 10 15 < odd round - # 1 6 11 12 - # 2 7 8 13 - # 3 4 9 14 - # - # 'a', 'b' are permanently allocated in registers, @x[0..7], - # while 'c's and pair of 'd's are maintained in memory. If - # you observe 'c' column, you'll notice that pair of 'c's is - # invariant between rounds. This means that we have to reload - # them once per round, in the middle. This is why you'll see - # bunch of 'c' stores and loads in the middle, but none in - # the beginning or end. If you observe 'd' column, you'll - # notice that 15 and 13 are reused in next pair of rounds. - # This is why these two are chosen for offloading to memory, - # to make loads count more. - push @ret,( - "&add (@x[$a0],@x[$a0],@x[$b0],'ror#13')", - "&add (@x[$a1],@x[$a1],@x[$b1],'ror#13')", - "&eor ($xd,@x[$a0],$xd,'ror#24')", - "&eor ($xd_,@x[$a1],$xd_,'ror#24')", - - "&add ($xc,$xc,$xd,'ror#16')", - "&add ($xc_,$xc_,$xd_,'ror#16')", - "&eor (@x[$b0],$xc, @x[$b0],'ror#13')", - "&eor (@x[$b1],$xc_,@x[$b1],'ror#13')", - - "&add (@x[$a0],@x[$a0],@x[$b0],'ror#20')", - "&add (@x[$a1],@x[$a1],@x[$b1],'ror#20')", - "&eor ($xd,@x[$a0],$xd,'ror#16')", - "&eor ($xd_,@x[$a1],$xd_,'ror#16')" ); - push @ret,( - "&str ($xd,'[sp,#4*(16+$d0)]')" ) if ($odd); - push @ret,( - "&add ($xc,$xc,$xd,'ror#24')" ); - push @ret,( - "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); - push @ret,( - "&str ($xd_,'[sp,#4*(16+$d1)]')" ) if (!$odd); - push @ret,( - "&add ($xc_,$xc_,$xd_,'ror#24')" ); - push @ret,( - "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); - push @ret,( - "&str ($xc,'[sp,#4*(16+$c0)]')", - "&eor (@x[$b0],@x[$b0],$xc,'ror#12')", - "&str ($xc_,'[sp,#4*(16+$c1)]')", - "&eor (@x[$b1],@x[$b1],$xc_,'ror#12')" ); - - $xd=@x[$d2] if (!$odd); - $xd_=@x[$d3] if ($odd); - push @ret,( - "&ldr ($xc,'[sp,#4*(16+$c2)]')", - "&add (@x[$a2],@x[$a2],@x[$b2],'ror#13')", - "&ldr ($xc_,'[sp,#4*(16+$c3)]')", - "&add (@x[$a3],@x[$a3],@x[$b3],'ror#13')", - "&eor ($xd,@x[$a2],$xd,'ror#24')", - "&eor ($xd_,@x[$a3],$xd_,'ror#24')", - - "&add ($xc,$xc,$xd,'ror#16')", - "&add ($xc_,$xc_,$xd_,'ror#16')", - "&eor (@x[$b2],$xc, @x[$b2],'ror#13')", - "&eor (@x[$b3],$xc_,@x[$b3],'ror#13')", - - "&add (@x[$a2],@x[$a2],@x[$b2],'ror#20')", - "&add (@x[$a3],@x[$a3],@x[$b3],'ror#20')", - "&eor ($xd,@x[$a2],$xd,'ror#16')", - "&eor ($xd_,@x[$a3],$xd_,'ror#16')", - - "&add ($xc,$xc,$xd,'ror#24')", - "&add ($xc_,$xc_,$xd_,'ror#24')", - "&eor (@x[$b2],@x[$b2],$xc,'ror#12')", - "&eor (@x[$b3],@x[$b3],$xc_,'ror#12')" ); - - @ret; -} - -$code.=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -#else -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ -# define ChaCha20_ctr32 chacha20_arm_cryptogams -# define ChaCha20_neon chacha20_neon -#endif - -.text -#if defined(__thumb2__) || defined(__clang__) -.syntax unified -# define ldrhsb ldrbhs -#endif -#if defined(__thumb2__) -.thumb -#else -.code 32 -#endif - -.align 5 -.Lsigma: -.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral -.Lone: -.long 1,0,0,0 -.Lrot8: -.long 0x02010003,0x06050407 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-.LChaCha20_ctr32 -#else -.word -1 -#endif - -.globl ChaCha20_ctr32 -.type ChaCha20_ctr32,%function -.align 5 -ChaCha20_ctr32: -.LChaCha20_ctr32: - ldr r12,[sp,#0] @ pull pointer to counter and nonce - stmdb sp!,{r0-r2,r4-r11,lr} -#if __ARM_ARCH__<7 && !defined(__thumb2__) - sub r14,pc,#16 @ ChaCha20_ctr32 -#else - adr r14,.LChaCha20_ctr32 -#endif - cmp r2,#0 @ len==0? -#ifdef __thumb2__ - itt eq -#endif - addeq sp,sp,#4*3 - beq .Lno_data -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - cmp r2,#192 @ test len - bls .Lshort - ldr r4,[r14,#-24] - ldr r4,[r14,r4] -# ifdef __APPLE__ - ldr r4,[r4] -# endif - tst r4,#ARMV7_NEON - bne .LChaCha20_neon -.Lshort: -#endif - ldmia r12,{r4-r7} @ load counter and nonce - sub sp,sp,#4*(16) @ off-load area - sub r14,r14,#64 @ .Lsigma - stmdb sp!,{r4-r7} @ copy counter and nonce - ldmia r3,{r4-r11} @ load key - ldmia r14,{r0-r3} @ load sigma - stmdb sp!,{r4-r11} @ copy key - stmdb sp!,{r0-r3} @ copy sigma - str r10,[sp,#4*(16+10)] @ off-load "@x[10]" - str r11,[sp,#4*(16+11)] @ off-load "@x[11]" - b .Loop_outer_enter - -.align 4 -.Loop_outer: - ldmia sp,{r0-r9} @ load key material - str @t[3],[sp,#4*(32+2)] @ save len - str r12, [sp,#4*(32+1)] @ save inp - str r14, [sp,#4*(32+0)] @ save out -.Loop_outer_enter: - ldr @t[3], [sp,#4*(15)] - mov @x[4],@x[4],ror#19 @ twist b[0..3] - ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load - mov @x[5],@x[5],ror#19 - ldr @t[2], [sp,#4*(13)] - mov @x[6],@x[6],ror#19 - ldr @x[14],[sp,#4*(14)] - mov @x[7],@x[7],ror#19 - mov @t[3],@t[3],ror#8 @ twist d[0..3] - mov @x[12],@x[12],ror#8 - mov @t[2],@t[2],ror#8 - mov @x[14],@x[14],ror#8 - str @t[3], [sp,#4*(16+15)] - mov @t[3],#10 - b .Loop - -.align 4 -.Loop: - subs @t[3],@t[3],#1 -___ - foreach (&ROUND(0, 4, 8,12)) { eval; } - foreach (&ROUND(0, 5,10,15)) { eval; } -$code.=<<___; - bne .Loop - - ldr @t[3],[sp,#4*(32+2)] @ load len - - str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store - str @t[1], [sp,#4*(16+9)] - str @x[12],[sp,#4*(16+12)] - str @t[2], [sp,#4*(16+13)] - str @x[14],[sp,#4*(16+14)] - - @ at this point we have first half of 512-bit result in - @ @x[0-7] and second half at sp+4*(16+8) - - cmp @t[3],#64 @ done yet? -#ifdef __thumb2__ - itete lo -#endif - addlo r12,sp,#4*(0) @ shortcut or ... - ldrhs r12,[sp,#4*(32+1)] @ ... load inp - addlo r14,sp,#4*(0) @ shortcut or ... - ldrhs r14,[sp,#4*(32+0)] @ ... load out - - ldr @t[0],[sp,#4*(0)] @ load key material - ldr @t[1],[sp,#4*(1)] - -#if __ARM_ARCH__>=6 || !defined(__ARMEB__) -# if __ARM_ARCH__<7 - orr @t[2],r12,r14 - tst @t[2],#3 @ are input and output aligned? - ldr @t[2],[sp,#4*(2)] - bne .Lunaligned - cmp @t[3],#64 @ restore flags -# else - ldr @t[2],[sp,#4*(2)] -# endif - ldr @t[3],[sp,#4*(3)] - - add @x[0],@x[0],@t[0] @ accumulate key material - add @x[1],@x[1],@t[1] -# ifdef __thumb2__ - itt hs -# endif - ldrhs @t[0],[r12],#16 @ load input - ldrhs @t[1],[r12,#-12] - - add @x[2],@x[2],@t[2] - add @x[3],@x[3],@t[3] -# ifdef __thumb2__ - itt hs -# endif - ldrhs @t[2],[r12,#-8] - ldrhs @t[3],[r12,#-4] -# if __ARM_ARCH__>=6 && defined(__ARMEB__) - rev @x[0],@x[0] - rev @x[1],@x[1] - rev @x[2],@x[2] - rev @x[3],@x[3] -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs @x[0],@x[0],@t[0] @ xor with input - eorhs @x[1],@x[1],@t[1] - add @t[0],sp,#4*(4) - str @x[0],[r14],#16 @ store output -# ifdef __thumb2__ - itt hs -# endif - eorhs @x[2],@x[2],@t[2] - eorhs @x[3],@x[3],@t[3] - ldmia @t[0],{@t[0]-@t[3]} @ load key material - str @x[1],[r14,#-12] - str @x[2],[r14,#-8] - str @x[3],[r14,#-4] - - add @x[4],@t[0],@x[4],ror#13 @ accumulate key material - add @x[5],@t[1],@x[5],ror#13 -# ifdef __thumb2__ - itt hs -# endif - ldrhs @t[0],[r12],#16 @ load input - ldrhs @t[1],[r12,#-12] - add @x[6],@t[2],@x[6],ror#13 - add @x[7],@t[3],@x[7],ror#13 -# ifdef __thumb2__ - itt hs -# endif - ldrhs @t[2],[r12,#-8] - ldrhs @t[3],[r12,#-4] -# if __ARM_ARCH__>=6 && defined(__ARMEB__) - rev @x[4],@x[4] - rev @x[5],@x[5] - rev @x[6],@x[6] - rev @x[7],@x[7] -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs @x[4],@x[4],@t[0] - eorhs @x[5],@x[5],@t[1] - add @t[0],sp,#4*(8) - str @x[4],[r14],#16 @ store output -# ifdef __thumb2__ - itt hs -# endif - eorhs @x[6],@x[6],@t[2] - eorhs @x[7],@x[7],@t[3] - str @x[5],[r14,#-12] - ldmia @t[0],{@t[0]-@t[3]} @ load key material - str @x[6],[r14,#-8] - add @x[0],sp,#4*(16+8) - str @x[7],[r14,#-4] - - ldmia @x[0],{@x[0]-@x[7]} @ load second half - - add @x[0],@x[0],@t[0] @ accumulate key material - add @x[1],@x[1],@t[1] -# ifdef __thumb2__ - itt hs -# endif - ldrhs @t[0],[r12],#16 @ load input - ldrhs @t[1],[r12,#-12] -# ifdef __thumb2__ - itt hi -# endif - strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it - strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it - add @x[2],@x[2],@t[2] - add @x[3],@x[3],@t[3] -# ifdef __thumb2__ - itt hs -# endif - ldrhs @t[2],[r12,#-8] - ldrhs @t[3],[r12,#-4] -# if __ARM_ARCH__>=6 && defined(__ARMEB__) - rev @x[0],@x[0] - rev @x[1],@x[1] - rev @x[2],@x[2] - rev @x[3],@x[3] -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs @x[0],@x[0],@t[0] - eorhs @x[1],@x[1],@t[1] - add @t[0],sp,#4*(12) - str @x[0],[r14],#16 @ store output -# ifdef __thumb2__ - itt hs -# endif - eorhs @x[2],@x[2],@t[2] - eorhs @x[3],@x[3],@t[3] - str @x[1],[r14,#-12] - ldmia @t[0],{@t[0]-@t[3]} @ load key material - str @x[2],[r14,#-8] - str @x[3],[r14,#-4] - - add @x[4],@t[0],@x[4],ror#24 @ accumulate key material - add @x[5],@t[1],@x[5],ror#24 -# ifdef __thumb2__ - itt hi -# endif - addhi @t[0],@t[0],#1 @ next counter value - strhi @t[0],[sp,#4*(12)] @ save next counter value -# ifdef __thumb2__ - itt hs -# endif - ldrhs @t[0],[r12],#16 @ load input - ldrhs @t[1],[r12,#-12] - add @x[6],@t[2],@x[6],ror#24 - add @x[7],@t[3],@x[7],ror#24 -# ifdef __thumb2__ - itt hs -# endif - ldrhs @t[2],[r12,#-8] - ldrhs @t[3],[r12,#-4] -# if __ARM_ARCH__>=6 && defined(__ARMEB__) - rev @x[4],@x[4] - rev @x[5],@x[5] - rev @x[6],@x[6] - rev @x[7],@x[7] -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs @x[4],@x[4],@t[0] - eorhs @x[5],@x[5],@t[1] -# ifdef __thumb2__ - it ne -# endif - ldrne @t[0],[sp,#4*(32+2)] @ re-load len -# ifdef __thumb2__ - itt hs -# endif - eorhs @x[6],@x[6],@t[2] - eorhs @x[7],@x[7],@t[3] - str @x[4],[r14],#16 @ store output - str @x[5],[r14,#-12] -# ifdef __thumb2__ - it hs -# endif - subhs @t[3],@t[0],#64 @ len-=64 - str @x[6],[r14,#-8] - str @x[7],[r14,#-4] - bhi .Loop_outer - - beq .Ldone -# if __ARM_ARCH__<7 - b .Ltail - -.align 4 -.Lunaligned: @ unaligned endian-neutral path - cmp @t[3],#64 @ restore flags -# endif -#endif -#if __ARM_ARCH__<7 - ldr @t[3],[sp,#4*(3)] -___ -for ($i=0;$i<16;$i+=4) { -my $j=$i&0x7; -my $twist=""; -if ($i==4) { $twist = ",ror#13"; } -elsif ($i==12) { $twist = ",ror#24"; } - -$code.=<<___ if ($i==4); - add @x[0],sp,#4*(16+8) -___ -$code.=<<___ if ($i==8); - ldmia @x[0],{@x[0]-@x[7]} @ load second half -# ifdef __thumb2__ - itt hi -# endif - strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" - strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" -___ -$code.=<<___; - add @x[$j+0],@t[0],@x[$j+0]$twist @ accumulate key material -___ -$code.=<<___ if ($i==12); -# ifdef __thumb2__ - itt hi -# endif - addhi @t[0],@t[0],#1 @ next counter value - strhi @t[0],[sp,#4*(12)] @ save next counter value -___ -$code.=<<___; - add @x[$j+1],@t[1],@x[$j+1]$twist - add @x[$j+2],@t[2],@x[$j+2]$twist -# ifdef __thumb2__ - itete lo -# endif - eorlo @t[0],@t[0],@t[0] @ zero or ... - ldrhsb @t[0],[r12],#16 @ ... load input - eorlo @t[1],@t[1],@t[1] - ldrhsb @t[1],[r12,#-12] - - add @x[$j+3],@t[3],@x[$j+3]$twist -# ifdef __thumb2__ - itete lo -# endif - eorlo @t[2],@t[2],@t[2] - ldrhsb @t[2],[r12,#-8] - eorlo @t[3],@t[3],@t[3] - ldrhsb @t[3],[r12,#-4] - - eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) - eor @x[$j+1],@t[1],@x[$j+1] -# ifdef __thumb2__ - itt hs -# endif - ldrhsb @t[0],[r12,#-15] @ load more input - ldrhsb @t[1],[r12,#-11] - eor @x[$j+2],@t[2],@x[$j+2] - strb @x[$j+0],[r14],#16 @ store output - eor @x[$j+3],@t[3],@x[$j+3] -# ifdef __thumb2__ - itt hs -# endif - ldrhsb @t[2],[r12,#-7] - ldrhsb @t[3],[r12,#-3] - strb @x[$j+1],[r14,#-12] - eor @x[$j+0],@t[0],@x[$j+0],lsr#8 - strb @x[$j+2],[r14,#-8] - eor @x[$j+1],@t[1],@x[$j+1],lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb @t[0],[r12,#-14] @ load more input - ldrhsb @t[1],[r12,#-10] - strb @x[$j+3],[r14,#-4] - eor @x[$j+2],@t[2],@x[$j+2],lsr#8 - strb @x[$j+0],[r14,#-15] - eor @x[$j+3],@t[3],@x[$j+3],lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb @t[2],[r12,#-6] - ldrhsb @t[3],[r12,#-2] - strb @x[$j+1],[r14,#-11] - eor @x[$j+0],@t[0],@x[$j+0],lsr#8 - strb @x[$j+2],[r14,#-7] - eor @x[$j+1],@t[1],@x[$j+1],lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb @t[0],[r12,#-13] @ load more input - ldrhsb @t[1],[r12,#-9] - strb @x[$j+3],[r14,#-3] - eor @x[$j+2],@t[2],@x[$j+2],lsr#8 - strb @x[$j+0],[r14,#-14] - eor @x[$j+3],@t[3],@x[$j+3],lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb @t[2],[r12,#-5] - ldrhsb @t[3],[r12,#-1] - strb @x[$j+1],[r14,#-10] - strb @x[$j+2],[r14,#-6] - eor @x[$j+0],@t[0],@x[$j+0],lsr#8 - strb @x[$j+3],[r14,#-2] - eor @x[$j+1],@t[1],@x[$j+1],lsr#8 - strb @x[$j+0],[r14,#-13] - eor @x[$j+2],@t[2],@x[$j+2],lsr#8 - strb @x[$j+1],[r14,#-9] - eor @x[$j+3],@t[3],@x[$j+3],lsr#8 - strb @x[$j+2],[r14,#-5] - strb @x[$j+3],[r14,#-1] -___ -$code.=<<___ if ($i<12); - add @t[0],sp,#4*(4+$i) - ldmia @t[0],{@t[0]-@t[3]} @ load key material -___ -} -$code.=<<___; -# ifdef __thumb2__ - it ne -# endif - ldrne @t[0],[sp,#4*(32+2)] @ re-load len -# ifdef __thumb2__ - it hs -# endif - subhs @t[3],@t[0],#64 @ len-=64 - bhi .Loop_outer - - beq .Ldone -#endif - -.Ltail: - ldr r12,[sp,#4*(32+1)] @ load inp - add @t[1],sp,#4*(0) - ldr r14,[sp,#4*(32+0)] @ load out - -.Loop_tail: - ldrb @t[2],[@t[1]],#1 @ read buffer on stack - ldrb @t[3],[r12],#1 @ read input - subs @t[0],@t[0],#1 - eor @t[3],@t[3],@t[2] - strb @t[3],[r14],#1 @ store output - bne .Loop_tail - -.Ldone: - add sp,sp,#4*(32+3) -.Lno_data: -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r11,pc} -#else - ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .long 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size ChaCha20_ctr32,.-ChaCha20_ctr32 -___ - -{{{ -my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = - map("q$_",(0..15)); - -# This can replace vshr-by-24+vsli-by-8. It gives ~3% improvement on -# Cortex-A5/A7, but hurts Cortex-A9 by 5% and Snapdragon S4 by 14%! -sub vperm() -{ my ($dst,$src,$tbl) = @_; - $code .= " vtbl.8 $dst#lo,{$src#lo},$tbl#lo\n"; - $code .= " vtbl.8 $dst#hi,{$src#hi},$tbl#lo\n"; -} - -sub NEONROUND { -my $odd = pop; -my ($a,$b,$c,$d,$t)=@_; - - ( - "&vadd_i32 ($a,$a,$b)", - "&veor ($d,$d,$a)", - "&vrev32_16 ($d,$d)", # vrot ($d,16) - - "&vadd_i32 ($c,$c,$d)", - "&veor ($t,$b,$c)", - "&vshr_u32 ($b,$t,20)", - "&vsli_32 ($b,$t,12)", - - "&vadd_i32 ($a,$a,$b)", - "&veor ($t,$d,$a)", - "&vshr_u32 ($d,$t,24)", - "&vsli_32 ($d,$t,8)", - #"&vperm ($d,$t,$t3)", - - "&vadd_i32 ($c,$c,$d)", - "&veor ($t,$b,$c)", - "&vshr_u32 ($b,$t,25)", - "&vsli_32 ($b,$t,7)", - - "&vext_8 ($a,$a,$a,$odd?4:12)", - "&vext_8 ($d,$d,$d,8)", - "&vext_8 ($c,$c,$c,$odd?12:4)" - ); -} - -$code.=<<___; -#if (defined(__KERNEL__) && defined(CONFIG_KERNEL_MODE_NEON)) || (!defined(__KERNEL__) && __ARM_MAX_ARCH__>=7) -.arch armv7-a -.fpu neon - -# ifdef __KERNEL__ -.globl ChaCha20_neon -@ For optimal performance it's appropriate for caller to enforce -@ minimum input length, 193 bytes is suggested. -# endif -.type ChaCha20_neon,%function -.align 5 -ChaCha20_neon: - ldr r12,[sp,#0] @ pull pointer to counter and nonce - stmdb sp!,{r0-r2,r4-r11,lr} -.LChaCha20_neon: - adr r14,.Lsigma - vstmdb sp!,{d8-d15} @ ABI spec says so - stmdb sp!,{r0-r3} - - vld1.32 {$b0-$c0},[r3] @ load key - ldmia r3,{r4-r11} @ load key - - sub sp,sp,#4*(16+16) - vld1.32 {$d0},[r12] @ load counter and nonce - add r12,sp,#4*8 - ldmia r14,{r0-r3} @ load sigma - vld1.32 {$a0},[r14]! @ load sigma - vld1.32 {$t0},[r14]! @ one - @ vld1.32 {$t3#lo},[r14] @ rot8 - vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce - vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key - - str r10,[sp,#4*(16+10)] @ off-load "@x[10]" - str r11,[sp,#4*(16+11)] @ off-load "@x[11]" - vshl.i32 $t1#lo,$t0#lo,#1 @ two - vstr $t0#lo,[sp,#4*(16+0)] - vshl.i32 $t2#lo,$t0#lo,#2 @ four - vstr $t1#lo,[sp,#4*(16+2)] - vmov $a1,$a0 - vstr $t2#lo,[sp,#4*(16+4)] - vmov $a2,$a0 - @ vstr $t3#lo,[sp,#4*(16+6)] - vmov $b1,$b0 - vmov $b2,$b0 - b .Loop_neon_enter - -.align 4 -.Loop_neon_outer: - ldmia sp,{r0-r9} @ load key material - cmp @t[3],#64*2 @ if len<=64*2 - bls .Lbreak_neon @ switch to integer-only - @ vldr $t3#lo,[sp,#4*(16+6)] @ rot8 - vmov $a1,$a0 - str @t[3],[sp,#4*(32+2)] @ save len - vmov $a2,$a0 - str r12, [sp,#4*(32+1)] @ save inp - vmov $b1,$b0 - str r14, [sp,#4*(32+0)] @ save out - vmov $b2,$b0 -.Loop_neon_enter: - ldr @t[3], [sp,#4*(15)] - mov @x[4],@x[4],ror#19 @ twist b[0..3] - vadd.i32 $d1,$d0,$t0 @ counter+1 - ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load - mov @x[5],@x[5],ror#19 - vmov $c1,$c0 - ldr @t[2], [sp,#4*(13)] - mov @x[6],@x[6],ror#19 - vmov $c2,$c0 - ldr @x[14],[sp,#4*(14)] - mov @x[7],@x[7],ror#19 - vadd.i32 $d2,$d1,$t0 @ counter+2 - add @x[12],@x[12],#3 @ counter+3 - mov @t[3],@t[3],ror#8 @ twist d[0..3] - mov @x[12],@x[12],ror#8 - mov @t[2],@t[2],ror#8 - mov @x[14],@x[14],ror#8 - str @t[3], [sp,#4*(16+15)] - mov @t[3],#10 - b .Loop_neon - -.align 4 -.Loop_neon: - subs @t[3],@t[3],#1 -___ - my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); - my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); - my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); - my @thread3=&ROUND(0,4,8,12); - - foreach (@thread0) { - eval; eval(shift(@thread3)); - eval(shift(@thread1)); eval(shift(@thread3)); - eval(shift(@thread2)); eval(shift(@thread3)); - } - - @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); - @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); - @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); - @thread3=&ROUND(0,5,10,15); - - foreach (@thread0) { - eval; eval(shift(@thread3)); - eval(shift(@thread1)); eval(shift(@thread3)); - eval(shift(@thread2)); eval(shift(@thread3)); - } -$code.=<<___; - bne .Loop_neon - - add @t[3],sp,#32 - vld1.32 {$t0-$t1},[sp] @ load key material - vld1.32 {$t2-$t3},[@t[3]] - - ldr @t[3],[sp,#4*(32+2)] @ load len - - str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store - str @t[1], [sp,#4*(16+9)] - str @x[12],[sp,#4*(16+12)] - str @t[2], [sp,#4*(16+13)] - str @x[14],[sp,#4*(16+14)] - - @ at this point we have first half of 512-bit result in - @ @x[0-7] and second half at sp+4*(16+8) - - ldr r12,[sp,#4*(32+1)] @ load inp - ldr r14,[sp,#4*(32+0)] @ load out - - vadd.i32 $a0,$a0,$t0 @ accumulate key material - vadd.i32 $a1,$a1,$t0 - vadd.i32 $a2,$a2,$t0 - vldr $t0#lo,[sp,#4*(16+0)] @ one - - vadd.i32 $b0,$b0,$t1 - vadd.i32 $b1,$b1,$t1 - vadd.i32 $b2,$b2,$t1 - vldr $t1#lo,[sp,#4*(16+2)] @ two - - vadd.i32 $c0,$c0,$t2 - vadd.i32 $c1,$c1,$t2 - vadd.i32 $c2,$c2,$t2 - vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 - vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 - - vadd.i32 $d0,$d0,$t3 - vadd.i32 $d1,$d1,$t3 - vadd.i32 $d2,$d2,$t3 - - cmp @t[3],#64*4 - blo .Ltail_neon - - vld1.8 {$t0-$t1},[r12]! @ load input - mov @t[3],sp - vld1.8 {$t2-$t3},[r12]! - veor $a0,$a0,$t0 @ xor with input - veor $b0,$b0,$t1 - vld1.8 {$t0-$t1},[r12]! - veor $c0,$c0,$t2 - veor $d0,$d0,$t3 - vld1.8 {$t2-$t3},[r12]! - - veor $a1,$a1,$t0 - vst1.8 {$a0-$b0},[r14]! @ store output - veor $b1,$b1,$t1 - vld1.8 {$t0-$t1},[r12]! - veor $c1,$c1,$t2 - vst1.8 {$c0-$d0},[r14]! - veor $d1,$d1,$t3 - vld1.8 {$t2-$t3},[r12]! - - veor $a2,$a2,$t0 - vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration - veor $t0#hi,$t0#hi,$t0#hi - vldr $t0#lo,[sp,#4*(16+4)] @ four - veor $b2,$b2,$t1 - vld1.32 {$c0-$d0},[@t[3]] - veor $c2,$c2,$t2 - vst1.8 {$a1-$b1},[r14]! - veor $d2,$d2,$t3 - vst1.8 {$c1-$d1},[r14]! - - vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value - vldr $t0#lo,[sp,#4*(16+0)] @ one - - ldmia sp,{@t[0]-@t[3]} @ load key material - add @x[0],@x[0],@t[0] @ accumulate key material - ldr @t[0],[r12],#16 @ load input - vst1.8 {$a2-$b2},[r14]! - add @x[1],@x[1],@t[1] - ldr @t[1],[r12,#-12] - vst1.8 {$c2-$d2},[r14]! - add @x[2],@x[2],@t[2] - ldr @t[2],[r12,#-8] - add @x[3],@x[3],@t[3] - ldr @t[3],[r12,#-4] -# ifdef __ARMEB__ - rev @x[0],@x[0] - rev @x[1],@x[1] - rev @x[2],@x[2] - rev @x[3],@x[3] -# endif - eor @x[0],@x[0],@t[0] @ xor with input - add @t[0],sp,#4*(4) - eor @x[1],@x[1],@t[1] - str @x[0],[r14],#16 @ store output - eor @x[2],@x[2],@t[2] - str @x[1],[r14,#-12] - eor @x[3],@x[3],@t[3] - ldmia @t[0],{@t[0]-@t[3]} @ load key material - str @x[2],[r14,#-8] - str @x[3],[r14,#-4] - - add @x[4],@t[0],@x[4],ror#13 @ accumulate key material - ldr @t[0],[r12],#16 @ load input - add @x[5],@t[1],@x[5],ror#13 - ldr @t[1],[r12,#-12] - add @x[6],@t[2],@x[6],ror#13 - ldr @t[2],[r12,#-8] - add @x[7],@t[3],@x[7],ror#13 - ldr @t[3],[r12,#-4] -# ifdef __ARMEB__ - rev @x[4],@x[4] - rev @x[5],@x[5] - rev @x[6],@x[6] - rev @x[7],@x[7] -# endif - eor @x[4],@x[4],@t[0] - add @t[0],sp,#4*(8) - eor @x[5],@x[5],@t[1] - str @x[4],[r14],#16 @ store output - eor @x[6],@x[6],@t[2] - str @x[5],[r14,#-12] - eor @x[7],@x[7],@t[3] - ldmia @t[0],{@t[0]-@t[3]} @ load key material - str @x[6],[r14,#-8] - add @x[0],sp,#4*(16+8) - str @x[7],[r14,#-4] - - ldmia @x[0],{@x[0]-@x[7]} @ load second half - - add @x[0],@x[0],@t[0] @ accumulate key material - ldr @t[0],[r12],#16 @ load input - add @x[1],@x[1],@t[1] - ldr @t[1],[r12,#-12] -# ifdef __thumb2__ - it hi -# endif - strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it - add @x[2],@x[2],@t[2] - ldr @t[2],[r12,#-8] -# ifdef __thumb2__ - it hi -# endif - strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it - add @x[3],@x[3],@t[3] - ldr @t[3],[r12,#-4] -# ifdef __ARMEB__ - rev @x[0],@x[0] - rev @x[1],@x[1] - rev @x[2],@x[2] - rev @x[3],@x[3] -# endif - eor @x[0],@x[0],@t[0] - add @t[0],sp,#4*(12) - eor @x[1],@x[1],@t[1] - str @x[0],[r14],#16 @ store output - eor @x[2],@x[2],@t[2] - str @x[1],[r14,#-12] - eor @x[3],@x[3],@t[3] - ldmia @t[0],{@t[0]-@t[3]} @ load key material - str @x[2],[r14,#-8] - str @x[3],[r14,#-4] - - add @x[4],@t[0],@x[4],ror#24 @ accumulate key material - add @t[0],@t[0],#4 @ next counter value - add @x[5],@t[1],@x[5],ror#24 - str @t[0],[sp,#4*(12)] @ save next counter value - ldr @t[0],[r12],#16 @ load input - add @x[6],@t[2],@x[6],ror#24 - add @x[4],@x[4],#3 @ counter+3 - ldr @t[1],[r12,#-12] - add @x[7],@t[3],@x[7],ror#24 - ldr @t[2],[r12,#-8] - ldr @t[3],[r12,#-4] -# ifdef __ARMEB__ - rev @x[4],@x[4] - rev @x[5],@x[5] - rev @x[6],@x[6] - rev @x[7],@x[7] -# endif - eor @x[4],@x[4],@t[0] -# ifdef __thumb2__ - it hi -# endif - ldrhi @t[0],[sp,#4*(32+2)] @ re-load len - eor @x[5],@x[5],@t[1] - eor @x[6],@x[6],@t[2] - str @x[4],[r14],#16 @ store output - eor @x[7],@x[7],@t[3] - str @x[5],[r14,#-12] - sub @t[3],@t[0],#64*4 @ len-=64*4 - str @x[6],[r14,#-8] - str @x[7],[r14,#-4] - bhi .Loop_neon_outer - - b .Ldone_neon - -.align 4 -.Lbreak_neon: - @ harmonize NEON and integer-only stack frames: load data - @ from NEON frame, but save to integer-only one; distance - @ between the two is 4*(32+4+16-32)=4*(20). - - str @t[3], [sp,#4*(20+32+2)] @ save len - add @t[3],sp,#4*(32+4) - str r12, [sp,#4*(20+32+1)] @ save inp - str r14, [sp,#4*(20+32+0)] @ save out - - ldr @x[12],[sp,#4*(16+10)] - ldr @x[14],[sp,#4*(16+11)] - vldmia @t[3],{d8-d15} @ fulfill ABI requirement - str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" - str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" - - ldr @t[3], [sp,#4*(15)] - mov @x[4],@x[4],ror#19 @ twist b[0..3] - ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load - mov @x[5],@x[5],ror#19 - ldr @t[2], [sp,#4*(13)] - mov @x[6],@x[6],ror#19 - ldr @x[14],[sp,#4*(14)] - mov @x[7],@x[7],ror#19 - mov @t[3],@t[3],ror#8 @ twist d[0..3] - mov @x[12],@x[12],ror#8 - mov @t[2],@t[2],ror#8 - mov @x[14],@x[14],ror#8 - str @t[3], [sp,#4*(20+16+15)] - add @t[3],sp,#4*(20) - vst1.32 {$a0-$b0},[@t[3]]! @ copy key - add sp,sp,#4*(20) @ switch frame - vst1.32 {$c0-$d0},[@t[3]] - mov @t[3],#10 - b .Loop @ go integer-only - -.align 4 -.Ltail_neon: - cmp @t[3],#64*3 - bhs .L192_or_more_neon - cmp @t[3],#64*2 - bhs .L128_or_more_neon - cmp @t[3],#64*1 - bhs .L64_or_more_neon - - add @t[0],sp,#4*(8) - vst1.8 {$a0-$b0},[sp] - add @t[2],sp,#4*(0) - vst1.8 {$c0-$d0},[@t[0]] - b .Loop_tail_neon - -.align 4 -.L64_or_more_neon: - vld1.8 {$t0-$t1},[r12]! - vld1.8 {$t2-$t3},[r12]! - veor $a0,$a0,$t0 - veor $b0,$b0,$t1 - veor $c0,$c0,$t2 - veor $d0,$d0,$t3 - vst1.8 {$a0-$b0},[r14]! - vst1.8 {$c0-$d0},[r14]! - - beq .Ldone_neon - - add @t[0],sp,#4*(8) - vst1.8 {$a1-$b1},[sp] - add @t[2],sp,#4*(0) - vst1.8 {$c1-$d1},[@t[0]] - sub @t[3],@t[3],#64*1 @ len-=64*1 - b .Loop_tail_neon - -.align 4 -.L128_or_more_neon: - vld1.8 {$t0-$t1},[r12]! - vld1.8 {$t2-$t3},[r12]! - veor $a0,$a0,$t0 - veor $b0,$b0,$t1 - vld1.8 {$t0-$t1},[r12]! - veor $c0,$c0,$t2 - veor $d0,$d0,$t3 - vld1.8 {$t2-$t3},[r12]! - - veor $a1,$a1,$t0 - veor $b1,$b1,$t1 - vst1.8 {$a0-$b0},[r14]! - veor $c1,$c1,$t2 - vst1.8 {$c0-$d0},[r14]! - veor $d1,$d1,$t3 - vst1.8 {$a1-$b1},[r14]! - vst1.8 {$c1-$d1},[r14]! - - beq .Ldone_neon - - add @t[0],sp,#4*(8) - vst1.8 {$a2-$b2},[sp] - add @t[2],sp,#4*(0) - vst1.8 {$c2-$d2},[@t[0]] - sub @t[3],@t[3],#64*2 @ len-=64*2 - b .Loop_tail_neon - -.align 4 -.L192_or_more_neon: - vld1.8 {$t0-$t1},[r12]! - vld1.8 {$t2-$t3},[r12]! - veor $a0,$a0,$t0 - veor $b0,$b0,$t1 - vld1.8 {$t0-$t1},[r12]! - veor $c0,$c0,$t2 - veor $d0,$d0,$t3 - vld1.8 {$t2-$t3},[r12]! - - veor $a1,$a1,$t0 - veor $b1,$b1,$t1 - vld1.8 {$t0-$t1},[r12]! - veor $c1,$c1,$t2 - vst1.8 {$a0-$b0},[r14]! - veor $d1,$d1,$t3 - vld1.8 {$t2-$t3},[r12]! - - veor $a2,$a2,$t0 - vst1.8 {$c0-$d0},[r14]! - veor $b2,$b2,$t1 - vst1.8 {$a1-$b1},[r14]! - veor $c2,$c2,$t2 - vst1.8 {$c1-$d1},[r14]! - veor $d2,$d2,$t3 - vst1.8 {$a2-$b2},[r14]! - vst1.8 {$c2-$d2},[r14]! - - beq .Ldone_neon - - ldmia sp,{@t[0]-@t[3]} @ load key material - add @x[0],@x[0],@t[0] @ accumulate key material - add @t[0],sp,#4*(4) - add @x[1],@x[1],@t[1] - add @x[2],@x[2],@t[2] - add @x[3],@x[3],@t[3] - ldmia @t[0],{@t[0]-@t[3]} @ load key material - - add @x[4],@t[0],@x[4],ror#13 @ accumulate key material - add @t[0],sp,#4*(8) - add @x[5],@t[1],@x[5],ror#13 - add @x[6],@t[2],@x[6],ror#13 - add @x[7],@t[3],@x[7],ror#13 - ldmia @t[0],{@t[0]-@t[3]} @ load key material -# ifdef __ARMEB__ - rev @x[0],@x[0] - rev @x[1],@x[1] - rev @x[2],@x[2] - rev @x[3],@x[3] - rev @x[4],@x[4] - rev @x[5],@x[5] - rev @x[6],@x[6] - rev @x[7],@x[7] -# endif - stmia sp,{@x[0]-@x[7]} - add @x[0],sp,#4*(16+8) - - ldmia @x[0],{@x[0]-@x[7]} @ load second half - - add @x[0],@x[0],@t[0] @ accumulate key material - add @t[0],sp,#4*(12) - add @x[1],@x[1],@t[1] - add @x[2],@x[2],@t[2] - add @x[3],@x[3],@t[3] - ldmia @t[0],{@t[0]-@t[3]} @ load key material - - add @x[4],@t[0],@x[4],ror#24 @ accumulate key material - add @t[0],sp,#4*(8) - add @x[5],@t[1],@x[5],ror#24 - add @x[4],@x[4],#3 @ counter+3 - add @x[6],@t[2],@x[6],ror#24 - add @x[7],@t[3],@x[7],ror#24 - ldr @t[3],[sp,#4*(32+2)] @ re-load len -# ifdef __ARMEB__ - rev @x[0],@x[0] - rev @x[1],@x[1] - rev @x[2],@x[2] - rev @x[3],@x[3] - rev @x[4],@x[4] - rev @x[5],@x[5] - rev @x[6],@x[6] - rev @x[7],@x[7] -# endif - stmia @t[0],{@x[0]-@x[7]} - add @t[2],sp,#4*(0) - sub @t[3],@t[3],#64*3 @ len-=64*3 - -.Loop_tail_neon: - ldrb @t[0],[@t[2]],#1 @ read buffer on stack - ldrb @t[1],[r12],#1 @ read input - subs @t[3],@t[3],#1 - eor @t[0],@t[0],@t[1] - strb @t[0],[r14],#1 @ store output - bne .Loop_tail_neon - -.Ldone_neon: - add sp,sp,#4*(32+4) - vldmia sp,{d8-d15} - add sp,sp,#4*(16+3) - ldmia sp!,{r4-r11,pc} -.size ChaCha20_neon,.-ChaCha20_neon -# ifndef __KERNEL__ -.comm OPENSSL_armcap_P,4,4 -# endif -#endif -___ -}}} - -open SELF,$0; -while(<SELF>) { - next if (/^#!/); - last if (!s/^#/@/ and !/^$/); - print; -} -close SELF; - -foreach (split("\n",$code)) { - s/\`([^\`]*)\`/eval $1/geo; - - s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; - - print $_,"\n"; -} -close STDOUT; diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm64.pl b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm64.pl deleted file mode 100755 index ac14a9924165..000000000000 --- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm64.pl +++ /dev/null @@ -1,1163 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause -# -# This code is taken from the OpenSSL project but the author, Andy Polyakov, -# has relicensed it under the licenses specified in the SPDX header above. -# The original headers, including the original license headers, are -# included below for completeness. -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# June 2015 -# -# ChaCha20 for ARMv8. -# -# Performance in cycles per byte out of large buffer. -# -# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU(*) -# -# Apple A7 5.50/+49% 3.33 1.70 -# Cortex-A53 8.40/+80% 4.72 4.72(**) -# Cortex-A57 8.06/+43% 4.90 4.43(***) -# Denver 4.50/+82% 2.63 2.67(**) -# X-Gene 9.50/+46% 8.82 8.89(**) -# Mongoose 8.00/+44% 3.64 3.25(***) -# Kryo 8.17/+50% 4.83 4.65(***) -# -# (*) since no non-Apple processor exhibits significantly better -# performance, the code path is #ifdef __APPLE__-ed; -# (**) it's expected that doubling interleave factor doesn't help -# all processors, only those with higher NEON latency and -# higher instruction issue rate; -# (***) expected improvement was actually higher; - -$flavour=shift; -if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } -else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -sub AUTOLOAD() # thunk [simplified] x86-style perlasm -{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; - my $arg = pop; - $arg = "#$arg" if ($arg*1 eq $arg); - $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; -} - -my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4)); - -my @x=map("x$_",(5..17,19..21)); -my @d=map("x$_",(22..28,30)); - -sub ROUND { -my ($a0,$b0,$c0,$d0)=@_; -my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); -my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); -my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); - - ( - "&add_32 (@x[$a0],@x[$a0],@x[$b0])", - "&add_32 (@x[$a1],@x[$a1],@x[$b1])", - "&add_32 (@x[$a2],@x[$a2],@x[$b2])", - "&add_32 (@x[$a3],@x[$a3],@x[$b3])", - "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", - "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", - "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", - "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", - "&ror_32 (@x[$d0],@x[$d0],16)", - "&ror_32 (@x[$d1],@x[$d1],16)", - "&ror_32 (@x[$d2],@x[$d2],16)", - "&ror_32 (@x[$d3],@x[$d3],16)", - - "&add_32 (@x[$c0],@x[$c0],@x[$d0])", - "&add_32 (@x[$c1],@x[$c1],@x[$d1])", - "&add_32 (@x[$c2],@x[$c2],@x[$d2])", - "&add_32 (@x[$c3],@x[$c3],@x[$d3])", - "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", - "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", - "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", - "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", - "&ror_32 (@x[$b0],@x[$b0],20)", - "&ror_32 (@x[$b1],@x[$b1],20)", - "&ror_32 (@x[$b2],@x[$b2],20)", - "&ror_32 (@x[$b3],@x[$b3],20)", - - "&add_32 (@x[$a0],@x[$a0],@x[$b0])", - "&add_32 (@x[$a1],@x[$a1],@x[$b1])", - "&add_32 (@x[$a2],@x[$a2],@x[$b2])", - "&add_32 (@x[$a3],@x[$a3],@x[$b3])", - "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", - "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", - "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", - "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", - "&ror_32 (@x[$d0],@x[$d0],24)", - "&ror_32 (@x[$d1],@x[$d1],24)", - "&ror_32 (@x[$d2],@x[$d2],24)", - "&ror_32 (@x[$d3],@x[$d3],24)", - - "&add_32 (@x[$c0],@x[$c0],@x[$d0])", - "&add_32 (@x[$c1],@x[$c1],@x[$d1])", - "&add_32 (@x[$c2],@x[$c2],@x[$d2])", - "&add_32 (@x[$c3],@x[$c3],@x[$d3])", - "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", - "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", - "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", - "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", - "&ror_32 (@x[$b0],@x[$b0],25)", - "&ror_32 (@x[$b1],@x[$b1],25)", - "&ror_32 (@x[$b2],@x[$b2],25)", - "&ror_32 (@x[$b3],@x[$b3],25)" - ); -} - -$code.=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -.extern OPENSSL_armcap_P -#else -# define ChaCha20_ctr32 chacha20_arm -# define ChaCha20_neon chacha20_neon -#endif - -.text - -.align 5 -.Lsigma: -.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral -.Lone: -.long 1,0,0,0 -#ifndef __KERNEL__ -.LOPENSSL_armcap_P: -# ifdef __ILP32__ -.long OPENSSL_armcap_P-. -# else -.quad OPENSSL_armcap_P-. -# endif -#endif - -.globl ChaCha20_ctr32 -.type ChaCha20_ctr32,%function -.align 5 -ChaCha20_ctr32: - cbz $len,.Labort -#ifndef __KERNEL__ - adr @x[0],.LOPENSSL_armcap_P - cmp $len,#192 - b.lo .Lshort -# ifdef __ILP32__ - ldrsw @x[1],[@x[0]] -# else - ldr @x[1],[@x[0]] -# endif - ldr w17,[@x[1],@x[0]] - tst w17,#ARMV7_NEON - b.ne ChaCha20_neon - -.Lshort: -#endif - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adr @x[0],.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#64 - - ldp @d[0],@d[1],[@x[0]] // load sigma - ldp @d[2],@d[3],[$key] // load key - ldp @d[4],@d[5],[$key,#16] - ldp @d[6],@d[7],[$ctr] // load counter -#ifdef __AARCH64EB__ - ror @d[2],@d[2],#32 - ror @d[3],@d[3],#32 - ror @d[4],@d[4],#32 - ror @d[5],@d[5],#32 - ror @d[6],@d[6],#32 - ror @d[7],@d[7],#32 -#endif - -.Loop_outer: - mov.32 @x[0],@d[0] // unpack key block - lsr @x[1],@d[0],#32 - mov.32 @x[2],@d[1] - lsr @x[3],@d[1],#32 - mov.32 @x[4],@d[2] - lsr @x[5],@d[2],#32 - mov.32 @x[6],@d[3] - lsr @x[7],@d[3],#32 - mov.32 @x[8],@d[4] - lsr @x[9],@d[4],#32 - mov.32 @x[10],@d[5] - lsr @x[11],@d[5],#32 - mov.32 @x[12],@d[6] - lsr @x[13],@d[6],#32 - mov.32 @x[14],@d[7] - lsr @x[15],@d[7],#32 - - mov $ctr,#10 - subs $len,$len,#64 -.Loop: - sub $ctr,$ctr,#1 -___ - foreach (&ROUND(0, 4, 8,12)) { eval; } - foreach (&ROUND(0, 5,10,15)) { eval; } -$code.=<<___; - cbnz $ctr,.Loop - - add.32 @x[0],@x[0],@d[0] // accumulate key block - add @x[1],@x[1],@d[0],lsr#32 - add.32 @x[2],@x[2],@d[1] - add @x[3],@x[3],@d[1],lsr#32 - add.32 @x[4],@x[4],@d[2] - add @x[5],@x[5],@d[2],lsr#32 - add.32 @x[6],@x[6],@d[3] - add @x[7],@x[7],@d[3],lsr#32 - add.32 @x[8],@x[8],@d[4] - add @x[9],@x[9],@d[4],lsr#32 - add.32 @x[10],@x[10],@d[5] - add @x[11],@x[11],@d[5],lsr#32 - add.32 @x[12],@x[12],@d[6] - add @x[13],@x[13],@d[6],lsr#32 - add.32 @x[14],@x[14],@d[7] - add @x[15],@x[15],@d[7],lsr#32 - - b.lo .Ltail - - add @x[0],@x[0],@x[1],lsl#32 // pack - add @x[2],@x[2],@x[3],lsl#32 - ldp @x[1],@x[3],[$inp,#0] // load input - add @x[4],@x[4],@x[5],lsl#32 - add @x[6],@x[6],@x[7],lsl#32 - ldp @x[5],@x[7],[$inp,#16] - add @x[8],@x[8],@x[9],lsl#32 - add @x[10],@x[10],@x[11],lsl#32 - ldp @x[9],@x[11],[$inp,#32] - add @x[12],@x[12],@x[13],lsl#32 - add @x[14],@x[14],@x[15],lsl#32 - ldp @x[13],@x[15],[$inp,#48] - add $inp,$inp,#64 -#ifdef __AARCH64EB__ - rev @x[0],@x[0] - rev @x[2],@x[2] - rev @x[4],@x[4] - rev @x[6],@x[6] - rev @x[8],@x[8] - rev @x[10],@x[10] - rev @x[12],@x[12] - rev @x[14],@x[14] -#endif - eor @x[0],@x[0],@x[1] - eor @x[2],@x[2],@x[3] - eor @x[4],@x[4],@x[5] - eor @x[6],@x[6],@x[7] - eor @x[8],@x[8],@x[9] - eor @x[10],@x[10],@x[11] - eor @x[12],@x[12],@x[13] - eor @x[14],@x[14],@x[15] - - stp @x[0],@x[2],[$out,#0] // store output - add @d[6],@d[6],#1 // increment counter - stp @x[4],@x[6],[$out,#16] - stp @x[8],@x[10],[$out,#32] - stp @x[12],@x[14],[$out,#48] - add $out,$out,#64 - - b.hi .Loop_outer - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 -.Labort: - ret - -.align 4 -.Ltail: - add $len,$len,#64 -.Less_than_64: - sub $out,$out,#1 - add $inp,$inp,$len - add $out,$out,$len - add $ctr,sp,$len - neg $len,$len - - add @x[0],@x[0],@x[1],lsl#32 // pack - add @x[2],@x[2],@x[3],lsl#32 - add @x[4],@x[4],@x[5],lsl#32 - add @x[6],@x[6],@x[7],lsl#32 - add @x[8],@x[8],@x[9],lsl#32 - add @x[10],@x[10],@x[11],lsl#32 - add @x[12],@x[12],@x[13],lsl#32 - add @x[14],@x[14],@x[15],lsl#32 -#ifdef __AARCH64EB__ - rev @x[0],@x[0] - rev @x[2],@x[2] - rev @x[4],@x[4] - rev @x[6],@x[6] - rev @x[8],@x[8] - rev @x[10],@x[10] - rev @x[12],@x[12] - rev @x[14],@x[14] -#endif - stp @x[0],@x[2],[sp,#0] - stp @x[4],@x[6],[sp,#16] - stp @x[8],@x[10],[sp,#32] - stp @x[12],@x[14],[sp,#48] - -.Loop_tail: - ldrb w10,[$inp,$len] - ldrb w11,[$ctr,$len] - add $len,$len,#1 - eor w10,w10,w11 - strb w10,[$out,$len] - cbnz $len,.Loop_tail - - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - ret -.size ChaCha20_ctr32,.-ChaCha20_ctr32 -___ - -{{{ -my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) = - map("v$_.4s",(0..7,16..23)); -my (@K)=map("v$_.4s",(24..30)); -my $ONE="v31.4s"; - -sub NEONROUND { -my $odd = pop; -my ($a,$b,$c,$d,$t)=@_; - - ( - "&add ('$a','$a','$b')", - "&eor ('$d','$d','$a')", - "&rev32_16 ('$d','$d')", # vrot ($d,16) - - "&add ('$c','$c','$d')", - "&eor ('$t','$b','$c')", - "&ushr ('$b','$t',20)", - "&sli ('$b','$t',12)", - - "&add ('$a','$a','$b')", - "&eor ('$t','$d','$a')", - "&ushr ('$d','$t',24)", - "&sli ('$d','$t',8)", - - "&add ('$c','$c','$d')", - "&eor ('$t','$b','$c')", - "&ushr ('$b','$t',25)", - "&sli ('$b','$t',7)", - - "&ext ('$a','$a','$a',$odd?4:12)", - "&ext ('$d','$d','$d',8)", - "&ext ('$c','$c','$c',$odd?12:4)" - ); -} - -$code.=<<___; -#if !defined(__KERNEL__) || defined(CONFIG_KERNEL_MODE_NEON) -#ifdef __KERNEL__ -.globl ChaCha20_neon -.type ChaCha20_neon,%function -#endif -.type ChaCha20_neon,%function -.align 5 -ChaCha20_neon: - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adr @x[0],.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] -#ifdef __APPLE__ - cmp $len,#512 - b.hs .L512_or_more_neon -#endif - - sub sp,sp,#64 - - ldp @d[0],@d[1],[@x[0]] // load sigma - ld1 {@K[0]},[@x[0]],#16 - ldp @d[2],@d[3],[$key] // load key - ldp @d[4],@d[5],[$key,#16] - ld1 {@K[1],@K[2]},[$key] - ldp @d[6],@d[7],[$ctr] // load counter - ld1 {@K[3]},[$ctr] - ld1 {$ONE},[@x[0]] -#ifdef __AARCH64EB__ - rev64 @K[0],@K[0] - ror @d[2],@d[2],#32 - ror @d[3],@d[3],#32 - ror @d[4],@d[4],#32 - ror @d[5],@d[5],#32 - ror @d[6],@d[6],#32 - ror @d[7],@d[7],#32 -#endif - add @K[3],@K[3],$ONE // += 1 - add @K[4],@K[3],$ONE - add @K[5],@K[4],$ONE - shl $ONE,$ONE,#2 // 1 -> 4 - -.Loop_outer_neon: - mov.32 @x[0],@d[0] // unpack key block - lsr @x[1],@d[0],#32 - mov $A0,@K[0] - mov.32 @x[2],@d[1] - lsr @x[3],@d[1],#32 - mov $A1,@K[0] - mov.32 @x[4],@d[2] - lsr @x[5],@d[2],#32 - mov $A2,@K[0] - mov.32 @x[6],@d[3] - mov $B0,@K[1] - lsr @x[7],@d[3],#32 - mov $B1,@K[1] - mov.32 @x[8],@d[4] - mov $B2,@K[1] - lsr @x[9],@d[4],#32 - mov $D0,@K[3] - mov.32 @x[10],@d[5] - mov $D1,@K[4] - lsr @x[11],@d[5],#32 - mov $D2,@K[5] - mov.32 @x[12],@d[6] - mov $C0,@K[2] - lsr @x[13],@d[6],#32 - mov $C1,@K[2] - mov.32 @x[14],@d[7] - mov $C2,@K[2] - lsr @x[15],@d[7],#32 - - mov $ctr,#10 - subs $len,$len,#256 -.Loop_neon: - sub $ctr,$ctr,#1 -___ - my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); - my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); - my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); - my @thread3=&ROUND(0,4,8,12); - - foreach (@thread0) { - eval; eval(shift(@thread3)); - eval(shift(@thread1)); eval(shift(@thread3)); - eval(shift(@thread2)); eval(shift(@thread3)); - } - - @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); - @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); - @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); - @thread3=&ROUND(0,5,10,15); - - foreach (@thread0) { - eval; eval(shift(@thread3)); - eval(shift(@thread1)); eval(shift(@thread3)); - eval(shift(@thread2)); eval(shift(@thread3)); - } -$code.=<<___; - cbnz $ctr,.Loop_neon - - add.32 @x[0],@x[0],@d[0] // accumulate key block - add $A0,$A0,@K[0] - add @x[1],@x[1],@d[0],lsr#32 - add $A1,$A1,@K[0] - add.32 @x[2],@x[2],@d[1] - add $A2,$A2,@K[0] - add @x[3],@x[3],@d[1],lsr#32 - add $C0,$C0,@K[2] - add.32 @x[4],@x[4],@d[2] - add $C1,$C1,@K[2] - add @x[5],@x[5],@d[2],lsr#32 - add $C2,$C2,@K[2] - add.32 @x[6],@x[6],@d[3] - add $D0,$D0,@K[3] - add @x[7],@x[7],@d[3],lsr#32 - add.32 @x[8],@x[8],@d[4] - add $D1,$D1,@K[4] - add @x[9],@x[9],@d[4],lsr#32 - add.32 @x[10],@x[10],@d[5] - add $D2,$D2,@K[5] - add @x[11],@x[11],@d[5],lsr#32 - add.32 @x[12],@x[12],@d[6] - add $B0,$B0,@K[1] - add @x[13],@x[13],@d[6],lsr#32 - add.32 @x[14],@x[14],@d[7] - add $B1,$B1,@K[1] - add @x[15],@x[15],@d[7],lsr#32 - add $B2,$B2,@K[1] - - b.lo .Ltail_neon - - add @x[0],@x[0],@x[1],lsl#32 // pack - add @x[2],@x[2],@x[3],lsl#32 - ldp @x[1],@x[3],[$inp,#0] // load input - add @x[4],@x[4],@x[5],lsl#32 - add @x[6],@x[6],@x[7],lsl#32 - ldp @x[5],@x[7],[$inp,#16] - add @x[8],@x[8],@x[9],lsl#32 - add @x[10],@x[10],@x[11],lsl#32 - ldp @x[9],@x[11],[$inp,#32] - add @x[12],@x[12],@x[13],lsl#32 - add @x[14],@x[14],@x[15],lsl#32 - ldp @x[13],@x[15],[$inp,#48] - add $inp,$inp,#64 -#ifdef __AARCH64EB__ - rev @x[0],@x[0] - rev @x[2],@x[2] - rev @x[4],@x[4] - rev @x[6],@x[6] - rev @x[8],@x[8] - rev @x[10],@x[10] - rev @x[12],@x[12] - rev @x[14],@x[14] -#endif - ld1.8 {$T0-$T3},[$inp],#64 - eor @x[0],@x[0],@x[1] - eor @x[2],@x[2],@x[3] - eor @x[4],@x[4],@x[5] - eor @x[6],@x[6],@x[7] - eor @x[8],@x[8],@x[9] - eor $A0,$A0,$T0 - eor @x[10],@x[10],@x[11] - eor $B0,$B0,$T1 - eor @x[12],@x[12],@x[13] - eor $C0,$C0,$T2 - eor @x[14],@x[14],@x[15] - eor $D0,$D0,$T3 - ld1.8 {$T0-$T3},[$inp],#64 - - stp @x[0],@x[2],[$out,#0] // store output - add @d[6],@d[6],#4 // increment counter - stp @x[4],@x[6],[$out,#16] - add @K[3],@K[3],$ONE // += 4 - stp @x[8],@x[10],[$out,#32] - add @K[4],@K[4],$ONE - stp @x[12],@x[14],[$out,#48] - add @K[5],@K[5],$ONE - add $out,$out,#64 - - st1.8 {$A0-$D0},[$out],#64 - ld1.8 {$A0-$D0},[$inp],#64 - - eor $A1,$A1,$T0 - eor $B1,$B1,$T1 - eor $C1,$C1,$T2 - eor $D1,$D1,$T3 - st1.8 {$A1-$D1},[$out],#64 - - eor $A2,$A2,$A0 - eor $B2,$B2,$B0 - eor $C2,$C2,$C0 - eor $D2,$D2,$D0 - st1.8 {$A2-$D2},[$out],#64 - - b.hi .Loop_outer_neon - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - ret - -.Ltail_neon: - add $len,$len,#256 - cmp $len,#64 - b.lo .Less_than_64 - - add @x[0],@x[0],@x[1],lsl#32 // pack - add @x[2],@x[2],@x[3],lsl#32 - ldp @x[1],@x[3],[$inp,#0] // load input - add @x[4],@x[4],@x[5],lsl#32 - add @x[6],@x[6],@x[7],lsl#32 - ldp @x[5],@x[7],[$inp,#16] - add @x[8],@x[8],@x[9],lsl#32 - add @x[10],@x[10],@x[11],lsl#32 - ldp @x[9],@x[11],[$inp,#32] - add @x[12],@x[12],@x[13],lsl#32 - add @x[14],@x[14],@x[15],lsl#32 - ldp @x[13],@x[15],[$inp,#48] - add $inp,$inp,#64 -#ifdef __AARCH64EB__ - rev @x[0],@x[0] - rev @x[2],@x[2] - rev @x[4],@x[4] - rev @x[6],@x[6] - rev @x[8],@x[8] - rev @x[10],@x[10] - rev @x[12],@x[12] - rev @x[14],@x[14] -#endif - eor @x[0],@x[0],@x[1] - eor @x[2],@x[2],@x[3] - eor @x[4],@x[4],@x[5] - eor @x[6],@x[6],@x[7] - eor @x[8],@x[8],@x[9] - eor @x[10],@x[10],@x[11] - eor @x[12],@x[12],@x[13] - eor @x[14],@x[14],@x[15] - - stp @x[0],@x[2],[$out,#0] // store output - add @d[6],@d[6],#4 // increment counter - stp @x[4],@x[6],[$out,#16] - stp @x[8],@x[10],[$out,#32] - stp @x[12],@x[14],[$out,#48] - add $out,$out,#64 - b.eq .Ldone_neon - sub $len,$len,#64 - cmp $len,#64 - b.lo .Less_than_128 - - ld1.8 {$T0-$T3},[$inp],#64 - eor $A0,$A0,$T0 - eor $B0,$B0,$T1 - eor $C0,$C0,$T2 - eor $D0,$D0,$T3 - st1.8 {$A0-$D0},[$out],#64 - b.eq .Ldone_neon - sub $len,$len,#64 - cmp $len,#64 - b.lo .Less_than_192 - - ld1.8 {$T0-$T3},[$inp],#64 - eor $A1,$A1,$T0 - eor $B1,$B1,$T1 - eor $C1,$C1,$T2 - eor $D1,$D1,$T3 - st1.8 {$A1-$D1},[$out],#64 - b.eq .Ldone_neon - sub $len,$len,#64 - - st1.8 {$A2-$D2},[sp] - b .Last_neon - -.Less_than_128: - st1.8 {$A0-$D0},[sp] - b .Last_neon -.Less_than_192: - st1.8 {$A1-$D1},[sp] - b .Last_neon - -.align 4 -.Last_neon: - sub $out,$out,#1 - add $inp,$inp,$len - add $out,$out,$len - add $ctr,sp,$len - neg $len,$len - -.Loop_tail_neon: - ldrb w10,[$inp,$len] - ldrb w11,[$ctr,$len] - add $len,$len,#1 - eor w10,w10,w11 - strb w10,[$out,$len] - cbnz $len,.Loop_tail_neon - - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] - -.Ldone_neon: - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - ret -.size ChaCha20_neon,.-ChaCha20_neon -___ -{ -my ($T0,$T1,$T2,$T3,$T4,$T5)=@K; -my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2, - $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23)); - -$code.=<<___; -#ifdef __APPLE__ -.type ChaCha20_512_neon,%function -.align 5 -ChaCha20_512_neon: - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adr @x[0],.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - -.L512_or_more_neon: - sub sp,sp,#128+64 - - ldp @d[0],@d[1],[@x[0]] // load sigma - ld1 {@K[0]},[@x[0]],#16 - ldp @d[2],@d[3],[$key] // load key - ldp @d[4],@d[5],[$key,#16] - ld1 {@K[1],@K[2]},[$key] - ldp @d[6],@d[7],[$ctr] // load counter - ld1 {@K[3]},[$ctr] - ld1 {$ONE},[@x[0]] -# ifdef __AARCH64EB__ - rev64 @K[0],@K[0] - ror @d[2],@d[2],#32 - ror @d[3],@d[3],#32 - ror @d[4],@d[4],#32 - ror @d[5],@d[5],#32 - ror @d[6],@d[6],#32 - ror @d[7],@d[7],#32 -# endif - add @K[3],@K[3],$ONE // += 1 - stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part - add @K[3],@K[3],$ONE // not typo - str @K[2],[sp,#32] - add @K[4],@K[3],$ONE - add @K[5],@K[4],$ONE - add @K[6],@K[5],$ONE - shl $ONE,$ONE,#2 // 1 -> 4 - - stp d8,d9,[sp,#128+0] // meet ABI requirements - stp d10,d11,[sp,#128+16] - stp d12,d13,[sp,#128+32] - stp d14,d15,[sp,#128+48] - - sub $len,$len,#512 // not typo - -.Loop_outer_512_neon: - mov $A0,@K[0] - mov $A1,@K[0] - mov $A2,@K[0] - mov $A3,@K[0] - mov $A4,@K[0] - mov $A5,@K[0] - mov $B0,@K[1] - mov.32 @x[0],@d[0] // unpack key block - mov $B1,@K[1] - lsr @x[1],@d[0],#32 - mov $B2,@K[1] - mov.32 @x[2],@d[1] - mov $B3,@K[1] - lsr @x[3],@d[1],#32 - mov $B4,@K[1] - mov.32 @x[4],@d[2] - mov $B5,@K[1] - lsr @x[5],@d[2],#32 - mov $D0,@K[3] - mov.32 @x[6],@d[3] - mov $D1,@K[4] - lsr @x[7],@d[3],#32 - mov $D2,@K[5] - mov.32 @x[8],@d[4] - mov $D3,@K[6] - lsr @x[9],@d[4],#32 - mov $C0,@K[2] - mov.32 @x[10],@d[5] - mov $C1,@K[2] - lsr @x[11],@d[5],#32 - add $D4,$D0,$ONE // +4 - mov.32 @x[12],@d[6] - add $D5,$D1,$ONE // +4 - lsr @x[13],@d[6],#32 - mov $C2,@K[2] - mov.32 @x[14],@d[7] - mov $C3,@K[2] - lsr @x[15],@d[7],#32 - mov $C4,@K[2] - stp @K[3],@K[4],[sp,#48] // off-load key block, variable part - mov $C5,@K[2] - str @K[5],[sp,#80] - - mov $ctr,#5 - subs $len,$len,#512 -.Loop_upper_neon: - sub $ctr,$ctr,#1 -___ - my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); - my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); - my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); - my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); - my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); - my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); - my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); - my $diff = ($#thread0+1)*6 - $#thread67 - 1; - my $i = 0; - - foreach (@thread0) { - eval; eval(shift(@thread67)); - eval(shift(@thread1)); eval(shift(@thread67)); - eval(shift(@thread2)); eval(shift(@thread67)); - eval(shift(@thread3)); eval(shift(@thread67)); - eval(shift(@thread4)); eval(shift(@thread67)); - eval(shift(@thread5)); eval(shift(@thread67)); - } - - @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); - @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); - @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); - @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); - @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); - @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); - @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); - - foreach (@thread0) { - eval; eval(shift(@thread67)); - eval(shift(@thread1)); eval(shift(@thread67)); - eval(shift(@thread2)); eval(shift(@thread67)); - eval(shift(@thread3)); eval(shift(@thread67)); - eval(shift(@thread4)); eval(shift(@thread67)); - eval(shift(@thread5)); eval(shift(@thread67)); - } -$code.=<<___; - cbnz $ctr,.Loop_upper_neon - - add.32 @x[0],@x[0],@d[0] // accumulate key block - add @x[1],@x[1],@d[0],lsr#32 - add.32 @x[2],@x[2],@d[1] - add @x[3],@x[3],@d[1],lsr#32 - add.32 @x[4],@x[4],@d[2] - add @x[5],@x[5],@d[2],lsr#32 - add.32 @x[6],@x[6],@d[3] - add @x[7],@x[7],@d[3],lsr#32 - add.32 @x[8],@x[8],@d[4] - add @x[9],@x[9],@d[4],lsr#32 - add.32 @x[10],@x[10],@d[5] - add @x[11],@x[11],@d[5],lsr#32 - add.32 @x[12],@x[12],@d[6] - add @x[13],@x[13],@d[6],lsr#32 - add.32 @x[14],@x[14],@d[7] - add @x[15],@x[15],@d[7],lsr#32 - - add @x[0],@x[0],@x[1],lsl#32 // pack - add @x[2],@x[2],@x[3],lsl#32 - ldp @x[1],@x[3],[$inp,#0] // load input - add @x[4],@x[4],@x[5],lsl#32 - add @x[6],@x[6],@x[7],lsl#32 - ldp @x[5],@x[7],[$inp,#16] - add @x[8],@x[8],@x[9],lsl#32 - add @x[10],@x[10],@x[11],lsl#32 - ldp @x[9],@x[11],[$inp,#32] - add @x[12],@x[12],@x[13],lsl#32 - add @x[14],@x[14],@x[15],lsl#32 - ldp @x[13],@x[15],[$inp,#48] - add $inp,$inp,#64 -# ifdef __AARCH64EB__ - rev @x[0],@x[0] - rev @x[2],@x[2] - rev @x[4],@x[4] - rev @x[6],@x[6] - rev @x[8],@x[8] - rev @x[10],@x[10] - rev @x[12],@x[12] - rev @x[14],@x[14] -# endif - eor @x[0],@x[0],@x[1] - eor @x[2],@x[2],@x[3] - eor @x[4],@x[4],@x[5] - eor @x[6],@x[6],@x[7] - eor @x[8],@x[8],@x[9] - eor @x[10],@x[10],@x[11] - eor @x[12],@x[12],@x[13] - eor @x[14],@x[14],@x[15] - - stp @x[0],@x[2],[$out,#0] // store output - add @d[6],@d[6],#1 // increment counter - mov.32 @x[0],@d[0] // unpack key block - lsr @x[1],@d[0],#32 - stp @x[4],@x[6],[$out,#16] - mov.32 @x[2],@d[1] - lsr @x[3],@d[1],#32 - stp @x[8],@x[10],[$out,#32] - mov.32 @x[4],@d[2] - lsr @x[5],@d[2],#32 - stp @x[12],@x[14],[$out,#48] - add $out,$out,#64 - mov.32 @x[6],@d[3] - lsr @x[7],@d[3],#32 - mov.32 @x[8],@d[4] - lsr @x[9],@d[4],#32 - mov.32 @x[10],@d[5] - lsr @x[11],@d[5],#32 - mov.32 @x[12],@d[6] - lsr @x[13],@d[6],#32 - mov.32 @x[14],@d[7] - lsr @x[15],@d[7],#32 - - mov $ctr,#5 -.Loop_lower_neon: - sub $ctr,$ctr,#1 -___ - @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); - @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); - @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); - @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); - @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); - @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); - @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); - - foreach (@thread0) { - eval; eval(shift(@thread67)); - eval(shift(@thread1)); eval(shift(@thread67)); - eval(shift(@thread2)); eval(shift(@thread67)); - eval(shift(@thread3)); eval(shift(@thread67)); - eval(shift(@thread4)); eval(shift(@thread67)); - eval(shift(@thread5)); eval(shift(@thread67)); - } - - @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); - @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); - @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); - @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); - @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); - @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); - @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); - - foreach (@thread0) { - eval; eval(shift(@thread67)); - eval(shift(@thread1)); eval(shift(@thread67)); - eval(shift(@thread2)); eval(shift(@thread67)); - eval(shift(@thread3)); eval(shift(@thread67)); - eval(shift(@thread4)); eval(shift(@thread67)); - eval(shift(@thread5)); eval(shift(@thread67)); - } -$code.=<<___; - cbnz $ctr,.Loop_lower_neon - - add.32 @x[0],@x[0],@d[0] // accumulate key block - ldp @K[0],@K[1],[sp,#0] - add @x[1],@x[1],@d[0],lsr#32 - ldp @K[2],@K[3],[sp,#32] - add.32 @x[2],@x[2],@d[1] - ldp @K[4],@K[5],[sp,#64] - add @x[3],@x[3],@d[1],lsr#32 - add $A0,$A0,@K[0] - add.32 @x[4],@x[4],@d[2] - add $A1,$A1,@K[0] - add @x[5],@x[5],@d[2],lsr#32 - add $A2,$A2,@K[0] - add.32 @x[6],@x[6],@d[3] - add $A3,$A3,@K[0] - add @x[7],@x[7],@d[3],lsr#32 - add $A4,$A4,@K[0] - add.32 @x[8],@x[8],@d[4] - add $A5,$A5,@K[0] - add @x[9],@x[9],@d[4],lsr#32 - add $C0,$C0,@K[2] - add.32 @x[10],@x[10],@d[5] - add $C1,$C1,@K[2] - add @x[11],@x[11],@d[5],lsr#32 - add $C2,$C2,@K[2] - add.32 @x[12],@x[12],@d[6] - add $C3,$C3,@K[2] - add @x[13],@x[13],@d[6],lsr#32 - add $C4,$C4,@K[2] - add.32 @x[14],@x[14],@d[7] - add $C5,$C5,@K[2] - add @x[15],@x[15],@d[7],lsr#32 - add $D4,$D4,$ONE // +4 - add @x[0],@x[0],@x[1],lsl#32 // pack - add $D5,$D5,$ONE // +4 - add @x[2],@x[2],@x[3],lsl#32 - add $D0,$D0,@K[3] - ldp @x[1],@x[3],[$inp,#0] // load input - add $D1,$D1,@K[4] - add @x[4],@x[4],@x[5],lsl#32 - add $D2,$D2,@K[5] - add @x[6],@x[6],@x[7],lsl#32 - add $D3,$D3,@K[6] - ldp @x[5],@x[7],[$inp,#16] - add $D4,$D4,@K[3] - add @x[8],@x[8],@x[9],lsl#32 - add $D5,$D5,@K[4] - add @x[10],@x[10],@x[11],lsl#32 - add $B0,$B0,@K[1] - ldp @x[9],@x[11],[$inp,#32] - add $B1,$B1,@K[1] - add @x[12],@x[12],@x[13],lsl#32 - add $B2,$B2,@K[1] - add @x[14],@x[14],@x[15],lsl#32 - add $B3,$B3,@K[1] - ldp @x[13],@x[15],[$inp,#48] - add $B4,$B4,@K[1] - add $inp,$inp,#64 - add $B5,$B5,@K[1] - -# ifdef __AARCH64EB__ - rev @x[0],@x[0] - rev @x[2],@x[2] - rev @x[4],@x[4] - rev @x[6],@x[6] - rev @x[8],@x[8] - rev @x[10],@x[10] - rev @x[12],@x[12] - rev @x[14],@x[14] -# endif - ld1.8 {$T0-$T3},[$inp],#64 - eor @x[0],@x[0],@x[1] - eor @x[2],@x[2],@x[3] - eor @x[4],@x[4],@x[5] - eor @x[6],@x[6],@x[7] - eor @x[8],@x[8],@x[9] - eor $A0,$A0,$T0 - eor @x[10],@x[10],@x[11] - eor $B0,$B0,$T1 - eor @x[12],@x[12],@x[13] - eor $C0,$C0,$T2 - eor @x[14],@x[14],@x[15] - eor $D0,$D0,$T3 - ld1.8 {$T0-$T3},[$inp],#64 - - stp @x[0],@x[2],[$out,#0] // store output - add @d[6],@d[6],#7 // increment counter - stp @x[4],@x[6],[$out,#16] - stp @x[8],@x[10],[$out,#32] - stp @x[12],@x[14],[$out,#48] - add $out,$out,#64 - st1.8 {$A0-$D0},[$out],#64 - - ld1.8 {$A0-$D0},[$inp],#64 - eor $A1,$A1,$T0 - eor $B1,$B1,$T1 - eor $C1,$C1,$T2 - eor $D1,$D1,$T3 - st1.8 {$A1-$D1},[$out],#64 - - ld1.8 {$A1-$D1},[$inp],#64 - eor $A2,$A2,$A0 - ldp @K[0],@K[1],[sp,#0] - eor $B2,$B2,$B0 - ldp @K[2],@K[3],[sp,#32] - eor $C2,$C2,$C0 - eor $D2,$D2,$D0 - st1.8 {$A2-$D2},[$out],#64 - - ld1.8 {$A2-$D2},[$inp],#64 - eor $A3,$A3,$A1 - eor $B3,$B3,$B1 - eor $C3,$C3,$C1 - eor $D3,$D3,$D1 - st1.8 {$A3-$D3},[$out],#64 - - ld1.8 {$A3-$D3},[$inp],#64 - eor $A4,$A4,$A2 - eor $B4,$B4,$B2 - eor $C4,$C4,$C2 - eor $D4,$D4,$D2 - st1.8 {$A4-$D4},[$out],#64 - - shl $A0,$ONE,#1 // 4 -> 8 - eor $A5,$A5,$A3 - eor $B5,$B5,$B3 - eor $C5,$C5,$C3 - eor $D5,$D5,$D3 - st1.8 {$A5-$D5},[$out],#64 - - add @K[3],@K[3],$A0 // += 8 - add @K[4],@K[4],$A0 - add @K[5],@K[5],$A0 - add @K[6],@K[6],$A0 - - b.hs .Loop_outer_512_neon - - adds $len,$len,#512 - ushr $A0,$ONE,#2 // 4 -> 1 - - ldp d8,d9,[sp,#128+0] // meet ABI requirements - ldp d10,d11,[sp,#128+16] - ldp d12,d13,[sp,#128+32] - ldp d14,d15,[sp,#128+48] - - stp @K[0],$ONE,[sp,#0] // wipe off-load area - stp @K[0],$ONE,[sp,#32] - stp @K[0],$ONE,[sp,#64] - - b.eq .Ldone_512_neon - - cmp $len,#192 - sub @K[3],@K[3],$A0 // -= 1 - sub @K[4],@K[4],$A0 - sub @K[5],@K[5],$A0 - add sp,sp,#128 - b.hs .Loop_outer_neon - - eor @K[1],@K[1],@K[1] - eor @K[2],@K[2],@K[2] - eor @K[3],@K[3],@K[3] - eor @K[4],@K[4],@K[4] - eor @K[5],@K[5],@K[5] - eor @K[6],@K[6],@K[6] - b .Loop_outer - -.Ldone_512_neon: - ldp x19,x20,[x29,#16] - add sp,sp,#128+64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - ret -.size ChaCha20_512_neon,.-ChaCha20_512_neon -#endif -#endif -___ -} -}}} - -open SELF,$0; -while(<SELF>) { - next if (/^#!/); - last if (!s/^#/\/\// and !/^$/); - print; -} -close SELF; - -foreach (split("\n",$code)) { - s/\`([^\`]*)\`/eval $1/geo; - - (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or - (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or - (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or - (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or - (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1)); - - print $_,"\n"; -} -close STDOUT; # flush diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips-glue.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips-glue.c deleted file mode 100644 index 96ce01e2c133..000000000000 --- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips-glue.c +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -asmlinkage void chacha20_mips(u32 state[16], u8 *out, const u8 *in, - const size_t len); -static bool *const chacha20_nobs[] __initconst = { }; -static void __init chacha20_fpu_init(void) -{ -} - -static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst, - const u8 *src, size_t len, - simd_context_t *simd_context) -{ - chacha20_mips(ctx->state, dst, src, len); - return true; -} - -static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS], - const u8 nonce[HCHACHA20_NONCE_SIZE], - const u8 key[HCHACHA20_KEY_SIZE], - simd_context_t *simd_context) -{ - return false; -} diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips.S b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips.S deleted file mode 100644 index a81e02db95e7..000000000000 --- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips.S +++ /dev/null @@ -1,424 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved. - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#define MASK_U32 0x3c -#define CHACHA20_BLOCK_SIZE 64 -#define STACK_SIZE 32 - -#define X0 $t0 -#define X1 $t1 -#define X2 $t2 -#define X3 $t3 -#define X4 $t4 -#define X5 $t5 -#define X6 $t6 -#define X7 $t7 -#define X8 $t8 -#define X9 $t9 -#define X10 $v1 -#define X11 $s6 -#define X12 $s5 -#define X13 $s4 -#define X14 $s3 -#define X15 $s2 -/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ -#define T0 $s1 -#define T1 $s0 -#define T(n) T ## n -#define X(n) X ## n - -/* Input arguments */ -#define STATE $a0 -#define OUT $a1 -#define IN $a2 -#define BYTES $a3 - -/* Output argument */ -/* NONCE[0] is kept in a register and not in memory. - * We don't want to touch original value in memory. - * Must be incremented every loop iteration. - */ -#define NONCE_0 $v0 - -/* SAVED_X and SAVED_CA are set in the jump table. - * Use regs which are overwritten on exit else we don't leak clear data. - * They are used to handling the last bytes which are not multiple of 4. - */ -#define SAVED_X X15 -#define SAVED_CA $s7 - -#define IS_UNALIGNED $s7 - -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -#define MSB 0 -#define LSB 3 -#define ROTx rotl -#define ROTR(n) rotr n, 24 -#define CPU_TO_LE32(n) \ - wsbh n; \ - rotr n, 16; -#else -#define MSB 3 -#define LSB 0 -#define ROTx rotr -#define CPU_TO_LE32(n) -#define ROTR(n) -#endif - -#define FOR_EACH_WORD(x) \ - x( 0); \ - x( 1); \ - x( 2); \ - x( 3); \ - x( 4); \ - x( 5); \ - x( 6); \ - x( 7); \ - x( 8); \ - x( 9); \ - x(10); \ - x(11); \ - x(12); \ - x(13); \ - x(14); \ - x(15); - -#define FOR_EACH_WORD_REV(x) \ - x(15); \ - x(14); \ - x(13); \ - x(12); \ - x(11); \ - x(10); \ - x( 9); \ - x( 8); \ - x( 7); \ - x( 6); \ - x( 5); \ - x( 4); \ - x( 3); \ - x( 2); \ - x( 1); \ - x( 0); - -#define PLUS_ONE_0 1 -#define PLUS_ONE_1 2 -#define PLUS_ONE_2 3 -#define PLUS_ONE_3 4 -#define PLUS_ONE_4 5 -#define PLUS_ONE_5 6 -#define PLUS_ONE_6 7 -#define PLUS_ONE_7 8 -#define PLUS_ONE_8 9 -#define PLUS_ONE_9 10 -#define PLUS_ONE_10 11 -#define PLUS_ONE_11 12 -#define PLUS_ONE_12 13 -#define PLUS_ONE_13 14 -#define PLUS_ONE_14 15 -#define PLUS_ONE_15 16 -#define PLUS_ONE(x) PLUS_ONE_ ## x -#define _CONCAT3(a,b,c) a ## b ## c -#define CONCAT3(a,b,c) _CONCAT3(a,b,c) - -#define STORE_UNALIGNED(x) \ -CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ - .if (x != 12); \ - lw T0, (x*4)(STATE); \ - .endif; \ - lwl T1, (x*4)+MSB ## (IN); \ - lwr T1, (x*4)+LSB ## (IN); \ - .if (x == 12); \ - addu X ## x, NONCE_0; \ - .else; \ - addu X ## x, T0; \ - .endif; \ - CPU_TO_LE32(X ## x); \ - xor X ## x, T1; \ - swl X ## x, (x*4)+MSB ## (OUT); \ - swr X ## x, (x*4)+LSB ## (OUT); - -#define STORE_ALIGNED(x) \ -CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ - .if (x != 12); \ - lw T0, (x*4)(STATE); \ - .endif; \ - lw T1, (x*4) ## (IN); \ - .if (x == 12); \ - addu X ## x, NONCE_0; \ - .else; \ - addu X ## x, T0; \ - .endif; \ - CPU_TO_LE32(X ## x); \ - xor X ## x, T1; \ - sw X ## x, (x*4) ## (OUT); - -/* Jump table macro. - * Used for setup and handling the last bytes, which are not multiple of 4. - * X15 is free to store Xn - * Every jumptable entry must be equal in size. - */ -#define JMPTBL_ALIGNED(x) \ -.Lchacha20_mips_jmptbl_aligned_ ## x: ; \ - .set noreorder; \ - b .Lchacha20_mips_xor_aligned_ ## x ## _b; \ - .if (x == 12); \ - addu SAVED_X, X ## x, NONCE_0; \ - .else; \ - addu SAVED_X, X ## x, SAVED_CA; \ - .endif; \ - .set reorder - -#define JMPTBL_UNALIGNED(x) \ -.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \ - .set noreorder; \ - b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \ - .if (x == 12); \ - addu SAVED_X, X ## x, NONCE_0; \ - .else; \ - addu SAVED_X, X ## x, SAVED_CA; \ - .endif; \ - .set reorder - -#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ - addu X(A), X(K); \ - addu X(B), X(L); \ - addu X(C), X(M); \ - addu X(D), X(N); \ - xor X(V), X(A); \ - xor X(W), X(B); \ - xor X(Y), X(C); \ - xor X(Z), X(D); \ - rotl X(V), S; \ - rotl X(W), S; \ - rotl X(Y), S; \ - rotl X(Z), S; - -.text -.set reorder -.set noat -.globl chacha20_mips -.ent chacha20_mips -chacha20_mips: - .frame $sp, STACK_SIZE, $ra - - addiu $sp, -STACK_SIZE - - /* Return bytes = 0. */ - beqz BYTES, .Lchacha20_mips_end - - lw NONCE_0, 48(STATE) - - /* Save s0-s7 */ - sw $s0, 0($sp) - sw $s1, 4($sp) - sw $s2, 8($sp) - sw $s3, 12($sp) - sw $s4, 16($sp) - sw $s5, 20($sp) - sw $s6, 24($sp) - sw $s7, 28($sp) - - /* Test IN or OUT is unaligned. - * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 - */ - or IS_UNALIGNED, IN, OUT - andi IS_UNALIGNED, 0x3 - - /* Set number of rounds */ - li $at, 20 - - b .Lchacha20_rounds_start - -.align 4 -.Loop_chacha20_rounds: - addiu IN, CHACHA20_BLOCK_SIZE - addiu OUT, CHACHA20_BLOCK_SIZE - addiu NONCE_0, 1 - -.Lchacha20_rounds_start: - lw X0, 0(STATE) - lw X1, 4(STATE) - lw X2, 8(STATE) - lw X3, 12(STATE) - - lw X4, 16(STATE) - lw X5, 20(STATE) - lw X6, 24(STATE) - lw X7, 28(STATE) - lw X8, 32(STATE) - lw X9, 36(STATE) - lw X10, 40(STATE) - lw X11, 44(STATE) - - move X12, NONCE_0 - lw X13, 52(STATE) - lw X14, 56(STATE) - lw X15, 60(STATE) - -.Loop_chacha20_xor_rounds: - addiu $at, -2 - AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); - AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); - AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); - AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); - AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); - AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); - AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); - AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); - bnez $at, .Loop_chacha20_xor_rounds - - addiu BYTES, -(CHACHA20_BLOCK_SIZE) - - /* Is data src/dst unaligned? Jump */ - bnez IS_UNALIGNED, .Loop_chacha20_unaligned - - /* Set number rounds here to fill delayslot. */ - li $at, 20 - - /* BYTES < 0, it has no full block. */ - bltz BYTES, .Lchacha20_mips_no_full_block_aligned - - FOR_EACH_WORD_REV(STORE_ALIGNED) - - /* BYTES > 0? Loop again. */ - bgtz BYTES, .Loop_chacha20_rounds - - /* Place this here to fill delay slot */ - addiu NONCE_0, 1 - - /* BYTES < 0? Handle last bytes */ - bltz BYTES, .Lchacha20_mips_xor_bytes - -.Lchacha20_mips_xor_done: - /* Restore used registers */ - lw $s0, 0($sp) - lw $s1, 4($sp) - lw $s2, 8($sp) - lw $s3, 12($sp) - lw $s4, 16($sp) - lw $s5, 20($sp) - lw $s6, 24($sp) - lw $s7, 28($sp) - - /* Write NONCE_0 back to right location in state */ - sw NONCE_0, 48(STATE) - -.Lchacha20_mips_end: - addiu $sp, STACK_SIZE - jr $ra - -.Lchacha20_mips_no_full_block_aligned: - /* Restore the offset on BYTES */ - addiu BYTES, CHACHA20_BLOCK_SIZE - - /* Get number of full WORDS */ - andi $at, BYTES, MASK_U32 - - /* Load upper half of jump table addr */ - lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0) - - /* Calculate lower half jump table offset */ - ins T0, $at, 1, 6 - - /* Add offset to STATE */ - addu T1, STATE, $at - - /* Add lower half jump table addr */ - addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0) - - /* Read value from STATE */ - lw SAVED_CA, 0(T1) - - /* Store remaining bytecounter as negative value */ - subu BYTES, $at, BYTES - - jr T0 - - /* Jump table */ - FOR_EACH_WORD(JMPTBL_ALIGNED) - - -.Loop_chacha20_unaligned: - /* Set number rounds here to fill delayslot. */ - li $at, 20 - - /* BYTES > 0, it has no full block. */ - bltz BYTES, .Lchacha20_mips_no_full_block_unaligned - - FOR_EACH_WORD_REV(STORE_UNALIGNED) - - /* BYTES > 0? Loop again. */ - bgtz BYTES, .Loop_chacha20_rounds - - /* Write NONCE_0 back to right location in state */ - sw NONCE_0, 48(STATE) - - .set noreorder - /* Fall through to byte handling */ - bgez BYTES, .Lchacha20_mips_xor_done -.Lchacha20_mips_xor_unaligned_0_b: -.Lchacha20_mips_xor_aligned_0_b: - /* Place this here to fill delay slot */ - addiu NONCE_0, 1 - .set reorder - -.Lchacha20_mips_xor_bytes: - addu IN, $at - addu OUT, $at - /* First byte */ - lbu T1, 0(IN) - addiu $at, BYTES, 1 - CPU_TO_LE32(SAVED_X) - ROTR(SAVED_X) - xor T1, SAVED_X - sb T1, 0(OUT) - beqz $at, .Lchacha20_mips_xor_done - /* Second byte */ - lbu T1, 1(IN) - addiu $at, BYTES, 2 - ROTx SAVED_X, 8 - xor T1, SAVED_X - sb T1, 1(OUT) - beqz $at, .Lchacha20_mips_xor_done - /* Third byte */ - lbu T1, 2(IN) - ROTx SAVED_X, 8 - xor T1, SAVED_X - sb T1, 2(OUT) - b .Lchacha20_mips_xor_done - -.Lchacha20_mips_no_full_block_unaligned: - /* Restore the offset on BYTES */ - addiu BYTES, CHACHA20_BLOCK_SIZE - - /* Get number of full WORDS */ - andi $at, BYTES, MASK_U32 - - /* Load upper half of jump table addr */ - lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0) - - /* Calculate lower half jump table offset */ - ins T0, $at, 1, 6 - - /* Add offset to STATE */ - addu T1, STATE, $at - - /* Add lower half jump table addr */ - addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0) - - /* Read value from STATE */ - lw SAVED_CA, 0(T1) - - /* Store remaining bytecounter as negative value */ - subu BYTES, $at, BYTES - - jr T0 - - /* Jump table */ - FOR_EACH_WORD(JMPTBL_UNALIGNED) -.end chacha20_mips -.set at diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64-glue.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64-glue.c deleted file mode 100644 index 1bccec70845c..000000000000 --- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64-glue.c +++ /dev/null @@ -1,132 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ -#ifdef __linux__ -#include <asm/fpu/api.h> -#include <asm/cpufeature.h> -#include <asm/processor.h> -#include <asm/intel-family.h> -#else -#include <sys/simd-x86_64.h> -#endif - -asmlinkage void hchacha20_ssse3(u32 *derived_key, const u8 *nonce, - const u8 *key); -asmlinkage void chacha20_ssse3(u8 *out, const u8 *in, const size_t len, - const u32 key[8], const u32 counter[4]); -asmlinkage void chacha20_avx2(u8 *out, const u8 *in, const size_t len, - const u32 key[8], const u32 counter[4]); -asmlinkage void chacha20_avx512(u8 *out, const u8 *in, const size_t len, - const u32 key[8], const u32 counter[4]); -asmlinkage void chacha20_avx512vl(u8 *out, const u8 *in, const size_t len, - const u32 key[8], const u32 counter[4]); - -static bool chacha20_use_ssse3 __ro_after_init; -static bool chacha20_use_avx2 __ro_after_init; -static bool chacha20_use_avx512 __ro_after_init; -static bool chacha20_use_avx512vl __ro_after_init; -static bool *const chacha20_nobs[] __initconst = { - &chacha20_use_ssse3, &chacha20_use_avx2, &chacha20_use_avx512, - &chacha20_use_avx512vl }; - -static void __init chacha20_fpu_init(void) -{ -#ifdef __linux__ - chacha20_use_ssse3 = boot_cpu_has(X86_FEATURE_SSSE3); - chacha20_use_avx2 = - boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); -#ifndef COMPAT_CANNOT_USE_AVX512 - chacha20_use_avx512 = - boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_AVX512F) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | - XFEATURE_MASK_AVX512, NULL) && - /* Skylake downclocks unacceptably much when using zmm. */ - boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X; - chacha20_use_avx512vl = - boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_AVX512F) && - boot_cpu_has(X86_FEATURE_AVX512VL) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | - XFEATURE_MASK_AVX512, NULL); -#endif -#else - chacha20_use_ssse3 = !!(cpu_feature2 & CPUID2_SSSE3); - chacha20_use_avx2 = !!(cpu_feature2 & CPUID2_AVX) && - !!(cpu_stdext_feature & CPUID_STDEXT_AVX2) && - __ymm_enabled(); - chacha20_use_avx512 = chacha20_use_avx2 && - !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && - __zmm_enabled(); - chacha20_use_avx512vl = chacha20_use_avx512 && - !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && - !!(cpu_stdext_feature & CPUID_STDEXT_AVX512VL); -#endif - if (bootverbose) - printf("ssse3: %d avx2: %d avx512: %d avx512vl: %d\n", - chacha20_use_ssse3, - chacha20_use_avx2, - chacha20_use_avx512, - chacha20_use_avx512vl); -} - -static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst, - const u8 *src, size_t len, - simd_context_t *simd_context) -{ - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(PAGE_SIZE < CHACHA20_BLOCK_SIZE || - PAGE_SIZE % CHACHA20_BLOCK_SIZE); - - if (!chacha20_use_ssse3) { - return false; - } - if (len <= CHACHA20_BLOCK_SIZE) { - return false; - } - if (!simd_use(simd_context)) { - return false; - } - for (;;) { - const size_t bytes = min_t(size_t, len, PAGE_SIZE); - - if (chacha20_use_avx512 && - len >= CHACHA20_BLOCK_SIZE * 8) - chacha20_avx512(dst, src, bytes, ctx->key, ctx->counter); - else if (chacha20_use_avx512vl && - len >= CHACHA20_BLOCK_SIZE * 4) - chacha20_avx512vl(dst, src, bytes, ctx->key, ctx->counter); - else if (chacha20_use_avx2 && - len >= CHACHA20_BLOCK_SIZE * 4) - chacha20_avx2(dst, src, bytes, ctx->key, ctx->counter); - else - chacha20_ssse3(dst, src, bytes, ctx->key, ctx->counter); - ctx->counter[0] += (bytes + 63) / 64; - len -= bytes; - if (!len) - break; - dst += bytes; - src += bytes; - simd_relax(simd_context); - } - - return true; -} - -static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS], - const u8 nonce[HCHACHA20_NONCE_SIZE], - const u8 key[HCHACHA20_KEY_SIZE], - simd_context_t *simd_context) -{ - if (IS_ENABLED(CONFIG_AS_SSSE3) && chacha20_use_ssse3 && - simd_use(simd_context)) { - hchacha20_ssse3(derived_key, nonce, key); - return true; - } - return false; -} diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64.pl b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64.pl deleted file mode 100755 index 29906a66b8b7..000000000000 --- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64.pl +++ /dev/null @@ -1,4106 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause -# -# Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. -# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. -# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. -# -# This code is taken from the OpenSSL project but the author, Andy Polyakov, -# has relicensed it under the licenses specified in the SPDX header above. -# The original headers, including the original license headers, are -# included below for completeness. -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# November 2014 -# -# ChaCha20 for x86_64. -# -# December 2016 -# -# Add AVX512F code path. -# -# December 2017 -# -# Add AVX512VL code path. -# -# Performance in cycles per byte out of large buffer. -# -# IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v) -# -# P4 9.48/+99% - - -# Core2 7.83/+55% 7.90/5.76 4.35 -# Westmere 7.19/+50% 5.60/4.50 3.00 -# Sandy Bridge 8.31/+42% 5.45/4.00 2.72 -# Ivy Bridge 6.71/+46% 5.40/? 2.41 -# Haswell 5.92/+43% 5.20/3.45 2.42 1.23 -# Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)] -# Silvermont 12.0/+33% 7.75/6.90 7.03(iii) -# Knights L 11.7/- ? 9.60(iii) 0.80 -# Goldmont 10.6/+17% 5.10/3.52 3.28 -# Sledgehammer 7.28/+52% - - -# Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv) -# Ryzen 5.96/+50% 5.19/3.00 2.40 2.09 -# VIA Nano 10.5/+46% 6.72/6.88 6.05 -# -# (i) compared to older gcc 3.x one can observe >2x improvement on -# most platforms; -# (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used -# by chacha20_poly1305_tls_cipher, results are EVP-free; -# (iii) this is not optimal result for Atom because of MSROM -# limitations, SSE2 can do better, but gain is considered too -# low to justify the [maintenance] effort; -# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20 -# and 4.85 for 128-byte inputs; -# (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable; -# (vi) even though Skylake-X can execute AVX512F code and deliver 0.57 -# cpb in single thread, the corresponding capability is suppressed; - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); -$kernel=0; $kernel=1 if (!$flavour && !$output); - -if (!$kernel) { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or - die "can't locate x86_64-xlate.pl"; - - open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; - *STDOUT=*OUT; - - if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); - } - - if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { - $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); - $avx += 1 if ($1==2.11 && $2>=8); - } - - if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./) { - $avx = ($1>=10) + ($1>=11); - } - - if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { - $avx = ($2>=3.0) + ($2>3.0); - } -} else { - $avx = 4; # The kernel uses ifdefs for this. -} - -# input parameter block -($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); - -$code.=<<___ if $kernel; -#include <linux/linkage.h> -___ - -sub declare_variable() { - my ($name, $size, $type, $payload) = @_; - if($kernel) { - $code.=".section .rodata.cst$size.L$name, \"aM\", \@progbits, $size\n"; - $code.=".align $size\n"; - $code.=".L$name:\n"; - $code.=".$type $payload\n"; - } else { - $code.=".L$name:\n"; - $code.=".$type $payload\n"; - } -} - -sub declare_function() { - my ($name, $align, $nargs) = @_; - if($kernel) { - $code .= ".align $align\n"; - $code .= "SYM_FUNC_START($name)\n"; - $code .= ".L$name:\n"; - } else { - $code .= ".globl $name\n"; - $code .= ".type $name,\@function,$nargs\n"; - $code .= ".align $align\n"; - $code .= "$name:\n"; - } -} - -sub end_function() { - my ($name) = @_; - if($kernel) { - $code .= "SYM_FUNC_END($name)\n"; - } else { - $code .= ".size $name,.-$name\n"; - } -} - -if(!$kernel) { - $code .= ".text\n"; -} -&declare_variable('zero', 16, 'long', '0,0,0,0'); -&declare_variable('one', 16, 'long', '1,0,0,0'); -&declare_variable('inc', 16, 'long', '0,1,2,3'); -&declare_variable('four', 16, 'long', '4,4,4,4'); -&declare_variable('incy', 32, 'long', '0,2,4,6,1,3,5,7'); -&declare_variable('eight', 32, 'long', '8,8,8,8,8,8,8,8'); -&declare_variable('rot16', 16, 'byte', '0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd'); -&declare_variable('rot24', 16, 'byte', '0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe'); -&declare_variable('twoy', 32, 'long', '2,0,0,0, 2,0,0,0'); -&declare_variable('zeroz', 64, 'long', '0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0'); -&declare_variable('fourz', 64, 'long', '4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0'); -&declare_variable('incz', 64, 'long', '0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15'); -&declare_variable('sixteen', 64, 'long', '16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16'); -&declare_variable('sigma', 16, 'ascii', '"expand 32-byte k"'); - -$code.=<<___ if !$kernel; -.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" -___ -$code.=".text\n"; - -sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm -{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; - my $arg = pop; - $arg = "\$$arg" if ($arg*1 eq $arg); - $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; -} - -@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), - "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); -@t=("%esi","%edi"); - -sub ROUND { # critical path is 24 cycles per round -my ($a0,$b0,$c0,$d0)=@_; -my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); -my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); -my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); -my ($xc,$xc_)=map("\"$_\"",@t); -my @x=map("\"$_\"",@x); - - # Consider order in which variables are addressed by their - # index: - # - # a b c d - # - # 0 4 8 12 < even round - # 1 5 9 13 - # 2 6 10 14 - # 3 7 11 15 - # 0 5 10 15 < odd round - # 1 6 11 12 - # 2 7 8 13 - # 3 4 9 14 - # - # 'a', 'b' and 'd's are permanently allocated in registers, - # @x[0..7,12..15], while 'c's are maintained in memory. If - # you observe 'c' column, you'll notice that pair of 'c's is - # invariant between rounds. This means that we have to reload - # them once per round, in the middle. This is why you'll see - # bunch of 'c' stores and loads in the middle, but none in - # the beginning or end. - - # Normally instructions would be interleaved to favour in-order - # execution. Generally out-of-order cores manage it gracefully, - # but not this time for some reason. As in-order execution - # cores are dying breed, old Atom is the only one around, - # instructions are left uninterleaved. Besides, Atom is better - # off executing 1xSSSE3 code anyway... - - ( - "&add (@x[$a0],@x[$b0])", # Q1 - "&xor (@x[$d0],@x[$a0])", - "&rol (@x[$d0],16)", - "&add (@x[$a1],@x[$b1])", # Q2 - "&xor (@x[$d1],@x[$a1])", - "&rol (@x[$d1],16)", - - "&add ($xc,@x[$d0])", - "&xor (@x[$b0],$xc)", - "&rol (@x[$b0],12)", - "&add ($xc_,@x[$d1])", - "&xor (@x[$b1],$xc_)", - "&rol (@x[$b1],12)", - - "&add (@x[$a0],@x[$b0])", - "&xor (@x[$d0],@x[$a0])", - "&rol (@x[$d0],8)", - "&add (@x[$a1],@x[$b1])", - "&xor (@x[$d1],@x[$a1])", - "&rol (@x[$d1],8)", - - "&add ($xc,@x[$d0])", - "&xor (@x[$b0],$xc)", - "&rol (@x[$b0],7)", - "&add ($xc_,@x[$d1])", - "&xor (@x[$b1],$xc_)", - "&rol (@x[$b1],7)", - - "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's - "&mov (\"4*$c1(%rsp)\",$xc_)", - "&mov ($xc,\"4*$c2(%rsp)\")", - "&mov ($xc_,\"4*$c3(%rsp)\")", - - "&add (@x[$a2],@x[$b2])", # Q3 - "&xor (@x[$d2],@x[$a2])", - "&rol (@x[$d2],16)", - "&add (@x[$a3],@x[$b3])", # Q4 - "&xor (@x[$d3],@x[$a3])", - "&rol (@x[$d3],16)", - - "&add ($xc,@x[$d2])", - "&xor (@x[$b2],$xc)", - "&rol (@x[$b2],12)", - "&add ($xc_,@x[$d3])", - "&xor (@x[$b3],$xc_)", - "&rol (@x[$b3],12)", - - "&add (@x[$a2],@x[$b2])", - "&xor (@x[$d2],@x[$a2])", - "&rol (@x[$d2],8)", - "&add (@x[$a3],@x[$b3])", - "&xor (@x[$d3],@x[$a3])", - "&rol (@x[$d3],8)", - - "&add ($xc,@x[$d2])", - "&xor (@x[$b2],$xc)", - "&rol (@x[$b2],7)", - "&add ($xc_,@x[$d3])", - "&xor (@x[$b3],$xc_)", - "&rol (@x[$b3],7)" - ); -} - -######################################################################## -# Generic code path that handles all lengths on pre-SSSE3 processors. -if(!$kernel) { -&declare_function("chacha20_ctr32", 64, 5); -$code.=<<___; -.cfi_startproc - cmp \$0,$len - je .Lno_data - mov OPENSSL_ia32cap_P+4(%rip),%r9 -___ -$code.=<<___ if ($avx>2); - bt \$48,%r9 # check for AVX512F - jc .Lchacha20_avx512 - test %r9,%r9 # check for AVX512VL - js .Lchacha20_avx512vl -___ -$code.=<<___; - test \$`1<<(41-32)`,%r9d - jnz .Lchacha20_ssse3 -___ -$code.=<<___; - push %rbx -.cfi_push %rbx - push %rbp -.cfi_push %rbp - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$64+24,%rsp -.cfi_adjust_cfa_offset 64+24 -.Lctr32_body: - - #movdqa .Lsigma(%rip),%xmm0 - movdqu ($key),%xmm1 - movdqu 16($key),%xmm2 - movdqu ($counter),%xmm3 - movdqa .Lone(%rip),%xmm4 - - #movdqa %xmm0,4*0(%rsp) # key[0] - movdqa %xmm1,4*4(%rsp) # key[1] - movdqa %xmm2,4*8(%rsp) # key[2] - movdqa %xmm3,4*12(%rsp) # key[3] - mov $len,%rbp # reassign $len - jmp .Loop_outer - -.align 32 -.Loop_outer: - mov \$0x61707865,@x[0] # 'expa' - mov \$0x3320646e,@x[1] # 'nd 3' - mov \$0x79622d32,@x[2] # '2-by' - mov \$0x6b206574,@x[3] # 'te k' - mov 4*4(%rsp),@x[4] - mov 4*5(%rsp),@x[5] - mov 4*6(%rsp),@x[6] - mov 4*7(%rsp),@x[7] - movd %xmm3,@x[12] - mov 4*13(%rsp),@x[13] - mov 4*14(%rsp),@x[14] - mov 4*15(%rsp),@x[15] - - mov %rbp,64+0(%rsp) # save len - mov \$10,%ebp - mov $inp,64+8(%rsp) # save inp - movq %xmm2,%rsi # "@x[8]" - mov $out,64+16(%rsp) # save out - mov %rsi,%rdi - shr \$32,%rdi # "@x[9]" - jmp .Loop - -.align 32 -.Loop: -___ - foreach (&ROUND (0, 4, 8,12)) { eval; } - foreach (&ROUND (0, 5,10,15)) { eval; } - &dec ("%ebp"); - &jnz (".Loop"); - -$code.=<<___; - mov @t[1],4*9(%rsp) # modulo-scheduled - mov @t[0],4*8(%rsp) - mov 64(%rsp),%rbp # load len - movdqa %xmm2,%xmm1 - mov 64+8(%rsp),$inp # load inp - paddd %xmm4,%xmm3 # increment counter - mov 64+16(%rsp),$out # load out - - add \$0x61707865,@x[0] # 'expa' - add \$0x3320646e,@x[1] # 'nd 3' - add \$0x79622d32,@x[2] # '2-by' - add \$0x6b206574,@x[3] # 'te k' - add 4*4(%rsp),@x[4] - add 4*5(%rsp),@x[5] - add 4*6(%rsp),@x[6] - add 4*7(%rsp),@x[7] - add 4*12(%rsp),@x[12] - add 4*13(%rsp),@x[13] - add 4*14(%rsp),@x[14] - add 4*15(%rsp),@x[15] - paddd 4*8(%rsp),%xmm1 - - cmp \$64,%rbp - jb .Ltail - - xor 4*0($inp),@x[0] # xor with input - xor 4*1($inp),@x[1] - xor 4*2($inp),@x[2] - xor 4*3($inp),@x[3] - xor 4*4($inp),@x[4] - xor 4*5($inp),@x[5] - xor 4*6($inp),@x[6] - xor 4*7($inp),@x[7] - movdqu 4*8($inp),%xmm0 - xor 4*12($inp),@x[12] - xor 4*13($inp),@x[13] - xor 4*14($inp),@x[14] - xor 4*15($inp),@x[15] - lea 4*16($inp),$inp # inp+=64 - pxor %xmm1,%xmm0 - - movdqa %xmm2,4*8(%rsp) - movd %xmm3,4*12(%rsp) - - mov @x[0],4*0($out) # write output - mov @x[1],4*1($out) - mov @x[2],4*2($out) - mov @x[3],4*3($out) - mov @x[4],4*4($out) - mov @x[5],4*5($out) - mov @x[6],4*6($out) - mov @x[7],4*7($out) - movdqu %xmm0,4*8($out) - mov @x[12],4*12($out) - mov @x[13],4*13($out) - mov @x[14],4*14($out) - mov @x[15],4*15($out) - lea 4*16($out),$out # out+=64 - - sub \$64,%rbp - jnz .Loop_outer - - jmp .Ldone - -.align 16 -.Ltail: - mov @x[0],4*0(%rsp) - mov @x[1],4*1(%rsp) - xor %rbx,%rbx - mov @x[2],4*2(%rsp) - mov @x[3],4*3(%rsp) - mov @x[4],4*4(%rsp) - mov @x[5],4*5(%rsp) - mov @x[6],4*6(%rsp) - mov @x[7],4*7(%rsp) - movdqa %xmm1,4*8(%rsp) - mov @x[12],4*12(%rsp) - mov @x[13],4*13(%rsp) - mov @x[14],4*14(%rsp) - mov @x[15],4*15(%rsp) - -.Loop_tail: - movzb ($inp,%rbx),%eax - movzb (%rsp,%rbx),%edx - lea 1(%rbx),%rbx - xor %edx,%eax - mov %al,-1($out,%rbx) - dec %rbp - jnz .Loop_tail - -.Ldone: - add \$64+24,%rsp -.cfi_adjust_cfa_offset -64-24 - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbp -.cfi_restore %rbp - pop %rbx -.cfi_restore %rbx -.Lno_data: - ret -.cfi_endproc -___ -&end_function("chacha20_ctr32"); -} - -######################################################################## -# SSSE3 code path that handles shorter lengths -{ -my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7)); - -sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round - &paddd ($a,$b); - &pxor ($d,$a); - &pshufb ($d,$rot16); - - &paddd ($c,$d); - &pxor ($b,$c); - &movdqa ($t,$b); - &psrld ($b,20); - &pslld ($t,12); - &por ($b,$t); - - &paddd ($a,$b); - &pxor ($d,$a); - &pshufb ($d,$rot24); - - &paddd ($c,$d); - &pxor ($b,$c); - &movdqa ($t,$b); - &psrld ($b,25); - &pslld ($t,7); - &por ($b,$t); -} - -my $xframe = $win64 ? 32+8 : 8; - -if($kernel) { - $code .= "#ifdef CONFIG_AS_SSSE3\n"; -} - -if($kernel) { -&declare_function("hchacha20_ssse3", 32, 5); -$code.=<<___; - movdqa .Lsigma(%rip),$a - movdqu ($len),$b - movdqu 16($len),$c - movdqu ($inp),$d - # This code is only used when targeting kernel. - # If targeting win64, xmm{6,7} preserving needs to be added. - movdqa .Lrot16(%rip),$rot16 - movdqa .Lrot24(%rip),$rot24 - mov \$10,$counter # reuse $counter - jmp 1f -.align 32 -1: -___ - &SSSE3ROUND(); - &pshufd ($a,$a,0b10010011); - &pshufd ($d,$d,0b01001110); - &pshufd ($c,$c,0b00111001); - &nop (); - - &SSSE3ROUND(); - &pshufd ($a,$a,0b00111001); - &pshufd ($d,$d,0b01001110); - &pshufd ($c,$c,0b10010011); - - &dec ($counter); - &jnz ("1b"); - -$code.=<<___; - movdqu $a, ($out) - movdqu $d, 16($out) - ret -___ -&end_function("hchacha20_ssse3"); -} - -&declare_function("chacha20_ssse3", 32, 5); -$code.=<<___; -.cfi_startproc - lea 8(%rsp),%r10 # frame pointer -.cfi_def_cfa_register %r10 -___ -$code.=<<___ if ($avx && !$kernel); - test \$`1<<(43-32)`,%r10d - jnz .Lchacha20_4xop # XOP is fastest even if we use 1/4 -___ -$code.=<<___; - cmp \$128,$len # we might throw away some data, - je .Lchacha20_128 - ja .Lchacha20_4x # but overall it won't be slower - -.Ldo_ssse3_after_all: - sub \$64+$xframe,%rsp - and \$-16,%rsp -___ -$code.=<<___ if ($win64); - movaps %xmm6,-0x30(%r10) - movaps %xmm7,-0x20(%r10) -.Lssse3_body: -___ -$code.=<<___; - movdqa .Lsigma(%rip),$a - movdqu ($key),$b - movdqu 16($key),$c - movdqu ($counter),$d - movdqa .Lrot16(%rip),$rot16 - movdqa .Lrot24(%rip),$rot24 - - movdqa $a,0x00(%rsp) - movdqa $b,0x10(%rsp) - movdqa $c,0x20(%rsp) - movdqa $d,0x30(%rsp) - mov \$10,$counter # reuse $counter - jmp .Loop_ssse3 - -.align 32 -.Loop_outer_ssse3: - movdqa .Lone(%rip),$d - movdqa 0x00(%rsp),$a - movdqa 0x10(%rsp),$b - movdqa 0x20(%rsp),$c - paddd 0x30(%rsp),$d - mov \$10,$counter - movdqa $d,0x30(%rsp) - jmp .Loop_ssse3 - -.align 32 -.Loop_ssse3: -___ - &SSSE3ROUND(); - &pshufd ($a,$a,0b10010011); - &pshufd ($d,$d,0b01001110); - &pshufd ($c,$c,0b00111001); - &nop (); - - &SSSE3ROUND(); - &pshufd ($a,$a,0b00111001); - &pshufd ($d,$d,0b01001110); - &pshufd ($c,$c,0b10010011); - - &dec ($counter); - &jnz (".Loop_ssse3"); - -$code.=<<___; - paddd 0x00(%rsp),$a - paddd 0x10(%rsp),$b - paddd 0x20(%rsp),$c - paddd 0x30(%rsp),$d - - cmp \$64,$len - jb .Ltail_ssse3 - - movdqu 0x00($inp),$t - movdqu 0x10($inp),$t1 - pxor $t,$a # xor with input - movdqu 0x20($inp),$t - pxor $t1,$b - movdqu 0x30($inp),$t1 - lea 0x40($inp),$inp # inp+=64 - pxor $t,$c - pxor $t1,$d - - movdqu $a,0x00($out) # write output - movdqu $b,0x10($out) - movdqu $c,0x20($out) - movdqu $d,0x30($out) - lea 0x40($out),$out # out+=64 - - sub \$64,$len - jnz .Loop_outer_ssse3 - - jmp .Ldone_ssse3 - -.align 16 -.Ltail_ssse3: - movdqa $a,0x00(%rsp) - movdqa $b,0x10(%rsp) - movdqa $c,0x20(%rsp) - movdqa $d,0x30(%rsp) - xor $counter,$counter - -.Loop_tail_ssse3: - movzb ($inp,$counter),%eax - movzb (%rsp,$counter),%ecx - lea 1($counter),$counter - xor %ecx,%eax - mov %al,-1($out,$counter) - dec $len - jnz .Loop_tail_ssse3 - -.Ldone_ssse3: -___ -$code.=<<___ if ($win64); - movaps -0x30(%r10),%xmm6 - movaps -0x20(%r10),%xmm7 -___ -$code.=<<___; - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -.Lssse3_epilogue: - ret -.cfi_endproc -___ -} -&end_function("chacha20_ssse3"); - -######################################################################## -# SSSE3 code path that handles 128-byte inputs -{ -my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7)); -my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1)); - -sub SSSE3ROUND_2x { - &paddd ($a,$b); - &pxor ($d,$a); - &paddd ($a1,$b1); - &pxor ($d1,$a1); - &pshufb ($d,$rot16); - &pshufb($d1,$rot16); - - &paddd ($c,$d); - &paddd ($c1,$d1); - &pxor ($b,$c); - &pxor ($b1,$c1); - &movdqa ($t,$b); - &psrld ($b,20); - &movdqa($t1,$b1); - &pslld ($t,12); - &psrld ($b1,20); - &por ($b,$t); - &pslld ($t1,12); - &por ($b1,$t1); - - &paddd ($a,$b); - &pxor ($d,$a); - &paddd ($a1,$b1); - &pxor ($d1,$a1); - &pshufb ($d,$rot24); - &pshufb($d1,$rot24); - - &paddd ($c,$d); - &paddd ($c1,$d1); - &pxor ($b,$c); - &pxor ($b1,$c1); - &movdqa ($t,$b); - &psrld ($b,25); - &movdqa($t1,$b1); - &pslld ($t,7); - &psrld ($b1,25); - &por ($b,$t); - &pslld ($t1,7); - &por ($b1,$t1); -} - -my $xframe = $win64 ? 0x68 : 8; - -$code.=<<___; -.type chacha20_128,\@function,5 -.align 32 -chacha20_128: -.cfi_startproc -.Lchacha20_128: - lea 8(%rsp),%r10 # frame pointer -.cfi_def_cfa_register %r10 - sub \$64+$xframe,%rsp - and \$-16,%rsp -___ -$code.=<<___ if ($win64); - movaps %xmm6,-0x70(%r10) - movaps %xmm7,-0x60(%r10) - movaps %xmm8,-0x50(%r10) - movaps %xmm9,-0x40(%r10) - movaps %xmm10,-0x30(%r10) - movaps %xmm11,-0x20(%r10) -.L128_body: -___ -$code.=<<___; - movdqa .Lsigma(%rip),$a - movdqu ($key),$b - movdqu 16($key),$c - movdqu ($counter),$d - movdqa .Lone(%rip),$d1 - movdqa .Lrot16(%rip),$rot16 - movdqa .Lrot24(%rip),$rot24 - - movdqa $a,$a1 - movdqa $a,0x00(%rsp) - movdqa $b,$b1 - movdqa $b,0x10(%rsp) - movdqa $c,$c1 - movdqa $c,0x20(%rsp) - paddd $d,$d1 - movdqa $d,0x30(%rsp) - mov \$10,$counter # reuse $counter - jmp .Loop_128 - -.align 32 -.Loop_128: -___ - &SSSE3ROUND_2x(); - &pshufd ($a,$a,0b10010011); - &pshufd ($d,$d,0b01001110); - &pshufd ($c,$c,0b00111001); - &pshufd ($a1,$a1,0b10010011); - &pshufd ($d1,$d1,0b01001110); - &pshufd ($c1,$c1,0b00111001); - - &SSSE3ROUND_2x(); - &pshufd ($a,$a,0b00111001); - &pshufd ($d,$d,0b01001110); - &pshufd ($c,$c,0b10010011); - &pshufd ($a1,$a1,0b00111001); - &pshufd ($d1,$d1,0b01001110); - &pshufd ($c1,$c1,0b10010011); - - &dec ($counter); - &jnz (".Loop_128"); - -$code.=<<___; - paddd 0x00(%rsp),$a - paddd 0x10(%rsp),$b - paddd 0x20(%rsp),$c - paddd 0x30(%rsp),$d - paddd .Lone(%rip),$d1 - paddd 0x00(%rsp),$a1 - paddd 0x10(%rsp),$b1 - paddd 0x20(%rsp),$c1 - paddd 0x30(%rsp),$d1 - - movdqu 0x00($inp),$t - movdqu 0x10($inp),$t1 - pxor $t,$a # xor with input - movdqu 0x20($inp),$t - pxor $t1,$b - movdqu 0x30($inp),$t1 - pxor $t,$c - movdqu 0x40($inp),$t - pxor $t1,$d - movdqu 0x50($inp),$t1 - pxor $t,$a1 - movdqu 0x60($inp),$t - pxor $t1,$b1 - movdqu 0x70($inp),$t1 - pxor $t,$c1 - pxor $t1,$d1 - - movdqu $a,0x00($out) # write output - movdqu $b,0x10($out) - movdqu $c,0x20($out) - movdqu $d,0x30($out) - movdqu $a1,0x40($out) - movdqu $b1,0x50($out) - movdqu $c1,0x60($out) - movdqu $d1,0x70($out) -___ -$code.=<<___ if ($win64); - movaps -0x70(%r10),%xmm6 - movaps -0x60(%r10),%xmm7 - movaps -0x50(%r10),%xmm8 - movaps -0x40(%r10),%xmm9 - movaps -0x30(%r10),%xmm10 - movaps -0x20(%r10),%xmm11 -___ -$code.=<<___; - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -.L128_epilogue: - ret -.cfi_endproc -.size chacha20_128,.-chacha20_128 -___ -} - -######################################################################## -# SSSE3 code path that handles longer messages. -{ -# assign variables to favor Atom front-end -my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, - $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); -my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, - "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); - -sub SSSE3_lane_ROUND { -my ($a0,$b0,$c0,$d0)=@_; -my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); -my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); -my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); -my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); -my @x=map("\"$_\"",@xx); - - # Consider order in which variables are addressed by their - # index: - # - # a b c d - # - # 0 4 8 12 < even round - # 1 5 9 13 - # 2 6 10 14 - # 3 7 11 15 - # 0 5 10 15 < odd round - # 1 6 11 12 - # 2 7 8 13 - # 3 4 9 14 - # - # 'a', 'b' and 'd's are permanently allocated in registers, - # @x[0..7,12..15], while 'c's are maintained in memory. If - # you observe 'c' column, you'll notice that pair of 'c's is - # invariant between rounds. This means that we have to reload - # them once per round, in the middle. This is why you'll see - # bunch of 'c' stores and loads in the middle, but none in - # the beginning or end. - - ( - "&paddd (@x[$a0],@x[$b0])", # Q1 - "&paddd (@x[$a1],@x[$b1])", # Q2 - "&pxor (@x[$d0],@x[$a0])", - "&pxor (@x[$d1],@x[$a1])", - "&pshufb (@x[$d0],$t1)", - "&pshufb (@x[$d1],$t1)", - - "&paddd ($xc,@x[$d0])", - "&paddd ($xc_,@x[$d1])", - "&pxor (@x[$b0],$xc)", - "&pxor (@x[$b1],$xc_)", - "&movdqa ($t0,@x[$b0])", - "&pslld (@x[$b0],12)", - "&psrld ($t0,20)", - "&movdqa ($t1,@x[$b1])", - "&pslld (@x[$b1],12)", - "&por (@x[$b0],$t0)", - "&psrld ($t1,20)", - "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) - "&por (@x[$b1],$t1)", - - "&paddd (@x[$a0],@x[$b0])", - "&paddd (@x[$a1],@x[$b1])", - "&pxor (@x[$d0],@x[$a0])", - "&pxor (@x[$d1],@x[$a1])", - "&pshufb (@x[$d0],$t0)", - "&pshufb (@x[$d1],$t0)", - - "&paddd ($xc,@x[$d0])", - "&paddd ($xc_,@x[$d1])", - "&pxor (@x[$b0],$xc)", - "&pxor (@x[$b1],$xc_)", - "&movdqa ($t1,@x[$b0])", - "&pslld (@x[$b0],7)", - "&psrld ($t1,25)", - "&movdqa ($t0,@x[$b1])", - "&pslld (@x[$b1],7)", - "&por (@x[$b0],$t1)", - "&psrld ($t0,25)", - "&movdqa ($t1,'(%r9)')", # .Lrot16(%rip) - "&por (@x[$b1],$t0)", - - "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's - "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", - "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", - "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", - - "&paddd (@x[$a2],@x[$b2])", # Q3 - "&paddd (@x[$a3],@x[$b3])", # Q4 - "&pxor (@x[$d2],@x[$a2])", - "&pxor (@x[$d3],@x[$a3])", - "&pshufb (@x[$d2],$t1)", - "&pshufb (@x[$d3],$t1)", - - "&paddd ($xc,@x[$d2])", - "&paddd ($xc_,@x[$d3])", - "&pxor (@x[$b2],$xc)", - "&pxor (@x[$b3],$xc_)", - "&movdqa ($t0,@x[$b2])", - "&pslld (@x[$b2],12)", - "&psrld ($t0,20)", - "&movdqa ($t1,@x[$b3])", - "&pslld (@x[$b3],12)", - "&por (@x[$b2],$t0)", - "&psrld ($t1,20)", - "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) - "&por (@x[$b3],$t1)", - - "&paddd (@x[$a2],@x[$b2])", - "&paddd (@x[$a3],@x[$b3])", - "&pxor (@x[$d2],@x[$a2])", - "&pxor (@x[$d3],@x[$a3])", - "&pshufb (@x[$d2],$t0)", - "&pshufb (@x[$d3],$t0)", - - "&paddd ($xc,@x[$d2])", - "&paddd ($xc_,@x[$d3])", - "&pxor (@x[$b2],$xc)", - "&pxor (@x[$b3],$xc_)", - "&movdqa ($t1,@x[$b2])", - "&pslld (@x[$b2],7)", - "&psrld ($t1,25)", - "&movdqa ($t0,@x[$b3])", - "&pslld (@x[$b3],7)", - "&por (@x[$b2],$t1)", - "&psrld ($t0,25)", - "&movdqa ($t1,'(%r9)')", # .Lrot16(%rip) - "&por (@x[$b3],$t0)" - ); -} - -my $xframe = $win64 ? 0xa8 : 8; - -$code.=<<___; -.type chacha20_4x,\@function,5 -.align 32 -chacha20_4x: -.cfi_startproc -.Lchacha20_4x: - lea 8(%rsp),%r10 # frame pointer -.cfi_def_cfa_register %r10 -___ -$code.=<<___ if (!$kernel); - mov %r9,%r11 -___ -$code.=<<___ if ($avx>1 && !$kernel); - shr \$32,%r9 # OPENSSL_ia32cap_P+8 - test \$`1<<5`,%r9 # test AVX2 - jnz .Lchacha20_8x -___ -$code.=<<___; - cmp \$192,$len - ja .Lproceed4x -___ -$code.=<<___ if (!$kernel); - and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE - cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE - je .Ldo_ssse3_after_all # to detect Atom -___ -$code.=<<___; -.Lproceed4x: - sub \$0x140+$xframe,%rsp - and \$-16,%rsp -___ - ################ stack layout - # +0x00 SIMD equivalent of @x[8-12] - # ... - # +0x40 constant copy of key[0-2] smashed by lanes - # ... - # +0x100 SIMD counters (with nonce smashed by lanes) - # ... - # +0x140 -$code.=<<___ if ($win64); - movaps %xmm6,-0xb0(%r10) - movaps %xmm7,-0xa0(%r10) - movaps %xmm8,-0x90(%r10) - movaps %xmm9,-0x80(%r10) - movaps %xmm10,-0x70(%r10) - movaps %xmm11,-0x60(%r10) - movaps %xmm12,-0x50(%r10) - movaps %xmm13,-0x40(%r10) - movaps %xmm14,-0x30(%r10) - movaps %xmm15,-0x20(%r10) -.L4x_body: -___ -$code.=<<___; - movdqa .Lsigma(%rip),$xa3 # key[0] - movdqu ($key),$xb3 # key[1] - movdqu 16($key),$xt3 # key[2] - movdqu ($counter),$xd3 # key[3] - lea 0x100(%rsp),%rcx # size optimization - lea .Lrot16(%rip),%r9 - lea .Lrot24(%rip),%r11 - - pshufd \$0x00,$xa3,$xa0 # smash key by lanes... - pshufd \$0x55,$xa3,$xa1 - movdqa $xa0,0x40(%rsp) # ... and offload - pshufd \$0xaa,$xa3,$xa2 - movdqa $xa1,0x50(%rsp) - pshufd \$0xff,$xa3,$xa3 - movdqa $xa2,0x60(%rsp) - movdqa $xa3,0x70(%rsp) - - pshufd \$0x00,$xb3,$xb0 - pshufd \$0x55,$xb3,$xb1 - movdqa $xb0,0x80-0x100(%rcx) - pshufd \$0xaa,$xb3,$xb2 - movdqa $xb1,0x90-0x100(%rcx) - pshufd \$0xff,$xb3,$xb3 - movdqa $xb2,0xa0-0x100(%rcx) - movdqa $xb3,0xb0-0x100(%rcx) - - pshufd \$0x00,$xt3,$xt0 # "$xc0" - pshufd \$0x55,$xt3,$xt1 # "$xc1" - movdqa $xt0,0xc0-0x100(%rcx) - pshufd \$0xaa,$xt3,$xt2 # "$xc2" - movdqa $xt1,0xd0-0x100(%rcx) - pshufd \$0xff,$xt3,$xt3 # "$xc3" - movdqa $xt2,0xe0-0x100(%rcx) - movdqa $xt3,0xf0-0x100(%rcx) - - pshufd \$0x00,$xd3,$xd0 - pshufd \$0x55,$xd3,$xd1 - paddd .Linc(%rip),$xd0 # don't save counters yet - pshufd \$0xaa,$xd3,$xd2 - movdqa $xd1,0x110-0x100(%rcx) - pshufd \$0xff,$xd3,$xd3 - movdqa $xd2,0x120-0x100(%rcx) - movdqa $xd3,0x130-0x100(%rcx) - - jmp .Loop_enter4x - -.align 32 -.Loop_outer4x: - movdqa 0x40(%rsp),$xa0 # re-load smashed key - movdqa 0x50(%rsp),$xa1 - movdqa 0x60(%rsp),$xa2 - movdqa 0x70(%rsp),$xa3 - movdqa 0x80-0x100(%rcx),$xb0 - movdqa 0x90-0x100(%rcx),$xb1 - movdqa 0xa0-0x100(%rcx),$xb2 - movdqa 0xb0-0x100(%rcx),$xb3 - movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" - movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" - movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" - movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" - movdqa 0x100-0x100(%rcx),$xd0 - movdqa 0x110-0x100(%rcx),$xd1 - movdqa 0x120-0x100(%rcx),$xd2 - movdqa 0x130-0x100(%rcx),$xd3 - paddd .Lfour(%rip),$xd0 # next SIMD counters - -.Loop_enter4x: - movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" - movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" - movdqa (%r9),$xt3 # .Lrot16(%rip) - mov \$10,%eax - movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters - jmp .Loop4x - -.align 32 -.Loop4x: -___ - foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } - foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } -$code.=<<___; - dec %eax - jnz .Loop4x - - paddd 0x40(%rsp),$xa0 # accumulate key material - paddd 0x50(%rsp),$xa1 - paddd 0x60(%rsp),$xa2 - paddd 0x70(%rsp),$xa3 - - movdqa $xa0,$xt2 # "de-interlace" data - punpckldq $xa1,$xa0 - movdqa $xa2,$xt3 - punpckldq $xa3,$xa2 - punpckhdq $xa1,$xt2 - punpckhdq $xa3,$xt3 - movdqa $xa0,$xa1 - punpcklqdq $xa2,$xa0 # "a0" - movdqa $xt2,$xa3 - punpcklqdq $xt3,$xt2 # "a2" - punpckhqdq $xa2,$xa1 # "a1" - punpckhqdq $xt3,$xa3 # "a3" -___ - ($xa2,$xt2)=($xt2,$xa2); -$code.=<<___; - paddd 0x80-0x100(%rcx),$xb0 - paddd 0x90-0x100(%rcx),$xb1 - paddd 0xa0-0x100(%rcx),$xb2 - paddd 0xb0-0x100(%rcx),$xb3 - - movdqa $xa0,0x00(%rsp) # offload $xaN - movdqa $xa1,0x10(%rsp) - movdqa 0x20(%rsp),$xa0 # "xc2" - movdqa 0x30(%rsp),$xa1 # "xc3" - - movdqa $xb0,$xt2 - punpckldq $xb1,$xb0 - movdqa $xb2,$xt3 - punpckldq $xb3,$xb2 - punpckhdq $xb1,$xt2 - punpckhdq $xb3,$xt3 - movdqa $xb0,$xb1 - punpcklqdq $xb2,$xb0 # "b0" - movdqa $xt2,$xb3 - punpcklqdq $xt3,$xt2 # "b2" - punpckhqdq $xb2,$xb1 # "b1" - punpckhqdq $xt3,$xb3 # "b3" -___ - ($xb2,$xt2)=($xt2,$xb2); - my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); -$code.=<<___; - paddd 0xc0-0x100(%rcx),$xc0 - paddd 0xd0-0x100(%rcx),$xc1 - paddd 0xe0-0x100(%rcx),$xc2 - paddd 0xf0-0x100(%rcx),$xc3 - - movdqa $xa2,0x20(%rsp) # keep offloading $xaN - movdqa $xa3,0x30(%rsp) - - movdqa $xc0,$xt2 - punpckldq $xc1,$xc0 - movdqa $xc2,$xt3 - punpckldq $xc3,$xc2 - punpckhdq $xc1,$xt2 - punpckhdq $xc3,$xt3 - movdqa $xc0,$xc1 - punpcklqdq $xc2,$xc0 # "c0" - movdqa $xt2,$xc3 - punpcklqdq $xt3,$xt2 # "c2" - punpckhqdq $xc2,$xc1 # "c1" - punpckhqdq $xt3,$xc3 # "c3" -___ - ($xc2,$xt2)=($xt2,$xc2); - ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary -$code.=<<___; - paddd 0x100-0x100(%rcx),$xd0 - paddd 0x110-0x100(%rcx),$xd1 - paddd 0x120-0x100(%rcx),$xd2 - paddd 0x130-0x100(%rcx),$xd3 - - movdqa $xd0,$xt2 - punpckldq $xd1,$xd0 - movdqa $xd2,$xt3 - punpckldq $xd3,$xd2 - punpckhdq $xd1,$xt2 - punpckhdq $xd3,$xt3 - movdqa $xd0,$xd1 - punpcklqdq $xd2,$xd0 # "d0" - movdqa $xt2,$xd3 - punpcklqdq $xt3,$xt2 # "d2" - punpckhqdq $xd2,$xd1 # "d1" - punpckhqdq $xt3,$xd3 # "d3" -___ - ($xd2,$xt2)=($xt2,$xd2); -$code.=<<___; - cmp \$64*4,$len - jb .Ltail4x - - movdqu 0x00($inp),$xt0 # xor with input - movdqu 0x10($inp),$xt1 - movdqu 0x20($inp),$xt2 - movdqu 0x30($inp),$xt3 - pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? - pxor $xb0,$xt1 - pxor $xc0,$xt2 - pxor $xd0,$xt3 - - movdqu $xt0,0x00($out) - movdqu 0x40($inp),$xt0 - movdqu $xt1,0x10($out) - movdqu 0x50($inp),$xt1 - movdqu $xt2,0x20($out) - movdqu 0x60($inp),$xt2 - movdqu $xt3,0x30($out) - movdqu 0x70($inp),$xt3 - lea 0x80($inp),$inp # size optimization - pxor 0x10(%rsp),$xt0 - pxor $xb1,$xt1 - pxor $xc1,$xt2 - pxor $xd1,$xt3 - - movdqu $xt0,0x40($out) - movdqu 0x00($inp),$xt0 - movdqu $xt1,0x50($out) - movdqu 0x10($inp),$xt1 - movdqu $xt2,0x60($out) - movdqu 0x20($inp),$xt2 - movdqu $xt3,0x70($out) - lea 0x80($out),$out # size optimization - movdqu 0x30($inp),$xt3 - pxor 0x20(%rsp),$xt0 - pxor $xb2,$xt1 - pxor $xc2,$xt2 - pxor $xd2,$xt3 - - movdqu $xt0,0x00($out) - movdqu 0x40($inp),$xt0 - movdqu $xt1,0x10($out) - movdqu 0x50($inp),$xt1 - movdqu $xt2,0x20($out) - movdqu 0x60($inp),$xt2 - movdqu $xt3,0x30($out) - movdqu 0x70($inp),$xt3 - lea 0x80($inp),$inp # inp+=64*4 - pxor 0x30(%rsp),$xt0 - pxor $xb3,$xt1 - pxor $xc3,$xt2 - pxor $xd3,$xt3 - movdqu $xt0,0x40($out) - movdqu $xt1,0x50($out) - movdqu $xt2,0x60($out) - movdqu $xt3,0x70($out) - lea 0x80($out),$out # out+=64*4 - - sub \$64*4,$len - jnz .Loop_outer4x - - jmp .Ldone4x - -.Ltail4x: - cmp \$192,$len - jae .L192_or_more4x - cmp \$128,$len - jae .L128_or_more4x - cmp \$64,$len - jae .L64_or_more4x - - #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? - xor %r9,%r9 - #movdqa $xt0,0x00(%rsp) - movdqa $xb0,0x10(%rsp) - movdqa $xc0,0x20(%rsp) - movdqa $xd0,0x30(%rsp) - jmp .Loop_tail4x - -.align 32 -.L64_or_more4x: - movdqu 0x00($inp),$xt0 # xor with input - movdqu 0x10($inp),$xt1 - movdqu 0x20($inp),$xt2 - movdqu 0x30($inp),$xt3 - pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? - pxor $xb0,$xt1 - pxor $xc0,$xt2 - pxor $xd0,$xt3 - movdqu $xt0,0x00($out) - movdqu $xt1,0x10($out) - movdqu $xt2,0x20($out) - movdqu $xt3,0x30($out) - je .Ldone4x - - movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? - lea 0x40($inp),$inp # inp+=64*1 - xor %r9,%r9 - movdqa $xt0,0x00(%rsp) - movdqa $xb1,0x10(%rsp) - lea 0x40($out),$out # out+=64*1 - movdqa $xc1,0x20(%rsp) - sub \$64,$len # len-=64*1 - movdqa $xd1,0x30(%rsp) - jmp .Loop_tail4x - -.align 32 -.L128_or_more4x: - movdqu 0x00($inp),$xt0 # xor with input - movdqu 0x10($inp),$xt1 - movdqu 0x20($inp),$xt2 - movdqu 0x30($inp),$xt3 - pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? - pxor $xb0,$xt1 - pxor $xc0,$xt2 - pxor $xd0,$xt3 - - movdqu $xt0,0x00($out) - movdqu 0x40($inp),$xt0 - movdqu $xt1,0x10($out) - movdqu 0x50($inp),$xt1 - movdqu $xt2,0x20($out) - movdqu 0x60($inp),$xt2 - movdqu $xt3,0x30($out) - movdqu 0x70($inp),$xt3 - pxor 0x10(%rsp),$xt0 - pxor $xb1,$xt1 - pxor $xc1,$xt2 - pxor $xd1,$xt3 - movdqu $xt0,0x40($out) - movdqu $xt1,0x50($out) - movdqu $xt2,0x60($out) - movdqu $xt3,0x70($out) - je .Ldone4x - - movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? - lea 0x80($inp),$inp # inp+=64*2 - xor %r9,%r9 - movdqa $xt0,0x00(%rsp) - movdqa $xb2,0x10(%rsp) - lea 0x80($out),$out # out+=64*2 - movdqa $xc2,0x20(%rsp) - sub \$128,$len # len-=64*2 - movdqa $xd2,0x30(%rsp) - jmp .Loop_tail4x - -.align 32 -.L192_or_more4x: - movdqu 0x00($inp),$xt0 # xor with input - movdqu 0x10($inp),$xt1 - movdqu 0x20($inp),$xt2 - movdqu 0x30($inp),$xt3 - pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? - pxor $xb0,$xt1 - pxor $xc0,$xt2 - pxor $xd0,$xt3 - - movdqu $xt0,0x00($out) - movdqu 0x40($inp),$xt0 - movdqu $xt1,0x10($out) - movdqu 0x50($inp),$xt1 - movdqu $xt2,0x20($out) - movdqu 0x60($inp),$xt2 - movdqu $xt3,0x30($out) - movdqu 0x70($inp),$xt3 - lea 0x80($inp),$inp # size optimization - pxor 0x10(%rsp),$xt0 - pxor $xb1,$xt1 - pxor $xc1,$xt2 - pxor $xd1,$xt3 - - movdqu $xt0,0x40($out) - movdqu 0x00($inp),$xt0 - movdqu $xt1,0x50($out) - movdqu 0x10($inp),$xt1 - movdqu $xt2,0x60($out) - movdqu 0x20($inp),$xt2 - movdqu $xt3,0x70($out) - lea 0x80($out),$out # size optimization - movdqu 0x30($inp),$xt3 - pxor 0x20(%rsp),$xt0 - pxor $xb2,$xt1 - pxor $xc2,$xt2 - pxor $xd2,$xt3 - movdqu $xt0,0x00($out) - movdqu $xt1,0x10($out) - movdqu $xt2,0x20($out) - movdqu $xt3,0x30($out) - je .Ldone4x - - movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? - lea 0x40($inp),$inp # inp+=64*3 - xor %r9,%r9 - movdqa $xt0,0x00(%rsp) - movdqa $xb3,0x10(%rsp) - lea 0x40($out),$out # out+=64*3 - movdqa $xc3,0x20(%rsp) - sub \$192,$len # len-=64*3 - movdqa $xd3,0x30(%rsp) - -.Loop_tail4x: - movzb ($inp,%r9),%eax - movzb (%rsp,%r9),%ecx - lea 1(%r9),%r9 - xor %ecx,%eax - mov %al,-1($out,%r9) - dec $len - jnz .Loop_tail4x - -.Ldone4x: -___ -$code.=<<___ if ($win64); - movaps -0xb0(%r10),%xmm6 - movaps -0xa0(%r10),%xmm7 - movaps -0x90(%r10),%xmm8 - movaps -0x80(%r10),%xmm9 - movaps -0x70(%r10),%xmm10 - movaps -0x60(%r10),%xmm11 - movaps -0x50(%r10),%xmm12 - movaps -0x40(%r10),%xmm13 - movaps -0x30(%r10),%xmm14 - movaps -0x20(%r10),%xmm15 -___ -$code.=<<___; - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -.L4x_epilogue: - ret -.cfi_endproc -.size chacha20_4x,.-chacha20_4x -___ -} -if($kernel) { - $code .= "#endif\n"; -} - -######################################################################## -# XOP code path that handles all lengths. -if ($avx && !$kernel) { -# There is some "anomaly" observed depending on instructions' size or -# alignment. If you look closely at below code you'll notice that -# sometimes argument order varies. The order affects instruction -# encoding by making it larger, and such fiddling gives 5% performance -# improvement. This is on FX-4100... - -my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, - $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15)); -my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, - $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3); - -sub XOP_lane_ROUND { -my ($a0,$b0,$c0,$d0)=@_; -my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); -my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); -my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); -my @x=map("\"$_\"",@xx); - - ( - "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 - "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 - "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 - "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 - "&vpxor (@x[$d0],@x[$a0],@x[$d0])", - "&vpxor (@x[$d1],@x[$a1],@x[$d1])", - "&vpxor (@x[$d2],@x[$a2],@x[$d2])", - "&vpxor (@x[$d3],@x[$a3],@x[$d3])", - "&vprotd (@x[$d0],@x[$d0],16)", - "&vprotd (@x[$d1],@x[$d1],16)", - "&vprotd (@x[$d2],@x[$d2],16)", - "&vprotd (@x[$d3],@x[$d3],16)", - - "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", - "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", - "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", - "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", - "&vpxor (@x[$b0],@x[$c0],@x[$b0])", - "&vpxor (@x[$b1],@x[$c1],@x[$b1])", - "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip - "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip - "&vprotd (@x[$b0],@x[$b0],12)", - "&vprotd (@x[$b1],@x[$b1],12)", - "&vprotd (@x[$b2],@x[$b2],12)", - "&vprotd (@x[$b3],@x[$b3],12)", - - "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip - "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip - "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", - "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", - "&vpxor (@x[$d0],@x[$a0],@x[$d0])", - "&vpxor (@x[$d1],@x[$a1],@x[$d1])", - "&vpxor (@x[$d2],@x[$a2],@x[$d2])", - "&vpxor (@x[$d3],@x[$a3],@x[$d3])", - "&vprotd (@x[$d0],@x[$d0],8)", - "&vprotd (@x[$d1],@x[$d1],8)", - "&vprotd (@x[$d2],@x[$d2],8)", - "&vprotd (@x[$d3],@x[$d3],8)", - - "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", - "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", - "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", - "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", - "&vpxor (@x[$b0],@x[$c0],@x[$b0])", - "&vpxor (@x[$b1],@x[$c1],@x[$b1])", - "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip - "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip - "&vprotd (@x[$b0],@x[$b0],7)", - "&vprotd (@x[$b1],@x[$b1],7)", - "&vprotd (@x[$b2],@x[$b2],7)", - "&vprotd (@x[$b3],@x[$b3],7)" - ); -} - -my $xframe = $win64 ? 0xa8 : 8; - -&declare_function("chacha20_xop", 32, 5); -$code.=<<___; -.cfi_startproc -.Lchacha20_4xop: - lea 8(%rsp),%r10 # frame pointer -.cfi_def_cfa_register %r10 - sub \$0x140+$xframe,%rsp - and \$-16,%rsp -___ - ################ stack layout - # +0x00 SIMD equivalent of @x[8-12] - # ... - # +0x40 constant copy of key[0-2] smashed by lanes - # ... - # +0x100 SIMD counters (with nonce smashed by lanes) - # ... - # +0x140 -$code.=<<___ if ($win64); - movaps %xmm6,-0xb0(%r10) - movaps %xmm7,-0xa0(%r10) - movaps %xmm8,-0x90(%r10) - movaps %xmm9,-0x80(%r10) - movaps %xmm10,-0x70(%r10) - movaps %xmm11,-0x60(%r10) - movaps %xmm12,-0x50(%r10) - movaps %xmm13,-0x40(%r10) - movaps %xmm14,-0x30(%r10) - movaps %xmm15,-0x20(%r10) -.L4xop_body: -___ -$code.=<<___; - vzeroupper - - vmovdqa .Lsigma(%rip),$xa3 # key[0] - vmovdqu ($key),$xb3 # key[1] - vmovdqu 16($key),$xt3 # key[2] - vmovdqu ($counter),$xd3 # key[3] - lea 0x100(%rsp),%rcx # size optimization - - vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... - vpshufd \$0x55,$xa3,$xa1 - vmovdqa $xa0,0x40(%rsp) # ... and offload - vpshufd \$0xaa,$xa3,$xa2 - vmovdqa $xa1,0x50(%rsp) - vpshufd \$0xff,$xa3,$xa3 - vmovdqa $xa2,0x60(%rsp) - vmovdqa $xa3,0x70(%rsp) - - vpshufd \$0x00,$xb3,$xb0 - vpshufd \$0x55,$xb3,$xb1 - vmovdqa $xb0,0x80-0x100(%rcx) - vpshufd \$0xaa,$xb3,$xb2 - vmovdqa $xb1,0x90-0x100(%rcx) - vpshufd \$0xff,$xb3,$xb3 - vmovdqa $xb2,0xa0-0x100(%rcx) - vmovdqa $xb3,0xb0-0x100(%rcx) - - vpshufd \$0x00,$xt3,$xt0 # "$xc0" - vpshufd \$0x55,$xt3,$xt1 # "$xc1" - vmovdqa $xt0,0xc0-0x100(%rcx) - vpshufd \$0xaa,$xt3,$xt2 # "$xc2" - vmovdqa $xt1,0xd0-0x100(%rcx) - vpshufd \$0xff,$xt3,$xt3 # "$xc3" - vmovdqa $xt2,0xe0-0x100(%rcx) - vmovdqa $xt3,0xf0-0x100(%rcx) - - vpshufd \$0x00,$xd3,$xd0 - vpshufd \$0x55,$xd3,$xd1 - vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet - vpshufd \$0xaa,$xd3,$xd2 - vmovdqa $xd1,0x110-0x100(%rcx) - vpshufd \$0xff,$xd3,$xd3 - vmovdqa $xd2,0x120-0x100(%rcx) - vmovdqa $xd3,0x130-0x100(%rcx) - - jmp .Loop_enter4xop - -.align 32 -.Loop_outer4xop: - vmovdqa 0x40(%rsp),$xa0 # re-load smashed key - vmovdqa 0x50(%rsp),$xa1 - vmovdqa 0x60(%rsp),$xa2 - vmovdqa 0x70(%rsp),$xa3 - vmovdqa 0x80-0x100(%rcx),$xb0 - vmovdqa 0x90-0x100(%rcx),$xb1 - vmovdqa 0xa0-0x100(%rcx),$xb2 - vmovdqa 0xb0-0x100(%rcx),$xb3 - vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" - vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" - vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" - vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" - vmovdqa 0x100-0x100(%rcx),$xd0 - vmovdqa 0x110-0x100(%rcx),$xd1 - vmovdqa 0x120-0x100(%rcx),$xd2 - vmovdqa 0x130-0x100(%rcx),$xd3 - vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters - -.Loop_enter4xop: - mov \$10,%eax - vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters - jmp .Loop4xop - -.align 32 -.Loop4xop: -___ - foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; } - foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; } -$code.=<<___; - dec %eax - jnz .Loop4xop - - vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material - vpaddd 0x50(%rsp),$xa1,$xa1 - vpaddd 0x60(%rsp),$xa2,$xa2 - vpaddd 0x70(%rsp),$xa3,$xa3 - - vmovdqa $xt2,0x20(%rsp) # offload $xc2,3 - vmovdqa $xt3,0x30(%rsp) - - vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data - vpunpckldq $xa3,$xa2,$xt3 - vpunpckhdq $xa1,$xa0,$xa0 - vpunpckhdq $xa3,$xa2,$xa2 - vpunpcklqdq $xt3,$xt2,$xa1 # "a0" - vpunpckhqdq $xt3,$xt2,$xt2 # "a1" - vpunpcklqdq $xa2,$xa0,$xa3 # "a2" - vpunpckhqdq $xa2,$xa0,$xa0 # "a3" -___ - ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); -$code.=<<___; - vpaddd 0x80-0x100(%rcx),$xb0,$xb0 - vpaddd 0x90-0x100(%rcx),$xb1,$xb1 - vpaddd 0xa0-0x100(%rcx),$xb2,$xb2 - vpaddd 0xb0-0x100(%rcx),$xb3,$xb3 - - vmovdqa $xa0,0x00(%rsp) # offload $xa0,1 - vmovdqa $xa1,0x10(%rsp) - vmovdqa 0x20(%rsp),$xa0 # "xc2" - vmovdqa 0x30(%rsp),$xa1 # "xc3" - - vpunpckldq $xb1,$xb0,$xt2 - vpunpckldq $xb3,$xb2,$xt3 - vpunpckhdq $xb1,$xb0,$xb0 - vpunpckhdq $xb3,$xb2,$xb2 - vpunpcklqdq $xt3,$xt2,$xb1 # "b0" - vpunpckhqdq $xt3,$xt2,$xt2 # "b1" - vpunpcklqdq $xb2,$xb0,$xb3 # "b2" - vpunpckhqdq $xb2,$xb0,$xb0 # "b3" -___ - ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); - my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); -$code.=<<___; - vpaddd 0xc0-0x100(%rcx),$xc0,$xc0 - vpaddd 0xd0-0x100(%rcx),$xc1,$xc1 - vpaddd 0xe0-0x100(%rcx),$xc2,$xc2 - vpaddd 0xf0-0x100(%rcx),$xc3,$xc3 - - vpunpckldq $xc1,$xc0,$xt2 - vpunpckldq $xc3,$xc2,$xt3 - vpunpckhdq $xc1,$xc0,$xc0 - vpunpckhdq $xc3,$xc2,$xc2 - vpunpcklqdq $xt3,$xt2,$xc1 # "c0" - vpunpckhqdq $xt3,$xt2,$xt2 # "c1" - vpunpcklqdq $xc2,$xc0,$xc3 # "c2" - vpunpckhqdq $xc2,$xc0,$xc0 # "c3" -___ - ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); -$code.=<<___; - vpaddd 0x100-0x100(%rcx),$xd0,$xd0 - vpaddd 0x110-0x100(%rcx),$xd1,$xd1 - vpaddd 0x120-0x100(%rcx),$xd2,$xd2 - vpaddd 0x130-0x100(%rcx),$xd3,$xd3 - - vpunpckldq $xd1,$xd0,$xt2 - vpunpckldq $xd3,$xd2,$xt3 - vpunpckhdq $xd1,$xd0,$xd0 - vpunpckhdq $xd3,$xd2,$xd2 - vpunpcklqdq $xt3,$xt2,$xd1 # "d0" - vpunpckhqdq $xt3,$xt2,$xt2 # "d1" - vpunpcklqdq $xd2,$xd0,$xd3 # "d2" - vpunpckhqdq $xd2,$xd0,$xd0 # "d3" -___ - ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); - ($xa0,$xa1)=($xt2,$xt3); -$code.=<<___; - vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1 - vmovdqa 0x10(%rsp),$xa1 - - cmp \$64*4,$len - jb .Ltail4xop - - vpxor 0x00($inp),$xa0,$xa0 # xor with input - vpxor 0x10($inp),$xb0,$xb0 - vpxor 0x20($inp),$xc0,$xc0 - vpxor 0x30($inp),$xd0,$xd0 - vpxor 0x40($inp),$xa1,$xa1 - vpxor 0x50($inp),$xb1,$xb1 - vpxor 0x60($inp),$xc1,$xc1 - vpxor 0x70($inp),$xd1,$xd1 - lea 0x80($inp),$inp # size optimization - vpxor 0x00($inp),$xa2,$xa2 - vpxor 0x10($inp),$xb2,$xb2 - vpxor 0x20($inp),$xc2,$xc2 - vpxor 0x30($inp),$xd2,$xd2 - vpxor 0x40($inp),$xa3,$xa3 - vpxor 0x50($inp),$xb3,$xb3 - vpxor 0x60($inp),$xc3,$xc3 - vpxor 0x70($inp),$xd3,$xd3 - lea 0x80($inp),$inp # inp+=64*4 - - vmovdqu $xa0,0x00($out) - vmovdqu $xb0,0x10($out) - vmovdqu $xc0,0x20($out) - vmovdqu $xd0,0x30($out) - vmovdqu $xa1,0x40($out) - vmovdqu $xb1,0x50($out) - vmovdqu $xc1,0x60($out) - vmovdqu $xd1,0x70($out) - lea 0x80($out),$out # size optimization - vmovdqu $xa2,0x00($out) - vmovdqu $xb2,0x10($out) - vmovdqu $xc2,0x20($out) - vmovdqu $xd2,0x30($out) - vmovdqu $xa3,0x40($out) - vmovdqu $xb3,0x50($out) - vmovdqu $xc3,0x60($out) - vmovdqu $xd3,0x70($out) - lea 0x80($out),$out # out+=64*4 - - sub \$64*4,$len - jnz .Loop_outer4xop - - jmp .Ldone4xop - -.align 32 -.Ltail4xop: - cmp \$192,$len - jae .L192_or_more4xop - cmp \$128,$len - jae .L128_or_more4xop - cmp \$64,$len - jae .L64_or_more4xop - - xor %r9,%r9 - vmovdqa $xa0,0x00(%rsp) - vmovdqa $xb0,0x10(%rsp) - vmovdqa $xc0,0x20(%rsp) - vmovdqa $xd0,0x30(%rsp) - jmp .Loop_tail4xop - -.align 32 -.L64_or_more4xop: - vpxor 0x00($inp),$xa0,$xa0 # xor with input - vpxor 0x10($inp),$xb0,$xb0 - vpxor 0x20($inp),$xc0,$xc0 - vpxor 0x30($inp),$xd0,$xd0 - vmovdqu $xa0,0x00($out) - vmovdqu $xb0,0x10($out) - vmovdqu $xc0,0x20($out) - vmovdqu $xd0,0x30($out) - je .Ldone4xop - - lea 0x40($inp),$inp # inp+=64*1 - vmovdqa $xa1,0x00(%rsp) - xor %r9,%r9 - vmovdqa $xb1,0x10(%rsp) - lea 0x40($out),$out # out+=64*1 - vmovdqa $xc1,0x20(%rsp) - sub \$64,$len # len-=64*1 - vmovdqa $xd1,0x30(%rsp) - jmp .Loop_tail4xop - -.align 32 -.L128_or_more4xop: - vpxor 0x00($inp),$xa0,$xa0 # xor with input - vpxor 0x10($inp),$xb0,$xb0 - vpxor 0x20($inp),$xc0,$xc0 - vpxor 0x30($inp),$xd0,$xd0 - vpxor 0x40($inp),$xa1,$xa1 - vpxor 0x50($inp),$xb1,$xb1 - vpxor 0x60($inp),$xc1,$xc1 - vpxor 0x70($inp),$xd1,$xd1 - - vmovdqu $xa0,0x00($out) - vmovdqu $xb0,0x10($out) - vmovdqu $xc0,0x20($out) - vmovdqu $xd0,0x30($out) - vmovdqu $xa1,0x40($out) - vmovdqu $xb1,0x50($out) - vmovdqu $xc1,0x60($out) - vmovdqu $xd1,0x70($out) - je .Ldone4xop - - lea 0x80($inp),$inp # inp+=64*2 - vmovdqa $xa2,0x00(%rsp) - xor %r9,%r9 - vmovdqa $xb2,0x10(%rsp) - lea 0x80($out),$out # out+=64*2 - vmovdqa $xc2,0x20(%rsp) - sub \$128,$len # len-=64*2 - vmovdqa $xd2,0x30(%rsp) - jmp .Loop_tail4xop - -.align 32 -.L192_or_more4xop: - vpxor 0x00($inp),$xa0,$xa0 # xor with input - vpxor 0x10($inp),$xb0,$xb0 - vpxor 0x20($inp),$xc0,$xc0 - vpxor 0x30($inp),$xd0,$xd0 - vpxor 0x40($inp),$xa1,$xa1 - vpxor 0x50($inp),$xb1,$xb1 - vpxor 0x60($inp),$xc1,$xc1 - vpxor 0x70($inp),$xd1,$xd1 - lea 0x80($inp),$inp # size optimization - vpxor 0x00($inp),$xa2,$xa2 - vpxor 0x10($inp),$xb2,$xb2 - vpxor 0x20($inp),$xc2,$xc2 - vpxor 0x30($inp),$xd2,$xd2 - - vmovdqu $xa0,0x00($out) - vmovdqu $xb0,0x10($out) - vmovdqu $xc0,0x20($out) - vmovdqu $xd0,0x30($out) - vmovdqu $xa1,0x40($out) - vmovdqu $xb1,0x50($out) - vmovdqu $xc1,0x60($out) - vmovdqu $xd1,0x70($out) - lea 0x80($out),$out # size optimization - vmovdqu $xa2,0x00($out) - vmovdqu $xb2,0x10($out) - vmovdqu $xc2,0x20($out) - vmovdqu $xd2,0x30($out) - je .Ldone4xop - - lea 0x40($inp),$inp # inp+=64*3 - vmovdqa $xa3,0x00(%rsp) - xor %r9,%r9 - vmovdqa $xb3,0x10(%rsp) - lea 0x40($out),$out # out+=64*3 - vmovdqa $xc3,0x20(%rsp) - sub \$192,$len # len-=64*3 - vmovdqa $xd3,0x30(%rsp) - -.Loop_tail4xop: - movzb ($inp,%r9),%eax - movzb (%rsp,%r9),%ecx - lea 1(%r9),%r9 - xor %ecx,%eax - mov %al,-1($out,%r9) - dec $len - jnz .Loop_tail4xop - -.Ldone4xop: - vzeroupper -___ -$code.=<<___ if ($win64); - movaps -0xb0(%r10),%xmm6 - movaps -0xa0(%r10),%xmm7 - movaps -0x90(%r10),%xmm8 - movaps -0x80(%r10),%xmm9 - movaps -0x70(%r10),%xmm10 - movaps -0x60(%r10),%xmm11 - movaps -0x50(%r10),%xmm12 - movaps -0x40(%r10),%xmm13 - movaps -0x30(%r10),%xmm14 - movaps -0x20(%r10),%xmm15 -___ -$code.=<<___; - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -.L4xop_epilogue: - ret -.cfi_endproc -___ -&end_function("chacha20_xop"); -} - -######################################################################## -# AVX2 code path -if ($avx>1) { - -if($kernel) { - $code .= "#ifdef CONFIG_AS_AVX2\n"; -} - -my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, - $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); -my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, - "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); - -sub AVX2_lane_ROUND { -my ($a0,$b0,$c0,$d0)=@_; -my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); -my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); -my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); -my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); -my @x=map("\"$_\"",@xx); - - # Consider order in which variables are addressed by their - # index: - # - # a b c d - # - # 0 4 8 12 < even round - # 1 5 9 13 - # 2 6 10 14 - # 3 7 11 15 - # 0 5 10 15 < odd round - # 1 6 11 12 - # 2 7 8 13 - # 3 4 9 14 - # - # 'a', 'b' and 'd's are permanently allocated in registers, - # @x[0..7,12..15], while 'c's are maintained in memory. If - # you observe 'c' column, you'll notice that pair of 'c's is - # invariant between rounds. This means that we have to reload - # them once per round, in the middle. This is why you'll see - # bunch of 'c' stores and loads in the middle, but none in - # the beginning or end. - - ( - "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 - "&vpxor (@x[$d0],@x[$a0],@x[$d0])", - "&vpshufb (@x[$d0],@x[$d0],$t1)", - "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 - "&vpxor (@x[$d1],@x[$a1],@x[$d1])", - "&vpshufb (@x[$d1],@x[$d1],$t1)", - - "&vpaddd ($xc,$xc,@x[$d0])", - "&vpxor (@x[$b0],$xc,@x[$b0])", - "&vpslld ($t0,@x[$b0],12)", - "&vpsrld (@x[$b0],@x[$b0],20)", - "&vpor (@x[$b0],$t0,@x[$b0])", - "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) - "&vpaddd ($xc_,$xc_,@x[$d1])", - "&vpxor (@x[$b1],$xc_,@x[$b1])", - "&vpslld ($t1,@x[$b1],12)", - "&vpsrld (@x[$b1],@x[$b1],20)", - "&vpor (@x[$b1],$t1,@x[$b1])", - - "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", - "&vpxor (@x[$d0],@x[$a0],@x[$d0])", - "&vpshufb (@x[$d0],@x[$d0],$t0)", - "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", - "&vpxor (@x[$d1],@x[$a1],@x[$d1])", - "&vpshufb (@x[$d1],@x[$d1],$t0)", - - "&vpaddd ($xc,$xc,@x[$d0])", - "&vpxor (@x[$b0],$xc,@x[$b0])", - "&vpslld ($t1,@x[$b0],7)", - "&vpsrld (@x[$b0],@x[$b0],25)", - "&vpor (@x[$b0],$t1,@x[$b0])", - "&vbroadcasti128($t1,'(%r9)')", # .Lrot16(%rip) - "&vpaddd ($xc_,$xc_,@x[$d1])", - "&vpxor (@x[$b1],$xc_,@x[$b1])", - "&vpslld ($t0,@x[$b1],7)", - "&vpsrld (@x[$b1],@x[$b1],25)", - "&vpor (@x[$b1],$t0,@x[$b1])", - - "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's - "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", - "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", - "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", - - "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 - "&vpxor (@x[$d2],@x[$a2],@x[$d2])", - "&vpshufb (@x[$d2],@x[$d2],$t1)", - "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 - "&vpxor (@x[$d3],@x[$a3],@x[$d3])", - "&vpshufb (@x[$d3],@x[$d3],$t1)", - - "&vpaddd ($xc,$xc,@x[$d2])", - "&vpxor (@x[$b2],$xc,@x[$b2])", - "&vpslld ($t0,@x[$b2],12)", - "&vpsrld (@x[$b2],@x[$b2],20)", - "&vpor (@x[$b2],$t0,@x[$b2])", - "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) - "&vpaddd ($xc_,$xc_,@x[$d3])", - "&vpxor (@x[$b3],$xc_,@x[$b3])", - "&vpslld ($t1,@x[$b3],12)", - "&vpsrld (@x[$b3],@x[$b3],20)", - "&vpor (@x[$b3],$t1,@x[$b3])", - - "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", - "&vpxor (@x[$d2],@x[$a2],@x[$d2])", - "&vpshufb (@x[$d2],@x[$d2],$t0)", - "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", - "&vpxor (@x[$d3],@x[$a3],@x[$d3])", - "&vpshufb (@x[$d3],@x[$d3],$t0)", - - "&vpaddd ($xc,$xc,@x[$d2])", - "&vpxor (@x[$b2],$xc,@x[$b2])", - "&vpslld ($t1,@x[$b2],7)", - "&vpsrld (@x[$b2],@x[$b2],25)", - "&vpor (@x[$b2],$t1,@x[$b2])", - "&vbroadcasti128($t1,'(%r9)')", # .Lrot16(%rip) - "&vpaddd ($xc_,$xc_,@x[$d3])", - "&vpxor (@x[$b3],$xc_,@x[$b3])", - "&vpslld ($t0,@x[$b3],7)", - "&vpsrld (@x[$b3],@x[$b3],25)", - "&vpor (@x[$b3],$t0,@x[$b3])" - ); -} - -my $xframe = $win64 ? 0xa8 : 8; - -&declare_function("chacha20_avx2", 32, 5); -$code.=<<___; -.cfi_startproc -.Lchacha20_8x: - lea 8(%rsp),%r10 # frame register -.cfi_def_cfa_register %r10 - sub \$0x280+$xframe,%rsp - and \$-32,%rsp -___ -$code.=<<___ if ($win64); - movaps %xmm6,-0xb0(%r10) - movaps %xmm7,-0xa0(%r10) - movaps %xmm8,-0x90(%r10) - movaps %xmm9,-0x80(%r10) - movaps %xmm10,-0x70(%r10) - movaps %xmm11,-0x60(%r10) - movaps %xmm12,-0x50(%r10) - movaps %xmm13,-0x40(%r10) - movaps %xmm14,-0x30(%r10) - movaps %xmm15,-0x20(%r10) -.L8x_body: -___ -$code.=<<___; - vzeroupper - - ################ stack layout - # +0x00 SIMD equivalent of @x[8-12] - # ... - # +0x80 constant copy of key[0-2] smashed by lanes - # ... - # +0x200 SIMD counters (with nonce smashed by lanes) - # ... - # +0x280 - - vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] - vbroadcasti128 ($key),$xb3 # key[1] - vbroadcasti128 16($key),$xt3 # key[2] - vbroadcasti128 ($counter),$xd3 # key[3] - lea 0x100(%rsp),%rcx # size optimization - lea 0x200(%rsp),%rax # size optimization - lea .Lrot16(%rip),%r9 - lea .Lrot24(%rip),%r11 - - vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... - vpshufd \$0x55,$xa3,$xa1 - vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload - vpshufd \$0xaa,$xa3,$xa2 - vmovdqa $xa1,0xa0-0x100(%rcx) - vpshufd \$0xff,$xa3,$xa3 - vmovdqa $xa2,0xc0-0x100(%rcx) - vmovdqa $xa3,0xe0-0x100(%rcx) - - vpshufd \$0x00,$xb3,$xb0 - vpshufd \$0x55,$xb3,$xb1 - vmovdqa $xb0,0x100-0x100(%rcx) - vpshufd \$0xaa,$xb3,$xb2 - vmovdqa $xb1,0x120-0x100(%rcx) - vpshufd \$0xff,$xb3,$xb3 - vmovdqa $xb2,0x140-0x100(%rcx) - vmovdqa $xb3,0x160-0x100(%rcx) - - vpshufd \$0x00,$xt3,$xt0 # "xc0" - vpshufd \$0x55,$xt3,$xt1 # "xc1" - vmovdqa $xt0,0x180-0x200(%rax) - vpshufd \$0xaa,$xt3,$xt2 # "xc2" - vmovdqa $xt1,0x1a0-0x200(%rax) - vpshufd \$0xff,$xt3,$xt3 # "xc3" - vmovdqa $xt2,0x1c0-0x200(%rax) - vmovdqa $xt3,0x1e0-0x200(%rax) - - vpshufd \$0x00,$xd3,$xd0 - vpshufd \$0x55,$xd3,$xd1 - vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet - vpshufd \$0xaa,$xd3,$xd2 - vmovdqa $xd1,0x220-0x200(%rax) - vpshufd \$0xff,$xd3,$xd3 - vmovdqa $xd2,0x240-0x200(%rax) - vmovdqa $xd3,0x260-0x200(%rax) - - jmp .Loop_enter8x - -.align 32 -.Loop_outer8x: - vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key - vmovdqa 0xa0-0x100(%rcx),$xa1 - vmovdqa 0xc0-0x100(%rcx),$xa2 - vmovdqa 0xe0-0x100(%rcx),$xa3 - vmovdqa 0x100-0x100(%rcx),$xb0 - vmovdqa 0x120-0x100(%rcx),$xb1 - vmovdqa 0x140-0x100(%rcx),$xb2 - vmovdqa 0x160-0x100(%rcx),$xb3 - vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" - vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" - vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" - vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" - vmovdqa 0x200-0x200(%rax),$xd0 - vmovdqa 0x220-0x200(%rax),$xd1 - vmovdqa 0x240-0x200(%rax),$xd2 - vmovdqa 0x260-0x200(%rax),$xd3 - vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters - -.Loop_enter8x: - vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" - vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" - vbroadcasti128 (%r9),$xt3 - vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters - mov \$10,%eax - jmp .Loop8x - -.align 32 -.Loop8x: -___ - foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } - foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } -$code.=<<___; - dec %eax - jnz .Loop8x - - lea 0x200(%rsp),%rax # size optimization - vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key - vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 - vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 - vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 - - vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data - vpunpckldq $xa3,$xa2,$xt3 - vpunpckhdq $xa1,$xa0,$xa0 - vpunpckhdq $xa3,$xa2,$xa2 - vpunpcklqdq $xt3,$xt2,$xa1 # "a0" - vpunpckhqdq $xt3,$xt2,$xt2 # "a1" - vpunpcklqdq $xa2,$xa0,$xa3 # "a2" - vpunpckhqdq $xa2,$xa0,$xa0 # "a3" -___ - ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); -$code.=<<___; - vpaddd 0x100-0x100(%rcx),$xb0,$xb0 - vpaddd 0x120-0x100(%rcx),$xb1,$xb1 - vpaddd 0x140-0x100(%rcx),$xb2,$xb2 - vpaddd 0x160-0x100(%rcx),$xb3,$xb3 - - vpunpckldq $xb1,$xb0,$xt2 - vpunpckldq $xb3,$xb2,$xt3 - vpunpckhdq $xb1,$xb0,$xb0 - vpunpckhdq $xb3,$xb2,$xb2 - vpunpcklqdq $xt3,$xt2,$xb1 # "b0" - vpunpckhqdq $xt3,$xt2,$xt2 # "b1" - vpunpcklqdq $xb2,$xb0,$xb3 # "b2" - vpunpckhqdq $xb2,$xb0,$xb0 # "b3" -___ - ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); -$code.=<<___; - vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further - vperm2i128 \$0x31,$xb0,$xa0,$xb0 - vperm2i128 \$0x20,$xb1,$xa1,$xa0 - vperm2i128 \$0x31,$xb1,$xa1,$xb1 - vperm2i128 \$0x20,$xb2,$xa2,$xa1 - vperm2i128 \$0x31,$xb2,$xa2,$xb2 - vperm2i128 \$0x20,$xb3,$xa3,$xa2 - vperm2i128 \$0x31,$xb3,$xa3,$xb3 -___ - ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); - my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); -$code.=<<___; - vmovdqa $xa0,0x00(%rsp) # offload $xaN - vmovdqa $xa1,0x20(%rsp) - vmovdqa 0x40(%rsp),$xc2 # $xa0 - vmovdqa 0x60(%rsp),$xc3 # $xa1 - - vpaddd 0x180-0x200(%rax),$xc0,$xc0 - vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 - vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 - vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 - - vpunpckldq $xc1,$xc0,$xt2 - vpunpckldq $xc3,$xc2,$xt3 - vpunpckhdq $xc1,$xc0,$xc0 - vpunpckhdq $xc3,$xc2,$xc2 - vpunpcklqdq $xt3,$xt2,$xc1 # "c0" - vpunpckhqdq $xt3,$xt2,$xt2 # "c1" - vpunpcklqdq $xc2,$xc0,$xc3 # "c2" - vpunpckhqdq $xc2,$xc0,$xc0 # "c3" -___ - ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); -$code.=<<___; - vpaddd 0x200-0x200(%rax),$xd0,$xd0 - vpaddd 0x220-0x200(%rax),$xd1,$xd1 - vpaddd 0x240-0x200(%rax),$xd2,$xd2 - vpaddd 0x260-0x200(%rax),$xd3,$xd3 - - vpunpckldq $xd1,$xd0,$xt2 - vpunpckldq $xd3,$xd2,$xt3 - vpunpckhdq $xd1,$xd0,$xd0 - vpunpckhdq $xd3,$xd2,$xd2 - vpunpcklqdq $xt3,$xt2,$xd1 # "d0" - vpunpckhqdq $xt3,$xt2,$xt2 # "d1" - vpunpcklqdq $xd2,$xd0,$xd3 # "d2" - vpunpckhqdq $xd2,$xd0,$xd0 # "d3" -___ - ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); -$code.=<<___; - vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further - vperm2i128 \$0x31,$xd0,$xc0,$xd0 - vperm2i128 \$0x20,$xd1,$xc1,$xc0 - vperm2i128 \$0x31,$xd1,$xc1,$xd1 - vperm2i128 \$0x20,$xd2,$xc2,$xc1 - vperm2i128 \$0x31,$xd2,$xc2,$xd2 - vperm2i128 \$0x20,$xd3,$xc3,$xc2 - vperm2i128 \$0x31,$xd3,$xc3,$xd3 -___ - ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); - ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= - ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); - ($xa0,$xa1)=($xt2,$xt3); -$code.=<<___; - vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? - vmovdqa 0x20(%rsp),$xa1 - - cmp \$64*8,$len - jb .Ltail8x - - vpxor 0x00($inp),$xa0,$xa0 # xor with input - vpxor 0x20($inp),$xb0,$xb0 - vpxor 0x40($inp),$xc0,$xc0 - vpxor 0x60($inp),$xd0,$xd0 - lea 0x80($inp),$inp # size optimization - vmovdqu $xa0,0x00($out) - vmovdqu $xb0,0x20($out) - vmovdqu $xc0,0x40($out) - vmovdqu $xd0,0x60($out) - lea 0x80($out),$out # size optimization - - vpxor 0x00($inp),$xa1,$xa1 - vpxor 0x20($inp),$xb1,$xb1 - vpxor 0x40($inp),$xc1,$xc1 - vpxor 0x60($inp),$xd1,$xd1 - lea 0x80($inp),$inp # size optimization - vmovdqu $xa1,0x00($out) - vmovdqu $xb1,0x20($out) - vmovdqu $xc1,0x40($out) - vmovdqu $xd1,0x60($out) - lea 0x80($out),$out # size optimization - - vpxor 0x00($inp),$xa2,$xa2 - vpxor 0x20($inp),$xb2,$xb2 - vpxor 0x40($inp),$xc2,$xc2 - vpxor 0x60($inp),$xd2,$xd2 - lea 0x80($inp),$inp # size optimization - vmovdqu $xa2,0x00($out) - vmovdqu $xb2,0x20($out) - vmovdqu $xc2,0x40($out) - vmovdqu $xd2,0x60($out) - lea 0x80($out),$out # size optimization - - vpxor 0x00($inp),$xa3,$xa3 - vpxor 0x20($inp),$xb3,$xb3 - vpxor 0x40($inp),$xc3,$xc3 - vpxor 0x60($inp),$xd3,$xd3 - lea 0x80($inp),$inp # size optimization - vmovdqu $xa3,0x00($out) - vmovdqu $xb3,0x20($out) - vmovdqu $xc3,0x40($out) - vmovdqu $xd3,0x60($out) - lea 0x80($out),$out # size optimization - - sub \$64*8,$len - jnz .Loop_outer8x - - jmp .Ldone8x - -.Ltail8x: - cmp \$448,$len - jae .L448_or_more8x - cmp \$384,$len - jae .L384_or_more8x - cmp \$320,$len - jae .L320_or_more8x - cmp \$256,$len - jae .L256_or_more8x - cmp \$192,$len - jae .L192_or_more8x - cmp \$128,$len - jae .L128_or_more8x - cmp \$64,$len - jae .L64_or_more8x - - xor %r9,%r9 - vmovdqa $xa0,0x00(%rsp) - vmovdqa $xb0,0x20(%rsp) - jmp .Loop_tail8x - -.align 32 -.L64_or_more8x: - vpxor 0x00($inp),$xa0,$xa0 # xor with input - vpxor 0x20($inp),$xb0,$xb0 - vmovdqu $xa0,0x00($out) - vmovdqu $xb0,0x20($out) - je .Ldone8x - - lea 0x40($inp),$inp # inp+=64*1 - xor %r9,%r9 - vmovdqa $xc0,0x00(%rsp) - lea 0x40($out),$out # out+=64*1 - sub \$64,$len # len-=64*1 - vmovdqa $xd0,0x20(%rsp) - jmp .Loop_tail8x - -.align 32 -.L128_or_more8x: - vpxor 0x00($inp),$xa0,$xa0 # xor with input - vpxor 0x20($inp),$xb0,$xb0 - vpxor 0x40($inp),$xc0,$xc0 - vpxor 0x60($inp),$xd0,$xd0 - vmovdqu $xa0,0x00($out) - vmovdqu $xb0,0x20($out) - vmovdqu $xc0,0x40($out) - vmovdqu $xd0,0x60($out) - je .Ldone8x - - lea 0x80($inp),$inp # inp+=64*2 - xor %r9,%r9 - vmovdqa $xa1,0x00(%rsp) - lea 0x80($out),$out # out+=64*2 - sub \$128,$len # len-=64*2 - vmovdqa $xb1,0x20(%rsp) - jmp .Loop_tail8x - -.align 32 -.L192_or_more8x: - vpxor 0x00($inp),$xa0,$xa0 # xor with input - vpxor 0x20($inp),$xb0,$xb0 - vpxor 0x40($inp),$xc0,$xc0 - vpxor 0x60($inp),$xd0,$xd0 - vpxor 0x80($inp),$xa1,$xa1 - vpxor 0xa0($inp),$xb1,$xb1 - vmovdqu $xa0,0x00($out) - vmovdqu $xb0,0x20($out) - vmovdqu $xc0,0x40($out) - vmovdqu $xd0,0x60($out) - vmovdqu $xa1,0x80($out) - vmovdqu $xb1,0xa0($out) - je .Ldone8x - - lea 0xc0($inp),$inp # inp+=64*3 - xor %r9,%r9 - vmovdqa $xc1,0x00(%rsp) - lea 0xc0($out),$out # out+=64*3 - sub \$192,$len # len-=64*3 - vmovdqa $xd1,0x20(%rsp) - jmp .Loop_tail8x - -.align 32 -.L256_or_more8x: - vpxor 0x00($inp),$xa0,$xa0 # xor with input - vpxor 0x20($inp),$xb0,$xb0 - vpxor 0x40($inp),$xc0,$xc0 - vpxor 0x60($inp),$xd0,$xd0 - vpxor 0x80($inp),$xa1,$xa1 - vpxor 0xa0($inp),$xb1,$xb1 - vpxor 0xc0($inp),$xc1,$xc1 - vpxor 0xe0($inp),$xd1,$xd1 - vmovdqu $xa0,0x00($out) - vmovdqu $xb0,0x20($out) - vmovdqu $xc0,0x40($out) - vmovdqu $xd0,0x60($out) - vmovdqu $xa1,0x80($out) - vmovdqu $xb1,0xa0($out) - vmovdqu $xc1,0xc0($out) - vmovdqu $xd1,0xe0($out) - je .Ldone8x - - lea 0x100($inp),$inp # inp+=64*4 - xor %r9,%r9 - vmovdqa $xa2,0x00(%rsp) - lea 0x100($out),$out # out+=64*4 - sub \$256,$len # len-=64*4 - vmovdqa $xb2,0x20(%rsp) - jmp .Loop_tail8x - -.align 32 -.L320_or_more8x: - vpxor 0x00($inp),$xa0,$xa0 # xor with input - vpxor 0x20($inp),$xb0,$xb0 - vpxor 0x40($inp),$xc0,$xc0 - vpxor 0x60($inp),$xd0,$xd0 - vpxor 0x80($inp),$xa1,$xa1 - vpxor 0xa0($inp),$xb1,$xb1 - vpxor 0xc0($inp),$xc1,$xc1 - vpxor 0xe0($inp),$xd1,$xd1 - vpxor 0x100($inp),$xa2,$xa2 - vpxor 0x120($inp),$xb2,$xb2 - vmovdqu $xa0,0x00($out) - vmovdqu $xb0,0x20($out) - vmovdqu $xc0,0x40($out) - vmovdqu $xd0,0x60($out) - vmovdqu $xa1,0x80($out) - vmovdqu $xb1,0xa0($out) - vmovdqu $xc1,0xc0($out) - vmovdqu $xd1,0xe0($out) - vmovdqu $xa2,0x100($out) - vmovdqu $xb2,0x120($out) - je .Ldone8x - - lea 0x140($inp),$inp # inp+=64*5 - xor %r9,%r9 - vmovdqa $xc2,0x00(%rsp) - lea 0x140($out),$out # out+=64*5 - sub \$320,$len # len-=64*5 - vmovdqa $xd2,0x20(%rsp) - jmp .Loop_tail8x - -.align 32 -.L384_or_more8x: - vpxor 0x00($inp),$xa0,$xa0 # xor with input - vpxor 0x20($inp),$xb0,$xb0 - vpxor 0x40($inp),$xc0,$xc0 - vpxor 0x60($inp),$xd0,$xd0 - vpxor 0x80($inp),$xa1,$xa1 - vpxor 0xa0($inp),$xb1,$xb1 - vpxor 0xc0($inp),$xc1,$xc1 - vpxor 0xe0($inp),$xd1,$xd1 - vpxor 0x100($inp),$xa2,$xa2 - vpxor 0x120($inp),$xb2,$xb2 - vpxor 0x140($inp),$xc2,$xc2 - vpxor 0x160($inp),$xd2,$xd2 - vmovdqu $xa0,0x00($out) - vmovdqu $xb0,0x20($out) - vmovdqu $xc0,0x40($out) - vmovdqu $xd0,0x60($out) - vmovdqu $xa1,0x80($out) - vmovdqu $xb1,0xa0($out) - vmovdqu $xc1,0xc0($out) - vmovdqu $xd1,0xe0($out) - vmovdqu $xa2,0x100($out) - vmovdqu $xb2,0x120($out) - vmovdqu $xc2,0x140($out) - vmovdqu $xd2,0x160($out) - je .Ldone8x - - lea 0x180($inp),$inp # inp+=64*6 - xor %r9,%r9 - vmovdqa $xa3,0x00(%rsp) - lea 0x180($out),$out # out+=64*6 - sub \$384,$len # len-=64*6 - vmovdqa $xb3,0x20(%rsp) - jmp .Loop_tail8x - -.align 32 -.L448_or_more8x: - vpxor 0x00($inp),$xa0,$xa0 # xor with input - vpxor 0x20($inp),$xb0,$xb0 - vpxor 0x40($inp),$xc0,$xc0 - vpxor 0x60($inp),$xd0,$xd0 - vpxor 0x80($inp),$xa1,$xa1 - vpxor 0xa0($inp),$xb1,$xb1 - vpxor 0xc0($inp),$xc1,$xc1 - vpxor 0xe0($inp),$xd1,$xd1 - vpxor 0x100($inp),$xa2,$xa2 - vpxor 0x120($inp),$xb2,$xb2 - vpxor 0x140($inp),$xc2,$xc2 - vpxor 0x160($inp),$xd2,$xd2 - vpxor 0x180($inp),$xa3,$xa3 - vpxor 0x1a0($inp),$xb3,$xb3 - vmovdqu $xa0,0x00($out) - vmovdqu $xb0,0x20($out) - vmovdqu $xc0,0x40($out) - vmovdqu $xd0,0x60($out) - vmovdqu $xa1,0x80($out) - vmovdqu $xb1,0xa0($out) - vmovdqu $xc1,0xc0($out) - vmovdqu $xd1,0xe0($out) - vmovdqu $xa2,0x100($out) - vmovdqu $xb2,0x120($out) - vmovdqu $xc2,0x140($out) - vmovdqu $xd2,0x160($out) - vmovdqu $xa3,0x180($out) - vmovdqu $xb3,0x1a0($out) - je .Ldone8x - - lea 0x1c0($inp),$inp # inp+=64*7 - xor %r9,%r9 - vmovdqa $xc3,0x00(%rsp) - lea 0x1c0($out),$out # out+=64*7 - sub \$448,$len # len-=64*7 - vmovdqa $xd3,0x20(%rsp) - -.Loop_tail8x: - movzb ($inp,%r9),%eax - movzb (%rsp,%r9),%ecx - lea 1(%r9),%r9 - xor %ecx,%eax - mov %al,-1($out,%r9) - dec $len - jnz .Loop_tail8x - -.Ldone8x: - vzeroall -___ -$code.=<<___ if ($win64); - movaps -0xb0(%r10),%xmm6 - movaps -0xa0(%r10),%xmm7 - movaps -0x90(%r10),%xmm8 - movaps -0x80(%r10),%xmm9 - movaps -0x70(%r10),%xmm10 - movaps -0x60(%r10),%xmm11 - movaps -0x50(%r10),%xmm12 - movaps -0x40(%r10),%xmm13 - movaps -0x30(%r10),%xmm14 - movaps -0x20(%r10),%xmm15 -___ -$code.=<<___; - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -.L8x_epilogue: - ret -.cfi_endproc -___ -&end_function("chacha20_avx2"); -if($kernel) { - $code .= "#endif\n"; -} -} - -######################################################################## -# AVX512 code paths -if ($avx>2) { -# This one handles shorter inputs... -if($kernel) { - $code .= "#ifdef CONFIG_AS_AVX512\n"; -} - -my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); -my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); - -sub vpxord() # size optimization -{ my $opcode = "vpxor"; # adhere to vpxor when possible - - foreach (@_) { - if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) { - $opcode = "vpxord"; - last; - } - } - - $code .= "\t$opcode\t".join(',',reverse @_)."\n"; -} - -sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round - &vpaddd ($a,$a,$b); - &vpxord ($d,$d,$a); - &vprold ($d,$d,16); - - &vpaddd ($c,$c,$d); - &vpxord ($b,$b,$c); - &vprold ($b,$b,12); - - &vpaddd ($a,$a,$b); - &vpxord ($d,$d,$a); - &vprold ($d,$d,8); - - &vpaddd ($c,$c,$d); - &vpxord ($b,$b,$c); - &vprold ($b,$b,7); -} - -my $xframe = $win64 ? 32+8 : 8; - -&declare_function("chacha20_avx512", 32, 5); -$code.=<<___; -.cfi_startproc -.Lchacha20_avx512: - lea 8(%rsp),%r10 # frame pointer -.cfi_def_cfa_register %r10 - cmp \$512,$len - ja .Lchacha20_16x - - sub \$64+$xframe,%rsp - and \$-64,%rsp -___ -$code.=<<___ if ($win64); - movaps %xmm6,-0x30(%r10) - movaps %xmm7,-0x20(%r10) -.Lavx512_body: -___ -$code.=<<___; - vbroadcasti32x4 .Lsigma(%rip),$a - vbroadcasti32x4 ($key),$b - vbroadcasti32x4 16($key),$c - vbroadcasti32x4 ($counter),$d - - vmovdqa32 $a,$a_ - vmovdqa32 $b,$b_ - vmovdqa32 $c,$c_ - vpaddd .Lzeroz(%rip),$d,$d - vmovdqa32 .Lfourz(%rip),$fourz - mov \$10,$counter # reuse $counter - vmovdqa32 $d,$d_ - jmp .Loop_avx512 - -.align 16 -.Loop_outer_avx512: - vmovdqa32 $a_,$a - vmovdqa32 $b_,$b - vmovdqa32 $c_,$c - vpaddd $fourz,$d_,$d - mov \$10,$counter - vmovdqa32 $d,$d_ - jmp .Loop_avx512 - -.align 32 -.Loop_avx512: -___ - &AVX512ROUND(); - &vpshufd ($c,$c,0b01001110); - &vpshufd ($b,$b,0b00111001); - &vpshufd ($d,$d,0b10010011); - - &AVX512ROUND(); - &vpshufd ($c,$c,0b01001110); - &vpshufd ($b,$b,0b10010011); - &vpshufd ($d,$d,0b00111001); - - &dec ($counter); - &jnz (".Loop_avx512"); - -$code.=<<___; - vpaddd $a_,$a,$a - vpaddd $b_,$b,$b - vpaddd $c_,$c,$c - vpaddd $d_,$d,$d - - sub \$64,$len - jb .Ltail64_avx512 - - vpxor 0x00($inp),%x#$a,$t0 # xor with input - vpxor 0x10($inp),%x#$b,$t1 - vpxor 0x20($inp),%x#$c,$t2 - vpxor 0x30($inp),%x#$d,$t3 - lea 0x40($inp),$inp # inp+=64 - - vmovdqu $t0,0x00($out) # write output - vmovdqu $t1,0x10($out) - vmovdqu $t2,0x20($out) - vmovdqu $t3,0x30($out) - lea 0x40($out),$out # out+=64 - - jz .Ldone_avx512 - - vextracti32x4 \$1,$a,$t0 - vextracti32x4 \$1,$b,$t1 - vextracti32x4 \$1,$c,$t2 - vextracti32x4 \$1,$d,$t3 - - sub \$64,$len - jb .Ltail_avx512 - - vpxor 0x00($inp),$t0,$t0 # xor with input - vpxor 0x10($inp),$t1,$t1 - vpxor 0x20($inp),$t2,$t2 - vpxor 0x30($inp),$t3,$t3 - lea 0x40($inp),$inp # inp+=64 - - vmovdqu $t0,0x00($out) # write output - vmovdqu $t1,0x10($out) - vmovdqu $t2,0x20($out) - vmovdqu $t3,0x30($out) - lea 0x40($out),$out # out+=64 - - jz .Ldone_avx512 - - vextracti32x4 \$2,$a,$t0 - vextracti32x4 \$2,$b,$t1 - vextracti32x4 \$2,$c,$t2 - vextracti32x4 \$2,$d,$t3 - - sub \$64,$len - jb .Ltail_avx512 - - vpxor 0x00($inp),$t0,$t0 # xor with input - vpxor 0x10($inp),$t1,$t1 - vpxor 0x20($inp),$t2,$t2 - vpxor 0x30($inp),$t3,$t3 - lea 0x40($inp),$inp # inp+=64 - - vmovdqu $t0,0x00($out) # write output - vmovdqu $t1,0x10($out) - vmovdqu $t2,0x20($out) - vmovdqu $t3,0x30($out) - lea 0x40($out),$out # out+=64 - - jz .Ldone_avx512 - - vextracti32x4 \$3,$a,$t0 - vextracti32x4 \$3,$b,$t1 - vextracti32x4 \$3,$c,$t2 - vextracti32x4 \$3,$d,$t3 - - sub \$64,$len - jb .Ltail_avx512 - - vpxor 0x00($inp),$t0,$t0 # xor with input - vpxor 0x10($inp),$t1,$t1 - vpxor 0x20($inp),$t2,$t2 - vpxor 0x30($inp),$t3,$t3 - lea 0x40($inp),$inp # inp+=64 - - vmovdqu $t0,0x00($out) # write output - vmovdqu $t1,0x10($out) - vmovdqu $t2,0x20($out) - vmovdqu $t3,0x30($out) - lea 0x40($out),$out # out+=64 - - jnz .Loop_outer_avx512 - - jmp .Ldone_avx512 - -.align 16 -.Ltail64_avx512: - vmovdqa %x#$a,0x00(%rsp) - vmovdqa %x#$b,0x10(%rsp) - vmovdqa %x#$c,0x20(%rsp) - vmovdqa %x#$d,0x30(%rsp) - add \$64,$len - jmp .Loop_tail_avx512 - -.align 16 -.Ltail_avx512: - vmovdqa $t0,0x00(%rsp) - vmovdqa $t1,0x10(%rsp) - vmovdqa $t2,0x20(%rsp) - vmovdqa $t3,0x30(%rsp) - add \$64,$len - -.Loop_tail_avx512: - movzb ($inp,$counter),%eax - movzb (%rsp,$counter),%ecx - lea 1($counter),$counter - xor %ecx,%eax - mov %al,-1($out,$counter) - dec $len - jnz .Loop_tail_avx512 - - vmovdqu32 $a_,0x00(%rsp) - -.Ldone_avx512: - vzeroall -___ -$code.=<<___ if ($win64); - movaps -0x30(%r10),%xmm6 - movaps -0x20(%r10),%xmm7 -___ -$code.=<<___; - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -.Lavx512_epilogue: - ret -.cfi_endproc -___ -&end_function("chacha20_avx512"); - -map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz); - -&declare_function("chacha20_avx512vl", 32, 5); -$code.=<<___; -.cfi_startproc -.Lchacha20_avx512vl: - lea 8(%rsp),%r10 # frame pointer -.cfi_def_cfa_register %r10 - cmp \$128,$len - ja .Lchacha20_8xvl - - sub \$64+$xframe,%rsp - and \$-32,%rsp -___ -$code.=<<___ if ($win64); - movaps %xmm6,-0x30(%r10) - movaps %xmm7,-0x20(%r10) -.Lavx512vl_body: -___ -$code.=<<___; - vbroadcasti128 .Lsigma(%rip),$a - vbroadcasti128 ($key),$b - vbroadcasti128 16($key),$c - vbroadcasti128 ($counter),$d - - vmovdqa32 $a,$a_ - vmovdqa32 $b,$b_ - vmovdqa32 $c,$c_ - vpaddd .Lzeroz(%rip),$d,$d - vmovdqa32 .Ltwoy(%rip),$fourz - mov \$10,$counter # reuse $counter - vmovdqa32 $d,$d_ - jmp .Loop_avx512vl - -.align 16 -.Loop_outer_avx512vl: - vmovdqa32 $c_,$c - vpaddd $fourz,$d_,$d - mov \$10,$counter - vmovdqa32 $d,$d_ - jmp .Loop_avx512vl - -.align 32 -.Loop_avx512vl: -___ - &AVX512ROUND(); - &vpshufd ($c,$c,0b01001110); - &vpshufd ($b,$b,0b00111001); - &vpshufd ($d,$d,0b10010011); - - &AVX512ROUND(); - &vpshufd ($c,$c,0b01001110); - &vpshufd ($b,$b,0b10010011); - &vpshufd ($d,$d,0b00111001); - - &dec ($counter); - &jnz (".Loop_avx512vl"); - -$code.=<<___; - vpaddd $a_,$a,$a - vpaddd $b_,$b,$b - vpaddd $c_,$c,$c - vpaddd $d_,$d,$d - - sub \$64,$len - jb .Ltail64_avx512vl - - vpxor 0x00($inp),%x#$a,$t0 # xor with input - vpxor 0x10($inp),%x#$b,$t1 - vpxor 0x20($inp),%x#$c,$t2 - vpxor 0x30($inp),%x#$d,$t3 - lea 0x40($inp),$inp # inp+=64 - - vmovdqu $t0,0x00($out) # write output - vmovdqu $t1,0x10($out) - vmovdqu $t2,0x20($out) - vmovdqu $t3,0x30($out) - lea 0x40($out),$out # out+=64 - - jz .Ldone_avx512vl - - vextracti128 \$1,$a,$t0 - vextracti128 \$1,$b,$t1 - vextracti128 \$1,$c,$t2 - vextracti128 \$1,$d,$t3 - - sub \$64,$len - jb .Ltail_avx512vl - - vpxor 0x00($inp),$t0,$t0 # xor with input - vpxor 0x10($inp),$t1,$t1 - vpxor 0x20($inp),$t2,$t2 - vpxor 0x30($inp),$t3,$t3 - lea 0x40($inp),$inp # inp+=64 - - vmovdqu $t0,0x00($out) # write output - vmovdqu $t1,0x10($out) - vmovdqu $t2,0x20($out) - vmovdqu $t3,0x30($out) - lea 0x40($out),$out # out+=64 - - vmovdqa32 $a_,$a - vmovdqa32 $b_,$b - jnz .Loop_outer_avx512vl - - jmp .Ldone_avx512vl - -.align 16 -.Ltail64_avx512vl: - vmovdqa %x#$a,0x00(%rsp) - vmovdqa %x#$b,0x10(%rsp) - vmovdqa %x#$c,0x20(%rsp) - vmovdqa %x#$d,0x30(%rsp) - add \$64,$len - jmp .Loop_tail_avx512vl - -.align 16 -.Ltail_avx512vl: - vmovdqa $t0,0x00(%rsp) - vmovdqa $t1,0x10(%rsp) - vmovdqa $t2,0x20(%rsp) - vmovdqa $t3,0x30(%rsp) - add \$64,$len - -.Loop_tail_avx512vl: - movzb ($inp,$counter),%eax - movzb (%rsp,$counter),%ecx - lea 1($counter),$counter - xor %ecx,%eax - mov %al,-1($out,$counter) - dec $len - jnz .Loop_tail_avx512vl - - vmovdqu32 $a_,0x00(%rsp) - vmovdqu32 $a_,0x20(%rsp) - -.Ldone_avx512vl: - vzeroall -___ -$code.=<<___ if ($win64); - movaps -0x30(%r10),%xmm6 - movaps -0x20(%r10),%xmm7 -___ -$code.=<<___; - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -.Lavx512vl_epilogue: - ret -.cfi_endproc -___ -&end_function("chacha20_avx512vl"); - -# This one handles longer inputs... - -my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, - $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); -my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, - $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); -my @key=map("%zmm$_",(16..31)); -my ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; - -sub AVX512_lane_ROUND { -my ($a0,$b0,$c0,$d0)=@_; -my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); -my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); -my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); -my @x=map("\"$_\"",@xx); - - ( - "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 - "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 - "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 - "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 - "&vpxord (@x[$d0],@x[$d0],@x[$a0])", - "&vpxord (@x[$d1],@x[$d1],@x[$a1])", - "&vpxord (@x[$d2],@x[$d2],@x[$a2])", - "&vpxord (@x[$d3],@x[$d3],@x[$a3])", - "&vprold (@x[$d0],@x[$d0],16)", - "&vprold (@x[$d1],@x[$d1],16)", - "&vprold (@x[$d2],@x[$d2],16)", - "&vprold (@x[$d3],@x[$d3],16)", - - "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", - "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", - "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", - "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", - "&vpxord (@x[$b0],@x[$b0],@x[$c0])", - "&vpxord (@x[$b1],@x[$b1],@x[$c1])", - "&vpxord (@x[$b2],@x[$b2],@x[$c2])", - "&vpxord (@x[$b3],@x[$b3],@x[$c3])", - "&vprold (@x[$b0],@x[$b0],12)", - "&vprold (@x[$b1],@x[$b1],12)", - "&vprold (@x[$b2],@x[$b2],12)", - "&vprold (@x[$b3],@x[$b3],12)", - - "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", - "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", - "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", - "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", - "&vpxord (@x[$d0],@x[$d0],@x[$a0])", - "&vpxord (@x[$d1],@x[$d1],@x[$a1])", - "&vpxord (@x[$d2],@x[$d2],@x[$a2])", - "&vpxord (@x[$d3],@x[$d3],@x[$a3])", - "&vprold (@x[$d0],@x[$d0],8)", - "&vprold (@x[$d1],@x[$d1],8)", - "&vprold (@x[$d2],@x[$d2],8)", - "&vprold (@x[$d3],@x[$d3],8)", - - "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", - "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", - "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", - "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", - "&vpxord (@x[$b0],@x[$b0],@x[$c0])", - "&vpxord (@x[$b1],@x[$b1],@x[$c1])", - "&vpxord (@x[$b2],@x[$b2],@x[$c2])", - "&vpxord (@x[$b3],@x[$b3],@x[$c3])", - "&vprold (@x[$b0],@x[$b0],7)", - "&vprold (@x[$b1],@x[$b1],7)", - "&vprold (@x[$b2],@x[$b2],7)", - "&vprold (@x[$b3],@x[$b3],7)" - ); -} - -my $xframe = $win64 ? 0xa8 : 8; - -$code.=<<___; -.type chacha20_16x,\@function,5 -.align 32 -chacha20_16x: -.cfi_startproc -.Lchacha20_16x: - lea 8(%rsp),%r10 # frame register -.cfi_def_cfa_register %r10 - sub \$64+$xframe,%rsp - and \$-64,%rsp -___ -$code.=<<___ if ($win64); - movaps %xmm6,-0xb0(%r10) - movaps %xmm7,-0xa0(%r10) - movaps %xmm8,-0x90(%r10) - movaps %xmm9,-0x80(%r10) - movaps %xmm10,-0x70(%r10) - movaps %xmm11,-0x60(%r10) - movaps %xmm12,-0x50(%r10) - movaps %xmm13,-0x40(%r10) - movaps %xmm14,-0x30(%r10) - movaps %xmm15,-0x20(%r10) -.L16x_body: -___ -$code.=<<___; - vzeroupper - - lea .Lsigma(%rip),%r9 - vbroadcasti32x4 (%r9),$xa3 # key[0] - vbroadcasti32x4 ($key),$xb3 # key[1] - vbroadcasti32x4 16($key),$xc3 # key[2] - vbroadcasti32x4 ($counter),$xd3 # key[3] - - vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... - vpshufd \$0x55,$xa3,$xa1 - vpshufd \$0xaa,$xa3,$xa2 - vpshufd \$0xff,$xa3,$xa3 - vmovdqa64 $xa0,@key[0] - vmovdqa64 $xa1,@key[1] - vmovdqa64 $xa2,@key[2] - vmovdqa64 $xa3,@key[3] - - vpshufd \$0x00,$xb3,$xb0 - vpshufd \$0x55,$xb3,$xb1 - vpshufd \$0xaa,$xb3,$xb2 - vpshufd \$0xff,$xb3,$xb3 - vmovdqa64 $xb0,@key[4] - vmovdqa64 $xb1,@key[5] - vmovdqa64 $xb2,@key[6] - vmovdqa64 $xb3,@key[7] - - vpshufd \$0x00,$xc3,$xc0 - vpshufd \$0x55,$xc3,$xc1 - vpshufd \$0xaa,$xc3,$xc2 - vpshufd \$0xff,$xc3,$xc3 - vmovdqa64 $xc0,@key[8] - vmovdqa64 $xc1,@key[9] - vmovdqa64 $xc2,@key[10] - vmovdqa64 $xc3,@key[11] - - vpshufd \$0x00,$xd3,$xd0 - vpshufd \$0x55,$xd3,$xd1 - vpshufd \$0xaa,$xd3,$xd2 - vpshufd \$0xff,$xd3,$xd3 - vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet - vmovdqa64 $xd0,@key[12] - vmovdqa64 $xd1,@key[13] - vmovdqa64 $xd2,@key[14] - vmovdqa64 $xd3,@key[15] - - mov \$10,%eax - jmp .Loop16x - -.align 32 -.Loop_outer16x: - vpbroadcastd 0(%r9),$xa0 # reload key - vpbroadcastd 4(%r9),$xa1 - vpbroadcastd 8(%r9),$xa2 - vpbroadcastd 12(%r9),$xa3 - vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters - vmovdqa64 @key[4],$xb0 - vmovdqa64 @key[5],$xb1 - vmovdqa64 @key[6],$xb2 - vmovdqa64 @key[7],$xb3 - vmovdqa64 @key[8],$xc0 - vmovdqa64 @key[9],$xc1 - vmovdqa64 @key[10],$xc2 - vmovdqa64 @key[11],$xc3 - vmovdqa64 @key[12],$xd0 - vmovdqa64 @key[13],$xd1 - vmovdqa64 @key[14],$xd2 - vmovdqa64 @key[15],$xd3 - - vmovdqa64 $xa0,@key[0] - vmovdqa64 $xa1,@key[1] - vmovdqa64 $xa2,@key[2] - vmovdqa64 $xa3,@key[3] - - mov \$10,%eax - jmp .Loop16x - -.align 32 -.Loop16x: -___ - foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } - foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } -$code.=<<___; - dec %eax - jnz .Loop16x - - vpaddd @key[0],$xa0,$xa0 # accumulate key - vpaddd @key[1],$xa1,$xa1 - vpaddd @key[2],$xa2,$xa2 - vpaddd @key[3],$xa3,$xa3 - - vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data - vpunpckldq $xa3,$xa2,$xt3 - vpunpckhdq $xa1,$xa0,$xa0 - vpunpckhdq $xa3,$xa2,$xa2 - vpunpcklqdq $xt3,$xt2,$xa1 # "a0" - vpunpckhqdq $xt3,$xt2,$xt2 # "a1" - vpunpcklqdq $xa2,$xa0,$xa3 # "a2" - vpunpckhqdq $xa2,$xa0,$xa0 # "a3" -___ - ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); -$code.=<<___; - vpaddd @key[4],$xb0,$xb0 - vpaddd @key[5],$xb1,$xb1 - vpaddd @key[6],$xb2,$xb2 - vpaddd @key[7],$xb3,$xb3 - - vpunpckldq $xb1,$xb0,$xt2 - vpunpckldq $xb3,$xb2,$xt3 - vpunpckhdq $xb1,$xb0,$xb0 - vpunpckhdq $xb3,$xb2,$xb2 - vpunpcklqdq $xt3,$xt2,$xb1 # "b0" - vpunpckhqdq $xt3,$xt2,$xt2 # "b1" - vpunpcklqdq $xb2,$xb0,$xb3 # "b2" - vpunpckhqdq $xb2,$xb0,$xb0 # "b3" -___ - ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); -$code.=<<___; - vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further - vshufi32x4 \$0xee,$xb0,$xa0,$xb0 - vshufi32x4 \$0x44,$xb1,$xa1,$xa0 - vshufi32x4 \$0xee,$xb1,$xa1,$xb1 - vshufi32x4 \$0x44,$xb2,$xa2,$xa1 - vshufi32x4 \$0xee,$xb2,$xa2,$xb2 - vshufi32x4 \$0x44,$xb3,$xa3,$xa2 - vshufi32x4 \$0xee,$xb3,$xa3,$xb3 -___ - ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); -$code.=<<___; - vpaddd @key[8],$xc0,$xc0 - vpaddd @key[9],$xc1,$xc1 - vpaddd @key[10],$xc2,$xc2 - vpaddd @key[11],$xc3,$xc3 - - vpunpckldq $xc1,$xc0,$xt2 - vpunpckldq $xc3,$xc2,$xt3 - vpunpckhdq $xc1,$xc0,$xc0 - vpunpckhdq $xc3,$xc2,$xc2 - vpunpcklqdq $xt3,$xt2,$xc1 # "c0" - vpunpckhqdq $xt3,$xt2,$xt2 # "c1" - vpunpcklqdq $xc2,$xc0,$xc3 # "c2" - vpunpckhqdq $xc2,$xc0,$xc0 # "c3" -___ - ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); -$code.=<<___; - vpaddd @key[12],$xd0,$xd0 - vpaddd @key[13],$xd1,$xd1 - vpaddd @key[14],$xd2,$xd2 - vpaddd @key[15],$xd3,$xd3 - - vpunpckldq $xd1,$xd0,$xt2 - vpunpckldq $xd3,$xd2,$xt3 - vpunpckhdq $xd1,$xd0,$xd0 - vpunpckhdq $xd3,$xd2,$xd2 - vpunpcklqdq $xt3,$xt2,$xd1 # "d0" - vpunpckhqdq $xt3,$xt2,$xt2 # "d1" - vpunpcklqdq $xd2,$xd0,$xd3 # "d2" - vpunpckhqdq $xd2,$xd0,$xd0 # "d3" -___ - ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); -$code.=<<___; - vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further - vshufi32x4 \$0xee,$xd0,$xc0,$xd0 - vshufi32x4 \$0x44,$xd1,$xc1,$xc0 - vshufi32x4 \$0xee,$xd1,$xc1,$xd1 - vshufi32x4 \$0x44,$xd2,$xc2,$xc1 - vshufi32x4 \$0xee,$xd2,$xc2,$xd2 - vshufi32x4 \$0x44,$xd3,$xc3,$xc2 - vshufi32x4 \$0xee,$xd3,$xc3,$xd3 -___ - ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); -$code.=<<___; - vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further - vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 - vshufi32x4 \$0x88,$xd0,$xb0,$xc0 - vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 - vshufi32x4 \$0x88,$xc1,$xa1,$xt1 - vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 - vshufi32x4 \$0x88,$xd1,$xb1,$xc1 - vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 - vshufi32x4 \$0x88,$xc2,$xa2,$xt2 - vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 - vshufi32x4 \$0x88,$xd2,$xb2,$xc2 - vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 - vshufi32x4 \$0x88,$xc3,$xa3,$xt3 - vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 - vshufi32x4 \$0x88,$xd3,$xb3,$xc3 - vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 -___ - ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= - ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); - - ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, - $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = - ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, - $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); -$code.=<<___; - cmp \$64*16,$len - jb .Ltail16x - - vpxord 0x00($inp),$xa0,$xa0 # xor with input - vpxord 0x40($inp),$xb0,$xb0 - vpxord 0x80($inp),$xc0,$xc0 - vpxord 0xc0($inp),$xd0,$xd0 - vmovdqu32 $xa0,0x00($out) - vmovdqu32 $xb0,0x40($out) - vmovdqu32 $xc0,0x80($out) - vmovdqu32 $xd0,0xc0($out) - - vpxord 0x100($inp),$xa1,$xa1 - vpxord 0x140($inp),$xb1,$xb1 - vpxord 0x180($inp),$xc1,$xc1 - vpxord 0x1c0($inp),$xd1,$xd1 - vmovdqu32 $xa1,0x100($out) - vmovdqu32 $xb1,0x140($out) - vmovdqu32 $xc1,0x180($out) - vmovdqu32 $xd1,0x1c0($out) - - vpxord 0x200($inp),$xa2,$xa2 - vpxord 0x240($inp),$xb2,$xb2 - vpxord 0x280($inp),$xc2,$xc2 - vpxord 0x2c0($inp),$xd2,$xd2 - vmovdqu32 $xa2,0x200($out) - vmovdqu32 $xb2,0x240($out) - vmovdqu32 $xc2,0x280($out) - vmovdqu32 $xd2,0x2c0($out) - - vpxord 0x300($inp),$xa3,$xa3 - vpxord 0x340($inp),$xb3,$xb3 - vpxord 0x380($inp),$xc3,$xc3 - vpxord 0x3c0($inp),$xd3,$xd3 - lea 0x400($inp),$inp - vmovdqu32 $xa3,0x300($out) - vmovdqu32 $xb3,0x340($out) - vmovdqu32 $xc3,0x380($out) - vmovdqu32 $xd3,0x3c0($out) - lea 0x400($out),$out - - sub \$64*16,$len - jnz .Loop_outer16x - - jmp .Ldone16x - -.align 32 -.Ltail16x: - xor %r9,%r9 - sub $inp,$out - cmp \$64*1,$len - jb .Less_than_64_16x - vpxord ($inp),$xa0,$xa0 # xor with input - vmovdqu32 $xa0,($out,$inp) - je .Ldone16x - vmovdqa32 $xb0,$xa0 - lea 64($inp),$inp - - cmp \$64*2,$len - jb .Less_than_64_16x - vpxord ($inp),$xb0,$xb0 - vmovdqu32 $xb0,($out,$inp) - je .Ldone16x - vmovdqa32 $xc0,$xa0 - lea 64($inp),$inp - - cmp \$64*3,$len - jb .Less_than_64_16x - vpxord ($inp),$xc0,$xc0 - vmovdqu32 $xc0,($out,$inp) - je .Ldone16x - vmovdqa32 $xd0,$xa0 - lea 64($inp),$inp - - cmp \$64*4,$len - jb .Less_than_64_16x - vpxord ($inp),$xd0,$xd0 - vmovdqu32 $xd0,($out,$inp) - je .Ldone16x - vmovdqa32 $xa1,$xa0 - lea 64($inp),$inp - - cmp \$64*5,$len - jb .Less_than_64_16x - vpxord ($inp),$xa1,$xa1 - vmovdqu32 $xa1,($out,$inp) - je .Ldone16x - vmovdqa32 $xb1,$xa0 - lea 64($inp),$inp - - cmp \$64*6,$len - jb .Less_than_64_16x - vpxord ($inp),$xb1,$xb1 - vmovdqu32 $xb1,($out,$inp) - je .Ldone16x - vmovdqa32 $xc1,$xa0 - lea 64($inp),$inp - - cmp \$64*7,$len - jb .Less_than_64_16x - vpxord ($inp),$xc1,$xc1 - vmovdqu32 $xc1,($out,$inp) - je .Ldone16x - vmovdqa32 $xd1,$xa0 - lea 64($inp),$inp - - cmp \$64*8,$len - jb .Less_than_64_16x - vpxord ($inp),$xd1,$xd1 - vmovdqu32 $xd1,($out,$inp) - je .Ldone16x - vmovdqa32 $xa2,$xa0 - lea 64($inp),$inp - - cmp \$64*9,$len - jb .Less_than_64_16x - vpxord ($inp),$xa2,$xa2 - vmovdqu32 $xa2,($out,$inp) - je .Ldone16x - vmovdqa32 $xb2,$xa0 - lea 64($inp),$inp - - cmp \$64*10,$len - jb .Less_than_64_16x - vpxord ($inp),$xb2,$xb2 - vmovdqu32 $xb2,($out,$inp) - je .Ldone16x - vmovdqa32 $xc2,$xa0 - lea 64($inp),$inp - - cmp \$64*11,$len - jb .Less_than_64_16x - vpxord ($inp),$xc2,$xc2 - vmovdqu32 $xc2,($out,$inp) - je .Ldone16x - vmovdqa32 $xd2,$xa0 - lea 64($inp),$inp - - cmp \$64*12,$len - jb .Less_than_64_16x - vpxord ($inp),$xd2,$xd2 - vmovdqu32 $xd2,($out,$inp) - je .Ldone16x - vmovdqa32 $xa3,$xa0 - lea 64($inp),$inp - - cmp \$64*13,$len - jb .Less_than_64_16x - vpxord ($inp),$xa3,$xa3 - vmovdqu32 $xa3,($out,$inp) - je .Ldone16x - vmovdqa32 $xb3,$xa0 - lea 64($inp),$inp - - cmp \$64*14,$len - jb .Less_than_64_16x - vpxord ($inp),$xb3,$xb3 - vmovdqu32 $xb3,($out,$inp) - je .Ldone16x - vmovdqa32 $xc3,$xa0 - lea 64($inp),$inp - - cmp \$64*15,$len - jb .Less_than_64_16x - vpxord ($inp),$xc3,$xc3 - vmovdqu32 $xc3,($out,$inp) - je .Ldone16x - vmovdqa32 $xd3,$xa0 - lea 64($inp),$inp - -.Less_than_64_16x: - vmovdqa32 $xa0,0x00(%rsp) - lea ($out,$inp),$out - and \$63,$len - -.Loop_tail16x: - movzb ($inp,%r9),%eax - movzb (%rsp,%r9),%ecx - lea 1(%r9),%r9 - xor %ecx,%eax - mov %al,-1($out,%r9) - dec $len - jnz .Loop_tail16x - - vpxord $xa0,$xa0,$xa0 - vmovdqa32 $xa0,0(%rsp) - -.Ldone16x: - vzeroall -___ -$code.=<<___ if ($win64); - movaps -0xb0(%r10),%xmm6 - movaps -0xa0(%r10),%xmm7 - movaps -0x90(%r10),%xmm8 - movaps -0x80(%r10),%xmm9 - movaps -0x70(%r10),%xmm10 - movaps -0x60(%r10),%xmm11 - movaps -0x50(%r10),%xmm12 - movaps -0x40(%r10),%xmm13 - movaps -0x30(%r10),%xmm14 - movaps -0x20(%r10),%xmm15 -___ -$code.=<<___; - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -.L16x_epilogue: - ret -.cfi_endproc -.size chacha20_16x,.-chacha20_16x -___ - -# switch to %ymm domain -($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, - $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15)); -@xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, - $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); -@key=map("%ymm$_",(16..31)); -($xt0,$xt1,$xt2,$xt3)=@key[0..3]; - -$code.=<<___; -.type chacha20_8xvl,\@function,5 -.align 32 -chacha20_8xvl: -.cfi_startproc -.Lchacha20_8xvl: - lea 8(%rsp),%r10 # frame register -.cfi_def_cfa_register %r10 - sub \$64+$xframe,%rsp - and \$-64,%rsp -___ -$code.=<<___ if ($win64); - movaps %xmm6,-0xb0(%r10) - movaps %xmm7,-0xa0(%r10) - movaps %xmm8,-0x90(%r10) - movaps %xmm9,-0x80(%r10) - movaps %xmm10,-0x70(%r10) - movaps %xmm11,-0x60(%r10) - movaps %xmm12,-0x50(%r10) - movaps %xmm13,-0x40(%r10) - movaps %xmm14,-0x30(%r10) - movaps %xmm15,-0x20(%r10) -.L8xvl_body: -___ -$code.=<<___; - vzeroupper - - lea .Lsigma(%rip),%r9 - vbroadcasti128 (%r9),$xa3 # key[0] - vbroadcasti128 ($key),$xb3 # key[1] - vbroadcasti128 16($key),$xc3 # key[2] - vbroadcasti128 ($counter),$xd3 # key[3] - - vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... - vpshufd \$0x55,$xa3,$xa1 - vpshufd \$0xaa,$xa3,$xa2 - vpshufd \$0xff,$xa3,$xa3 - vmovdqa64 $xa0,@key[0] - vmovdqa64 $xa1,@key[1] - vmovdqa64 $xa2,@key[2] - vmovdqa64 $xa3,@key[3] - - vpshufd \$0x00,$xb3,$xb0 - vpshufd \$0x55,$xb3,$xb1 - vpshufd \$0xaa,$xb3,$xb2 - vpshufd \$0xff,$xb3,$xb3 - vmovdqa64 $xb0,@key[4] - vmovdqa64 $xb1,@key[5] - vmovdqa64 $xb2,@key[6] - vmovdqa64 $xb3,@key[7] - - vpshufd \$0x00,$xc3,$xc0 - vpshufd \$0x55,$xc3,$xc1 - vpshufd \$0xaa,$xc3,$xc2 - vpshufd \$0xff,$xc3,$xc3 - vmovdqa64 $xc0,@key[8] - vmovdqa64 $xc1,@key[9] - vmovdqa64 $xc2,@key[10] - vmovdqa64 $xc3,@key[11] - - vpshufd \$0x00,$xd3,$xd0 - vpshufd \$0x55,$xd3,$xd1 - vpshufd \$0xaa,$xd3,$xd2 - vpshufd \$0xff,$xd3,$xd3 - vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet - vmovdqa64 $xd0,@key[12] - vmovdqa64 $xd1,@key[13] - vmovdqa64 $xd2,@key[14] - vmovdqa64 $xd3,@key[15] - - mov \$10,%eax - jmp .Loop8xvl - -.align 32 -.Loop_outer8xvl: - #vpbroadcastd 0(%r9),$xa0 # reload key - #vpbroadcastd 4(%r9),$xa1 - vpbroadcastd 8(%r9),$xa2 - vpbroadcastd 12(%r9),$xa3 - vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters - vmovdqa64 @key[4],$xb0 - vmovdqa64 @key[5],$xb1 - vmovdqa64 @key[6],$xb2 - vmovdqa64 @key[7],$xb3 - vmovdqa64 @key[8],$xc0 - vmovdqa64 @key[9],$xc1 - vmovdqa64 @key[10],$xc2 - vmovdqa64 @key[11],$xc3 - vmovdqa64 @key[12],$xd0 - vmovdqa64 @key[13],$xd1 - vmovdqa64 @key[14],$xd2 - vmovdqa64 @key[15],$xd3 - - vmovdqa64 $xa0,@key[0] - vmovdqa64 $xa1,@key[1] - vmovdqa64 $xa2,@key[2] - vmovdqa64 $xa3,@key[3] - - mov \$10,%eax - jmp .Loop8xvl - -.align 32 -.Loop8xvl: -___ - foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } - foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } -$code.=<<___; - dec %eax - jnz .Loop8xvl - - vpaddd @key[0],$xa0,$xa0 # accumulate key - vpaddd @key[1],$xa1,$xa1 - vpaddd @key[2],$xa2,$xa2 - vpaddd @key[3],$xa3,$xa3 - - vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data - vpunpckldq $xa3,$xa2,$xt3 - vpunpckhdq $xa1,$xa0,$xa0 - vpunpckhdq $xa3,$xa2,$xa2 - vpunpcklqdq $xt3,$xt2,$xa1 # "a0" - vpunpckhqdq $xt3,$xt2,$xt2 # "a1" - vpunpcklqdq $xa2,$xa0,$xa3 # "a2" - vpunpckhqdq $xa2,$xa0,$xa0 # "a3" -___ - ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); -$code.=<<___; - vpaddd @key[4],$xb0,$xb0 - vpaddd @key[5],$xb1,$xb1 - vpaddd @key[6],$xb2,$xb2 - vpaddd @key[7],$xb3,$xb3 - - vpunpckldq $xb1,$xb0,$xt2 - vpunpckldq $xb3,$xb2,$xt3 - vpunpckhdq $xb1,$xb0,$xb0 - vpunpckhdq $xb3,$xb2,$xb2 - vpunpcklqdq $xt3,$xt2,$xb1 # "b0" - vpunpckhqdq $xt3,$xt2,$xt2 # "b1" - vpunpcklqdq $xb2,$xb0,$xb3 # "b2" - vpunpckhqdq $xb2,$xb0,$xb0 # "b3" -___ - ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); -$code.=<<___; - vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further - vshufi32x4 \$3,$xb0,$xa0,$xb0 - vshufi32x4 \$0,$xb1,$xa1,$xa0 - vshufi32x4 \$3,$xb1,$xa1,$xb1 - vshufi32x4 \$0,$xb2,$xa2,$xa1 - vshufi32x4 \$3,$xb2,$xa2,$xb2 - vshufi32x4 \$0,$xb3,$xa3,$xa2 - vshufi32x4 \$3,$xb3,$xa3,$xb3 -___ - ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); -$code.=<<___; - vpaddd @key[8],$xc0,$xc0 - vpaddd @key[9],$xc1,$xc1 - vpaddd @key[10],$xc2,$xc2 - vpaddd @key[11],$xc3,$xc3 - - vpunpckldq $xc1,$xc0,$xt2 - vpunpckldq $xc3,$xc2,$xt3 - vpunpckhdq $xc1,$xc0,$xc0 - vpunpckhdq $xc3,$xc2,$xc2 - vpunpcklqdq $xt3,$xt2,$xc1 # "c0" - vpunpckhqdq $xt3,$xt2,$xt2 # "c1" - vpunpcklqdq $xc2,$xc0,$xc3 # "c2" - vpunpckhqdq $xc2,$xc0,$xc0 # "c3" -___ - ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); -$code.=<<___; - vpaddd @key[12],$xd0,$xd0 - vpaddd @key[13],$xd1,$xd1 - vpaddd @key[14],$xd2,$xd2 - vpaddd @key[15],$xd3,$xd3 - - vpunpckldq $xd1,$xd0,$xt2 - vpunpckldq $xd3,$xd2,$xt3 - vpunpckhdq $xd1,$xd0,$xd0 - vpunpckhdq $xd3,$xd2,$xd2 - vpunpcklqdq $xt3,$xt2,$xd1 # "d0" - vpunpckhqdq $xt3,$xt2,$xt2 # "d1" - vpunpcklqdq $xd2,$xd0,$xd3 # "d2" - vpunpckhqdq $xd2,$xd0,$xd0 # "d3" -___ - ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); -$code.=<<___; - vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further - vperm2i128 \$0x31,$xd0,$xc0,$xd0 - vperm2i128 \$0x20,$xd1,$xc1,$xc0 - vperm2i128 \$0x31,$xd1,$xc1,$xd1 - vperm2i128 \$0x20,$xd2,$xc2,$xc1 - vperm2i128 \$0x31,$xd2,$xc2,$xd2 - vperm2i128 \$0x20,$xd3,$xc3,$xc2 - vperm2i128 \$0x31,$xd3,$xc3,$xd3 -___ - ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); - ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= - ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); -$code.=<<___; - cmp \$64*8,$len - jb .Ltail8xvl - - mov \$0x80,%eax # size optimization - vpxord 0x00($inp),$xa0,$xa0 # xor with input - vpxor 0x20($inp),$xb0,$xb0 - vpxor 0x40($inp),$xc0,$xc0 - vpxor 0x60($inp),$xd0,$xd0 - lea ($inp,%rax),$inp # size optimization - vmovdqu32 $xa0,0x00($out) - vmovdqu $xb0,0x20($out) - vmovdqu $xc0,0x40($out) - vmovdqu $xd0,0x60($out) - lea ($out,%rax),$out # size optimization - - vpxor 0x00($inp),$xa1,$xa1 - vpxor 0x20($inp),$xb1,$xb1 - vpxor 0x40($inp),$xc1,$xc1 - vpxor 0x60($inp),$xd1,$xd1 - lea ($inp,%rax),$inp # size optimization - vmovdqu $xa1,0x00($out) - vmovdqu $xb1,0x20($out) - vmovdqu $xc1,0x40($out) - vmovdqu $xd1,0x60($out) - lea ($out,%rax),$out # size optimization - - vpxord 0x00($inp),$xa2,$xa2 - vpxor 0x20($inp),$xb2,$xb2 - vpxor 0x40($inp),$xc2,$xc2 - vpxor 0x60($inp),$xd2,$xd2 - lea ($inp,%rax),$inp # size optimization - vmovdqu32 $xa2,0x00($out) - vmovdqu $xb2,0x20($out) - vmovdqu $xc2,0x40($out) - vmovdqu $xd2,0x60($out) - lea ($out,%rax),$out # size optimization - - vpxor 0x00($inp),$xa3,$xa3 - vpxor 0x20($inp),$xb3,$xb3 - vpxor 0x40($inp),$xc3,$xc3 - vpxor 0x60($inp),$xd3,$xd3 - lea ($inp,%rax),$inp # size optimization - vmovdqu $xa3,0x00($out) - vmovdqu $xb3,0x20($out) - vmovdqu $xc3,0x40($out) - vmovdqu $xd3,0x60($out) - lea ($out,%rax),$out # size optimization - - vpbroadcastd 0(%r9),%ymm0 # reload key - vpbroadcastd 4(%r9),%ymm1 - - sub \$64*8,$len - jnz .Loop_outer8xvl - - jmp .Ldone8xvl - -.align 32 -.Ltail8xvl: - vmovdqa64 $xa0,%ymm8 # size optimization -___ -$xa0 = "%ymm8"; -$code.=<<___; - xor %r9,%r9 - sub $inp,$out - cmp \$64*1,$len - jb .Less_than_64_8xvl - vpxor 0x00($inp),$xa0,$xa0 # xor with input - vpxor 0x20($inp),$xb0,$xb0 - vmovdqu $xa0,0x00($out,$inp) - vmovdqu $xb0,0x20($out,$inp) - je .Ldone8xvl - vmovdqa $xc0,$xa0 - vmovdqa $xd0,$xb0 - lea 64($inp),$inp - - cmp \$64*2,$len - jb .Less_than_64_8xvl - vpxor 0x00($inp),$xc0,$xc0 - vpxor 0x20($inp),$xd0,$xd0 - vmovdqu $xc0,0x00($out,$inp) - vmovdqu $xd0,0x20($out,$inp) - je .Ldone8xvl - vmovdqa $xa1,$xa0 - vmovdqa $xb1,$xb0 - lea 64($inp),$inp - - cmp \$64*3,$len - jb .Less_than_64_8xvl - vpxor 0x00($inp),$xa1,$xa1 - vpxor 0x20($inp),$xb1,$xb1 - vmovdqu $xa1,0x00($out,$inp) - vmovdqu $xb1,0x20($out,$inp) - je .Ldone8xvl - vmovdqa $xc1,$xa0 - vmovdqa $xd1,$xb0 - lea 64($inp),$inp - - cmp \$64*4,$len - jb .Less_than_64_8xvl - vpxor 0x00($inp),$xc1,$xc1 - vpxor 0x20($inp),$xd1,$xd1 - vmovdqu $xc1,0x00($out,$inp) - vmovdqu $xd1,0x20($out,$inp) - je .Ldone8xvl - vmovdqa32 $xa2,$xa0 - vmovdqa $xb2,$xb0 - lea 64($inp),$inp - - cmp \$64*5,$len - jb .Less_than_64_8xvl - vpxord 0x00($inp),$xa2,$xa2 - vpxor 0x20($inp),$xb2,$xb2 - vmovdqu32 $xa2,0x00($out,$inp) - vmovdqu $xb2,0x20($out,$inp) - je .Ldone8xvl - vmovdqa $xc2,$xa0 - vmovdqa $xd2,$xb0 - lea 64($inp),$inp - - cmp \$64*6,$len - jb .Less_than_64_8xvl - vpxor 0x00($inp),$xc2,$xc2 - vpxor 0x20($inp),$xd2,$xd2 - vmovdqu $xc2,0x00($out,$inp) - vmovdqu $xd2,0x20($out,$inp) - je .Ldone8xvl - vmovdqa $xa3,$xa0 - vmovdqa $xb3,$xb0 - lea 64($inp),$inp - - cmp \$64*7,$len - jb .Less_than_64_8xvl - vpxor 0x00($inp),$xa3,$xa3 - vpxor 0x20($inp),$xb3,$xb3 - vmovdqu $xa3,0x00($out,$inp) - vmovdqu $xb3,0x20($out,$inp) - je .Ldone8xvl - vmovdqa $xc3,$xa0 - vmovdqa $xd3,$xb0 - lea 64($inp),$inp - -.Less_than_64_8xvl: - vmovdqa $xa0,0x00(%rsp) - vmovdqa $xb0,0x20(%rsp) - lea ($out,$inp),$out - and \$63,$len - -.Loop_tail8xvl: - movzb ($inp,%r9),%eax - movzb (%rsp,%r9),%ecx - lea 1(%r9),%r9 - xor %ecx,%eax - mov %al,-1($out,%r9) - dec $len - jnz .Loop_tail8xvl - - vpxor $xa0,$xa0,$xa0 - vmovdqa $xa0,0x00(%rsp) - vmovdqa $xa0,0x20(%rsp) - -.Ldone8xvl: - vzeroall -___ -$code.=<<___ if ($win64); - movaps -0xb0(%r10),%xmm6 - movaps -0xa0(%r10),%xmm7 - movaps -0x90(%r10),%xmm8 - movaps -0x80(%r10),%xmm9 - movaps -0x70(%r10),%xmm10 - movaps -0x60(%r10),%xmm11 - movaps -0x50(%r10),%xmm12 - movaps -0x40(%r10),%xmm13 - movaps -0x30(%r10),%xmm14 - movaps -0x20(%r10),%xmm15 -___ -$code.=<<___; - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -.L8xvl_epilogue: - ret -.cfi_endproc -.size chacha20_8xvl,.-chacha20_8xvl -___ -if($kernel) { - $code .= "#endif\n"; -} -} - -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -if ($win64) { -$rec="%rcx"; -$frame="%rdx"; -$context="%r8"; -$disp="%r9"; - -$code.=<<___; -.extern __imp_RtlVirtualUnwind -.type se_handler,\@abi-omnipotent -.align 16 -se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - lea .Lctr32_body(%rip),%r10 - cmp %r10,%rbx # context->Rip<.Lprologue - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - lea .Lno_data(%rip),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=.Lepilogue - jae .Lcommon_seh_tail - - lea 64+24+48(%rax),%rax - - mov -8(%rax),%rbx - mov -16(%rax),%rbp - mov -24(%rax),%r12 - mov -32(%rax),%r13 - mov -40(%rax),%r14 - mov -48(%rax),%r15 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 - mov %r15,240($context) # restore context->R14 - -.Lcommon_seh_tail: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - mov 40($disp),%rdi # disp->ContextRecord - mov $context,%rsi # context - mov \$154,%ecx # sizeof(CONTEXT) - .long 0xa548f3fc # cld; rep movsq - - mov $disp,%rsi - xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER - mov 8(%rsi),%rdx # arg2, disp->ImageBase - mov 0(%rsi),%r8 # arg3, disp->ControlPc - mov 16(%rsi),%r9 # arg4, disp->FunctionEntry - mov 40(%rsi),%r10 # disp->ContextRecord - lea 56(%rsi),%r11 # &disp->HandlerData - lea 24(%rsi),%r12 # &disp->EstablisherFrame - mov %r10,32(%rsp) # arg5 - mov %r11,40(%rsp) # arg6 - mov %r12,48(%rsp) # arg7 - mov %rcx,56(%rsp) # arg8, (NULL) - call *__imp_RtlVirtualUnwind(%rip) - - mov \$1,%eax # ExceptionContinueSearch - add \$64,%rsp - popfq - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - pop %rdi - pop %rsi - ret -.size se_handler,.-se_handler - -.type simd_handler,\@abi-omnipotent -.align 16 -simd_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<prologue label - jb .Lcommon_seh_tail - - mov 192($context),%rax # pull context->R9 - - mov 4(%r11),%r10d # HandlerData[1] - mov 8(%r11),%ecx # HandlerData[2] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lcommon_seh_tail - - neg %rcx - lea -8(%rax,%rcx),%rsi - lea 512($context),%rdi # &context.Xmm6 - neg %ecx - shr \$3,%ecx - .long 0xa548f3fc # cld; rep movsq - - jmp .Lcommon_seh_tail -.size simd_handler,.-simd_handler - -.section .pdata -.align 4 - .rva .LSEH_begin_chacha20_ctr32 - .rva .LSEH_end_chacha20_ctr32 - .rva .LSEH_info_chacha20_ctr32 - - .rva .LSEH_begin_chacha20_ssse3 - .rva .LSEH_end_chacha20_ssse3 - .rva .LSEH_info_chacha20_ssse3 - - .rva .LSEH_begin_chacha20_128 - .rva .LSEH_end_chacha20_128 - .rva .LSEH_info_chacha20_128 - - .rva .LSEH_begin_chacha20_4x - .rva .LSEH_end_chacha20_4x - .rva .LSEH_info_chacha20_4x -___ -$code.=<<___ if ($avx); - .rva .LSEH_begin_chacha20_xop - .rva .LSEH_end_chacha20_xop - .rva .LSEH_info_chacha20_xop -___ -$code.=<<___ if ($avx>1); - .rva .LSEH_begin_chacha20_avx2 - .rva .LSEH_end_chacha20_avx2 - .rva .LSEH_info_chacha20_avx2 -___ -$code.=<<___ if ($avx>2); - .rva .LSEH_begin_chacha20_avx512 - .rva .LSEH_end_chacha20_avx512 - .rva .LSEH_info_chacha20_avx512 - - .rva .LSEH_begin_chacha20_avx512vl - .rva .LSEH_end_chacha20_avx512vl - .rva .LSEH_info_chacha20_avx512vl - - .rva .LSEH_begin_chacha20_16x - .rva .LSEH_end_chacha20_16x - .rva .LSEH_info_chacha20_16x - - .rva .LSEH_begin_chacha20_8xvl - .rva .LSEH_end_chacha20_8xvl - .rva .LSEH_info_chacha20_8xvl -___ -$code.=<<___; -.section .xdata -.align 8 -.LSEH_info_chacha20_ctr32: - .byte 9,0,0,0 - .rva se_handler - -.LSEH_info_chacha20_ssse3: - .byte 9,0,0,0 - .rva simd_handler - .rva .Lssse3_body,.Lssse3_epilogue - .long 0x20,0 - -.LSEH_info_chacha20_128: - .byte 9,0,0,0 - .rva simd_handler - .rva .L128_body,.L128_epilogue - .long 0x60,0 - -.LSEH_info_chacha20_4x: - .byte 9,0,0,0 - .rva simd_handler - .rva .L4x_body,.L4x_epilogue - .long 0xa0,0 -___ -$code.=<<___ if ($avx); -.LSEH_info_chacha20_xop: - .byte 9,0,0,0 - .rva simd_handler - .rva .L4xop_body,.L4xop_epilogue # HandlerData[] - .long 0xa0,0 -___ -$code.=<<___ if ($avx>1); -.LSEH_info_chacha20_avx2: - .byte 9,0,0,0 - .rva simd_handler - .rva .L8x_body,.L8x_epilogue # HandlerData[] - .long 0xa0,0 -___ -$code.=<<___ if ($avx>2); -.LSEH_info_chacha20_avx512: - .byte 9,0,0,0 - .rva simd_handler - .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] - .long 0x20,0 - -.LSEH_info_chacha20_avx512vl: - .byte 9,0,0,0 - .rva simd_handler - .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[] - .long 0x20,0 - -.LSEH_info_chacha20_16x: - .byte 9,0,0,0 - .rva simd_handler - .rva .L16x_body,.L16x_epilogue # HandlerData[] - .long 0xa0,0 - -.LSEH_info_chacha20_8xvl: - .byte 9,0,0,0 - .rva simd_handler - .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[] - .long 0xa0,0 -___ -} - -open SELF,$0; -while(<SELF>) { - next if (/^#!/); - last if (!s/^#/\/\// and !/^$/); - print; -} -close SELF; - -foreach (split("\n",$code)) { - s/\`([^\`]*)\`/eval $1/ge; - - s/%x#%[yz]/%x/g; # "down-shift" - - if ($kernel) { - s/(^\.type.*),[0-9]+$/\1/; - next if /^\.cfi.*/; - } - - print $_,"\n"; -} - -close STDOUT; diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20.c deleted file mode 100644 index b78f19975b1d..000000000000 --- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20.c +++ /dev/null @@ -1,238 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * - * Implementation of the ChaCha20 stream cipher. - * - * Information: https://cr.yp.to/chacha.html - */ - -#include <zinc/chacha20.h> -#include "../selftest/run.h" -#define IS_ENABLED_CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 1 - -#define IS_ENABLED_CONFIG_64BIT (sizeof(void*) == 8) - -void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int len) -{ - int relalign = 0; - - if (!IS_ENABLED_CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) { - int size = sizeof(unsigned long); - int d = (((unsigned long)dst ^ (unsigned long)src1) | - ((unsigned long)dst ^ (unsigned long)src2)) & - (size - 1); - - relalign = d ? 1 << ffs(d) : size; - - /* - * If we care about alignment, process as many bytes as - * needed to advance dst and src to values whose alignments - * equal their relative alignment. This will allow us to - * process the remainder of the input using optimal strides. - */ - while (((unsigned long)dst & (relalign - 1)) && len > 0) { - *dst++ = *src1++ ^ *src2++; - len--; - } - } - - while (IS_ENABLED(CONFIG_64BIT) && len >= 8 && !(relalign & 7)) { - *(u64 *)dst = *(const u64 *)src1 ^ *(const u64 *)src2; - dst += 8; - src1 += 8; - src2 += 8; - len -= 8; - } - - while (len >= 4 && !(relalign & 3)) { - *(u32 *)dst = *(const u32 *)src1 ^ *(const u32 *)src2; - dst += 4; - src1 += 4; - src2 += 4; - len -= 4; - } - - while (len >= 2 && !(relalign & 1)) { - *(u16 *)dst = *(const u16 *)src1 ^ *(const u16 *)src2; - dst += 2; - src1 += 2; - src2 += 2; - len -= 2; - } - - while (len--) - *dst++ = *src1++ ^ *src2++; -} - -#if defined(CONFIG_ZINC_ARCH_X86_64) -#include "chacha20-x86_64-glue.c" -#elif defined(CONFIG_ZINC_ARCH_ARM) || defined(CONFIG_ZINC_ARCH_ARM64) -#include "chacha20-arm-glue.c" -#elif defined(CONFIG_ZINC_ARCH_MIPS) -#include "chacha20-mips-glue.c" -#else -static bool *const chacha20_nobs[] __initconst = { }; -static void __init chacha20_fpu_init(void) -{ -} -static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst, - const u8 *src, size_t len, - simd_context_t *simd_context) -{ - return false; -} -static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS], - const u8 nonce[HCHACHA20_NONCE_SIZE], - const u8 key[HCHACHA20_KEY_SIZE], - simd_context_t *simd_context) -{ - return false; -} -#endif - -#define QUARTER_ROUND(x, a, b, c, d) ( \ - x[a] += x[b], \ - x[d] = rol32((x[d] ^ x[a]), 16), \ - x[c] += x[d], \ - x[b] = rol32((x[b] ^ x[c]), 12), \ - x[a] += x[b], \ - x[d] = rol32((x[d] ^ x[a]), 8), \ - x[c] += x[d], \ - x[b] = rol32((x[b] ^ x[c]), 7) \ -) - -#define C(i, j) (i * 4 + j) - -#define DOUBLE_ROUND(x) ( \ - /* Column Round */ \ - QUARTER_ROUND(x, C(0, 0), C(1, 0), C(2, 0), C(3, 0)), \ - QUARTER_ROUND(x, C(0, 1), C(1, 1), C(2, 1), C(3, 1)), \ - QUARTER_ROUND(x, C(0, 2), C(1, 2), C(2, 2), C(3, 2)), \ - QUARTER_ROUND(x, C(0, 3), C(1, 3), C(2, 3), C(3, 3)), \ - /* Diagonal Round */ \ - QUARTER_ROUND(x, C(0, 0), C(1, 1), C(2, 2), C(3, 3)), \ - QUARTER_ROUND(x, C(0, 1), C(1, 2), C(2, 3), C(3, 0)), \ - QUARTER_ROUND(x, C(0, 2), C(1, 3), C(2, 0), C(3, 1)), \ - QUARTER_ROUND(x, C(0, 3), C(1, 0), C(2, 1), C(3, 2)) \ -) - -#define TWENTY_ROUNDS(x) ( \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x) \ -) - -static void chacha20_block_generic(struct chacha20_ctx *ctx, __le32 *stream) -{ - u32 x[CHACHA20_BLOCK_WORDS]; - int i; - - for (i = 0; i < ARRAY_SIZE(x); ++i) - x[i] = ctx->state[i]; - - TWENTY_ROUNDS(x); - - for (i = 0; i < ARRAY_SIZE(x); ++i) - stream[i] = cpu_to_le32(x[i] + ctx->state[i]); - - ctx->counter[0] += 1; -} - -static void chacha20_generic(struct chacha20_ctx *ctx, u8 *out, const u8 *in, - u32 len) -{ - __le32 buf[CHACHA20_BLOCK_WORDS]; - - while (len >= CHACHA20_BLOCK_SIZE) { - chacha20_block_generic(ctx, buf); - crypto_xor_cpy(out, in, (u8 *)buf, CHACHA20_BLOCK_SIZE); - len -= CHACHA20_BLOCK_SIZE; - out += CHACHA20_BLOCK_SIZE; - in += CHACHA20_BLOCK_SIZE; - } - if (len) { - chacha20_block_generic(ctx, buf); - crypto_xor_cpy(out, in, (u8 *)buf, len); - } -} - -void chacha20(struct chacha20_ctx *ctx, u8 *dst, const u8 *src, u32 len, - simd_context_t *simd_context) -{ - if (!chacha20_arch(ctx, dst, src, len, simd_context)) - chacha20_generic(ctx, dst, src, len); -} -EXPORT_SYMBOL(chacha20); - -static void hchacha20_generic(u32 derived_key[CHACHA20_KEY_WORDS], - const u8 nonce[HCHACHA20_NONCE_SIZE], - const u8 key[HCHACHA20_KEY_SIZE]) -{ - u32 x[] = { CHACHA20_CONSTANT_EXPA, - CHACHA20_CONSTANT_ND_3, - CHACHA20_CONSTANT_2_BY, - CHACHA20_CONSTANT_TE_K, - get_unaligned_le32(key + 0), - get_unaligned_le32(key + 4), - get_unaligned_le32(key + 8), - get_unaligned_le32(key + 12), - get_unaligned_le32(key + 16), - get_unaligned_le32(key + 20), - get_unaligned_le32(key + 24), - get_unaligned_le32(key + 28), - get_unaligned_le32(nonce + 0), - get_unaligned_le32(nonce + 4), - get_unaligned_le32(nonce + 8), - get_unaligned_le32(nonce + 12) - }; - - TWENTY_ROUNDS(x); - - memcpy(derived_key + 0, x + 0, sizeof(u32) * 4); - memcpy(derived_key + 4, x + 12, sizeof(u32) * 4); -} - -/* Derived key should be 32-bit aligned */ -void hchacha20(u32 derived_key[CHACHA20_KEY_WORDS], - const u8 nonce[HCHACHA20_NONCE_SIZE], - const u8 key[HCHACHA20_KEY_SIZE], simd_context_t *simd_context) -{ - if (!hchacha20_arch(derived_key, nonce, key, simd_context)) - hchacha20_generic(derived_key, nonce, key); -} -EXPORT_SYMBOL(hchacha20); - -#include "../selftest/chacha20.c" - -static bool nosimd __initdata = false; - -#ifndef COMPAT_ZINC_IS_A_MODULE -int __init chacha20_mod_init(void) -#else -static int __init mod_init(void) -#endif -{ - if (!nosimd) - chacha20_fpu_init(); - if (!selftest_run("chacha20", chacha20_selftest, chacha20_nobs, - ARRAY_SIZE(chacha20_nobs))) - return -ENOTRECOVERABLE; - return 0; -} - -#ifdef COMPAT_ZINC_IS_A_MODULE -static void __exit mod_exit(void) -{ -} - -module_init(mod_init); -module_exit(mod_exit); -#endif diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20poly1305.c b/sys/dev/if_wg/module/crypto/zinc/chacha20poly1305.c deleted file mode 100644 index 701666c78eb8..000000000000 --- a/sys/dev/if_wg/module/crypto/zinc/chacha20poly1305.c +++ /dev/null @@ -1,196 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * - * This is an implementation of the ChaCha20Poly1305 AEAD construction. - * - * Information: https://tools.ietf.org/html/rfc8439 - */ - -#include <sys/support.h> -#include <zinc/chacha20poly1305.h> -#include <zinc/chacha20.h> -#include <zinc/poly1305.h> -#include "selftest/run.h" - -static const u8 pad0[CHACHA20_BLOCK_SIZE] = { 0 }; - -static inline void -__chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, - const u8 *ad, const size_t ad_len, const u64 nonce, - const u8 key[CHACHA20POLY1305_KEY_SIZE], - simd_context_t *simd_context) -{ - struct poly1305_ctx poly1305_state; - struct chacha20_ctx chacha20_state; - union { - u8 block0[POLY1305_KEY_SIZE]; - __le64 lens[2]; - } b = { { 0 } }; - - chacha20_init(&chacha20_state, key, nonce); - chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0), - simd_context); - poly1305_init(&poly1305_state, b.block0); - - poly1305_update(&poly1305_state, ad, ad_len, simd_context); - poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf, - simd_context); - - chacha20(&chacha20_state, dst, src, src_len, simd_context); - - poly1305_update(&poly1305_state, dst, src_len, simd_context); - poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf, - simd_context); - - b.lens[0] = cpu_to_le64(ad_len); - b.lens[1] = cpu_to_le64(src_len); - poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens), - simd_context); - - poly1305_final(&poly1305_state, dst + src_len, simd_context); - - memzero_explicit(&chacha20_state, sizeof(chacha20_state)); - memzero_explicit(&b, sizeof(b)); -} - -void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, - const u8 *ad, const size_t ad_len, - const u64 nonce, - const u8 key[CHACHA20POLY1305_KEY_SIZE]) -{ - simd_context_t simd_context; - - simd_get(&simd_context); - __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, nonce, key, - &simd_context); - simd_put(&simd_context); -} -EXPORT_SYMBOL(chacha20poly1305_encrypt); -static inline bool -__chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, - const u8 *ad, const size_t ad_len, const u64 nonce, - const u8 key[CHACHA20POLY1305_KEY_SIZE], - simd_context_t *simd_context) -{ - struct poly1305_ctx poly1305_state; - struct chacha20_ctx chacha20_state; - int ret; - size_t dst_len; - union { - u8 block0[POLY1305_KEY_SIZE]; - u8 mac[POLY1305_MAC_SIZE]; - __le64 lens[2]; - } b = { { 0 } }; - - if (unlikely(src_len < POLY1305_MAC_SIZE)) { - printf("src_len too short\n"); - return false; - } - - chacha20_init(&chacha20_state, key, nonce); - chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0), - simd_context); - poly1305_init(&poly1305_state, b.block0); - - poly1305_update(&poly1305_state, ad, ad_len, simd_context); - poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf, - simd_context); - - dst_len = src_len - POLY1305_MAC_SIZE; - poly1305_update(&poly1305_state, src, dst_len, simd_context); - poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf, - simd_context); - - b.lens[0] = cpu_to_le64(ad_len); - b.lens[1] = cpu_to_le64(dst_len); - poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens), - simd_context); - - poly1305_final(&poly1305_state, b.mac, simd_context); - - ret = crypto_memneq(b.mac, src + dst_len, POLY1305_MAC_SIZE); - if (likely(!ret)) - chacha20(&chacha20_state, dst, src, dst_len, simd_context); - else { - printf("calculated: %16D\n", b.mac, ""); - printf("sent : %16D\n", src + dst_len, ""); - } - memzero_explicit(&chacha20_state, sizeof(chacha20_state)); - memzero_explicit(&b, sizeof(b)); - - return !ret; -} - -bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, - const u8 *ad, const size_t ad_len, - const u64 nonce, - const u8 key[CHACHA20POLY1305_KEY_SIZE]) -{ - simd_context_t simd_context; - bool ret; - - simd_get(&simd_context); - ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, nonce, - key, &simd_context); - simd_put(&simd_context); - return ret; -} -EXPORT_SYMBOL(chacha20poly1305_decrypt); - -void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, - const u8 *ad, const size_t ad_len, - const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE], - const u8 key[CHACHA20POLY1305_KEY_SIZE]) -{ - simd_context_t simd_context; - u32 derived_key[CHACHA20_KEY_WORDS] __aligned(16); - - simd_get(&simd_context); - hchacha20(derived_key, nonce, key, &simd_context); - cpu_to_le32_array(derived_key, ARRAY_SIZE(derived_key)); - __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, - get_unaligned_le64(nonce + 16), - (u8 *)derived_key, &simd_context); - memzero_explicit(derived_key, CHACHA20POLY1305_KEY_SIZE); - simd_put(&simd_context); -} -EXPORT_SYMBOL(xchacha20poly1305_encrypt); - -bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, - const u8 *ad, const size_t ad_len, - const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE], - const u8 key[CHACHA20POLY1305_KEY_SIZE]) -{ - bool ret; - simd_context_t simd_context; - u32 derived_key[CHACHA20_KEY_WORDS] __aligned(16); - - simd_get(&simd_context); - hchacha20(derived_key, nonce, key, &simd_context); - cpu_to_le32_array(derived_key, ARRAY_SIZE(derived_key)); - ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, - get_unaligned_le64(nonce + 16), - (u8 *)derived_key, &simd_context); - memzero_explicit(derived_key, CHACHA20POLY1305_KEY_SIZE); - simd_put(&simd_context); - return ret; -} -EXPORT_SYMBOL(xchacha20poly1305_decrypt); - -#include "selftest/chacha20poly1305.c" - -static int __init mod_init(void) -{ - if (!selftest_run("chacha20poly1305", chacha20poly1305_selftest, - NULL, 0)) - return -ENOTRECOVERABLE; - return 0; -} - -static void __exit mod_exit(void) -{ -} - -module_init(mod_init); -module_exit(mod_exit); diff --git a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm-glue.c b/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm-glue.c deleted file mode 100644 index 291fe4ba98b0..000000000000 --- a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm-glue.c +++ /dev/null @@ -1,140 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#include <asm/hwcap.h> -#include <asm/neon.h> - -asmlinkage void poly1305_init_arm(void *ctx, const u8 key[16]); -asmlinkage void poly1305_blocks_arm(void *ctx, const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_emit_arm(void *ctx, u8 mac[16], const u32 nonce[4]); -asmlinkage void poly1305_blocks_neon(void *ctx, const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_emit_neon(void *ctx, u8 mac[16], const u32 nonce[4]); - -static bool poly1305_use_neon __ro_after_init; -static bool *const poly1305_nobs[] __initconst = { &poly1305_use_neon }; - -static void __init poly1305_fpu_init(void) -{ -#if defined(CONFIG_ZINC_ARCH_ARM64) - poly1305_use_neon = cpu_have_named_feature(ASIMD); -#elif defined(CONFIG_ZINC_ARCH_ARM) - poly1305_use_neon = elf_hwcap & HWCAP_NEON; -#endif -} - -#if defined(CONFIG_ZINC_ARCH_ARM64) -struct poly1305_arch_internal { - union { - u32 h[5]; - struct { - u64 h0, h1, h2; - }; - }; - u64 is_base2_26; - u64 r[2]; -}; -#elif defined(CONFIG_ZINC_ARCH_ARM) -struct poly1305_arch_internal { - union { - u32 h[5]; - struct { - u64 h0, h1; - u32 h2; - } __packed; - }; - u32 r[4]; - u32 is_base2_26; -}; -#endif - -/* The NEON code uses base 2^26, while the scalar code uses base 2^64 on 64-bit - * and base 2^32 on 32-bit. If we hit the unfortunate situation of using NEON - * and then having to go back to scalar -- because the user is silly and has - * called the update function from two separate contexts -- then we need to - * convert back to the original base before proceeding. The below function is - * written for 64-bit integers, and so we have to swap words at the end on - * big-endian 32-bit. It is possible to reason that the initial reduction below - * is sufficient given the implementation invariants. However, for an avoidance - * of doubt and because this is not performance critical, we do the full - * reduction anyway. - */ -static void convert_to_base2_64(void *ctx) -{ - struct poly1305_arch_internal *state = ctx; - u32 cy; - - if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !state->is_base2_26) - return; - - cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; - cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; - cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; - cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; - state->h0 = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; - state->h1 = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); - state->h2 = state->h[4] >> 24; - if (IS_ENABLED(CONFIG_ZINC_ARCH_ARM) && IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) { - state->h0 = rol64(state->h0, 32); - state->h1 = rol64(state->h1, 32); - } -#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) - cy = (state->h2 >> 2) + (state->h2 & ~3ULL); - state->h2 &= 3; - state->h0 += cy; - state->h1 += (cy = ULT(state->h0, cy)); - state->h2 += ULT(state->h1, cy); -#undef ULT - state->is_base2_26 = 0; -} - -static inline bool poly1305_init_arch(void *ctx, - const u8 key[POLY1305_KEY_SIZE]) -{ - poly1305_init_arm(ctx, key); - return true; -} - -static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp, - size_t len, const u32 padbit, - simd_context_t *simd_context) -{ - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE || - PAGE_SIZE % POLY1305_BLOCK_SIZE); - - if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !poly1305_use_neon || - !simd_use(simd_context)) { - convert_to_base2_64(ctx); - poly1305_blocks_arm(ctx, inp, len, padbit); - return true; - } - - for (;;) { - const size_t bytes = min_t(size_t, len, PAGE_SIZE); - - poly1305_blocks_neon(ctx, inp, bytes, padbit); - len -= bytes; - if (!len) - break; - inp += bytes; - simd_relax(simd_context); - } - return true; -} - -static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE], - const u32 nonce[4], - simd_context_t *simd_context) -{ - if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !poly1305_use_neon || - !simd_use(simd_context)) { - convert_to_base2_64(ctx); - poly1305_emit_arm(ctx, mac, nonce); - } else - poly1305_emit_neon(ctx, mac, nonce); - return true; -} diff --git a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm.pl b/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm.pl deleted file mode 100755 index 468f41b76fbd..000000000000 --- a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm.pl +++ /dev/null @@ -1,1276 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause -# -# This code is taken from the OpenSSL project but the author, Andy Polyakov, -# has relicensed it under the licenses specified in the SPDX header above. -# The original headers, including the original license headers, are -# included below for completeness. -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# IALU(*)/gcc-4.4 NEON -# -# ARM11xx(ARMv6) 7.78/+100% - -# Cortex-A5 6.35/+130% 3.00 -# Cortex-A8 6.25/+115% 2.36 -# Cortex-A9 5.10/+95% 2.55 -# Cortex-A15 3.85/+85% 1.25(**) -# Snapdragon S4 5.70/+100% 1.48(**) -# -# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; -# (**) these are trade-off results, they can be improved by ~8% but at -# the cost of 15/12% regression on Cortex-A5/A7, it's even possible -# to improve Cortex-A9 result, but then A5/A7 loose more than 20%; - -$flavour = shift; -if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } -else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); - -$code.=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -#else -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ -# define poly1305_init poly1305_init_arm -# define poly1305_blocks poly1305_blocks_arm -# define poly1305_emit poly1305_emit_arm -#endif - -.text -#if defined(__thumb2__) -.syntax unified -.thumb -#else -.code 32 -#endif - -.globl poly1305_emit -.globl poly1305_blocks -.globl poly1305_init -.type poly1305_init,%function -.align 5 -poly1305_init: -.Lpoly1305_init: - stmdb sp!,{r4-r11} - - eor r3,r3,r3 - cmp $inp,#0 - str r3,[$ctx,#0] @ zero hash value - str r3,[$ctx,#4] - str r3,[$ctx,#8] - str r3,[$ctx,#12] - str r3,[$ctx,#16] - str r3,[$ctx,#36] @ is_base2_26 - add $ctx,$ctx,#20 - -#ifdef __thumb2__ - it eq -#endif - moveq r0,#0 - beq .Lno_key - -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - adr r11,.Lpoly1305_init - ldr r12,.LOPENSSL_armcap -#endif - ldrb r4,[$inp,#0] - mov r10,#0x0fffffff - ldrb r5,[$inp,#1] - and r3,r10,#-4 @ 0x0ffffffc - ldrb r6,[$inp,#2] - ldrb r7,[$inp,#3] - orr r4,r4,r5,lsl#8 - ldrb r5,[$inp,#4] - orr r4,r4,r6,lsl#16 - ldrb r6,[$inp,#5] - orr r4,r4,r7,lsl#24 - ldrb r7,[$inp,#6] - and r4,r4,r10 - -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - ldr r12,[r11,r12] @ OPENSSL_armcap_P -# ifdef __APPLE__ - ldr r12,[r12] -# endif -#endif - ldrb r8,[$inp,#7] - orr r5,r5,r6,lsl#8 - ldrb r6,[$inp,#8] - orr r5,r5,r7,lsl#16 - ldrb r7,[$inp,#9] - orr r5,r5,r8,lsl#24 - ldrb r8,[$inp,#10] - and r5,r5,r3 - -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - tst r12,#ARMV7_NEON @ check for NEON -# ifdef __APPLE__ - adr r9,poly1305_blocks_neon - adr r11,poly1305_blocks -# ifdef __thumb2__ - it ne -# endif - movne r11,r9 - adr r12,poly1305_emit - adr r10,poly1305_emit_neon -# ifdef __thumb2__ - it ne -# endif - movne r12,r10 -# else -# ifdef __thumb2__ - itete eq -# endif - addeq r12,r11,#(poly1305_emit-.Lpoly1305_init) - addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init) - addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init) - addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init) -# endif -# ifdef __thumb2__ - orr r12,r12,#1 @ thumb-ify address - orr r11,r11,#1 -# endif -#endif - ldrb r9,[$inp,#11] - orr r6,r6,r7,lsl#8 - ldrb r7,[$inp,#12] - orr r6,r6,r8,lsl#16 - ldrb r8,[$inp,#13] - orr r6,r6,r9,lsl#24 - ldrb r9,[$inp,#14] - and r6,r6,r3 - - ldrb r10,[$inp,#15] - orr r7,r7,r8,lsl#8 - str r4,[$ctx,#0] - orr r7,r7,r9,lsl#16 - str r5,[$ctx,#4] - orr r7,r7,r10,lsl#24 - str r6,[$ctx,#8] - and r7,r7,r3 - str r7,[$ctx,#12] -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - stmia r2,{r11,r12} @ fill functions table - mov r0,#1 -#else - mov r0,#0 -#endif -.Lno_key: - ldmia sp!,{r4-r11} -#if __ARM_ARCH__>=5 - ret @ bx lr -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size poly1305_init,.-poly1305_init -___ -{ -my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); -my ($s1,$s2,$s3)=($r1,$r2,$r3); - -$code.=<<___; -.type poly1305_blocks,%function -.align 5 -poly1305_blocks: -.Lpoly1305_blocks: - stmdb sp!,{r3-r11,lr} - - ands $len,$len,#-16 - beq .Lno_data - - cmp $padbit,#0 - add $len,$len,$inp @ end pointer - sub sp,sp,#32 - - ldmia $ctx,{$h0-$r3} @ load context - - str $ctx,[sp,#12] @ offload stuff - mov lr,$inp - str $len,[sp,#16] - str $r1,[sp,#20] - str $r2,[sp,#24] - str $r3,[sp,#28] - b .Loop - -.Loop: -#if __ARM_ARCH__<7 - ldrb r0,[lr],#16 @ load input -# ifdef __thumb2__ - it hi -# endif - addhi $h4,$h4,#1 @ 1<<128 - ldrb r1,[lr,#-15] - ldrb r2,[lr,#-14] - ldrb r3,[lr,#-13] - orr r1,r0,r1,lsl#8 - ldrb r0,[lr,#-12] - orr r2,r1,r2,lsl#16 - ldrb r1,[lr,#-11] - orr r3,r2,r3,lsl#24 - ldrb r2,[lr,#-10] - adds $h0,$h0,r3 @ accumulate input - - ldrb r3,[lr,#-9] - orr r1,r0,r1,lsl#8 - ldrb r0,[lr,#-8] - orr r2,r1,r2,lsl#16 - ldrb r1,[lr,#-7] - orr r3,r2,r3,lsl#24 - ldrb r2,[lr,#-6] - adcs $h1,$h1,r3 - - ldrb r3,[lr,#-5] - orr r1,r0,r1,lsl#8 - ldrb r0,[lr,#-4] - orr r2,r1,r2,lsl#16 - ldrb r1,[lr,#-3] - orr r3,r2,r3,lsl#24 - ldrb r2,[lr,#-2] - adcs $h2,$h2,r3 - - ldrb r3,[lr,#-1] - orr r1,r0,r1,lsl#8 - str lr,[sp,#8] @ offload input pointer - orr r2,r1,r2,lsl#16 - add $s1,$r1,$r1,lsr#2 - orr r3,r2,r3,lsl#24 -#else - ldr r0,[lr],#16 @ load input -# ifdef __thumb2__ - it hi -# endif - addhi $h4,$h4,#1 @ padbit - ldr r1,[lr,#-12] - ldr r2,[lr,#-8] - ldr r3,[lr,#-4] -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif - adds $h0,$h0,r0 @ accumulate input - str lr,[sp,#8] @ offload input pointer - adcs $h1,$h1,r1 - add $s1,$r1,$r1,lsr#2 - adcs $h2,$h2,r2 -#endif - add $s2,$r2,$r2,lsr#2 - adcs $h3,$h3,r3 - add $s3,$r3,$r3,lsr#2 - - umull r2,r3,$h1,$r0 - adc $h4,$h4,#0 - umull r0,r1,$h0,$r0 - umlal r2,r3,$h4,$s1 - umlal r0,r1,$h3,$s1 - ldr $r1,[sp,#20] @ reload $r1 - umlal r2,r3,$h2,$s3 - umlal r0,r1,$h1,$s3 - umlal r2,r3,$h3,$s2 - umlal r0,r1,$h2,$s2 - umlal r2,r3,$h0,$r1 - str r0,[sp,#0] @ future $h0 - mul r0,$s2,$h4 - ldr $r2,[sp,#24] @ reload $r2 - adds r2,r2,r1 @ d1+=d0>>32 - eor r1,r1,r1 - adc lr,r3,#0 @ future $h2 - str r2,[sp,#4] @ future $h1 - - mul r2,$s3,$h4 - eor r3,r3,r3 - umlal r0,r1,$h3,$s3 - ldr $r3,[sp,#28] @ reload $r3 - umlal r2,r3,$h3,$r0 - umlal r0,r1,$h2,$r0 - umlal r2,r3,$h2,$r1 - umlal r0,r1,$h1,$r1 - umlal r2,r3,$h1,$r2 - umlal r0,r1,$h0,$r2 - umlal r2,r3,$h0,$r3 - ldr $h0,[sp,#0] - mul $h4,$r0,$h4 - ldr $h1,[sp,#4] - - adds $h2,lr,r0 @ d2+=d1>>32 - ldr lr,[sp,#8] @ reload input pointer - adc r1,r1,#0 - adds $h3,r2,r1 @ d3+=d2>>32 - ldr r0,[sp,#16] @ reload end pointer - adc r3,r3,#0 - add $h4,$h4,r3 @ h4+=d3>>32 - - and r1,$h4,#-4 - and $h4,$h4,#3 - add r1,r1,r1,lsr#2 @ *=5 - adds $h0,$h0,r1 - adcs $h1,$h1,#0 - adcs $h2,$h2,#0 - adcs $h3,$h3,#0 - adc $h4,$h4,#0 - - cmp r0,lr @ done yet? - bhi .Loop - - ldr $ctx,[sp,#12] - add sp,sp,#32 - stmia $ctx,{$h0-$h4} @ store the result - -.Lno_data: -#if __ARM_ARCH__>=5 - ldmia sp!,{r3-r11,pc} -#else - ldmia sp!,{r3-r11,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size poly1305_blocks,.-poly1305_blocks -___ -} -{ -my ($ctx,$mac,$nonce)=map("r$_",(0..2)); -my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); -my $g4=$h4; - -$code.=<<___; -.type poly1305_emit,%function -.align 5 -poly1305_emit: - stmdb sp!,{r4-r11} -.Lpoly1305_emit_enter: - - ldmia $ctx,{$h0-$h4} - adds $g0,$h0,#5 @ compare to modulus - adcs $g1,$h1,#0 - adcs $g2,$h2,#0 - adcs $g3,$h3,#0 - adc $g4,$h4,#0 - tst $g4,#4 @ did it carry/borrow? - -#ifdef __thumb2__ - it ne -#endif - movne $h0,$g0 - ldr $g0,[$nonce,#0] -#ifdef __thumb2__ - it ne -#endif - movne $h1,$g1 - ldr $g1,[$nonce,#4] -#ifdef __thumb2__ - it ne -#endif - movne $h2,$g2 - ldr $g2,[$nonce,#8] -#ifdef __thumb2__ - it ne -#endif - movne $h3,$g3 - ldr $g3,[$nonce,#12] - - adds $h0,$h0,$g0 - adcs $h1,$h1,$g1 - adcs $h2,$h2,$g2 - adc $h3,$h3,$g3 - -#if __ARM_ARCH__>=7 -# ifdef __ARMEB__ - rev $h0,$h0 - rev $h1,$h1 - rev $h2,$h2 - rev $h3,$h3 -# endif - str $h0,[$mac,#0] - str $h1,[$mac,#4] - str $h2,[$mac,#8] - str $h3,[$mac,#12] -#else - strb $h0,[$mac,#0] - mov $h0,$h0,lsr#8 - strb $h1,[$mac,#4] - mov $h1,$h1,lsr#8 - strb $h2,[$mac,#8] - mov $h2,$h2,lsr#8 - strb $h3,[$mac,#12] - mov $h3,$h3,lsr#8 - - strb $h0,[$mac,#1] - mov $h0,$h0,lsr#8 - strb $h1,[$mac,#5] - mov $h1,$h1,lsr#8 - strb $h2,[$mac,#9] - mov $h2,$h2,lsr#8 - strb $h3,[$mac,#13] - mov $h3,$h3,lsr#8 - - strb $h0,[$mac,#2] - mov $h0,$h0,lsr#8 - strb $h1,[$mac,#6] - mov $h1,$h1,lsr#8 - strb $h2,[$mac,#10] - mov $h2,$h2,lsr#8 - strb $h3,[$mac,#14] - mov $h3,$h3,lsr#8 - - strb $h0,[$mac,#3] - strb $h1,[$mac,#7] - strb $h2,[$mac,#11] - strb $h3,[$mac,#15] -#endif - ldmia sp!,{r4-r11} -#if __ARM_ARCH__>=5 - ret @ bx lr -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size poly1305_emit,.-poly1305_emit -___ -{ -my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); -my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); -my ($T0,$T1,$MASK) = map("q$_",(15,4,0)); - -my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); - -$code.=<<___; -#if (defined(__KERNEL__) && defined(CONFIG_KERNEL_MODE_NEON)) || (!defined(__KERNEL__) && __ARM_MAX_ARCH__>=7) -.fpu neon - -.type poly1305_init_neon,%function -.align 5 -poly1305_init_neon: -.Lpoly1305_init_neon: - ldr r4,[$ctx,#20] @ load key base 2^32 - ldr r5,[$ctx,#24] - ldr r6,[$ctx,#28] - ldr r7,[$ctx,#32] - - and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 - mov r3,r4,lsr#26 - mov r4,r5,lsr#20 - orr r3,r3,r5,lsl#6 - mov r5,r6,lsr#14 - orr r4,r4,r6,lsl#12 - mov r6,r7,lsr#8 - orr r5,r5,r7,lsl#18 - and r3,r3,#0x03ffffff - and r4,r4,#0x03ffffff - and r5,r5,#0x03ffffff - - vdup.32 $R0,r2 @ r^1 in both lanes - add r2,r3,r3,lsl#2 @ *5 - vdup.32 $R1,r3 - add r3,r4,r4,lsl#2 - vdup.32 $S1,r2 - vdup.32 $R2,r4 - add r4,r5,r5,lsl#2 - vdup.32 $S2,r3 - vdup.32 $R3,r5 - add r5,r6,r6,lsl#2 - vdup.32 $S3,r4 - vdup.32 $R4,r6 - vdup.32 $S4,r5 - - mov $zeros,#2 @ counter - -.Lsquare_neon: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - - vmull.u32 $D0,$R0,${R0}[1] - vmull.u32 $D1,$R1,${R0}[1] - vmull.u32 $D2,$R2,${R0}[1] - vmull.u32 $D3,$R3,${R0}[1] - vmull.u32 $D4,$R4,${R0}[1] - - vmlal.u32 $D0,$R4,${S1}[1] - vmlal.u32 $D1,$R0,${R1}[1] - vmlal.u32 $D2,$R1,${R1}[1] - vmlal.u32 $D3,$R2,${R1}[1] - vmlal.u32 $D4,$R3,${R1}[1] - - vmlal.u32 $D0,$R3,${S2}[1] - vmlal.u32 $D1,$R4,${S2}[1] - vmlal.u32 $D3,$R1,${R2}[1] - vmlal.u32 $D2,$R0,${R2}[1] - vmlal.u32 $D4,$R2,${R2}[1] - - vmlal.u32 $D0,$R2,${S3}[1] - vmlal.u32 $D3,$R0,${R3}[1] - vmlal.u32 $D1,$R3,${S3}[1] - vmlal.u32 $D2,$R4,${S3}[1] - vmlal.u32 $D4,$R1,${R3}[1] - - vmlal.u32 $D3,$R4,${S4}[1] - vmlal.u32 $D0,$R1,${S4}[1] - vmlal.u32 $D1,$R2,${S4}[1] - vmlal.u32 $D2,$R3,${S4}[1] - vmlal.u32 $D4,$R0,${R4}[1] - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - @ and P. Schwabe - @ - @ H0>>+H1>>+H2>>+H3>>+H4 - @ H3>>+H4>>*5+H0>>+H1 - @ - @ Trivia. - @ - @ Result of multiplication of n-bit number by m-bit number is - @ n+m bits wide. However! Even though 2^n is a n+1-bit number, - @ m-bit number multiplied by 2^n is still n+m bits wide. - @ - @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, - @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit - @ one is n+1 bits wide. - @ - @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that - @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 - @ can be 27. However! In cases when their width exceeds 26 bits - @ they are limited by 2^26+2^6. This in turn means that *sum* - @ of the products with these values can still be viewed as sum - @ of 52-bit numbers as long as the amount of addends is not a - @ power of 2. For example, - @ - @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, - @ - @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or - @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than - @ 8 * (2^52) or 2^55. However, the value is then multiplied by - @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), - @ which is less than 32 * (2^52) or 2^57. And when processing - @ data we are looking at triple as many addends... - @ - @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and - @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the - @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while - @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 - @ instruction accepts 2x32-bit input and writes 2x64-bit result. - @ This means that result of reduction have to be compressed upon - @ loop wrap-around. This can be done in the process of reduction - @ to minimize amount of instructions [as well as amount of - @ 128-bit instructions, which benefits low-end processors], but - @ one has to watch for H2 (which is narrower than H0) and 5*H4 - @ not being wider than 58 bits, so that result of right shift - @ by 26 bits fits in 32 bits. This is also useful on x86, - @ because it allows to use paddd in place for paddq, which - @ benefits Atom, where paddq is ridiculously slow. - - vshr.u64 $T0,$D3,#26 - vmovn.i64 $D3#lo,$D3 - vshr.u64 $T1,$D0,#26 - vmovn.i64 $D0#lo,$D0 - vadd.i64 $D4,$D4,$T0 @ h3 -> h4 - vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff - vadd.i64 $D1,$D1,$T1 @ h0 -> h1 - vbic.i32 $D0#lo,#0xfc000000 - - vshrn.u64 $T0#lo,$D4,#26 - vmovn.i64 $D4#lo,$D4 - vshr.u64 $T1,$D1,#26 - vmovn.i64 $D1#lo,$D1 - vadd.i64 $D2,$D2,$T1 @ h1 -> h2 - vbic.i32 $D4#lo,#0xfc000000 - vbic.i32 $D1#lo,#0xfc000000 - - vadd.i32 $D0#lo,$D0#lo,$T0#lo - vshl.u32 $T0#lo,$T0#lo,#2 - vshrn.u64 $T1#lo,$D2,#26 - vmovn.i64 $D2#lo,$D2 - vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 - vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 - vbic.i32 $D2#lo,#0xfc000000 - - vshr.u32 $T0#lo,$D0#lo,#26 - vbic.i32 $D0#lo,#0xfc000000 - vshr.u32 $T1#lo,$D3#lo,#26 - vbic.i32 $D3#lo,#0xfc000000 - vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 - vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 - - subs $zeros,$zeros,#1 - beq .Lsquare_break_neon - - add $tbl0,$ctx,#(48+0*9*4) - add $tbl1,$ctx,#(48+1*9*4) - - vtrn.32 $R0,$D0#lo @ r^2:r^1 - vtrn.32 $R2,$D2#lo - vtrn.32 $R3,$D3#lo - vtrn.32 $R1,$D1#lo - vtrn.32 $R4,$D4#lo - - vshl.u32 $S2,$R2,#2 @ *5 - vshl.u32 $S3,$R3,#2 - vshl.u32 $S1,$R1,#2 - vshl.u32 $S4,$R4,#2 - vadd.i32 $S2,$S2,$R2 - vadd.i32 $S1,$S1,$R1 - vadd.i32 $S3,$S3,$R3 - vadd.i32 $S4,$S4,$R4 - - vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! - vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! - vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vst1.32 {${S4}[0]},[$tbl0,:32] - vst1.32 {${S4}[1]},[$tbl1,:32] - - b .Lsquare_neon - -.align 4 -.Lsquare_break_neon: - add $tbl0,$ctx,#(48+2*4*9) - add $tbl1,$ctx,#(48+3*4*9) - - vmov $R0,$D0#lo @ r^4:r^3 - vshl.u32 $S1,$D1#lo,#2 @ *5 - vmov $R1,$D1#lo - vshl.u32 $S2,$D2#lo,#2 - vmov $R2,$D2#lo - vshl.u32 $S3,$D3#lo,#2 - vmov $R3,$D3#lo - vshl.u32 $S4,$D4#lo,#2 - vmov $R4,$D4#lo - vadd.i32 $S1,$S1,$D1#lo - vadd.i32 $S2,$S2,$D2#lo - vadd.i32 $S3,$S3,$D3#lo - vadd.i32 $S4,$S4,$D4#lo - - vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! - vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! - vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vst1.32 {${S4}[0]},[$tbl0] - vst1.32 {${S4}[1]},[$tbl1] - - ret @ bx lr -.size poly1305_init_neon,.-poly1305_init_neon - -#ifdef __KERNEL__ -.globl poly1305_blocks_neon -#endif -.type poly1305_blocks_neon,%function -.align 5 -poly1305_blocks_neon: - ldr ip,[$ctx,#36] @ is_base2_26 - ands $len,$len,#-16 - beq .Lno_data_neon - - cmp $len,#64 - bhs .Lenter_neon - tst ip,ip @ is_base2_26? - beq .Lpoly1305_blocks - -.Lenter_neon: - stmdb sp!,{r4-r7} - vstmdb sp!,{d8-d15} @ ABI specification says so - - tst ip,ip @ is_base2_26? - bne .Lbase2_26_neon - - stmdb sp!,{r1-r3,lr} - bl .Lpoly1305_init_neon - - ldr r4,[$ctx,#0] @ load hash value base 2^32 - ldr r5,[$ctx,#4] - ldr r6,[$ctx,#8] - ldr r7,[$ctx,#12] - ldr ip,[$ctx,#16] - - and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 - mov r3,r4,lsr#26 - veor $D0#lo,$D0#lo,$D0#lo - mov r4,r5,lsr#20 - orr r3,r3,r5,lsl#6 - veor $D1#lo,$D1#lo,$D1#lo - mov r5,r6,lsr#14 - orr r4,r4,r6,lsl#12 - veor $D2#lo,$D2#lo,$D2#lo - mov r6,r7,lsr#8 - orr r5,r5,r7,lsl#18 - veor $D3#lo,$D3#lo,$D3#lo - and r3,r3,#0x03ffffff - orr r6,r6,ip,lsl#24 - veor $D4#lo,$D4#lo,$D4#lo - and r4,r4,#0x03ffffff - mov r1,#1 - and r5,r5,#0x03ffffff - str r1,[$ctx,#36] @ is_base2_26 - - vmov.32 $D0#lo[0],r2 - vmov.32 $D1#lo[0],r3 - vmov.32 $D2#lo[0],r4 - vmov.32 $D3#lo[0],r5 - vmov.32 $D4#lo[0],r6 - adr $zeros,.Lzeros - - ldmia sp!,{r1-r3,lr} - b .Lbase2_32_neon - -.align 4 -.Lbase2_26_neon: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ load hash value - - veor $D0#lo,$D0#lo,$D0#lo - veor $D1#lo,$D1#lo,$D1#lo - veor $D2#lo,$D2#lo,$D2#lo - veor $D3#lo,$D3#lo,$D3#lo - veor $D4#lo,$D4#lo,$D4#lo - vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! - adr $zeros,.Lzeros - vld1.32 {$D4#lo[0]},[$ctx] - sub $ctx,$ctx,#16 @ rewind - -.Lbase2_32_neon: - add $in2,$inp,#32 - mov $padbit,$padbit,lsl#24 - tst $len,#31 - beq .Leven - - vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! - vmov.32 $H4#lo[0],$padbit - sub $len,$len,#16 - add $in2,$inp,#32 - -# ifdef __ARMEB__ - vrev32.8 $H0,$H0 - vrev32.8 $H3,$H3 - vrev32.8 $H1,$H1 - vrev32.8 $H2,$H2 -# endif - vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 - vshl.u32 $H3#lo,$H3#lo,#18 - - vsri.u32 $H3#lo,$H2#lo,#14 - vshl.u32 $H2#lo,$H2#lo,#12 - vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi - - vbic.i32 $H3#lo,#0xfc000000 - vsri.u32 $H2#lo,$H1#lo,#20 - vshl.u32 $H1#lo,$H1#lo,#6 - - vbic.i32 $H2#lo,#0xfc000000 - vsri.u32 $H1#lo,$H0#lo,#26 - vadd.i32 $H3#hi,$H3#lo,$D3#lo - - vbic.i32 $H0#lo,#0xfc000000 - vbic.i32 $H1#lo,#0xfc000000 - vadd.i32 $H2#hi,$H2#lo,$D2#lo - - vadd.i32 $H0#hi,$H0#lo,$D0#lo - vadd.i32 $H1#hi,$H1#lo,$D1#lo - - mov $tbl1,$zeros - add $tbl0,$ctx,#48 - - cmp $len,$len - b .Long_tail - -.align 4 -.Leven: - subs $len,$len,#64 - it lo - movlo $in2,$zeros - - vmov.i32 $H4,#1<<24 @ padbit, yes, always - vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] - add $inp,$inp,#64 - vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) - add $in2,$in2,#64 - itt hi - addhi $tbl1,$ctx,#(48+1*9*4) - addhi $tbl0,$ctx,#(48+3*9*4) - -# ifdef __ARMEB__ - vrev32.8 $H0,$H0 - vrev32.8 $H3,$H3 - vrev32.8 $H1,$H1 - vrev32.8 $H2,$H2 -# endif - vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 - vshl.u32 $H3,$H3,#18 - - vsri.u32 $H3,$H2,#14 - vshl.u32 $H2,$H2,#12 - - vbic.i32 $H3,#0xfc000000 - vsri.u32 $H2,$H1,#20 - vshl.u32 $H1,$H1,#6 - - vbic.i32 $H2,#0xfc000000 - vsri.u32 $H1,$H0,#26 - - vbic.i32 $H0,#0xfc000000 - vbic.i32 $H1,#0xfc000000 - - bls .Lskip_loop - - vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 - vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 - vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - b .Loop_neon - -.align 5 -.Loop_neon: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - @ \___________________/ - @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - @ \___________________/ \____________________/ - @ - @ Note that we start with inp[2:3]*r^2. This is because it - @ doesn't depend on reduction in previous iteration. - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ inp[2:3]*r^2 - - vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] - vmull.u32 $D2,$H2#hi,${R0}[1] - vadd.i32 $H0#lo,$H0#lo,$D0#lo - vmull.u32 $D0,$H0#hi,${R0}[1] - vadd.i32 $H3#lo,$H3#lo,$D3#lo - vmull.u32 $D3,$H3#hi,${R0}[1] - vmlal.u32 $D2,$H1#hi,${R1}[1] - vadd.i32 $H1#lo,$H1#lo,$D1#lo - vmull.u32 $D1,$H1#hi,${R0}[1] - - vadd.i32 $H4#lo,$H4#lo,$D4#lo - vmull.u32 $D4,$H4#hi,${R0}[1] - subs $len,$len,#64 - vmlal.u32 $D0,$H4#hi,${S1}[1] - it lo - movlo $in2,$zeros - vmlal.u32 $D3,$H2#hi,${R1}[1] - vld1.32 ${S4}[1],[$tbl1,:32] - vmlal.u32 $D1,$H0#hi,${R1}[1] - vmlal.u32 $D4,$H3#hi,${R1}[1] - - vmlal.u32 $D0,$H3#hi,${S2}[1] - vmlal.u32 $D3,$H1#hi,${R2}[1] - vmlal.u32 $D4,$H2#hi,${R2}[1] - vmlal.u32 $D1,$H4#hi,${S2}[1] - vmlal.u32 $D2,$H0#hi,${R2}[1] - - vmlal.u32 $D3,$H0#hi,${R3}[1] - vmlal.u32 $D0,$H2#hi,${S3}[1] - vmlal.u32 $D4,$H1#hi,${R3}[1] - vmlal.u32 $D1,$H3#hi,${S3}[1] - vmlal.u32 $D2,$H4#hi,${S3}[1] - - vmlal.u32 $D3,$H4#hi,${S4}[1] - vmlal.u32 $D0,$H1#hi,${S4}[1] - vmlal.u32 $D4,$H0#hi,${R4}[1] - vmlal.u32 $D1,$H2#hi,${S4}[1] - vmlal.u32 $D2,$H3#hi,${S4}[1] - - vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) - add $in2,$in2,#64 - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ (hash+inp[0:1])*r^4 and accumulate - - vmlal.u32 $D3,$H3#lo,${R0}[0] - vmlal.u32 $D0,$H0#lo,${R0}[0] - vmlal.u32 $D4,$H4#lo,${R0}[0] - vmlal.u32 $D1,$H1#lo,${R0}[0] - vmlal.u32 $D2,$H2#lo,${R0}[0] - vld1.32 ${S4}[0],[$tbl0,:32] - - vmlal.u32 $D3,$H2#lo,${R1}[0] - vmlal.u32 $D0,$H4#lo,${S1}[0] - vmlal.u32 $D4,$H3#lo,${R1}[0] - vmlal.u32 $D1,$H0#lo,${R1}[0] - vmlal.u32 $D2,$H1#lo,${R1}[0] - - vmlal.u32 $D3,$H1#lo,${R2}[0] - vmlal.u32 $D0,$H3#lo,${S2}[0] - vmlal.u32 $D4,$H2#lo,${R2}[0] - vmlal.u32 $D1,$H4#lo,${S2}[0] - vmlal.u32 $D2,$H0#lo,${R2}[0] - - vmlal.u32 $D3,$H0#lo,${R3}[0] - vmlal.u32 $D0,$H2#lo,${S3}[0] - vmlal.u32 $D4,$H1#lo,${R3}[0] - vmlal.u32 $D1,$H3#lo,${S3}[0] - vmlal.u32 $D3,$H4#lo,${S4}[0] - - vmlal.u32 $D2,$H4#lo,${S3}[0] - vmlal.u32 $D0,$H1#lo,${S4}[0] - vmlal.u32 $D4,$H0#lo,${R4}[0] - vmov.i32 $H4,#1<<24 @ padbit, yes, always - vmlal.u32 $D1,$H2#lo,${S4}[0] - vmlal.u32 $D2,$H3#lo,${S4}[0] - - vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] - add $inp,$inp,#64 -# ifdef __ARMEB__ - vrev32.8 $H0,$H0 - vrev32.8 $H1,$H1 - vrev32.8 $H2,$H2 - vrev32.8 $H3,$H3 -# endif - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ lazy reduction interleaved with base 2^32 -> base 2^26 of - @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. - - vshr.u64 $T0,$D3,#26 - vmovn.i64 $D3#lo,$D3 - vshr.u64 $T1,$D0,#26 - vmovn.i64 $D0#lo,$D0 - vadd.i64 $D4,$D4,$T0 @ h3 -> h4 - vbic.i32 $D3#lo,#0xfc000000 - vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 - vadd.i64 $D1,$D1,$T1 @ h0 -> h1 - vshl.u32 $H3,$H3,#18 - vbic.i32 $D0#lo,#0xfc000000 - - vshrn.u64 $T0#lo,$D4,#26 - vmovn.i64 $D4#lo,$D4 - vshr.u64 $T1,$D1,#26 - vmovn.i64 $D1#lo,$D1 - vadd.i64 $D2,$D2,$T1 @ h1 -> h2 - vsri.u32 $H3,$H2,#14 - vbic.i32 $D4#lo,#0xfc000000 - vshl.u32 $H2,$H2,#12 - vbic.i32 $D1#lo,#0xfc000000 - - vadd.i32 $D0#lo,$D0#lo,$T0#lo - vshl.u32 $T0#lo,$T0#lo,#2 - vbic.i32 $H3,#0xfc000000 - vshrn.u64 $T1#lo,$D2,#26 - vmovn.i64 $D2#lo,$D2 - vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] - vsri.u32 $H2,$H1,#20 - vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 - vshl.u32 $H1,$H1,#6 - vbic.i32 $D2#lo,#0xfc000000 - vbic.i32 $H2,#0xfc000000 - - vshrn.u64 $T0#lo,$D0,#26 @ re-narrow - vmovn.i64 $D0#lo,$D0 - vsri.u32 $H1,$H0,#26 - vbic.i32 $H0,#0xfc000000 - vshr.u32 $T1#lo,$D3#lo,#26 - vbic.i32 $D3#lo,#0xfc000000 - vbic.i32 $D0#lo,#0xfc000000 - vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 - vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 - vbic.i32 $H1,#0xfc000000 - - bhi .Loop_neon - -.Lskip_loop: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - add $tbl1,$ctx,#(48+0*9*4) - add $tbl0,$ctx,#(48+1*9*4) - adds $len,$len,#32 - it ne - movne $len,#0 - bne .Long_tail - - vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi - vadd.i32 $H0#hi,$H0#lo,$D0#lo - vadd.i32 $H3#hi,$H3#lo,$D3#lo - vadd.i32 $H1#hi,$H1#lo,$D1#lo - vadd.i32 $H4#hi,$H4#lo,$D4#lo - -.Long_tail: - vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 - vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 - - vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant - vmull.u32 $D2,$H2#hi,$R0 - vadd.i32 $H0#lo,$H0#lo,$D0#lo - vmull.u32 $D0,$H0#hi,$R0 - vadd.i32 $H3#lo,$H3#lo,$D3#lo - vmull.u32 $D3,$H3#hi,$R0 - vadd.i32 $H1#lo,$H1#lo,$D1#lo - vmull.u32 $D1,$H1#hi,$R0 - vadd.i32 $H4#lo,$H4#lo,$D4#lo - vmull.u32 $D4,$H4#hi,$R0 - - vmlal.u32 $D0,$H4#hi,$S1 - vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vmlal.u32 $D3,$H2#hi,$R1 - vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - vmlal.u32 $D1,$H0#hi,$R1 - vmlal.u32 $D4,$H3#hi,$R1 - vmlal.u32 $D2,$H1#hi,$R1 - - vmlal.u32 $D3,$H1#hi,$R2 - vld1.32 ${S4}[1],[$tbl1,:32] - vmlal.u32 $D0,$H3#hi,$S2 - vld1.32 ${S4}[0],[$tbl0,:32] - vmlal.u32 $D4,$H2#hi,$R2 - vmlal.u32 $D1,$H4#hi,$S2 - vmlal.u32 $D2,$H0#hi,$R2 - - vmlal.u32 $D3,$H0#hi,$R3 - it ne - addne $tbl1,$ctx,#(48+2*9*4) - vmlal.u32 $D0,$H2#hi,$S3 - it ne - addne $tbl0,$ctx,#(48+3*9*4) - vmlal.u32 $D4,$H1#hi,$R3 - vmlal.u32 $D1,$H3#hi,$S3 - vmlal.u32 $D2,$H4#hi,$S3 - - vmlal.u32 $D3,$H4#hi,$S4 - vorn $MASK,$MASK,$MASK @ all-ones, can be redundant - vmlal.u32 $D0,$H1#hi,$S4 - vshr.u64 $MASK,$MASK,#38 - vmlal.u32 $D4,$H0#hi,$R4 - vmlal.u32 $D1,$H2#hi,$S4 - vmlal.u32 $D2,$H3# |