aboutsummaryrefslogtreecommitdiff
path: root/sys/netlink/route/nexthop.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/netlink/route/nexthop.c')
-rw-r--r--sys/netlink/route/nexthop.c1123
1 files changed, 1123 insertions, 0 deletions
diff --git a/sys/netlink/route/nexthop.c b/sys/netlink/route/nexthop.c
new file mode 100644
index 000000000000..30aa3dd72534
--- /dev/null
+++ b/sys/netlink/route/nexthop.c
@@ -0,0 +1,1123 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_route.h"
+#include <sys/types.h>
+#include <sys/ck.h>
+#include <sys/epoch.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_utils.h>
+
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <netinet6/scope6_var.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#define DEBUG_MOD_NAME nl_nhop
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_INFO);
+
+/*
+ * This file contains the logic to maintain kernel nexthops and
+ * nexhop groups based om the data provided by the user.
+ *
+ * Kernel stores (nearly) all of the routing data in the nexthops,
+ * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT).
+ *
+ * Netlink API provides higher-level abstraction for the user. Each
+ * user-created nexthop may map to multiple kernel nexthops.
+ *
+ * The following variations require separate kernel nexthop to be
+ * created:
+ * * prefix flags (NHF_HOST, NHF_DEFAULT)
+ * * using IPv6 gateway for IPv4 routes
+ * * different fibnum
+ *
+ * These kernel nexthops have the lifetime bound to the lifetime of
+ * the user_nhop object. They are not collected until user requests
+ * to delete the created user_nhop.
+ *
+ */
+struct user_nhop {
+ uint32_t un_idx; /* Userland-provided index */
+ uint32_t un_fibfam; /* fibnum+af(as highest byte) */
+ uint8_t un_protocol; /* protocol that install the record */
+ struct nhop_object *un_nhop; /* "production" nexthop */
+ struct nhop_object *un_nhop_src; /* nexthop to copy from */
+ struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */
+ uint32_t un_nhgrp_count; /* number of nexthops */
+ struct user_nhop *un_next; /* next item in hash chain */
+ struct user_nhop *un_nextchild; /* master -> children */
+ struct epoch_context un_epoch_ctx; /* epoch ctl helper */
+};
+
+/* produce hash value for an object */
+#define unhop_hash_obj(_obj) (hash_unhop(_obj))
+/* compare two objects */
+#define unhop_cmp(_one, _two) (cmp_unhop(_one, _two))
+/* next object accessor */
+#define unhop_next(_obj) (_obj)->un_next
+
+CHT_SLIST_DEFINE(unhop, struct user_nhop);
+
+struct unhop_ctl {
+ struct unhop_head un_head;
+ struct rmlock un_lock;
+};
+#define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl")
+#define UN_TRACKER struct rm_priotracker un_tracker
+#define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker)
+#define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker)
+
+#define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock);
+#define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock);
+
+VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL;
+#define V_un_ctl VNET(un_ctl)
+
+static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size);
+static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b);
+static unsigned int hash_unhop(const struct user_nhop *obj);
+
+static void destroy_unhop(struct user_nhop *unhop);
+static struct nhop_object *clone_unhop(const struct user_nhop *unhop,
+ uint32_t fibnum, int family, int nh_flags);
+
+static int
+cmp_unhop(const struct user_nhop *a, const struct user_nhop *b)
+{
+ return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam);
+}
+
+/*
+ * Hash callback: calculate hash of an object
+ */
+static unsigned int
+hash_unhop(const struct user_nhop *obj)
+{
+ return (obj->un_idx ^ obj->un_fibfam);
+}
+
+#define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0)
+
+/*
+ * Factory interface for creating matching kernel nexthops/nexthop groups
+ *
+ * @uidx: userland nexhop index used to create the nexthop
+ * @fibnum: fibnum nexthop will be used in
+ * @family: upper family nexthop will be used in
+ * @nh_flags: desired nexthop prefix flags
+ * @perror: pointer to store error to
+ *
+ * Returns referenced nexthop linked to @fibnum/@family rib on success.
+ */
+struct nhop_object *
+nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx,
+ int nh_flags, int *perror)
+{
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ UN_TRACKER;
+
+ if (__predict_false(ctl == NULL))
+ return (NULL);
+
+ struct user_nhop key= {
+ .un_idx = uidx,
+ .un_fibfam = fibnum | ((uint32_t)family) << 24,
+ };
+ struct user_nhop *unhop;
+
+ nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT);
+
+ if (__predict_false(family == 0))
+ return (NULL);
+
+ UN_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop != NULL) {
+ struct nhop_object *nh = unhop->un_nhop;
+ UN_RLOCK(ctl);
+ *perror = 0;
+ nhop_ref_any(nh);
+ return (nh);
+ }
+
+ /*
+ * Exact nexthop not found. Search for template nexthop to clone from.
+ */
+ key.un_fibfam = 0;
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop == NULL) {
+ UN_RUNLOCK(ctl);
+ *perror = ESRCH;
+ return (NULL);
+ }
+
+ UN_RUNLOCK(ctl);
+
+ /* Create entry to insert first */
+ struct user_nhop *un_new, *un_tmp;
+ un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
+ if (un_new == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ un_new->un_idx = uidx;
+ un_new->un_fibfam = fibnum | ((uint32_t)family) << 24;
+
+ /* Relying on epoch to protect unhop here */
+ un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags);
+ if (un_new->un_nhop == NULL) {
+ free(un_new, M_NETLINK);
+ *perror = ENOMEM;
+ return (NULL);
+ }
+
+ /* Insert back and report */
+ UN_WLOCK(ctl);
+
+ /* First, find template record once again */
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop == NULL) {
+ /* Someone deleted the nexthop during the call */
+ UN_WUNLOCK(ctl);
+ *perror = ESRCH;
+ destroy_unhop(un_new);
+ return (NULL);
+ }
+
+ /* Second, check the direct match */
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp);
+ struct nhop_object *nh;
+ if (un_tmp != NULL) {
+ /* Another thread already created the desired nextop, use it */
+ nh = un_tmp->un_nhop;
+ } else {
+ /* Finally, insert the new nexthop and link it to the primary */
+ nh = un_new->un_nhop;
+ CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new);
+ un_new->un_nextchild = unhop->un_nextchild;
+ unhop->un_nextchild = un_new;
+ un_new = NULL;
+ NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh);
+ }
+
+ UN_WUNLOCK(ctl);
+
+ if (un_new != NULL)
+ destroy_unhop(un_new);
+
+ *perror = 0;
+ nhop_ref_any(nh);
+ return (nh);
+}
+
+static struct user_nhop *
+nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx)
+{
+ struct user_nhop key= { .un_idx = uidx };
+ struct user_nhop *unhop = NULL;
+ UN_TRACKER;
+
+ UN_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ UN_RUNLOCK(ctl);
+
+ return (unhop);
+}
+
+#define MAX_STACK_NHOPS 4
+static struct nhop_object *
+clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags)
+{
+#ifdef ROUTE_MPATH
+ const struct weightened_nhop *wn;
+ struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS];
+ uint32_t num_nhops;
+#endif
+ struct nhop_object *nh = NULL;
+ int error;
+
+ if (unhop->un_nhop_src != NULL) {
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf));
+ FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src,
+ "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum,
+ family, nh_flags);
+ }
+ struct nhop_object *nh;
+ nh = nhop_alloc(fibnum, AF_UNSPEC);
+ if (nh == NULL)
+ return (NULL);
+ nhop_copy(nh, unhop->un_nhop_src);
+ /* Check that nexthop gateway is compatible with the new family */
+ if (!nhop_set_upper_family(nh, family)) {
+ nhop_free(nh);
+ return (NULL);
+ }
+ nhop_set_uidx(nh, unhop->un_idx);
+ nhop_set_pxtype_flag(nh, nh_flags);
+ return (nhop_get_nhop(nh, &error));
+ }
+#ifdef ROUTE_MPATH
+ wn = unhop->un_nhgrp_src;
+ num_nhops = unhop->un_nhgrp_count;
+
+ if (num_nhops > MAX_STACK_NHOPS) {
+ wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT);
+ if (wn_new == NULL)
+ return (NULL);
+ } else
+ wn_new = wn_base;
+
+ for (int i = 0; i < num_nhops; i++) {
+ uint32_t uidx = nhop_get_uidx(wn[i].nh);
+ MPASS(uidx != 0);
+ wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error);
+ if (error != 0)
+ break;
+ wn_new[i].weight = wn[i].weight;
+ }
+
+ if (error == 0) {
+ struct rib_head *rh = nhop_get_rh(wn_new[0].nh);
+ struct nhgrp_object *nhg;
+
+ error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg);
+ nh = (struct nhop_object *)nhg;
+ }
+
+ if (wn_new != wn_base)
+ free(wn_new, M_TEMP);
+#endif
+ return (nh);
+}
+
+static void
+destroy_unhop(struct user_nhop *unhop)
+{
+ if (unhop->un_nhop != NULL)
+ nhop_free_any(unhop->un_nhop);
+ if (unhop->un_nhop_src != NULL)
+ nhop_free_any(unhop->un_nhop_src);
+ free(unhop, M_NETLINK);
+}
+
+static void
+destroy_unhop_epoch(epoch_context_t ctx)
+{
+ struct user_nhop *unhop;
+
+ unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx);
+
+ destroy_unhop(unhop);
+}
+
+static uint32_t
+find_spare_uidx(struct unhop_ctl *ctl)
+{
+ struct user_nhop *unhop, key = {};
+ uint32_t uidx = 0;
+ UN_TRACKER;
+
+ UN_RLOCK(ctl);
+ /* This should return spare uid with 75% of 65k used in ~99/100 cases */
+ for (int i = 0; i < 16; i++) {
+ key.un_idx = (arc4random() % 65536) + 65536 * 4;
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop == NULL) {
+ uidx = key.un_idx;
+ break;
+ }
+ }
+ UN_RUNLOCK(ctl);
+
+ return (uidx);
+}
+
+
+/*
+ * Actual netlink code
+ */
+struct netlink_walkargs {
+ struct nl_writer *nw;
+ struct nlmsghdr hdr;
+ struct nlpcb *so;
+ int family;
+ int error;
+ int count;
+ int dumped;
+};
+#define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem
+
+static bool
+dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr,
+ struct nl_writer *nw)
+{
+
+ if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
+ goto enomem;
+
+ struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
+ nhm->nh_family = AF_UNSPEC;
+ nhm->nh_scope = 0;
+ nhm->nh_protocol = unhop->un_protocol;
+ nhm->nh_flags = 0;
+
+ nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
+ nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH);
+
+ struct weightened_nhop *wn = unhop->un_nhgrp_src;
+ uint32_t num_nhops = unhop->un_nhgrp_count;
+ /* TODO: a better API? */
+ int nla_len = sizeof(struct nlattr);
+ nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp));
+ struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
+ if (nla == NULL)
+ goto enomem;
+ nla->nla_type = NHA_GROUP;
+ nla->nla_len = nla_len;
+ for (int i = 0; i < num_nhops; i++) {
+ struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i];
+ grp->id = nhop_get_uidx(wn[i].nh);
+ grp->weight = wn[i].weight;
+ grp->resvd1 = 0;
+ grp->resvd2 = 0;
+ }
+
+ if (nlmsg_end(nw))
+ return (true);
+enomem:
+ NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory");
+ nlmsg_abort(nw);
+ return (false);
+}
+
+static bool
+dump_nhop(const struct nhop_object *nh, uint32_t uidx, struct nlmsghdr *hdr,
+ struct nl_writer *nw)
+{
+ if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
+ goto enomem;
+
+ struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
+ ENOMEM_IF_NULL(nhm);
+ nhm->nh_family = nhop_get_neigh_family(nh);
+ nhm->nh_scope = 0; // XXX: what's that?
+ nhm->nh_protocol = nhop_get_origin(nh);
+ nhm->nh_flags = 0;
+
+ if (uidx != 0)
+ nlattr_add_u32(nw, NHA_ID, uidx);
+ if (nh->nh_flags & NHF_BLACKHOLE) {
+ nlattr_add_flag(nw, NHA_BLACKHOLE);
+ goto done;
+ }
+ nlattr_add_u32(nw, NHA_OIF, if_getindex(nh->nh_ifp));
+
+ switch (nh->gw_sa.sa_family) {
+#ifdef INET
+ case AF_INET:
+ nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct in6_addr addr = nh->gw6_sa.sin6_addr;
+ in6_clearscope(&addr);
+ nlattr_add(nw, NHA_GATEWAY, 16, &addr);
+ break;
+ }
+#endif
+ }
+
+ int off = nlattr_add_nested(nw, NHA_FREEBSD);
+ if (off != 0) {
+ nlattr_add_u32(nw, NHAF_AIF, if_getindex(nh->nh_aifp));
+
+ if (uidx == 0) {
+ nlattr_add_u32(nw, NHAF_KID, nhop_get_idx(nh));
+ nlattr_add_u32(nw, NHAF_FAMILY, nhop_get_upper_family(nh));
+ nlattr_add_u32(nw, NHAF_TABLE, nhop_get_fibnum(nh));
+ }
+
+ nlattr_set_len(nw, off);
+ }
+
+done:
+ if (nlmsg_end(nw))
+ return (true);
+enomem:
+ nlmsg_abort(nw);
+ return (false);
+}
+
+static void
+dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
+ struct nl_writer *nw)
+{
+ if (unhop->un_nhop_src != NULL)
+ dump_nhop(unhop->un_nhop_src, unhop->un_idx, hdr, nw);
+ else
+ dump_nhgrp(unhop, hdr, nw);
+}
+
+static int
+delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx)
+{
+ struct user_nhop *unhop_ret, *unhop_base, *unhop_chain;
+ struct nl_writer nw;
+ struct user_nhop key = { .un_idx = uidx };
+
+ UN_WLOCK(ctl);
+
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base);
+
+ if (unhop_base != NULL) {
+ CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret);
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf));
+ FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop,
+ "removed base nhop %u: %s", uidx, nhbuf);
+ }
+ /* Unlink all child nexhops as well, keeping the chain intact */
+ unhop_chain = unhop_base->un_nextchild;
+ while (unhop_chain != NULL) {
+ CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain,
+ unhop_ret);
+ MPASS(unhop_chain == unhop_ret);
+ IF_DEBUG_LEVEL(LOG_DEBUG3) {
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf_any(unhop_chain->un_nhop,
+ nhbuf, sizeof(nhbuf));
+ FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop,
+ "removed child nhop %u: %s", uidx, nhbuf);
+ }
+ unhop_chain = unhop_chain->un_nextchild;
+ }
+ }
+
+ UN_WUNLOCK(ctl);
+
+ if (unhop_base == NULL) {
+ NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx);
+ return (ENOENT);
+ }
+
+ /* Report nexthop deletion */
+ struct netlink_walkargs wa = {
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_DELNEXTHOP,
+ };
+
+ if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP,
+ 0, false)) {
+ NL_LOG(LOG_DEBUG, "error allocating message writer");
+ return (ENOMEM);
+ }
+
+ dump_unhop(unhop_base, &wa.hdr, &nw);
+ nlmsg_flush(&nw);
+
+ while (unhop_base != NULL) {
+ unhop_chain = unhop_base->un_nextchild;
+ NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx);
+ unhop_base = unhop_chain;
+ }
+
+ return (0);
+}
+
+static void
+consider_resize(struct unhop_ctl *ctl, uint32_t new_size)
+{
+ void *new_ptr = NULL;
+ size_t alloc_size;
+
+ if (new_size == 0)
+ return;
+
+ if (new_size != 0) {
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size);
+ new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (new_ptr == NULL)
+ return;
+ }
+
+ NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size);
+ UN_WLOCK(ctl);
+ if (new_ptr != NULL) {
+ CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size);
+ }
+ UN_WUNLOCK(ctl);
+
+
+ if (new_ptr != NULL)
+ free(new_ptr, M_NETLINK);
+}
+
+static bool __noinline
+vnet_init_unhops(void)
+{
+ uint32_t num_buckets = 16;
+ size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
+
+ struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK,
+ M_NOWAIT | M_ZERO);
+ if (ctl == NULL)
+ return (false);
+
+ void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (ptr == NULL) {
+ free(ctl, M_NETLINK);
+ return (false);
+ }
+ CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets);
+ UN_LOCK_INIT(ctl);
+
+ if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) {
+ free(ptr, M_NETLINK);
+ free(ctl, M_NETLINK);
+ }
+
+ if (atomic_load_ptr(&V_un_ctl) == NULL)
+ return (false);
+
+ NL_LOG(LOG_NOTICE, "UNHOPS init done");
+
+ return (true);
+}
+
+static void
+vnet_destroy_unhops(const void *unused __unused)
+{
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ struct user_nhop *unhop, *tmp;
+
+ if (ctl == NULL)
+ return;
+ V_un_ctl = NULL;
+
+ /* Wait till all unhop users finish their reads */
+ NET_EPOCH_WAIT();
+
+ UN_WLOCK(ctl);
+ CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) {
+ destroy_unhop(unhop);
+ } CHT_SLIST_FOREACH_SAFE_END;
+ UN_WUNLOCK(ctl);
+
+ free(ctl->un_head.ptr, M_NETLINK);
+ free(ctl, M_NETLINK);
+}
+VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY,
+ vnet_destroy_unhops, NULL);
+
+static int
+nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ int error = 0;
+
+ /* Verify attribute correctness */
+ struct nexthop_grp *grp = NLA_DATA(nla);
+ int data_len = NLA_DATA_LEN(nla);
+
+ int count = data_len / sizeof(*grp);
+ if (count == 0 || (count * sizeof(*grp) != data_len)) {
+ NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len);
+ return (EINVAL);
+ }
+
+ *((struct nlattr **)target) = nla;
+ return (error);
+}
+
+static void
+set_scope6(struct sockaddr *sa, if_t ifp)
+{
+#ifdef INET6
+ if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) {
+ struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa;
+
+ if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr))
+ in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp));
+ }
+#endif
+}
+
+struct nl_parsed_nhop {
+ uint32_t nha_id;
+ uint8_t nha_blackhole;
+ uint8_t nha_groups;
+ uint8_t nhaf_knhops;
+ uint8_t nhaf_family;
+ struct ifnet *nha_oif;
+ struct sockaddr *nha_gw;
+ struct nlattr *nha_group;
+ uint8_t nh_family;
+ uint8_t nh_protocol;
+ uint32_t nhaf_table;
+ uint32_t nhaf_kid;
+ uint32_t nhaf_aif;
+};
+
+#define _IN(_field) offsetof(struct nhmsg, _field)
+#define _OUT(_field) offsetof(struct nl_parsed_nhop, _field)
+static struct nlattr_parser nla_p_nh_fbsd[] = {
+ { .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = nlattr_get_flag },
+ { .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = nlattr_get_uint32 },
+ { .type = NHAF_FAMILY, .off = _OUT(nhaf_family), .cb = nlattr_get_uint8 },
+ { .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = nlattr_get_uint32 },
+ { .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = nlattr_get_uint32 },
+};
+NL_DECLARE_ATTR_PARSER(nh_fbsd_parser, nla_p_nh_fbsd);
+
+static const struct nlfield_parser nlf_p_nh[] = {
+ { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 },
+ { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 },
+};
+
+static const struct nlattr_parser nla_p_nh[] = {
+ { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 },
+ { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg },
+ { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag },
+ { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp },
+ { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip },
+ { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag },
+ { .type = NHA_FREEBSD, .arg = &nh_fbsd_parser, .cb = nlattr_get_nested },
+};
+#undef _IN
+#undef _OUT
+
+static bool
+post_p_nh(void *_attrs, struct nl_pstate *npt)
+{
+ struct nl_parsed_nhop *attrs = (struct nl_parsed_nhop *)_attrs;
+
+ set_scope6(attrs->nha_gw, attrs->nha_oif);
+ return (true);
+}
+NL_DECLARE_PARSER_EXT(nhmsg_parser, struct nhmsg, NULL, nlf_p_nh, nla_p_nh, post_p_nh);
+
+static bool
+eligible_nhg(const struct nhop_object *nh)
+{
+ return (nh->nh_flags & NHF_GATEWAY);
+}
+
+static int
+newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
+{
+ struct nexthop_grp *grp = NLA_DATA(attrs->nha_group);
+ int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp);
+ struct weightened_nhop *wn;
+
+ wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (wn == NULL)
+ return (ENOMEM);
+
+ for (int i = 0; i < count; i++) {
+ struct user_nhop *unhop;
+ unhop = nl_find_base_unhop(ctl, grp[i].id);
+ if (unhop == NULL) {
+ NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id);
+ free(wn, M_NETLINK);
+ return (ESRCH);
+ } else if (unhop->un_nhop_src == NULL) {
+ NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported",
+ grp[i].id);
+ free(wn, M_NETLINK);
+ return (ENOTSUP);
+ } else if (!eligible_nhg(unhop->un_nhop_src)) {
+ NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible",
+ grp[i].id);
+ free(wn, M_NETLINK);
+ return (ENOTSUP);
+ }
+ /*
+ * TODO: consider more rigid eligibility checks:
+ * restrict nexthops with the same gateway
+ */
+ wn[i].nh = unhop->un_nhop_src;
+ wn[i].weight = grp[i].weight;
+ }
+ unhop->un_nhgrp_src = wn;
+ unhop->un_nhgrp_count = count;
+ return (0);
+}
+
+/*
+ * Sets nexthop @nh gateway specified by @gw.
+ * If gateway is IPv6 link-local, alters @gw to include scopeid equal to
+ * @ifp ifindex.
+ * Returns 0 on success or errno.
+ */
+int
+nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, if_t ifp,
+ struct nl_pstate *npt)
+{
+#ifdef INET6
+ if (gw->sa_family == AF_INET6) {
+ struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw;
+ if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) {
+ if (ifp == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "interface not set");
+ return (EINVAL);
+ }
+ in6_set_unicast_scopeid(&gw6->sin6_addr, if_getindex(ifp));
+ }
+ }
+#endif
+ nhop_set_gw(nh, gw, true);
+ return (0);
+}
+
+static int
+newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt)
+{
+ struct ifaddr *ifa = NULL;
+ struct nhop_object *nh;
+ int error;
+
+ if (!attrs->nha_blackhole) {
+ if (attrs->nha_gw == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY");
+ return (EINVAL);
+ }
+ if (attrs->nha_oif == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF");
+ return (EINVAL);
+ }
+ if (ifa == NULL)
+ ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif);
+ if (ifa == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP");
+ return (EINVAL);
+ }
+ }
+
+ int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family;
+
+ nh = nhop_alloc(RT_DEFAULT_FIB, family);
+ if (nh == NULL) {
+ NL_LOG(LOG_DEBUG, "Unable to allocate nexthop");
+ return (ENOMEM);
+ }
+ nhop_set_uidx(nh, attrs->nha_id);
+ nhop_set_origin(nh, attrs->nh_protocol);
+
+ if (attrs->nha_blackhole)
+ nhop_set_blackhole(nh, NHF_BLACKHOLE);
+ else {
+ error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt);
+ if (error != 0) {
+ nhop_free(nh);
+ return (error);
+ }
+ nhop_set_transmit_ifp(nh, attrs->nha_oif);
+ nhop_set_src(nh, ifa);
+ }
+
+ error = nhop_get_unlinked(nh);
+ if (error != 0) {
+ NL_LOG(LOG_DEBUG, "unable to finalize nexthop");
+ return (error);
+ }
+
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf(nh, nhbuf, sizeof(nhbuf));
+ NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf);
+ }
+
+ unhop->un_nhop_src = nh;
+ return (0);
+}
+
+static int
+rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nl_pstate *npt)
+{
+ struct nl_writer nw;
+ struct user_nhop *unhop;
+ int error;
+
+ if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops()))
+ return (ENOMEM);
+ struct unhop_ctl *ctl = V_un_ctl;
+
+ struct nl_parsed_nhop attrs = {};
+ error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class
+ * citizen.
+ */
+ if (attrs.nha_id == 0) {
+ attrs.nha_id = find_spare_uidx(ctl);
+ if (attrs.nha_id == 0) {
+ NL_LOG(LOG_DEBUG, "Unable to get spare uidx");
+ return (ENOSPC);
+ }
+ }
+
+ NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? if_getindex(attrs.nha_oif) : 0);
+
+ unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
+ if (unhop == NULL) {
+ NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop");
+ return (ENOMEM);
+ }
+ unhop->un_idx = attrs.nha_id;
+ unhop->un_protocol = attrs.nh_protocol;
+
+ if (attrs.nha_group)
+ error = newnhg(ctl, &attrs, unhop);
+ else
+ error = newnhop(&attrs, unhop, npt);
+
+ if (error != 0) {
+ free(unhop, M_NETLINK);
+ return (error);
+ }
+
+ UN_WLOCK(ctl);
+ /* Check if uidx already exists */
+ struct user_nhop *tmp = NULL;
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp);
+ if (tmp != NULL) {
+ UN_WUNLOCK(ctl);
+ NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id);
+ destroy_unhop(unhop);
+ return (EEXIST);
+ }
+ CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop);
+ uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head);
+ UN_WUNLOCK(ctl);
+
+ /* Report addition of the next nexhop */
+ struct netlink_walkargs wa = {
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
+ };
+
+ if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP,
+ 0, false)) {
+ NL_LOG(LOG_DEBUG, "error allocating message writer");
+ return (ENOMEM);
+ }
+
+ dump_unhop(unhop, &wa.hdr, &nw);
+ nlmsg_flush(&nw);
+
+ consider_resize(ctl, num_buckets_new);
+
+ return (0);
+}
+
+static int
+rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nl_pstate *npt)
+{
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ int error;
+
+ if (__predict_false(ctl == NULL))
+ return (ESRCH);
+
+ struct nl_parsed_nhop attrs = {};
+ error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.nha_id == 0) {
+ NL_LOG(LOG_DEBUG, "NHA_ID not set");
+ return (EINVAL);
+ }
+
+ error = delete_unhop(ctl, hdr, attrs.nha_id);
+
+ return (error);
+}
+
+static bool
+match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
+{
+ if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id)
+ return (false);
+ if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL)
+ return (false);
+ if (attrs->nha_oif != NULL &&
+ (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif))
+ return (false);
+
+ return (true);
+}
+
+static int
+rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nl_pstate *npt)
+{
+ struct user_nhop *unhop;
+ UN_TRACKER;
+ int error;
+
+ struct nl_parsed_nhop attrs = {};
+ error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ struct netlink_walkargs wa = {
+ .nw = npt->nw,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
+ };
+
+ if (attrs.nha_id != 0) {
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ struct user_nhop key = { .un_idx = attrs.nha_id };
+
+ if (__predict_false(ctl == NULL))
+ return (ESRCH);
+
+ NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id);
+ UN_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ UN_RUNLOCK(ctl);
+
+ if (unhop == NULL)
+ return (ESRCH);
+ dump_unhop(unhop, &wa.hdr, wa.nw);
+ return (0);
+ } else if (attrs.nhaf_kid != 0) {
+ struct nhop_iter iter = {
+ .fibnum = attrs.nhaf_table,
+ .family = attrs.nhaf_family,
+ };
+ int error = ESRCH;
+
+ NL_LOG(LOG_DEBUG2, "START table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
+ for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
+ nh = nhops_iter_next(&iter)) {
+ NL_LOG(LOG_DEBUG3, "get %u", nhop_get_idx(nh));
+ if (nhop_get_idx(nh) == attrs.nhaf_kid) {
+ dump_nhop(nh, 0, &wa.hdr, wa.nw);
+ error = 0;
+ break;
+ }
+ }
+ nhops_iter_stop(&iter);
+ return (error);
+ } else if (attrs.nhaf_knhops) {
+ struct nhop_iter iter = {
+ .fibnum = attrs.nhaf_table,
+ .family = attrs.nhaf_family,
+ };
+
+ NL_LOG(LOG_DEBUG2, "DUMP table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
+ wa.hdr.nlmsg_flags |= NLM_F_MULTI;
+ for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
+ nh = nhops_iter_next(&iter)) {
+ dump_nhop(nh, 0, &wa.hdr, wa.nw);
+ }
+ nhops_iter_stop(&iter);
+ } else {
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+
+ if (__predict_false(ctl == NULL))
+ return (ESRCH);
+
+ NL_LOG(LOG_DEBUG2, "DUMP unhops");
+ UN_RLOCK(ctl);
+ wa.hdr.nlmsg_flags |= NLM_F_MULTI;
+ CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) {
+ if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop))
+ dump_unhop(unhop, &wa.hdr, wa.nw);
+ } CHT_SLIST_FOREACH_END;
+ UN_RUNLOCK(ctl);
+ }
+
+ if (wa.error == 0) {
+ if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr))
+ return (ENOMEM);
+ }
+ return (0);
+}
+
+static const struct rtnl_cmd_handler cmd_handlers[] = {
+ {
+ .cmd = NL_RTM_NEWNEXTHOP,
+ .name = "RTM_NEWNEXTHOP",
+ .cb = &rtnl_handle_newnhop,
+ .priv = PRIV_NET_ROUTE,
+ },
+ {
+ .cmd = NL_RTM_DELNEXTHOP,
+ .name = "RTM_DELNEXTHOP",
+ .cb = &rtnl_handle_delnhop,
+ .priv = PRIV_NET_ROUTE,
+ },
+ {
+ .cmd = NL_RTM_GETNEXTHOP,
+ .name = "RTM_GETNEXTHOP",
+ .cb = &rtnl_handle_getnhop,
+ }
+};
+
+static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser, &nh_fbsd_parser };
+
+void
+rtnl_nexthops_init(void)
+{
+ NL_VERIFY_PARSERS(all_parsers);
+ rtnl_register_messages(cmd_handlers, nitems(cmd_handlers));
+}