diff options
Diffstat (limited to 'sys/netlink/route/nexthop.c')
-rw-r--r-- | sys/netlink/route/nexthop.c | 1123 |
1 files changed, 1123 insertions, 0 deletions
diff --git a/sys/netlink/route/nexthop.c b/sys/netlink/route/nexthop.c new file mode 100644 index 000000000000..30aa3dd72534 --- /dev/null +++ b/sys/netlink/route/nexthop.c @@ -0,0 +1,1123 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_route.h" +#include <sys/types.h> +#include <sys/ck.h> +#include <sys/epoch.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/rmlock.h> +#include <sys/socket.h> + +#include <net/if.h> +#include <net/route.h> +#include <net/route/nhop.h> +#include <net/route/nhop_utils.h> + +#include <net/route/route_ctl.h> +#include <net/route/route_var.h> +#include <netinet6/scope6_var.h> +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_route.h> +#include <netlink/route/route_var.h> + +#define DEBUG_MOD_NAME nl_nhop +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include <netlink/netlink_debug.h> +_DECLARE_DEBUG(LOG_INFO); + +/* + * This file contains the logic to maintain kernel nexthops and + * nexhop groups based om the data provided by the user. + * + * Kernel stores (nearly) all of the routing data in the nexthops, + * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT). + * + * Netlink API provides higher-level abstraction for the user. Each + * user-created nexthop may map to multiple kernel nexthops. + * + * The following variations require separate kernel nexthop to be + * created: + * * prefix flags (NHF_HOST, NHF_DEFAULT) + * * using IPv6 gateway for IPv4 routes + * * different fibnum + * + * These kernel nexthops have the lifetime bound to the lifetime of + * the user_nhop object. They are not collected until user requests + * to delete the created user_nhop. + * + */ +struct user_nhop { + uint32_t un_idx; /* Userland-provided index */ + uint32_t un_fibfam; /* fibnum+af(as highest byte) */ + uint8_t un_protocol; /* protocol that install the record */ + struct nhop_object *un_nhop; /* "production" nexthop */ + struct nhop_object *un_nhop_src; /* nexthop to copy from */ + struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */ + uint32_t un_nhgrp_count; /* number of nexthops */ + struct user_nhop *un_next; /* next item in hash chain */ + struct user_nhop *un_nextchild; /* master -> children */ + struct epoch_context un_epoch_ctx; /* epoch ctl helper */ +}; + +/* produce hash value for an object */ +#define unhop_hash_obj(_obj) (hash_unhop(_obj)) +/* compare two objects */ +#define unhop_cmp(_one, _two) (cmp_unhop(_one, _two)) +/* next object accessor */ +#define unhop_next(_obj) (_obj)->un_next + +CHT_SLIST_DEFINE(unhop, struct user_nhop); + +struct unhop_ctl { + struct unhop_head un_head; + struct rmlock un_lock; +}; +#define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl") +#define UN_TRACKER struct rm_priotracker un_tracker +#define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker) +#define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker) + +#define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock); +#define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock); + +VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL; +#define V_un_ctl VNET(un_ctl) + +static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size); +static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b); +static unsigned int hash_unhop(const struct user_nhop *obj); + +static void destroy_unhop(struct user_nhop *unhop); +static struct nhop_object *clone_unhop(const struct user_nhop *unhop, + uint32_t fibnum, int family, int nh_flags); + +static int +cmp_unhop(const struct user_nhop *a, const struct user_nhop *b) +{ + return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam); +} + +/* + * Hash callback: calculate hash of an object + */ +static unsigned int +hash_unhop(const struct user_nhop *obj) +{ + return (obj->un_idx ^ obj->un_fibfam); +} + +#define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0) + +/* + * Factory interface for creating matching kernel nexthops/nexthop groups + * + * @uidx: userland nexhop index used to create the nexthop + * @fibnum: fibnum nexthop will be used in + * @family: upper family nexthop will be used in + * @nh_flags: desired nexthop prefix flags + * @perror: pointer to store error to + * + * Returns referenced nexthop linked to @fibnum/@family rib on success. + */ +struct nhop_object * +nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx, + int nh_flags, int *perror) +{ + struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); + UN_TRACKER; + + if (__predict_false(ctl == NULL)) + return (NULL); + + struct user_nhop key= { + .un_idx = uidx, + .un_fibfam = fibnum | ((uint32_t)family) << 24, + }; + struct user_nhop *unhop; + + nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT); + + if (__predict_false(family == 0)) + return (NULL); + + UN_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + if (unhop != NULL) { + struct nhop_object *nh = unhop->un_nhop; + UN_RLOCK(ctl); + *perror = 0; + nhop_ref_any(nh); + return (nh); + } + + /* + * Exact nexthop not found. Search for template nexthop to clone from. + */ + key.un_fibfam = 0; + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + if (unhop == NULL) { + UN_RUNLOCK(ctl); + *perror = ESRCH; + return (NULL); + } + + UN_RUNLOCK(ctl); + + /* Create entry to insert first */ + struct user_nhop *un_new, *un_tmp; + un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); + if (un_new == NULL) { + *perror = ENOMEM; + return (NULL); + } + un_new->un_idx = uidx; + un_new->un_fibfam = fibnum | ((uint32_t)family) << 24; + + /* Relying on epoch to protect unhop here */ + un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags); + if (un_new->un_nhop == NULL) { + free(un_new, M_NETLINK); + *perror = ENOMEM; + return (NULL); + } + + /* Insert back and report */ + UN_WLOCK(ctl); + + /* First, find template record once again */ + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + if (unhop == NULL) { + /* Someone deleted the nexthop during the call */ + UN_WUNLOCK(ctl); + *perror = ESRCH; + destroy_unhop(un_new); + return (NULL); + } + + /* Second, check the direct match */ + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp); + struct nhop_object *nh; + if (un_tmp != NULL) { + /* Another thread already created the desired nextop, use it */ + nh = un_tmp->un_nhop; + } else { + /* Finally, insert the new nexthop and link it to the primary */ + nh = un_new->un_nhop; + CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new); + un_new->un_nextchild = unhop->un_nextchild; + unhop->un_nextchild = un_new; + un_new = NULL; + NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh); + } + + UN_WUNLOCK(ctl); + + if (un_new != NULL) + destroy_unhop(un_new); + + *perror = 0; + nhop_ref_any(nh); + return (nh); +} + +static struct user_nhop * +nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx) +{ + struct user_nhop key= { .un_idx = uidx }; + struct user_nhop *unhop = NULL; + UN_TRACKER; + + UN_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + UN_RUNLOCK(ctl); + + return (unhop); +} + +#define MAX_STACK_NHOPS 4 +static struct nhop_object * +clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags) +{ +#ifdef ROUTE_MPATH + const struct weightened_nhop *wn; + struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS]; + uint32_t num_nhops; +#endif + struct nhop_object *nh = NULL; + int error; + + if (unhop->un_nhop_src != NULL) { + IF_DEBUG_LEVEL(LOG_DEBUG2) { + char nhbuf[NHOP_PRINT_BUFSIZE]; + nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf)); + FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src, + "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum, + family, nh_flags); + } + struct nhop_object *nh; + nh = nhop_alloc(fibnum, AF_UNSPEC); + if (nh == NULL) + return (NULL); + nhop_copy(nh, unhop->un_nhop_src); + /* Check that nexthop gateway is compatible with the new family */ + if (!nhop_set_upper_family(nh, family)) { + nhop_free(nh); + return (NULL); + } + nhop_set_uidx(nh, unhop->un_idx); + nhop_set_pxtype_flag(nh, nh_flags); + return (nhop_get_nhop(nh, &error)); + } +#ifdef ROUTE_MPATH + wn = unhop->un_nhgrp_src; + num_nhops = unhop->un_nhgrp_count; + + if (num_nhops > MAX_STACK_NHOPS) { + wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT); + if (wn_new == NULL) + return (NULL); + } else + wn_new = wn_base; + + for (int i = 0; i < num_nhops; i++) { + uint32_t uidx = nhop_get_uidx(wn[i].nh); + MPASS(uidx != 0); + wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error); + if (error != 0) + break; + wn_new[i].weight = wn[i].weight; + } + + if (error == 0) { + struct rib_head *rh = nhop_get_rh(wn_new[0].nh); + struct nhgrp_object *nhg; + + error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg); + nh = (struct nhop_object *)nhg; + } + + if (wn_new != wn_base) + free(wn_new, M_TEMP); +#endif + return (nh); +} + +static void +destroy_unhop(struct user_nhop *unhop) +{ + if (unhop->un_nhop != NULL) + nhop_free_any(unhop->un_nhop); + if (unhop->un_nhop_src != NULL) + nhop_free_any(unhop->un_nhop_src); + free(unhop, M_NETLINK); +} + +static void +destroy_unhop_epoch(epoch_context_t ctx) +{ + struct user_nhop *unhop; + + unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx); + + destroy_unhop(unhop); +} + +static uint32_t +find_spare_uidx(struct unhop_ctl *ctl) +{ + struct user_nhop *unhop, key = {}; + uint32_t uidx = 0; + UN_TRACKER; + + UN_RLOCK(ctl); + /* This should return spare uid with 75% of 65k used in ~99/100 cases */ + for (int i = 0; i < 16; i++) { + key.un_idx = (arc4random() % 65536) + 65536 * 4; + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + if (unhop == NULL) { + uidx = key.un_idx; + break; + } + } + UN_RUNLOCK(ctl); + + return (uidx); +} + + +/* + * Actual netlink code + */ +struct netlink_walkargs { + struct nl_writer *nw; + struct nlmsghdr hdr; + struct nlpcb *so; + int family; + int error; + int count; + int dumped; +}; +#define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem + +static bool +dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr, + struct nl_writer *nw) +{ + + if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) + goto enomem; + + struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); + nhm->nh_family = AF_UNSPEC; + nhm->nh_scope = 0; + nhm->nh_protocol = unhop->un_protocol; + nhm->nh_flags = 0; + + nlattr_add_u32(nw, NHA_ID, unhop->un_idx); + nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH); + + struct weightened_nhop *wn = unhop->un_nhgrp_src; + uint32_t num_nhops = unhop->un_nhgrp_count; + /* TODO: a better API? */ + int nla_len = sizeof(struct nlattr); + nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp)); + struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); + if (nla == NULL) + goto enomem; + nla->nla_type = NHA_GROUP; + nla->nla_len = nla_len; + for (int i = 0; i < num_nhops; i++) { + struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i]; + grp->id = nhop_get_uidx(wn[i].nh); + grp->weight = wn[i].weight; + grp->resvd1 = 0; + grp->resvd2 = 0; + } + + if (nlmsg_end(nw)) + return (true); +enomem: + NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory"); + nlmsg_abort(nw); + return (false); +} + +static bool +dump_nhop(const struct nhop_object *nh, uint32_t uidx, struct nlmsghdr *hdr, + struct nl_writer *nw) +{ + if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) + goto enomem; + + struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); + ENOMEM_IF_NULL(nhm); + nhm->nh_family = nhop_get_neigh_family(nh); + nhm->nh_scope = 0; // XXX: what's that? + nhm->nh_protocol = nhop_get_origin(nh); + nhm->nh_flags = 0; + + if (uidx != 0) + nlattr_add_u32(nw, NHA_ID, uidx); + if (nh->nh_flags & NHF_BLACKHOLE) { + nlattr_add_flag(nw, NHA_BLACKHOLE); + goto done; + } + nlattr_add_u32(nw, NHA_OIF, if_getindex(nh->nh_ifp)); + + switch (nh->gw_sa.sa_family) { +#ifdef INET + case AF_INET: + nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr); + break; +#endif +#ifdef INET6 + case AF_INET6: + { + struct in6_addr addr = nh->gw6_sa.sin6_addr; + in6_clearscope(&addr); + nlattr_add(nw, NHA_GATEWAY, 16, &addr); + break; + } +#endif + } + + int off = nlattr_add_nested(nw, NHA_FREEBSD); + if (off != 0) { + nlattr_add_u32(nw, NHAF_AIF, if_getindex(nh->nh_aifp)); + + if (uidx == 0) { + nlattr_add_u32(nw, NHAF_KID, nhop_get_idx(nh)); + nlattr_add_u32(nw, NHAF_FAMILY, nhop_get_upper_family(nh)); + nlattr_add_u32(nw, NHAF_TABLE, nhop_get_fibnum(nh)); + } + + nlattr_set_len(nw, off); + } + +done: + if (nlmsg_end(nw)) + return (true); +enomem: + nlmsg_abort(nw); + return (false); +} + +static void +dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, + struct nl_writer *nw) +{ + if (unhop->un_nhop_src != NULL) + dump_nhop(unhop->un_nhop_src, unhop->un_idx, hdr, nw); + else + dump_nhgrp(unhop, hdr, nw); +} + +static int +delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx) +{ + struct user_nhop *unhop_ret, *unhop_base, *unhop_chain; + struct nl_writer nw; + struct user_nhop key = { .un_idx = uidx }; + + UN_WLOCK(ctl); + + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base); + + if (unhop_base != NULL) { + CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret); + IF_DEBUG_LEVEL(LOG_DEBUG2) { + char nhbuf[NHOP_PRINT_BUFSIZE]; + nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf)); + FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop, + "removed base nhop %u: %s", uidx, nhbuf); + } + /* Unlink all child nexhops as well, keeping the chain intact */ + unhop_chain = unhop_base->un_nextchild; + while (unhop_chain != NULL) { + CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain, + unhop_ret); + MPASS(unhop_chain == unhop_ret); + IF_DEBUG_LEVEL(LOG_DEBUG3) { + char nhbuf[NHOP_PRINT_BUFSIZE]; + nhop_print_buf_any(unhop_chain->un_nhop, + nhbuf, sizeof(nhbuf)); + FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop, + "removed child nhop %u: %s", uidx, nhbuf); + } + unhop_chain = unhop_chain->un_nextchild; + } + } + + UN_WUNLOCK(ctl); + + if (unhop_base == NULL) { + NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx); + return (ENOENT); + } + + /* Report nexthop deletion */ + struct netlink_walkargs wa = { + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags, + .hdr.nlmsg_type = NL_RTM_DELNEXTHOP, + }; + + if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP, + 0, false)) { + NL_LOG(LOG_DEBUG, "error allocating message writer"); + return (ENOMEM); + } + + dump_unhop(unhop_base, &wa.hdr, &nw); + nlmsg_flush(&nw); + + while (unhop_base != NULL) { + unhop_chain = unhop_base->un_nextchild; + NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx); + unhop_base = unhop_chain; + } + + return (0); +} + +static void +consider_resize(struct unhop_ctl *ctl, uint32_t new_size) +{ + void *new_ptr = NULL; + size_t alloc_size; + + if (new_size == 0) + return; + + if (new_size != 0) { + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size); + new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); + if (new_ptr == NULL) + return; + } + + NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size); + UN_WLOCK(ctl); + if (new_ptr != NULL) { + CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size); + } + UN_WUNLOCK(ctl); + + + if (new_ptr != NULL) + free(new_ptr, M_NETLINK); +} + +static bool __noinline +vnet_init_unhops(void) +{ + uint32_t num_buckets = 16; + size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); + + struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK, + M_NOWAIT | M_ZERO); + if (ctl == NULL) + return (false); + + void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); + if (ptr == NULL) { + free(ctl, M_NETLINK); + return (false); + } + CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets); + UN_LOCK_INIT(ctl); + + if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) { + free(ptr, M_NETLINK); + free(ctl, M_NETLINK); + } + + if (atomic_load_ptr(&V_un_ctl) == NULL) + return (false); + + NL_LOG(LOG_NOTICE, "UNHOPS init done"); + + return (true); +} + +static void +vnet_destroy_unhops(const void *unused __unused) +{ + struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); + struct user_nhop *unhop, *tmp; + + if (ctl == NULL) + return; + V_un_ctl = NULL; + + /* Wait till all unhop users finish their reads */ + NET_EPOCH_WAIT(); + + UN_WLOCK(ctl); + CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) { + destroy_unhop(unhop); + } CHT_SLIST_FOREACH_SAFE_END; + UN_WUNLOCK(ctl); + + free(ctl->un_head.ptr, M_NETLINK); + free(ctl, M_NETLINK); +} +VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY, + vnet_destroy_unhops, NULL); + +static int +nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + int error = 0; + + /* Verify attribute correctness */ + struct nexthop_grp *grp = NLA_DATA(nla); + int data_len = NLA_DATA_LEN(nla); + + int count = data_len / sizeof(*grp); + if (count == 0 || (count * sizeof(*grp) != data_len)) { + NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len); + return (EINVAL); + } + + *((struct nlattr **)target) = nla; + return (error); +} + +static void +set_scope6(struct sockaddr *sa, if_t ifp) +{ +#ifdef INET6 + if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) { + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; + + if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) + in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp)); + } +#endif +} + +struct nl_parsed_nhop { + uint32_t nha_id; + uint8_t nha_blackhole; + uint8_t nha_groups; + uint8_t nhaf_knhops; + uint8_t nhaf_family; + struct ifnet *nha_oif; + struct sockaddr *nha_gw; + struct nlattr *nha_group; + uint8_t nh_family; + uint8_t nh_protocol; + uint32_t nhaf_table; + uint32_t nhaf_kid; + uint32_t nhaf_aif; +}; + +#define _IN(_field) offsetof(struct nhmsg, _field) +#define _OUT(_field) offsetof(struct nl_parsed_nhop, _field) +static struct nlattr_parser nla_p_nh_fbsd[] = { + { .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = nlattr_get_flag }, + { .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = nlattr_get_uint32 }, + { .type = NHAF_FAMILY, .off = _OUT(nhaf_family), .cb = nlattr_get_uint8 }, + { .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = nlattr_get_uint32 }, + { .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = nlattr_get_uint32 }, +}; +NL_DECLARE_ATTR_PARSER(nh_fbsd_parser, nla_p_nh_fbsd); + +static const struct nlfield_parser nlf_p_nh[] = { + { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 }, + { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 }, +}; + +static const struct nlattr_parser nla_p_nh[] = { + { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 }, + { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg }, + { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag }, + { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp }, + { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip }, + { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag }, + { .type = NHA_FREEBSD, .arg = &nh_fbsd_parser, .cb = nlattr_get_nested }, +}; +#undef _IN +#undef _OUT + +static bool +post_p_nh(void *_attrs, struct nl_pstate *npt) +{ + struct nl_parsed_nhop *attrs = (struct nl_parsed_nhop *)_attrs; + + set_scope6(attrs->nha_gw, attrs->nha_oif); + return (true); +} +NL_DECLARE_PARSER_EXT(nhmsg_parser, struct nhmsg, NULL, nlf_p_nh, nla_p_nh, post_p_nh); + +static bool +eligible_nhg(const struct nhop_object *nh) +{ + return (nh->nh_flags & NHF_GATEWAY); +} + +static int +newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop) +{ + struct nexthop_grp *grp = NLA_DATA(attrs->nha_group); + int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp); + struct weightened_nhop *wn; + + wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO); + if (wn == NULL) + return (ENOMEM); + + for (int i = 0; i < count; i++) { + struct user_nhop *unhop; + unhop = nl_find_base_unhop(ctl, grp[i].id); + if (unhop == NULL) { + NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id); + free(wn, M_NETLINK); + return (ESRCH); + } else if (unhop->un_nhop_src == NULL) { + NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported", + grp[i].id); + free(wn, M_NETLINK); + return (ENOTSUP); + } else if (!eligible_nhg(unhop->un_nhop_src)) { + NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible", + grp[i].id); + free(wn, M_NETLINK); + return (ENOTSUP); + } + /* + * TODO: consider more rigid eligibility checks: + * restrict nexthops with the same gateway + */ + wn[i].nh = unhop->un_nhop_src; + wn[i].weight = grp[i].weight; + } + unhop->un_nhgrp_src = wn; + unhop->un_nhgrp_count = count; + return (0); +} + +/* + * Sets nexthop @nh gateway specified by @gw. + * If gateway is IPv6 link-local, alters @gw to include scopeid equal to + * @ifp ifindex. + * Returns 0 on success or errno. + */ +int +nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, if_t ifp, + struct nl_pstate *npt) +{ +#ifdef INET6 + if (gw->sa_family == AF_INET6) { + struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw; + if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) { + if (ifp == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "interface not set"); + return (EINVAL); + } + in6_set_unicast_scopeid(&gw6->sin6_addr, if_getindex(ifp)); + } + } +#endif + nhop_set_gw(nh, gw, true); + return (0); +} + +static int +newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt) +{ + struct ifaddr *ifa = NULL; + struct nhop_object *nh; + int error; + + if (!attrs->nha_blackhole) { + if (attrs->nha_gw == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY"); + return (EINVAL); + } + if (attrs->nha_oif == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF"); + return (EINVAL); + } + if (ifa == NULL) + ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif); + if (ifa == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP"); + return (EINVAL); + } + } + + int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family; + + nh = nhop_alloc(RT_DEFAULT_FIB, family); + if (nh == NULL) { + NL_LOG(LOG_DEBUG, "Unable to allocate nexthop"); + return (ENOMEM); + } + nhop_set_uidx(nh, attrs->nha_id); + nhop_set_origin(nh, attrs->nh_protocol); + + if (attrs->nha_blackhole) + nhop_set_blackhole(nh, NHF_BLACKHOLE); + else { + error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt); + if (error != 0) { + nhop_free(nh); + return (error); + } + nhop_set_transmit_ifp(nh, attrs->nha_oif); + nhop_set_src(nh, ifa); + } + + error = nhop_get_unlinked(nh); + if (error != 0) { + NL_LOG(LOG_DEBUG, "unable to finalize nexthop"); + return (error); + } + + IF_DEBUG_LEVEL(LOG_DEBUG2) { + char nhbuf[NHOP_PRINT_BUFSIZE]; + nhop_print_buf(nh, nhbuf, sizeof(nhbuf)); + NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf); + } + + unhop->un_nhop_src = nh; + return (0); +} + +static int +rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nl_pstate *npt) +{ + struct nl_writer nw; + struct user_nhop *unhop; + int error; + + if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops())) + return (ENOMEM); + struct unhop_ctl *ctl = V_un_ctl; + + struct nl_parsed_nhop attrs = {}; + error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + /* + * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class + * citizen. + */ + if (attrs.nha_id == 0) { + attrs.nha_id = find_spare_uidx(ctl); + if (attrs.nha_id == 0) { + NL_LOG(LOG_DEBUG, "Unable to get spare uidx"); + return (ENOSPC); + } + } + + NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? if_getindex(attrs.nha_oif) : 0); + + unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); + if (unhop == NULL) { + NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop"); + return (ENOMEM); + } + unhop->un_idx = attrs.nha_id; + unhop->un_protocol = attrs.nh_protocol; + + if (attrs.nha_group) + error = newnhg(ctl, &attrs, unhop); + else + error = newnhop(&attrs, unhop, npt); + + if (error != 0) { + free(unhop, M_NETLINK); + return (error); + } + + UN_WLOCK(ctl); + /* Check if uidx already exists */ + struct user_nhop *tmp = NULL; + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp); + if (tmp != NULL) { + UN_WUNLOCK(ctl); + NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id); + destroy_unhop(unhop); + return (EEXIST); + } + CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop); + uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head); + UN_WUNLOCK(ctl); + + /* Report addition of the next nexhop */ + struct netlink_walkargs wa = { + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags, + .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, + }; + + if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP, + 0, false)) { + NL_LOG(LOG_DEBUG, "error allocating message writer"); + return (ENOMEM); + } + + dump_unhop(unhop, &wa.hdr, &nw); + nlmsg_flush(&nw); + + consider_resize(ctl, num_buckets_new); + + return (0); +} + +static int +rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nl_pstate *npt) +{ + struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); + int error; + + if (__predict_false(ctl == NULL)) + return (ESRCH); + + struct nl_parsed_nhop attrs = {}; + error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + if (attrs.nha_id == 0) { + NL_LOG(LOG_DEBUG, "NHA_ID not set"); + return (EINVAL); + } + + error = delete_unhop(ctl, hdr, attrs.nha_id); + + return (error); +} + +static bool +match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop) +{ + if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id) + return (false); + if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL) + return (false); + if (attrs->nha_oif != NULL && + (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif)) + return (false); + + return (true); +} + +static int +rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nl_pstate *npt) +{ + struct user_nhop *unhop; + UN_TRACKER; + int error; + + struct nl_parsed_nhop attrs = {}; + error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + struct netlink_walkargs wa = { + .nw = npt->nw, + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags, + .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, + }; + + if (attrs.nha_id != 0) { + struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); + struct user_nhop key = { .un_idx = attrs.nha_id }; + + if (__predict_false(ctl == NULL)) + return (ESRCH); + + NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id); + UN_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + UN_RUNLOCK(ctl); + + if (unhop == NULL) + return (ESRCH); + dump_unhop(unhop, &wa.hdr, wa.nw); + return (0); + } else if (attrs.nhaf_kid != 0) { + struct nhop_iter iter = { + .fibnum = attrs.nhaf_table, + .family = attrs.nhaf_family, + }; + int error = ESRCH; + + NL_LOG(LOG_DEBUG2, "START table %u family %d", attrs.nhaf_table, attrs.nhaf_family); + for (struct nhop_object *nh = nhops_iter_start(&iter); nh; + nh = nhops_iter_next(&iter)) { + NL_LOG(LOG_DEBUG3, "get %u", nhop_get_idx(nh)); + if (nhop_get_idx(nh) == attrs.nhaf_kid) { + dump_nhop(nh, 0, &wa.hdr, wa.nw); + error = 0; + break; + } + } + nhops_iter_stop(&iter); + return (error); + } else if (attrs.nhaf_knhops) { + struct nhop_iter iter = { + .fibnum = attrs.nhaf_table, + .family = attrs.nhaf_family, + }; + + NL_LOG(LOG_DEBUG2, "DUMP table %u family %d", attrs.nhaf_table, attrs.nhaf_family); + wa.hdr.nlmsg_flags |= NLM_F_MULTI; + for (struct nhop_object *nh = nhops_iter_start(&iter); nh; + nh = nhops_iter_next(&iter)) { + dump_nhop(nh, 0, &wa.hdr, wa.nw); + } + nhops_iter_stop(&iter); + } else { + struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); + + if (__predict_false(ctl == NULL)) + return (ESRCH); + + NL_LOG(LOG_DEBUG2, "DUMP unhops"); + UN_RLOCK(ctl); + wa.hdr.nlmsg_flags |= NLM_F_MULTI; + CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) { + if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop)) + dump_unhop(unhop, &wa.hdr, wa.nw); + } CHT_SLIST_FOREACH_END; + UN_RUNLOCK(ctl); + } + + if (wa.error == 0) { + if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) + return (ENOMEM); + } + return (0); +} + +static const struct rtnl_cmd_handler cmd_handlers[] = { + { + .cmd = NL_RTM_NEWNEXTHOP, + .name = "RTM_NEWNEXTHOP", + .cb = &rtnl_handle_newnhop, + .priv = PRIV_NET_ROUTE, + }, + { + .cmd = NL_RTM_DELNEXTHOP, + .name = "RTM_DELNEXTHOP", + .cb = &rtnl_handle_delnhop, + .priv = PRIV_NET_ROUTE, + }, + { + .cmd = NL_RTM_GETNEXTHOP, + .name = "RTM_GETNEXTHOP", + .cb = &rtnl_handle_getnhop, + } +}; + +static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser, &nh_fbsd_parser }; + +void +rtnl_nexthops_init(void) +{ + NL_VERIFY_PARSERS(all_parsers); + rtnl_register_messages(cmd_handlers, nitems(cmd_handlers)); +} |