diff options
Diffstat (limited to 'sys')
-rw-r--r-- | sys/conf/NOTES | 4 | ||||
-rw-r--r-- | sys/conf/files | 4 | ||||
-rw-r--r-- | sys/conf/options | 1 | ||||
-rw-r--r-- | sys/net/radix.c | 4 | ||||
-rw-r--r-- | sys/net/route.c | 1 | ||||
-rw-r--r-- | sys/net/route.h | 5 | ||||
-rw-r--r-- | sys/net/route/mpath_ctl.c | 165 | ||||
-rw-r--r-- | sys/net/route/nhgrp.c | 344 | ||||
-rw-r--r-- | sys/net/route/nhgrp_ctl.c | 788 | ||||
-rw-r--r-- | sys/net/route/nhgrp_var.h | 72 | ||||
-rw-r--r-- | sys/net/route/nhop.c | 8 | ||||
-rw-r--r-- | sys/net/route/nhop.h | 37 | ||||
-rw-r--r-- | sys/net/route/nhop_ctl.c | 7 | ||||
-rw-r--r-- | sys/net/route/nhop_var.h | 11 | ||||
-rw-r--r-- | sys/net/route/route_ctl.c | 275 | ||||
-rw-r--r-- | sys/net/route/route_ctl.h | 18 | ||||
-rw-r--r-- | sys/net/route/route_helpers.c | 164 | ||||
-rw-r--r-- | sys/net/route/route_var.h | 80 | ||||
-rw-r--r-- | sys/net/rtsock.c | 111 | ||||
-rw-r--r-- | sys/netinet/in.c | 10 | ||||
-rw-r--r-- | sys/netinet/in_fib.c | 59 | ||||
-rw-r--r-- | sys/netinet/in_rmx.c | 5 | ||||
-rw-r--r-- | sys/netinet/ip_output.c | 5 | ||||
-rw-r--r-- | sys/netinet6/in6_fib.c | 55 | ||||
-rw-r--r-- | sys/netinet6/in6_rmx.c | 5 | ||||
-rw-r--r-- | sys/netinet6/nd6.c | 5 | ||||
-rw-r--r-- | sys/sys/socket.h | 1 |
27 files changed, 2020 insertions, 224 deletions
diff --git a/sys/conf/NOTES b/sys/conf/NOTES index 7aa957efa271..0d9fac844365 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -1002,7 +1002,7 @@ device lagg # # TCP_HHOOK enables the hhook(9) framework hooks for the TCP stack. # -# RADIX_MPATH provides support for equal-cost multi-path routing. +# ROUTE_MPATH provides support for multipath routing. # options MROUTING # Multicast routing options IPFIREWALL #firewall @@ -1023,7 +1023,7 @@ options TCPDEBUG options TCPPCAP options TCP_BLACKBOX options TCP_HHOOK -options RADIX_MPATH +options ROUTE_MPATH # The MBUF_STRESS_TEST option enables options which create # various random failures / extreme cases related to mbuf diff --git a/sys/conf/files b/sys/conf/files index e3c142441653..8ec5eacd053e 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -4143,10 +4143,12 @@ net/debugnet.c optional inet debugnet net/debugnet_inet.c optional inet debugnet net/pfil.c optional ether | inet net/radix.c standard -net/radix_mpath.c standard net/raw_cb.c standard net/raw_usrreq.c standard net/route.c standard +net/route/mpath_ctl.c optional route_mpath +net/route/nhgrp.c optional route_mpath +net/route/nhgrp_ctl.c optional route_mpath net/route/nhop.c standard net/route/nhop_ctl.c standard net/route/nhop_utils.c standard diff --git a/sys/conf/options b/sys/conf/options index e22197093f58..e68621d61a37 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -454,6 +454,7 @@ NFSLOCKD PCBGROUP opt_pcbgroup.h PF_DEFAULT_TO_DROP opt_pf.h RADIX_MPATH opt_mpath.h +ROUTE_MPATH opt_route.h ROUTETABLES opt_route.h RSS opt_rss.h SLIP_IFF_OPTS opt_slip.h diff --git a/sys/net/radix.c b/sys/net/radix.c index 3d9ed0a69538..f65153393d74 100644 --- a/sys/net/radix.c +++ b/sys/net/radix.c @@ -44,10 +44,6 @@ #include <sys/malloc.h> #include <sys/syslog.h> #include <net/radix.h> -#include "opt_mpath.h" -#ifdef RADIX_MPATH -#include <net/radix_mpath.h> -#endif #else /* !_KERNEL */ #include <stdio.h> #include <strings.h> diff --git a/sys/net/route.c b/sys/net/route.c index d19a4cfc0afe..dac3211bc1f5 100644 --- a/sys/net/route.c +++ b/sys/net/route.c @@ -39,7 +39,6 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mrouting.h" -#include "opt_mpath.h" #include "opt_route.h" #include <sys/param.h> diff --git a/sys/net/route.h b/sys/net/route.h index 19c9ce0eb51b..021b4621692b 100644 --- a/sys/net/route.h +++ b/sys/net/route.h @@ -178,6 +178,7 @@ VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */ */ /* Consumer-visible nexthop info flags */ +#define NHF_MULTIPATH 0x0008 /* Nexhop is a nexthop group */ #define NHF_REJECT 0x0010 /* RTF_REJECT */ #define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */ #define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */ @@ -208,6 +209,10 @@ struct rtstat { uint64_t rts_wildcard; /* lookups satisfied by a wildcard */ uint64_t rts_nh_idx_alloc_failure; /* nexthop index alloc failure*/ uint64_t rts_nh_alloc_failure; /* nexthop allocation failure*/ + uint64_t rts_add_failure; /* # of route addition failures */ + uint64_t rts_add_retry; /* # of route addition retries */ + uint64_t rts_del_failure; /* # of route deletion failure */ + uint64_t rts_del_retry; /* # of route deletion retries */ }; /* diff --git a/sys/net/route/mpath_ctl.c b/sys/net/route/mpath_ctl.c new file mode 100644 index 000000000000..1ac7c191ed05 --- /dev/null +++ b/sys/net/route/mpath_ctl.c @@ -0,0 +1,165 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_route.h" + +#include <sys/cdefs.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/rmlock.h> +#include <sys/rwlock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/kernel.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_dl.h> +#include <net/route.h> +#include <net/route/route_ctl.h> +#include <net/route/route_var.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/in_fib.h> + +#include <net/route/nhop_utils.h> +#include <net/route/nhop.h> +#include <net/route/nhop_var.h> + +/* + * This file contains the supporting functions for adding/deleting/updating + * multipath routes to the routing table. + */ + +SYSCTL_DECL(_net_route); + +/* + * Tries to add @rnd_add nhop to the existing set of nhops (@nh_orig) for the + * prefix specified by @rt. + * + * Return 0 ans consumes rt / rnd_add nhop references. @rc gets populated + * with the operation result. + * Otherwise errno is returned. + * + * caller responsibility is to unlock/free rt and + * rt->rt_nhop. + */ +int +add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry *rt, struct route_nhop_data *rnd_add, + struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) +{ + RIB_RLOCK_TRACKER; + struct route_nhop_data rnd_new; + int error = 0; + + /* + * It is possible that multiple rtsock speakers will try to update + * the same route simultaneously. Reduce the chance of failing the + * request by retrying the cycle multiple times. + */ + for (int i = 0; i < RIB_MAX_RETRIES; i++) { + error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, + &rnd_new); + if (error != 0) { + if (error != EAGAIN) + break; + + /* + * Group creation failed, most probably because + * @rnd_orig data got scheduled for deletion. + * Refresh @rnd_orig data and retry. + */ + RIB_RLOCK(rnh); + lookup_prefix(rnh, info, rnd_orig); + RIB_RUNLOCK(rnh); + continue; + } + + error = change_route_conditional(rnh, rt, info, rnd_orig, + &rnd_new, rc); + if (error != EAGAIN) + break; + RTSTAT_INC(rts_add_retry); + } + + return (error); +} + +struct rt_match_info { + struct rt_addrinfo *info; + struct rtentry *rt; +}; + +static bool +gw_filter_func(const struct nhop_object *nh, void *_data) +{ + struct rt_match_info *ri = (struct rt_match_info *)_data; + + return (check_info_match_nhop(ri->info, ri->rt, nh) == 0); +} + +/* + * Tries to delete matching paths from @nhg. + * Returns 0 on success and updates operation result in @rc. + */ +int +del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info, + struct rtentry *rt, struct nhgrp_object *nhg, + struct rib_cmd_info *rc) +{ + struct route_nhop_data rnd; + struct rt_match_info ri = { .info = info, .rt = rt }; + int error; + + RIB_WLOCK_ASSERT(rh); + + /* + * Require gateway to delete multipath routes, to forbid + * deleting all paths at once. + * If the filter function is provided, skip gateway check to + * allow rib_walk_del() delete routes for any criteria based + * on provided callback. + */ + if ((info->rti_info[RTAX_GATEWAY] == NULL) && (info->rti_filter == NULL)) + return (ESRCH); + + error = nhgrp_get_filtered_group(rh, nhg, gw_filter_func, (void *)&ri, + &rnd); + if (error == 0) + error = change_route_nhop(rh, rt, info, &rnd, rc); + return (error); +} + diff --git a/sys/net/route/nhgrp.c b/sys/net/route/nhgrp.c new file mode 100644 index 000000000000..c25f4f09865b --- /dev/null +++ b/sys/net/route/nhgrp.c @@ -0,0 +1,344 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_route.h" + +#include <sys/cdefs.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/rmlock.h> +#include <sys/rwlock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/refcount.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/kernel.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_dl.h> +#include <net/route.h> +#include <net/route/route_ctl.h> +#include <net/route/route_var.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/in_fib.h> + +#include <net/route/nhop_utils.h> +#include <net/route/nhop.h> +#include <net/route/nhop_var.h> +#include <net/route/nhgrp_var.h> + +/* + * This file contains data structures management logic for the nexthop + * groups ("nhgrp") route subsystem. + * + * Nexthop groups are used to store multiple routes available for the specific + * prefix. Nexthop groups are immutable and can be shared across multiple + * prefixes. + * + * Each group consists of a control plane part and a dataplane part. + * Control plane is basically a collection of nexthop objects with + * weights and refcount. + * + * Datapath consists of a array of nexthop pointers, compiled from control + * plane data to support O(1) nexthop selection. + * + * For example, consider the following group: + * [(nh1, weight=100), (nh2, weight=200)] + * It will compile to the following array: + * [nh1, nh2, nh2] + * + */ + +static void consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, + uint32_t new_idx_items); + +static int cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b); +static unsigned int hash_nhgrp(const struct nhgrp_priv *obj); + +static unsigned +djb_hash(const unsigned char *h, const int len) +{ + unsigned int result = 0; + int i; + + for (i = 0; i < len; i++) + result = 33 * result ^ h[i]; + + return (result); +} + +static int +cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b) +{ + + /* + * In case of consistent hashing, there can be multiple nexthop groups + * with the same "control plane" list of nexthops with weights and a + * different set of "data plane" nexthops. + * For now, ignore the data plane and focus on the control plane list. + */ + if (a->nhg_nh_count != b->nhg_nh_count) + return (0); + return !memcmp(a->nhg_nh_weights, b->nhg_nh_weights, + sizeof(struct weightened_nhop) * a->nhg_nh_count); +} + +/* + * Hash callback: calculate hash of an object + */ +static unsigned int +hash_nhgrp(const struct nhgrp_priv *obj) +{ + const unsigned char *key; + + key = (const unsigned char *)obj->nhg_nh_weights; + + return (djb_hash(key, sizeof(struct weightened_nhop) * obj->nhg_nh_count)); +} + +/* + * Returns object referenced and unlocked + */ +struct nhgrp_priv * +find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key) +{ + struct nhgrp_priv *priv_ret; + + NHOPS_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->gr_head, mpath, key, priv_ret); + if (priv_ret != NULL) { + if (refcount_acquire_if_not_zero(&priv_ret->nhg_refcount) == 0) { + /* refcount is 0 -> group is being deleted */ + priv_ret = NULL; + } + } + NHOPS_RUNLOCK(ctl); + + return (priv_ret); +} + +int +link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv) +{ + uint16_t idx; + uint32_t new_num_buckets, new_num_items; + + NHOPS_WLOCK(ctl); + /* Check if we need to resize hash and index */ + new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->gr_head); + new_num_items = bitmask_get_resize_items(&ctl->gr_idx_head); + + if (bitmask_alloc_idx(&ctl->gr_idx_head, &idx) != 0) { + NHOPS_WUNLOCK(ctl); + DPRINTF("Unable to allocate mpath index"); + consider_resize(ctl, new_num_buckets, new_num_items); + return (0); + } + + grp_priv->nhg_idx = idx; + grp_priv->nh_control = ctl; + CHT_SLIST_INSERT_HEAD(&ctl->gr_head, mpath, grp_priv); + + NHOPS_WUNLOCK(ctl); + + consider_resize(ctl, new_num_buckets, new_num_items); + + return (1); +} + +struct nhgrp_priv * +unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key) +{ + struct nhgrp_priv *nhg_priv_ret; + int ret, idx; + + NHOPS_WLOCK(ctl); + + CHT_SLIST_REMOVE_BYOBJ(&ctl->gr_head, mpath, key, nhg_priv_ret); + + if (nhg_priv_ret == NULL) { + DPRINTF("Unable to find nhop group!"); + NHOPS_WUNLOCK(ctl); + return (NULL); + } + + idx = nhg_priv_ret->nhg_idx; + ret = bitmask_free_idx(&ctl->gr_idx_head, idx); + nhg_priv_ret->nhg_idx = 0; + nhg_priv_ret->nh_control = NULL; + + NHOPS_WUNLOCK(ctl); + + return (nhg_priv_ret); +} + +/* + * Checks if hash needs resizing and performs this resize if necessary + * + */ +__noinline static void +consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items) +{ + void *nh_ptr, *nh_idx_ptr; + void *old_idx_ptr; + size_t alloc_size; + + nh_ptr = NULL ; + if (new_nh_buckets != 0) { + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets); + nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + } + + nh_idx_ptr = NULL; + if (new_idx_items != 0) { + alloc_size = bitmask_get_size(new_idx_items); + nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + } + + if (nh_ptr == NULL && nh_idx_ptr == NULL) { + /* Either resize is not required or allocations have failed. */ + return; + } + + DPRINTF("mp: going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]", + nh_ptr, new_nh_buckets, nh_idx_ptr, new_idx_items); + + old_idx_ptr = NULL; + + NHOPS_WLOCK(ctl); + if (nh_ptr != NULL) { + CHT_SLIST_RESIZE(&ctl->gr_head, mpath, nh_ptr, new_nh_buckets); + } + if (nh_idx_ptr != NULL) { + if (bitmask_copy(&ctl->gr_idx_head, nh_idx_ptr, new_idx_items)) + bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr); + } + NHOPS_WUNLOCK(ctl); + + if (nh_ptr != NULL) + free(nh_ptr, M_NHOP); + if (old_idx_ptr != NULL) + free(old_idx_ptr, M_NHOP); +} + +/* + * Function allocating the necessary group data structures. + */ +bool +nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags) +{ + size_t alloc_size; + uint32_t num_buckets, num_items; + void *cht_ptr, *mask_ptr; + + malloc_flags = (malloc_flags & (M_NOWAIT | M_WAITOK)) | M_ZERO; + + num_buckets = 8; + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); + cht_ptr = malloc(alloc_size, M_NHOP, malloc_flags); + + if (cht_ptr == NULL) { + DPRINTF("mpath init failed"); + return (false); + } + + /* + * Allocate nexthop index bitmask. + */ + num_items = 128; + mask_ptr = malloc(bitmask_get_size(num_items), M_NHOP, malloc_flags); + if (mask_ptr == NULL) { + DPRINTF("mpath bitmask init failed"); + free(cht_ptr, M_NHOP); + return (false); + } + + NHOPS_WLOCK(ctl); + + if (ctl->gr_head.hash_size == 0) { + /* Init hash and bitmask */ + CHT_SLIST_INIT(&ctl->gr_head, cht_ptr, num_buckets); + bitmask_init(&ctl->gr_idx_head, mask_ptr, num_items); + NHOPS_WUNLOCK(ctl); + } else { + /* Other thread has already initiliazed hash/bitmask */ + NHOPS_WUNLOCK(ctl); + free(cht_ptr, M_NHOP); + free(mask_ptr, M_NHOP); + } + + DPRINTF("mpath init done for fib/af %d/%d", ctl->rh->rib_fibnum, + ctl->rh->rib_family); + + return (true); +} + +int +nhgrp_ctl_init(struct nh_control *ctl) +{ + + /* + * By default, do not allocate datastructures as multipath + * routes will not be necessarily used. + */ + CHT_SLIST_INIT(&ctl->gr_head, NULL, 0); + bitmask_init(&ctl->gr_idx_head, NULL, 0); + return (0); +} + +void +nhgrp_ctl_free(struct nh_control *ctl) +{ + + if (ctl->gr_head.ptr != NULL) + free(ctl->gr_head.ptr, M_NHOP); + if (ctl->gr_idx_head.idx != NULL) + free(ctl->gr_idx_head.idx, M_NHOP); +} + +void +nhgrp_ctl_unlink_all(struct nh_control *ctl) +{ + struct nhgrp_priv *nhg_priv; + + NHOPS_WLOCK_ASSERT(ctl); + + CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) { + DPRINTF("Marking nhgrp %u unlinked", nhg_priv->nhg_idx); + refcount_release(&nhg_priv->nhg_linked); + } CHT_SLIST_FOREACH_END; +} + diff --git a/sys/net/route/nhgrp_ctl.c b/sys/net/route/nhgrp_ctl.c new file mode 100644 index 000000000000..a3a824992e08 --- /dev/null +++ b/sys/net/route/nhgrp_ctl.c @@ -0,0 +1,788 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +#define RTDEBUG +#include "opt_inet.h" +#include "opt_route.h" + +#include <sys/cdefs.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/rmlock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/refcount.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/kernel.h> +#include <sys/epoch.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/route.h> +#include <net/route/route_ctl.h> +#include <net/route/route_var.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/in_fib.h> + +#include <net/route/nhop_utils.h> +#include <net/route/nhop.h> +#include <net/route/nhop_var.h> +#include <net/route/nhgrp_var.h> + +/* + * This file contains the supporting functions for creating multipath groups + * and compiling their dataplane parts. + */ + +/* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */ +_Static_assert(MPF_MULTIPATH == NHF_MULTIPATH, + "MPF_MULTIPATH must be the same as NHF_MULTIPATH"); +/* Offset and size of flags field has to be the same for nhop/nhop groups */ +CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags); +/* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */ +CTASSERT(RIB_MAX_MPATH_WIDTH <= 64); + +static int wn_cmp(const void *a, const void *b); +static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops); + +static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl, + struct weightened_nhop *wn, int num_nhops, int *perror); +static void destroy_nhgrp(struct nhgrp_priv *nhg_priv); +static void destroy_nhgrp_epoch(epoch_context_t ctx); +static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv); + +static int +wn_cmp(const void *a, const void *b) +{ + const struct weightened_nhop *wa = a; + const struct weightened_nhop *wb = b; + + if (wa->weight > wb->weight) + return (1); + else if (wa->weight < wb->weight) + return (-1); + + /* Compare nexthops by pointer */ + if (wa->nh > wb->nh) + return (1); + else if (wa->nh < wb->nh) + return (-1); + else + return (0); +} + +/* + * Perform in-place sorting for array of nexthops in @wn. + * + * To avoid nh groups duplication, nexthops/weights in the + * @wn need to be ordered deterministically. + * As this sorting is needed only for the control plane functionality, + * there are no specific external requirements. + * + * Sort by weight first, to ease calculation of the slot sizes. + */ +static void +sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops) +{ + + qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp); +} + +/* + * Calculate minimum number of slots required to fit the existing + * set of weights in the common use case where weights are "easily" + * comparable. + * Assumes @wn is sorted by weight ascending and each weight is > 0. + * Returns number of slots or 0 if precise calculation failed. + * + * Some examples: + * note: (i, X) pair means (nhop=i, weight=X): + * (1, 1) (2, 2) -> 3 slots [1, 2, 2] + * (1, 100), (2, 200) -> 3 slots [1, 2, 2] + * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3] + */ +static uint32_t +calc_min_mpath_slots_fast(const struct weightened_nhop *wn, size_t num_items) +{ + uint32_t i, last, xmin; + uint64_t total = 0; + + last = 0; + xmin = wn[0].weight; + for (i = 0; i < num_items; i++) { + total += wn[i].weight; + if ((wn[i].weight - last < xmin) && (wn[i].weight != last)) + xmin = wn[i].weight - last; + last = wn[i].weight; + } + /* xmin is the minimum unit of desired capacity */ + if ((total % xmin) != 0) + return (0); + for (i = 0; i < num_items; i++) { + if ((wn[i].weight % xmin) != 0) + return (0); + } + + return ((uint32_t)(total / xmin)); +} + +/* + * Calculate minimum number of slots required to fit the existing + * set of weights while maintaining weight coefficients. + * + * Assume @wn is sorted by weight ascending and each weight is > 0. + * + * Tries to find simple precise solution first and falls back to + * RIB_MAX_MPATH_WIDTH in case of any failure. + */ +static uint32_t +calc_min_mpath_slots(const struct weightened_nhop *wn, size_t num_items) +{ + uint32_t v; + + v = calc_min_mpath_slots_fast(wn, num_items); + if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH)) + v = RIB_MAX_MPATH_WIDTH; + + return (v); +} + +/* + * Nexthop group data consists of + * 1) dataplane part, with nhgrp_object as a header followed by an + * arbitrary number of nexthop pointers. + * 2) control plane part, with nhgrp_priv as a header, followed by + * an arbirtrary number of 'struct weightened_nhop' object. + * + * Given nexthop groups are (mostly) immutable, allocate all data + * in one go. + * + */ +__noinline static size_t +get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops) +{ + size_t sz; + + sz = sizeof(struct nhgrp_object); + sz += nhg_size * sizeof(struct nhop_object *); + sz += sizeof(struct nhgrp_priv); + sz += num_nhops * sizeof(struct weightened_nhop); + return (sz); +} + +/* + * Compile actual list of nexthops to be used by datapath from + * the nexthop group @dst. + * + * For example, compiling control plane list of 2 nexthops + * [(200, A), (100, B)] would result in the datapath array + * [A, A, B] + */ +static void +compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x, + uint32_t num_slots) +{ + struct nhgrp_object *dst; + int i, slot_idx, remaining_slots; + uint64_t remaining_sum, nh_weight, nh_slots; + + slot_idx = 0; + dst = dst_priv->nhg; + /* Calculate sum of all weights */ + remaining_sum = 0; + for (i = 0; i < dst_priv->nhg_nh_count; i++) + remaining_sum += x[i].weight; + remaining_slots = num_slots; + DPRINTF("O: %u/%u", (uint32_t)remaining_sum, remaining_slots); + for (i = 0; i < dst_priv->nhg_nh_count; i++) { + /* Calculate number of slots for the current nexthop */ + if (remaining_sum > 0) { + nh_weight = (uint64_t)x[i].weight; + nh_slots = (nh_weight * remaining_slots / remaining_sum); + } else + nh_slots = 0; + + remaining_sum -= x[i].weight; + remaining_slots -= nh_slots; + + DPRINTF(" OO[%d]: %u/%u curr=%d slot_idx=%d", i, + (uint32_t)remaining_sum, remaining_slots, + (int)nh_slots, slot_idx); + + KASSERT((slot_idx + nh_slots <= num_slots), + ("index overflow during nhg compilation")); + while (nh_slots-- > 0) + dst->nhops[slot_idx++] = x[i].nh; + } +} + +/* + * Allocates new nexthop group for the list of weightened nexthops. + * Assume sorted list. + * Does NOT reference any nexthops in the group. + * Returns group with refcount=1 or NULL. + */ +static struct nhgrp_priv * +alloc_nhgrp(struct weightened_nhop *wn, int num_nhops) +{ + uint32_t nhgrp_size; + int flags = M_NOWAIT; + struct nhgrp_object *nhg; + struct nhgrp_priv *nhg_priv; + + nhgrp_size = calc_min_mpath_slots(wn, num_nhops); + if (nhgrp_size == 0) { + /* Zero weights, abort */ + return (NULL); + } + + size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops); + nhg = malloc(sz, M_NHOP, flags | M_ZERO); + if (nhg == NULL) { + return (NULL); + } + + /* Has to be the first to make NHGRP_PRIV() work */ + nhg->nhg_size = nhgrp_size; + DPRINTF("new mpath group: num_nhops: %u", (uint32_t)nhgrp_size); + nhg->nhg_flags = MPF_MULTIPATH; + + nhg_priv = NHGRP_PRIV(nhg); + nhg_priv->nhg_nh_count = num_nhops; + refcount_init(&nhg_priv->nhg_refcount, 1); + + /* Please see nhgrp_free() comments on the initial value */ + refcount_init(&nhg_priv->nhg_linked, 2); + + nhg_priv->nhg = nhg; + memcpy(&nhg_priv->nhg_nh_weights[0], wn, + num_nhops * sizeof(struct weightened_nhop)); + + compile_nhgrp(nhg_priv, wn, nhg->nhg_size); + + return (nhg_priv); +} + +void +nhgrp_free(struct nhgrp_object *nhg) +{ + struct nhgrp_priv *nhg_priv; + struct nh_control *ctl; + struct epoch_tracker et; + + nhg_priv = NHGRP_PRIV(nhg); + + if (!refcount_release(&nhg_priv->nhg_refcount)) + return; + + /* + * group objects don't have an explicit lock attached to it. + * As groups are reclaimed based on reference count, it is possible + * that some groups will persist after vnet destruction callback + * called. Given that, handle scenario with nhgrp_free_group() being + * called either after or simultaneously with nhgrp_ctl_unlink_all() + * by using another reference counter: nhg_linked. + * + * There are only 2 places, where nhg_linked can be decreased: + * rib destroy (nhgrp_ctl_unlink_all) and this function. + * nhg_link can never be increased. + * + * Hence, use initial value of 2 to make use of + * refcount_release_if_not_last(). + * + * There can be two scenarious when calling this function: + * + * 1) nhg_linked value is 2. This means that either + * nhgrp_ctl_unlink_all() has not been called OR it is running, + * but we are guaranteed that nh_control won't be freed in + * this epoch. Hence, nexthop can be safely unlinked. + * + * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all() + * has been called and nhgrp unlink can be skipped. + */ + + NET_EPOCH_ENTER(et); + if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) { + ctl = nhg_priv->nh_control; + if (unlink_nhgrp(ctl, nhg_priv) == NULL) { + /* Do not try to reclaim */ + DPRINTF("Failed to unlink nexhop group %p", nhg_priv); + NET_EPOCH_EXIT(et); + return; + } + } + NET_EPOCH_EXIT(et); + + epoch_call(net_epoch_preempt, destroy_nhgrp_epoch, + &nhg_priv->nhg_epoch_ctx); +} + +/* + * Destroys all local resources belonging to @nhg_priv. + */ +__noinline static void +destroy_nhgrp_int(struct nhgrp_priv *nhg_priv) +{ + + free(nhg_priv->nhg, M_NHOP); +} + +__noinline static void +destroy_nhgrp(struct nhgrp_priv *nhg_priv) +{ + + KASSERT((nhg_priv->nhg_refcount == 0), ("nhg_refcount != 0")); + + DPRINTF("DEL MPATH %p", nhg_priv); + + KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0")); + + free_nhgrp_nhops(nhg_priv); + + destroy_nhgrp_int(nhg_priv); +} + +/* + * Epoch callback indicating group is safe to destroy + */ +static void +destroy_nhgrp_epoch(epoch_context_t ctx) +{ + struct nhgrp_priv *nhg_priv; + + nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx); + + destroy_nhgrp(nhg_priv); +} + +static bool +ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv) +{ + + for (int i = 0; i < nhg_priv->nhg_nh_count; i++) { + if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0) + continue; + + /* + * Failed to ref the nexthop, b/c it's deleted. + * Need to rollback references back. + */ + for (int j = 0; j < i; j++) + nhop_free(nhg_priv->nhg_nh_weights[j].nh); + return (false); + } + + return (true); +} + +static void +free_nhgrp_nhops(struct nhgrp_priv *nhg_priv) +{ + + for (int i = 0; i < nhg_priv->nhg_nh_count; i++) + nhop_free(nhg_priv->nhg_nh_weights[i].nh); +} + +/* + * Creates or looks up an existing nexthop group based on @wn and @num_nhops. + * + * Returns referenced nhop group or NULL, passing error code in @perror. + */ +struct nhgrp_priv * +get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops, + int *perror) +{ + struct nhgrp_priv *key, *nhg_priv; + + if (num_nhops > RIB_MAX_MPATH_WIDTH) { + *perror = E2BIG; + return (NULL); + } + + if (ctl->gr_head.hash_size == 0) { + /* First multipath request. Bootstrap mpath datastructures. */ + if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) { + *perror = ENOMEM; + return (NULL); + } + } + + /* Sort nexthops & check there are no duplicates */ + sort_weightened_nhops(wn, num_nhops); + uint32_t last_id = 0; + for (int i = 0; i < num_nhops; i++) { + if (wn[i].nh->nh_priv->nh_idx == last_id) { + *perror = EEXIST; + return (NULL); + } + last_id = wn[i].nh->nh_priv->nh_idx; + } + + if ((key = alloc_nhgrp(wn, num_nhops)) == NULL) { + *perror = ENOMEM; + return (NULL); + } + + nhg_priv = find_nhgrp(ctl, key); + if (nhg_priv != NULL) { + /* + * Free originally-created group. As it hasn't been linked + * and the dependent nexhops haven't been referenced, just free + * the group. + */ + destroy_nhgrp_int(key); + *perror = 0; + return (nhg_priv); + } else { + /* No existing group, try to link the new one */ + if (!ref_nhgrp_nhops(key)) { + /* + * Some of the nexthops have been scheduled for deletion. + * As the group hasn't been linked / no nexhops have been + * referenced, call the final destructor immediately. + */ + destroy_nhgrp_int(key); + *perror = EAGAIN; + return (NULL); + } + if (link_nhgrp(ctl, key) == 0) { + /* Unable to allocate index? */ + *perror = EAGAIN; + destroy_nhgrp(key); + } + *perror = 0; + return (key); + } + + /* NOTREACHED */ +} + +/* + * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig. + * + * Returns referenced nexthop group or NULL. In the latter case, @perror is + * filled with an error code. + * Note that function does NOT care if the next nexthops already exists + * in the @gr_orig. As a result, they will be added, resulting in the + * same nexthop being present multiple times in the new group. + */ +static struct nhgrp_priv * +append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig, + struct weightened_nhop *wn, int num_nhops, int *perror) +{ + char storage[64]; + struct weightened_nhop *pnhops; + struct nhgrp_priv *nhg_priv; + const struct nhgrp_priv *src_priv; + size_t sz; + int curr_nhops; + + src_priv = NHGRP_PRIV_CONST(gr_orig); + curr_nhops = src_priv->nhg_nh_count; + + *perror = 0; + + sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop)); + /* optimize for <= 4 paths, each path=16 bytes */ + if (sz <= sizeof(storage)) + pnhops = (struct weightened_nhop *)&storage[0]; + else { + pnhops = malloc(sz, M_TEMP, M_NOWAIT); + if (pnhops == NULL) { + *perror = ENOMEM; + return (NULL); + } + } + + /* Copy nhops from original group first */ + memcpy(pnhops, src_priv->nhg_nh_weights, + curr_nhops * sizeof(struct weightened_nhop)); + memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop)); + curr_nhops += num_nhops; + + nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, perror); + + if (pnhops != (struct weightened_nhop *)&storage[0]) + free(pnhops, M_TEMP); + + if (nhg_priv == NULL) + return (NULL); + + return (nhg_priv); +} + + +/* + * Creates/finds nexthop group based on @wn and @num_nhops. + * Returns 0 on success with referenced group in @rnd, or + * errno. + * + * If the error is EAGAIN, then the operation can be retried. + */ +int +nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops, + struct route_nhop_data *rnd) +{ + struct nh_control *ctl = rh->nh_control; + struct nhgrp_priv *nhg_priv; + int error; + + nhg_priv = get_nhgrp(ctl, wn, num_nhops, &error); + if (nhg_priv != NULL) + rnd->rnd_nhgrp = nhg_priv->nhg; + rnd->rnd_weight = 0; + + return (error); +} + +/* + * Creates new nexthop group based on @src group with the nexthops defined in bitmask + * @nhop_mask removed. + * Returns referenced nexthop group or NULL on failure. + */ +int +nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src, + nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd) +{ + char storage[64]; + struct nh_control *ctl = rh->nh_control; + struct weightened_nhop *pnhops; + const struct nhgrp_priv *mp_priv, *src_priv; + size_t sz; + int error, i, num_nhops; + + src_priv = NHGRP_PRIV_CONST(src); + + sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop)); + /* optimize for <= 4 paths, each path=16 bytes */ + if (sz <= sizeof(storage)) + pnhops = (struct weightened_nhop *)&storage[0]; + else { + if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL) + return (ENOMEM); + } + + /* Filter nexthops */ + error = 0; + num_nhops = 0; + for (i = 0; i < src_priv->nhg_nh_count; i++) { + if (flt_func(src_priv->nhg_nh_weights[i].nh, flt_data)) + continue; + memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i], + sizeof(struct weightened_nhop)); + } + + if (num_nhops == 0) { + rnd->rnd_nhgrp = NULL; + rnd->rnd_weight = 0; + } else if (num_nhops == 1) { + rnd->rnd_nhop = pnhops[0].nh; + rnd->rnd_weight = pnhops[0].weight; + if (nhop_try_ref_object(rnd->rnd_nhop) == 0) + error = EAGAIN; + } else { + mp_priv = get_nhgrp(ctl, pnhops, num_nhops, &error); + if (mp_priv != NULL) + rnd->rnd_nhgrp = mp_priv->nhg; + rnd->rnd_weight = 0; + } + + if (pnhops != (struct weightened_nhop *)&storage[0]) + free(pnhops, M_TEMP); + + return (error); +} + +/* + * Creates new multipath group based on existing group/nhop in @rnd_orig and + * to-be-added nhop @wn_add. + * Returns 0 on success and stores result in @rnd_new. + */ +int +nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig, + struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new) +{ + struct nh_control *ctl = rh->nh_control; + struct nhgrp_priv *nhg_priv; + struct weightened_nhop wn[2]; + int error; + + if (rnd_orig->rnd_nhop == NULL) { + /* No paths to add to, just reference current nhop */ + *rnd_new = *rnd_add; + if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0) + return (EAGAIN); + return (0); + } + + wn[0].nh = rnd_add->rnd_nhop; + wn[0].weight = rnd_add->rnd_weight; + + if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) { + /* Simple merge of 2 non-multipath nexthops */ + wn[1].nh = rnd_orig->rnd_nhop; + wn[1].weight = rnd_orig->rnd_weight; + nhg_priv = get_nhgrp(ctl, wn, 2, &error); + } else { + /* Get new nhop group with @rt->rt_nhop as an additional nhop */ + nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1, + &error); + } + + if (nhg_priv == NULL) + return (error); + rnd_new->rnd_nhgrp = nhg_priv->nhg; + rnd_new->rnd_weight = 0; + + return (0); +} + +/* + * Returns pointer to array of nexthops with weights for + * given @nhg. Stores number of items in the array into @pnum_nhops. + */ +struct weightened_nhop * +nhgrp_get_nhops(struct nhgrp_object *nhg, uint32_t *pnum_nhops) +{ + struct nhgrp_priv *nhg_priv; + + KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath")); + + nhg_priv = NHGRP_PRIV(nhg); + *pnum_nhops = nhg_priv->nhg_nh_count; + + return (nhg_priv->nhg_nh_weights); +} + +__noinline static int +dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv, + char *buffer, size_t buffer_size, struct sysctl_req *w) +{ + struct rt_msghdr *rtm; + struct nhgrp_external *nhge; + struct nhgrp_container *nhgc; + const struct nhgrp_object *nhg; + struct nhgrp_nhop_external *ext; + int error; + size_t sz; + + nhg = nhg_priv->nhg; + + sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external); + /* controlplane nexthops */ + sz += sizeof(struct nhgrp_container); + sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count; + /* dataplane nexthops */ + sz += sizeof(struct nhgrp_container); + sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size; + + KASSERT(sz <= buffer_size, ("increase nhgrp buffer size")); + + bzero(buffer, sz); + + rtm = (struct rt_msghdr *)buffer; + rtm->rtm_msglen = sz; + rtm->rtm_version = RTM_VERSION; + rtm->rtm_type = RTM_GET; + + nhge = (struct nhgrp_external *)(rtm + 1); + + nhge->nhg_idx = nhg_priv->nhg_idx; + nhge->nhg_refcount = nhg_priv->nhg_refcount; + + /* fill in control plane nexthops firs */ + nhgc = (struct nhgrp_container *)(nhge + 1); + nhgc->nhgc_type = NHG_C_TYPE_CNHOPS; + nhgc->nhgc_subtype = 0; + nhgc->nhgc_len = sizeof(struct nhgrp_container); + nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count; + nhgc->nhgc_count = nhg_priv->nhg_nh_count; + + ext = (struct nhgrp_nhop_external *)(nhgc + 1); + for (int i = 0; i < nhg_priv->nhg_nh_count; i++) { + ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx; + ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight; + } + + /* fill in dataplane nexthops */ + nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]); + nhgc->nhgc_type = NHG_C_TYPE_DNHOPS; + nhgc->nhgc_subtype = 0; + nhgc->nhgc_len = sizeof(struct nhgrp_container); + nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size; + nhgc->nhgc_count = nhg->nhg_size; + + ext = (struct nhgrp_nhop_external *)(nhgc + 1); + for (int i = 0; i < nhg->nhg_size; i++) { + ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx; + ext[i].nh_weight = 0; + } + + error = SYSCTL_OUT(w, buffer, sz); + + return (error); +} + +int +nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w) +{ + struct nh_control *ctl = rh->nh_control; + struct epoch_tracker et; + struct nhgrp_priv *nhg_priv; + char *buffer; + size_t sz; + int error = 0; + + if (ctl->gr_head.items_count == 0) + return (0); + + /* Calculate the maximum nhop group size in bytes */ + sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external); + sz += 2 * sizeof(struct nhgrp_container); + sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH; + buffer = malloc(sz, M_TEMP, M_WAITOK); + + NET_EPOCH_ENTER(et); + NHOPS_RLOCK(ctl); + CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) { + error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w); + if (error != 0) + break; + } CHT_SLIST_FOREACH_END; + NHOPS_RUNLOCK(ctl); + NET_EPOCH_EXIT(et); + + free(buffer, M_TEMP); + + return (error); +} diff --git a/sys/net/route/nhgrp_var.h b/sys/net/route/nhgrp_var.h new file mode 100644 index 000000000000..ba90a3feedc8 --- /dev/null +++ b/sys/net/route/nhgrp_var.h @@ -0,0 +1,72 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This header file contains private definitions for the nexthop groups. + * + * Header is not intended to be included by the code external to the + * routing subsystem. + */ + +#ifndef _NET_ROUTE_NHGRP_VAR_H_ +#define _NET_ROUTE_NHGRP_VAR_H_ + +/* nhgrp hash definition */ +/* produce hash value for an object */ +#define mpath_hash_obj(_obj) (hash_nhgrp(_obj)) +/* compare two objects */ +#define mpath_cmp(_one, _two) (cmp_nhgrp(_one, _two)) +/* next object accessor */ +#define mpath_next(_obj) (_obj)->nhg_priv_next + +struct nhgrp_priv { + uint32_t nhg_idx; + uint8_t nhg_nh_count; /* number of items in nh_weights */ + uint8_t nhg_spare[3]; + u_int nhg_refcount; /* use refcount */ + u_int nhg_linked; /* refcount(9), == 2 if linked to the list */ + struct nh_control *nh_control; /* parent control structure */ + struct nhgrp_priv *nhg_priv_next; + struct nhgrp_object *nhg; + struct epoch_context nhg_epoch_ctx; /* epoch data for nhop */ + struct weightened_nhop nhg_nh_weights[0]; +}; + +#define _NHGRP_PRIV(_src) (&(_src)->nhops[(_src)->nhg_size]) +#define NHGRP_PRIV(_src) ((struct nhgrp_priv *)_NHGRP_PRIV(_src)) +#define NHGRP_PRIV_CONST(_src) ((const struct nhgrp_priv *)_NHGRP_PRIV(_src)) + +/* nhgrp.c */ +bool nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags); +struct nhgrp_priv *find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key); +int link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv); +struct nhgrp_priv *unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key); + +#endif + diff --git a/sys/net/route/nhop.c b/sys/net/route/nhop.c index 4b9a79ffbf20..0db47db9916e 100644 --- a/sys/net/route/nhop.c +++ b/sys/net/route/nhop.c @@ -64,7 +64,7 @@ __FBSDID("$FreeBSD$"); * is backed by the bitmask array. */ -static MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data"); +MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data"); /* Hash management functions */ @@ -112,6 +112,9 @@ destroy_ctl(struct nh_control *ctl) NHOPS_LOCK_DESTROY(ctl); free(ctl->nh_head.ptr, M_NHOP); free(ctl->nh_idx_head.idx, M_NHOP); +#ifdef ROUTE_MPATH + nhgrp_ctl_free(ctl); +#endif free(ctl, M_NHOP); } @@ -154,6 +157,9 @@ nhops_destroy_rib(struct rib_head *rh) DPRINTF("Marking nhop %u unlinked", nh_priv->nh_idx); refcount_release(&nh_priv->nh_linked); } CHT_SLIST_FOREACH_END; +#ifdef ROUTE_MPATH + nhgrp_ctl_unlink_all(ctl); +#endif NHOPS_WUNLOCK(ctl); /* diff --git a/sys/net/route/nhop.h b/sys/net/route/nhop.h index 1f6aff134c2d..3944d8946b07 100644 --- a/sys/net/route/nhop.h +++ b/sys/net/route/nhop.h @@ -155,7 +155,7 @@ struct nhop_object { */ #define NH_IS_VALID(_nh) RT_LINK_IS_UP((_nh)->nh_ifp) -#define NH_IS_MULTIPATH(_nh) ((_nh)->nh_flags & NHF_MULTIPATH) +#define NH_IS_NHGRP(_nh) ((_nh)->nh_flags & NHF_MULTIPATH) #define RT_GATEWAY(_rt) ((struct sockaddr *)&(_rt)->rt_nhop->gw4_sa) #define RT_GATEWAY_CONST(_rt) ((const struct sockaddr *)&(_rt)->rt_nhop->gw4_sa) @@ -166,6 +166,11 @@ struct nhop_object { _nh = NULL; \ } while (0) +struct weightened_nhop { + struct nhop_object *nh; + uint32_t weight; +}; + void nhop_free(struct nhop_object *nh); struct sysctl_req; @@ -209,16 +214,34 @@ struct nhop_addrs { uint16_t src_sa_off; /* offset of src address SA */ }; -struct mpath_nhop_external { +#define NHG_C_TYPE_CNHOPS 0x1 /* Control plane nhops list */ +#define NHG_C_TYPE_DNHOPS 0x2 /* Dataplane nhops list */ +struct nhgrp_container { + uint32_t nhgc_len; /* container length */ + uint16_t nhgc_count; /* number of items */ + uint8_t nhgc_type; /* container type */ + uint8_t nhgc_subtype; /* container subtype */ +}; + +struct nhgrp_nhop_external { uint32_t nh_idx; uint32_t nh_weight; }; -struct mpath_external { - uint32_t mp_idx; - uint32_t mp_refcount; - uint32_t mp_nh_count; - uint32_t mp_group_size; +/* + * Layout: + * - nhgrp_external + * - nhgrp_container (control plane nhops list) + * - nhgrp_nhop_external + * - nhgrp_nhop_external + * .. + * - nhgrp_container (dataplane nhops list) + * - nhgrp_nhop_external + * - nhgrp_nhop_external + */ +struct nhgrp_external { + uint32_t nhg_idx; /* Nexthop group index */ + uint32_t nhg_refcount; /* number of references */ }; #endif diff --git a/sys/net/route/nhop_ctl.c b/sys/net/route/nhop_ctl.c index b9ac4d63218d..150ae5c4be58 100644 --- a/sys/net/route/nhop_ctl.c +++ b/sys/net/route/nhop_ctl.c @@ -695,7 +695,14 @@ void nhop_free_any(struct nhop_object *nh) { +#ifdef ROUTE_MPATH + if (!NH_IS_NHGRP(nh)) + nhop_free(nh); + else + nhgrp_free((struct nhgrp_object *)nh); +#else nhop_free(nh); +#endif } /* Helper functions */ diff --git a/sys/net/route/nhop_var.h b/sys/net/route/nhop_var.h index 220b6c9a7634..6e1aba670e3c 100644 --- a/sys/net/route/nhop_var.h +++ b/sys/net/route/nhop_var.h @@ -37,6 +37,8 @@ #ifndef _NET_ROUTE_NHOP_VAR_H_ #define _NET_ROUTE_NHOP_VAR_H_ +MALLOC_DECLARE(M_NHOP); + /* define nhop hash table */ struct nhop_priv; CHT_SLIST_DEFINE(nhops, struct nhop_priv); @@ -47,9 +49,15 @@ CHT_SLIST_DEFINE(nhops, struct nhop_priv); /* next object accessor */ #define nhops_next(_obj) (_obj)->nh_next +/* define multipath hash table */ +struct nhgrp_priv; +CHT_SLIST_DEFINE(nhgroups, struct nhgrp_priv); + struct nh_control { struct nhops_head nh_head; /* hash table head */ struct bitmask_head nh_idx_head; /* nhop index head */ + struct nhgroups_head gr_head; /* nhgrp hash table head */ + struct bitmask_head gr_idx_head; /* nhgrp index head */ struct rwlock ctl_lock; /* overall ctl lock */ struct rib_head *ctl_rh; /* pointer back to rnh */ struct epoch_context ctl_epoch_ctx; /* epoch ctl helper */ @@ -80,7 +88,8 @@ struct nhop_priv { struct epoch_context nh_epoch_ctx; /* epoch data for nhop */ }; -#define NH_IS_PINNED(_nh) ((_nh)->nh_priv->rt_flags & RTF_PINNED) +#define NH_IS_PINNED(_nh) ((!NH_IS_NHGRP(_nh)) && \ + ((_nh)->nh_priv->rt_flags & RTF_PINNED)) /* nhop.c */ struct nhop_priv *find_nhop(struct nh_control *ctl, diff --git a/sys/net/route/route_ctl.c b/sys/net/route/route_ctl.c index 37c23e2cb1cb..f720d08f1f52 100644 --- a/sys/net/route/route_ctl.c +++ b/sys/net/route/route_ctl.c @@ -29,7 +29,7 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" -#include "opt_mpath.h" +#include "opt_route.h" #include <sys/param.h> #include <sys/systm.h> @@ -83,9 +83,6 @@ static int del_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc); static int change_route(struct rib_head *rnh, struct rt_addrinfo *info, struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc); -static int change_route_nhop(struct rib_head *rnh, struct rtentry *rt, - struct rt_addrinfo *info, struct route_nhop_data *rnd, - struct rib_cmd_info *rc); static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc); @@ -94,6 +91,20 @@ static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type, struct rib_cmd_info *rc); static void destroy_subscription_epoch(epoch_context_t ctx); +static bool rib_can_multipath(struct rib_head *rh); + +/* Per-vnet multipath routing configuration */ +SYSCTL_DECL(_net_route); +#define V_rib_route_multipath VNET(rib_route_multipath) +#ifdef ROUTE_MPATH +#define _MP_FLAGS CTLFLAG_RW +#else +#define _MP_FLAGS CTLFLAG_RD +#endif +VNET_DEFINE(u_int, rib_route_multipath) = 0; +SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET, + &VNET_NAME(rib_route_multipath), 0, "Enable route multipath"); +#undef _MP_FLAGS /* Routing table UMA zone */ VNET_DEFINE_STATIC(uma_zone_t, rtzone); @@ -128,7 +139,7 @@ destroy_rtentry(struct rtentry *rt) CURVNET_SET(nhop_get_vnet(rt->rt_nhop)); /* Unreference nexthop */ - nhop_free(rt->rt_nhop); + nhop_free_any(rt->rt_nhop); uma_zfree(V_rtzone, rt); @@ -175,6 +186,41 @@ get_rnh(uint32_t fibnum, const struct rt_addrinfo *info) return (rnh); } +#ifdef ROUTE_MPATH +static bool +rib_can_multipath(struct rib_head *rh) +{ + int result; + + CURVNET_SET(rh->rib_vnet); + result = !!V_rib_route_multipath; + CURVNET_RESTORE(); + + return (result); +} + +/* + * Check is nhop is multipath-eligible. + * Avoid nhops without gateways and redirects. + * + * Returns 1 for multipath-eligible nexthop, + * 0 otherwise. + */ +bool +nhop_can_multipath(const struct nhop_object *nh) +{ + + if ((nh->nh_flags & NHF_MULTIPATH) != 0) + return (1); + if ((nh->nh_flags & NHF_GATEWAY) == 0) + return (0); + if ((nh->nh_flags & NHF_REDIRECT) != 0) + return (0); + + return (1); +} +#endif + static int get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight) { @@ -206,7 +252,7 @@ rt_set_expire_info(struct rtentry *rt, const struct rt_addrinfo *info) * * Returns true if matches, false otherwise. */ -static bool +bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw) { @@ -461,7 +507,7 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc) { struct nhop_object *nh_orig; - struct route_nhop_data rnd; + struct route_nhop_data rnd_orig, rnd_add; struct nhop_object *nh; struct rtentry *rt, *rt_orig; int error; @@ -470,32 +516,19 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info, if (error != 0) return (error); - rnd.rnd_nhop = rt->rt_nhop; - rnd.rnd_weight = rt->rt_weight; + rnd_add.rnd_nhop = rt->rt_nhop; + rnd_add.rnd_weight = rt->rt_weight; nh = rt->rt_nhop; RIB_WLOCK(rnh); -#ifdef RADIX_MPATH - struct sockaddr *netmask; - netmask = info->rti_info[RTAX_NETMASK]; - /* do not permit exactly the same dst/mask/gw pair */ - if (rt_mpath_capable(rnh) && - rt_mpath_conflict(rnh, rt, netmask)) { - RIB_WUNLOCK(rnh); - - nhop_free(nh); - uma_zfree(V_rtzone, rt); - return (EEXIST); - } -#endif - error = add_route_nhop(rnh, rt, info, &rnd, rc); + error = add_route_nhop(rnh, rt, info, &rnd_add, rc); if (error == 0) { RIB_WUNLOCK(rnh); return (0); } /* addition failed. Lookup prefix in the rib to determine the cause */ - rt_orig = lookup_prefix(rnh, info, &rnd); + rt_orig = lookup_prefix(rnh, info, &rnd_orig); if (rt_orig == NULL) { /* No prefix -> rnh_addaddr() failed to allocate memory */ RIB_WUNLOCK(rnh); @@ -505,11 +538,11 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info, } /* We have existing route in the RIB. */ - nh_orig = rnd.rnd_nhop; + nh_orig = rnd_orig.rnd_nhop; /* Check if new route has higher preference */ if (can_override_nhop(info, nh_orig) > 0) { /* Update nexthop to the new route */ - change_route_nhop(rnh, rt_orig, info, &rnd, rc); + change_route_nhop(rnh, rt_orig, info, &rnd_add, rc); RIB_WUNLOCK(rnh); uma_zfree(V_rtzone, rt); nhop_free(nh_orig); @@ -518,11 +551,26 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info, RIB_WUNLOCK(rnh); +#ifdef ROUTE_MPATH + if (rib_can_multipath(rnh) && nhop_can_multipath(rnd_add.rnd_nhop) && + nhop_can_multipath(rnd_orig.rnd_nhop)) + error = add_route_mpath(rnh, info, rt, &rnd_add, &rnd_orig, rc); + else +#endif /* Unable to add - another route with the same preference exists */ error = EEXIST; + /* + * ROUTE_MPATH disabled: failed to add route, free both nhop and rt. + * ROUTE_MPATH enabled: original nhop reference is unused in any case, + * free rt only if not _adding_ new route to rib (e.g. the case + * when initial lookup returned existing route, but then it got + * deleted prior to multipath group insertion, leading to a simple + * non-multipath add as a result). + */ nhop_free(nh); - uma_zfree(V_rtzone, rt); + if ((error != 0) || rc->rc_cmd != RTM_ADD) + uma_zfree(V_rtzone, rt); return (error); } @@ -588,7 +636,13 @@ rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info return (ESRCH); nh = rt->rt_nhop; - +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + error = del_route_mpath(rnh, info, rt, + (struct nhgrp_object *)nh, rc); + return (error); + } +#endif error = check_info_match_nhop(info, rt, nh); if (error != 0) return (error); @@ -600,14 +654,6 @@ rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ -#ifdef RADIX_MPATH - info->rti_info[RTAX_GATEWAY] = &nh->gw_sa; - if (rt_mpath_capable(rnh)) { - rn = rt_mpath_unlink(rnh, info, rt, &error); - if (error != 0) - return (error); - } else -#endif rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rn == NULL) @@ -648,7 +694,18 @@ del_route(struct rib_head *rnh, struct rt_addrinfo *info, * If the caller wants it, then it can have it, * the entry will be deleted after the end of the current epoch. */ - rtfree(rc->rc_rt); + if (rc->rc_cmd == RTM_DELETE) + rtfree(rc->rc_rt); +#ifdef ROUTE_MPATH + else { + /* + * Deleting 1 path may result in RTM_CHANGE to + * a different mpath group/nhop. + * Free old mpath group. + */ + nhop_free_any(rc->rc_nh_old); + } +#endif return (0); } @@ -694,19 +751,6 @@ rib_change_route(uint32_t fibnum, struct rt_addrinfo *info, return (ESRCH); } -#ifdef RADIX_MPATH - /* - * If we got multipath routes, - * we require users to specify a matching RTAX_GATEWAY. - */ - if (rt_mpath_capable(rnh)) { - rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]); - if (rt == NULL) { - RIB_RUNLOCK(rnh); - return (ESRCH); - } - } -#endif rnd_orig.rnd_nhop = rt->rt_nhop; rnd_orig.rnd_weight = rt->rt_weight; @@ -722,18 +766,11 @@ rib_change_route(uint32_t fibnum, struct rt_addrinfo *info, } static int -change_route(struct rib_head *rnh, struct rt_addrinfo *info, - struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) +change_nhop(struct rib_head *rnh, struct rt_addrinfo *info, + struct nhop_object *nh_orig, struct nhop_object **nh_new) { - int error = 0; int free_ifa = 0; - struct nhop_object *nh, *nh_orig; - struct route_nhop_data rnd_new; - - nh = NULL; - nh_orig = rnd_orig->rnd_nhop; - if (nh_orig == NULL) - return (ESRCH); + int error; /* * New gateway could require new ifaddr, ifp; @@ -759,24 +796,101 @@ change_route(struct rib_head *rnh, struct rt_addrinfo *info, } } - error = nhop_create_from_nhop(rnh, nh_orig, info, &nh); + error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new); if (free_ifa) { ifa_free(info->rti_ifa); info->rti_ifa = NULL; } + + return (error); +} + +#ifdef ROUTE_MPATH +static int +change_mpath_route(struct rib_head *rnh, struct rt_addrinfo *info, + struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) +{ + int error = 0; + struct nhop_object *nh, *nh_orig, *nh_new; + struct route_nhop_data rnd_new; + + nh = NULL; + nh_orig = rnd_orig->rnd_nhop; + + struct weightened_nhop *wn = NULL, *wn_new; + uint32_t num_nhops; + + wn = nhgrp_get_nhops((struct nhgrp_object *)nh_orig, &num_nhops); + nh_orig = NULL; + for (int i = 0; i < num_nhops; i++) { + if (check_info_match_nhop(info, NULL, wn[i].nh)) { + nh_orig = wn[i].nh; + break; + } + } + + if (nh_orig == NULL) + return (ESRCH); + + error = change_nhop(rnh, info, nh_orig, &nh_new); if (error != 0) return (error); - rnd_new.rnd_nhop = nh; - if (info->rti_mflags & RTV_WEIGHT) - rnd_new.rnd_weight = info->rti_rmx->rmx_weight; - else - rnd_new.rnd_weight = rnd_orig->rnd_weight; + wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop), + M_TEMP, M_NOWAIT | M_ZERO); + if (wn_new == NULL) { + nhop_free(nh_new); + return (EAGAIN); + } + + memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop)); + for (int i = 0; i < num_nhops; i++) { + if (wn[i].nh == nh_orig) { + wn[i].nh = nh_new; + wn[i].weight = get_info_weight(info, rnd_orig->rnd_weight); + break; + } + } + + error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new); + nhop_free(nh_new); + free(wn_new, M_TEMP); + + if (error != 0) + return (error); error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); return (error); } +#endif + +static int +change_route(struct rib_head *rnh, struct rt_addrinfo *info, + struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) +{ + int error = 0; + struct nhop_object *nh, *nh_orig; + struct route_nhop_data rnd_new; + + nh = NULL; + nh_orig = rnd_orig->rnd_nhop; + if (nh_orig == NULL) + return (ESRCH); + +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh_orig)) + return (change_mpath_route(rnh, info, rnd_orig, rc)); +#endif + + rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight); + error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop); + if (error != 0) + return (error); + error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); + + return (error); +} /* * Insert @rt with nhop data from @rnd_new to @rnh. @@ -827,7 +941,7 @@ add_route_nhop(struct rib_head *rnh, struct rtentry *rt, * Conditionally set rt_expire if set in @info. * Returns 0 on success. */ -static int +int change_route_nhop(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *rnd, struct rib_cmd_info *rc) @@ -855,6 +969,8 @@ change_route_nhop(struct rib_head *rnh, struct rtentry *rt, rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head); if (rn == NULL) return (ESRCH); + rt = RNTORT(rn); + rt->rte_flags &= ~RTF_UP; } /* Finalize notification */ @@ -989,7 +1105,6 @@ rt_checkdelroute(struct radix_node *rn, void *arg) info->rti_info[RTAX_DST] = rt_key(rt); info->rti_info[RTAX_NETMASK] = rt_mask(rt); - info->rti_info[RTAX_GATEWAY] = &rt->rt_nhop->gw_sa; error = rt_unlinkrte(di->rnh, info, &di->rc); @@ -1000,7 +1115,7 @@ rt_checkdelroute(struct radix_node *rn, void *arg) * XXX: Delayed notifications not implemented * for nexthop updates. */ - if (error == 0) { + if ((error == 0) && (di->rc.rc_cmd == RTM_DELETE)) { /* Add to the list and return */ rt->rt_chain = di->head; di->head = rt; @@ -1024,6 +1139,7 @@ rib_walk_del(u_int fibnum, int family, rt_filter_f_t *filter_f, void *arg, bool struct rib_head *rnh; struct rt_delinfo di; struct rtentry *rt; + struct nhop_object *nh; struct epoch_tracker et; rnh = rt_tables_get_rnh(fibnum, family); @@ -1049,18 +1165,31 @@ rib_walk_del(u_int fibnum, int family, rt_filter_f_t *filter_f, void *arg, bool rt = di.head; di.head = rt->rt_chain; rt->rt_chain = NULL; + nh = rt->rt_nhop; di.rc.rc_rt = rt; - di.rc.rc_nh_old = rt->rt_nhop; + di.rc.rc_nh_old = nh; rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc); /* TODO std rt -> rt_addrinfo export */ di.info.rti_info[RTAX_DST] = rt_key(rt); di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); - if (report) - rt_routemsg(RTM_DELETE, rt, rt->rt_nhop->nh_ifp, 0, - fibnum); + if (report) { +#ifdef ROUTE_MPATH + struct nhgrp_object *nhg; + struct weightened_nhop *wn; + uint32_t num_nhops; + if (NH_IS_NHGRP(nh)) { + nhg = (struct nhgrp_object *)nh; + wn = nhgrp_get_nhops(nhg, &num_nhops); + for (int i = 0; i < num_nhops; i++) + rt_routemsg(RTM_DELETE, rt, + wn[i].nh->nh_ifp, 0, fibnum); + } else +#endif + rt_routemsg(RTM_DELETE, rt, nh->nh_ifp, 0, fibnum); + } rtfree(rt); } diff --git a/sys/net/route/route_ctl.h b/sys/net/route/route_ctl.h index fb6dda47b3ba..151771146e65 100644 --- a/sys/net/route/route_ctl.h +++ b/sys/net/route/route_ctl.h @@ -53,6 +53,10 @@ int rib_change_route(uint32_t fibnum, struct rt_addrinfo *info, int rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info, struct rib_cmd_info *rc); +typedef void route_notification_t(struct rib_cmd_info *rc, void *); +void rib_decompose_notification(struct rib_cmd_info *rc, + route_notification_t *cb, void *cbdata); + int rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp, int flags, int expire_sec); @@ -66,6 +70,20 @@ typedef void rt_setwarg_t(struct rib_head *, uint32_t, int, void *); void rt_foreach_fib_walk(int af, rt_setwarg_t *, rt_walktree_f_t *, void *); void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg); +struct route_nhop_data; +const struct rtentry *rib_lookup_prefix(uint32_t fibnum, int family, + const struct sockaddr *dst, const struct sockaddr *netmask, + struct route_nhop_data *rnd); +const struct rtentry *rib_lookup_lpm(uint32_t fibnum, int family, + const struct sockaddr *dst, struct route_nhop_data *rnd); + +/* Multipath */ +struct nhgrp_object; +struct weightened_nhop; + +struct weightened_nhop *nhgrp_get_nhops(struct nhgrp_object *nhg, + uint32_t *pnum_nhops); + enum rib_subscription_type { RIB_NOTIFY_IMMEDIATE, RIB_NOTIFY_DELAYED diff --git a/sys/net/route/route_helpers.c b/sys/net/route/route_helpers.c index b5b45ef662cc..dfa573d23a66 100644 --- a/sys/net/route/route_helpers.c +++ b/sys/net/route/route_helpers.c @@ -131,3 +131,167 @@ rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, return (nh); } + +#ifdef ROUTE_MPATH +static void +decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb, + void *cbdata) +{ + uint32_t num_old, num_new; + uint32_t nh_idx_old, nh_idx_new; + struct weightened_nhop *wn_old, *wn_new; + struct weightened_nhop tmp = { NULL, 0 }; + uint32_t idx_old = 0, idx_new = 0; + + struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt }; + struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt }; + + if (NH_IS_NHGRP(rc->rc_nh_old)) { + wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old); + } else { + tmp.nh = rc->rc_nh_old; + tmp.weight = rc->rc_nh_weight; + wn_old = &tmp; + num_old = 1; + } + if (NH_IS_NHGRP(rc->rc_nh_new)) { + wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new); + } else { + tmp.nh = rc->rc_nh_new; + tmp.weight = rc->rc_nh_weight; + wn_new = &tmp; + num_new = 1; + } + + /* Use the fact that each @wn array is sorted */ + /* + * Want to convert into set of add and delete operations + * [1] -> [1, 2] = A{2} + * [2] -> [1, 2] = A{1} + * [1, 2, 4]->[1, 3, 4] = A{2}, D{3} + * [1, 2, 4]->[1, 4] = D{2} + * [1, 2, 4] -> [3, 4] = D{1}, C{2,3} OR C{1,3}, D{2} OR D{1},D{2},A{3} + * [1, 2] -> [3, 4] = + * + */ + idx_old = 0; + while ((idx_old < num_old) && (idx_new < num_new)) { + nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx; + nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx; + + if (nh_idx_old == nh_idx_new) { + if (wn_old[idx_old].weight != wn_new[idx_new].weight) { + /* Update weight by providing del/add notifications */ + rc_del.rc_nh_old = wn_old[idx_old].nh; + rc_del.rc_nh_weight = wn_old[idx_old].weight; + cb(&rc_del, cbdata); + + rc_add.rc_nh_new = wn_new[idx_new].nh; + rc_add.rc_nh_weight = wn_new[idx_new].weight; + cb(&rc_add, cbdata); + } + idx_old++; + idx_new++; + } else if (nh_idx_old < nh_idx_new) { + /* + * [1, ~2~, 4], [1, ~3~, 4] + * [1, ~2~, 5], [1, ~3~, 4] + * [1, ~2~], [1, ~3~, 4] + */ + if ((idx_old + 1 >= num_old) || + (wn_old[idx_old + 1].nh->nh_priv->nh_idx > nh_idx_new)) { + /* Add new unless the next old item is still <= new */ + rc_add.rc_nh_new = wn_new[idx_new].nh; + rc_add.rc_nh_weight = wn_new[idx_new].weight; + cb(&rc_add, cbdata); + idx_new++; + } + /* In any case, delete current old */ + rc_del.rc_nh_old = wn_old[idx_old].nh; + rc_del.rc_nh_weight = wn_old[idx_old].weight; + cb(&rc_del, cbdata); + idx_old++; + } else { + /* + * nh_idx_old > nh_idx_new + * + * [1, ~3~, 4], [1, ~2~, 4] + * [1, ~3~, 5], [1, ~2~, 4] + * [1, ~3~, 4], [1, ~2~] + */ + if ((idx_new + 1 >= num_new) || + (wn_new[idx_new + 1].nh->nh_priv->nh_idx > nh_idx_old)) { + /* No next item or next item is > current one */ + rc_add.rc_nh_new = wn_new[idx_new].nh; + rc_add.rc_nh_weight = wn_new[idx_new].weight; + cb(&rc_add, cbdata); + idx_new++; + } + /* In any case, delete current old */ + rc_del.rc_nh_old = wn_old[idx_old].nh; + rc_del.rc_nh_weight = wn_old[idx_old].weight; + cb(&rc_del, cbdata); + idx_old++; + } + } + + while (idx_old < num_old) { + rc_del.rc_nh_old = wn_old[idx_old].nh; + rc_del.rc_nh_weight = wn_old[idx_old].weight; + cb(&rc_del, cbdata); + idx_old++; + } + + while (idx_new < num_new) { + rc_add.rc_nh_new = wn_new[idx_new].nh; + rc_add.rc_nh_weight = wn_new[idx_new].weight; + cb(&rc_add, cbdata); + idx_new++; + } +} + +/* + * Decompose multipath cmd info @rc into a list of add/del/change + * single-path operations, calling @cb callback for each operation. + * Assumes at least one of the nexthops in @rc is multipath. + */ +void +rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb, + void *cbdata) +{ + struct weightened_nhop *wn; + uint32_t num_nhops; + struct rib_cmd_info rc_new; + + rc_new = *rc; + DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p", + cb, rc->cmd, rc->nh_old, rc->nh_new); + switch (rc->rc_cmd) { + case RTM_ADD: + if (!NH_IS_NHGRP(rc->rc_nh_new)) + return; + wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops); + for (uint32_t i = 0; i < num_nhops; i++) { + rc_new.rc_nh_new = wn[i].nh; + rc_new.rc_nh_weight = wn[i].weight; + cb(&rc_new, cbdata); + } + break; + case RTM_DELETE: + if (!NH_IS_NHGRP(rc->rc_nh_old)) + return; + wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops); + for (uint32_t i = 0; i < num_nhops; i++) { + rc_new.rc_nh_old = wn[i].nh; + rc_new.rc_nh_weight = wn[i].weight; + cb(&rc_new, cbdata); + } + break; + case RTM_CHANGE: + if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new)) + return; + decompose_change_notification(rc, cb, cbdata); + break; + } +} +#endif diff --git a/sys/net/route/route_var.h b/sys/net/route/route_var.h index 6164ec08850c..12d081d410a2 100644 --- a/sys/net/route/route_var.h +++ b/sys/net/route/route_var.h @@ -87,6 +87,7 @@ struct rib_head { /* Constants */ #define RIB_MAX_RETRIES 3 #define RT_MAXFIBS UINT16_MAX +#define RIB_MAX_MPATH_WIDTH 64 /* Macro for verifying fields in af-specific 'struct route' structures */ #define CHK_STRUCT_FIELD_GENERIC(_s1, _f1, _s2, _f2) \ @@ -113,12 +114,7 @@ _Static_assert(__offsetof(struct route, ro_dst) == __offsetof(_ro_new, _dst_new) "ro_dst and " #_dst_new " are at different offset") struct rib_head *rt_tables_get_rnh(uint32_t table, sa_family_t family); -void rt_mpath_init_rnh(struct rib_head *rnh); int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum); -#ifdef RADIX_MPATH -struct radix_node *rt_mpath_unlink(struct rib_head *rnh, - struct rt_addrinfo *info, struct rtentry *rto, int *perror); -#endif struct rib_cmd_info; VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat); @@ -202,14 +198,6 @@ struct rtentry { /* rtentry rt flag mask */ #define RTE_RT_FLAG_MASK (RTF_UP | RTF_HOST) -/* Nexthop selection */ -#define _NH2MP(_nh) ((struct nhgrp_object *)(_nh)) -#define _SELECT_NHOP(_nh, _flowid) \ - (_NH2MP(_nh))->nhops[(_flowid) % (_NH2MP(_nh))->mp_size] -#define _RT_SELECT_NHOP(_nh, _flowid) \ - ((!NH_IS_MULTIPATH(_nh)) ? (_nh) : _SELECT_NHOP(_nh, _flowid)) -#define RT_SELECT_NHOP(_rt, _flowid) _RT_SELECT_NHOP((_rt)->rt_nhop, _flowid) - /* route_temporal.c */ void tmproutes_update(struct rib_head *rnh, struct rtentry *rt); void tmproutes_init(struct rib_head *rh); @@ -217,14 +205,24 @@ void tmproutes_destroy(struct rib_head *rh); /* route_ctl.c */ struct route_nhop_data { - struct nhop_object *rnd_nhop; - uint32_t rnd_weight; + union { + struct nhop_object *rnd_nhop; + struct nhgrp_object *rnd_nhgrp; + }; + uint32_t rnd_weight; }; + +int change_route_nhop(struct rib_head *rnh, struct rtentry *rt, + struct rt_addrinfo *info, struct route_nhop_data *rnd, + struct rib_cmd_info *rc); int change_route_conditional(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *nhd_orig, struct route_nhop_data *nhd_new, struct rib_cmd_info *rc); struct rtentry *lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info, struct route_nhop_data *rnd); + +bool nhop_can_multipath(const struct nhop_object *nh); +bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw); int check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt, const struct nhop_object *nh); int can_override_nhop(const struct rt_addrinfo *info, @@ -256,5 +254,57 @@ int nhop_create_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_ori void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu); int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); +/* MULTIPATH */ +#define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */ + +struct nhgrp_object { + uint16_t nhg_flags; /* nexthop group flags */ + uint8_t nhg_size; /* dataplain group size */ + uint8_t spare; + struct nhop_object *nhops[0]; /* nhops */ +}; + +static inline struct nhop_object * +nhop_select(struct nhop_object *nh, uint32_t flowid) +{ + +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + struct nhgrp_object *nhg = (struct nhgrp_object *)nh; + nh = nhg->nhops[flowid % nhg->nhg_size]; + } +#endif + return (nh); +} + + +struct weightened_nhop; + +/* mpath_ctl.c */ +int add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry *rt, struct route_nhop_data *rnd_add, + struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc); +int del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info, + struct rtentry *rt, struct nhgrp_object *nhg, struct rib_cmd_info *rc); + +/* nhgrp.c */ +int nhgrp_ctl_init(struct nh_control *ctl); +void nhgrp_ctl_free(struct nh_control *ctl); +void nhgrp_ctl_unlink_all(struct nh_control *ctl); + + +/* nhgrp_ctl.c */ +int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); + +int nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, + int num_nhops, struct route_nhop_data *rnd); +typedef bool nhgrp_filter_cb_t(const struct nhop_object *nh, void *data); +int nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src, + nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd); +int nhgrp_get_addition_group(struct rib_head *rnh, + struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_add, + struct route_nhop_data *rnd_new); + +void nhgrp_free(struct nhgrp_object *nhg); #endif diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c index f3b0ecec2430..c2e2273d0d31 100644 --- a/sys/net/rtsock.c +++ b/sys/net/rtsock.c @@ -32,7 +32,7 @@ * $FreeBSD$ */ #include "opt_ddb.h" -#include "opt_mpath.h" +#include "opt_route.h" #include "opt_inet.h" #include "opt_inet6.h" @@ -158,8 +158,7 @@ MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF); #define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx) #define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED) -static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, - ""); +SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); struct walkarg { int w_tmemsize; @@ -650,6 +649,25 @@ fill_addrinfo(struct rt_msghdr *rtm, int len, u_int fibnum, struct rt_addrinfo * return (0); } +static struct nhop_object * +select_nhop(struct nhop_object *nh, const struct sockaddr *gw) +{ + if (!NH_IS_NHGRP(nh)) + return (nh); +#ifdef ROUTE_MPATH + struct weightened_nhop *wn; + uint32_t num_nhops; + wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); + if (gw == NULL) + return (wn[0].nh); + for (int i = 0; i < num_nhops; i++) { + if (match_nhop_gw(wn[i].nh, gw)) + return (wn[i].nh); + } +#endif + return (NULL); +} + /* * Handles RTM_GET message from routing socket, returning matching rt. * @@ -663,6 +681,7 @@ handle_rtm_get(struct rt_addrinfo *info, u_int fibnum, { RIB_RLOCK_TRACKER; struct rib_head *rnh; + struct nhop_object *nh; sa_family_t saf; saf = info->rti_info[RTAX_DST]->sa_family; @@ -690,21 +709,12 @@ handle_rtm_get(struct rt_addrinfo *info, u_int fibnum, RIB_RUNLOCK(rnh); return (ESRCH); } -#ifdef RADIX_MPATH - /* - * for RTM_GET, gate is optional even with multipath. - * if gate == NULL the first match is returned. - * (no need to call rt_mpath_matchgate if gate == NULL) - */ - if (rt_mpath_capable(rnh) && info->rti_info[RTAX_GATEWAY]) { - rc->rc_rt = rt_mpath_matchgate(rc->rc_rt, - info->rti_info[RTAX_GATEWAY]); - if (rc->rc_rt == NULL) { - RIB_RUNLOCK(rnh); - return (ESRCH); - } + + nh = select_nhop(rc->rc_rt->rt_nhop, info->rti_info[RTAX_GATEWAY]); + if (nh == NULL) { + RIB_RUNLOCK(rnh); + return (ESRCH); } -#endif /* * If performing proxied L2 entry insertion, and * the actual PPP host entry is found, perform @@ -740,8 +750,13 @@ handle_rtm_get(struct rt_addrinfo *info, u_int fibnum, RIB_RUNLOCK(rnh); return (ESRCH); } + nh = select_nhop(rc->rc_rt->rt_nhop, info->rti_info[RTAX_GATEWAY]); + if (nh == NULL) { + RIB_RUNLOCK(rnh); + return (ESRCH); + } } - rc->rc_nh_new = rc->rc_rt->rt_nhop; + rc->rc_nh_new = nh; rc->rc_nh_weight = rc->rc_rt->rt_weight; RIB_RUNLOCK(rnh); @@ -832,6 +847,24 @@ update_rtm_from_rc(struct rt_addrinfo *info, struct rt_msghdr **prtm, return (0); } +static void +save_del_notification(struct rib_cmd_info *rc, void *_cbdata) +{ + struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata; + + if (rc->rc_cmd == RTM_DELETE) + *rc_new = *rc; +} + +static void +save_add_notification(struct rib_cmd_info *rc, void *_cbdata) +{ + struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata; + + if (rc->rc_cmd == RTM_ADD) + *rc_new = *rc; +} + /*ARGSUSED*/ static int route_output(struct mbuf *m, struct socket *so, ...) @@ -919,6 +952,15 @@ route_output(struct mbuf *m, struct socket *so, ...) #ifdef INET6 rti_need_deembed = 1; #endif +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(rc.rc_nh_new) || + (rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) { + struct rib_cmd_info rc_simple = {}; + rib_decompose_notification(&rc, + save_add_notification, (void *)&rc_simple); + rc = rc_simple; + } +#endif nh = rc.rc_nh_new; rtm->rtm_index = nh->nh_ifp->if_index; } @@ -927,6 +969,15 @@ route_output(struct mbuf *m, struct socket *so, ...) case RTM_DELETE: error = rib_action(fibnum, RTM_DELETE, &info, &rc); if (error == 0) { +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(rc.rc_nh_old) || + (rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) { + struct rib_cmd_info rc_simple = {}; + rib_decompose_notification(&rc, + save_del_notification, (void *)&rc_simple); + rc = rc_simple; + } +#endif nh = rc.rc_nh_old; goto report; } @@ -1708,7 +1759,19 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) if (!can_export_rte(w->w_req->td->td_ucred, rt)) return (0); nh = rt->rt_nhop; - error = sysctl_dumpnhop(rt, nh, rt->rt_weight, w); +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + struct weightened_nhop *wn; + uint32_t num_nhops; + wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); + for (int i = 0; i < num_nhops; i++) { + error = sysctl_dumpnhop(rt, wn[i].nh, wn[i].weight, w); + if (error != 0) + return (error); + } + } else +#endif + error = sysctl_dumpnhop(rt, nh, rt->rt_weight, w); return (0); } @@ -1748,6 +1811,7 @@ sysctl_dumpnhop(struct rtentry *rt, struct nhop_object *nh, uint32_t weight, rtm->rtm_flags = rt->rte_flags; rtm->rtm_flags |= nhop_get_rtflags(nh); rt_getmetrics(rt, nh, &rtm->rtm_rmx); + rtm->rtm_rmx.rmx_weight = weight; rtm->rtm_index = nh->nh_ifp->if_index; rtm->rtm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); @@ -2028,7 +2092,7 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS) namelen--; if (req->newptr) return (EPERM); - if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP) { + if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP || name[1] == NET_RT_NHGRP) { if (namelen == 3) fib = req->td->td_proc->p_fibnum; else if (namelen == 4) @@ -2096,6 +2160,7 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS) } break; case NET_RT_NHOP: + case NET_RT_NHGRP: /* Allow dumping one specific af/fib at a time */ if (namelen < 4) { error = EINVAL; @@ -2113,6 +2178,12 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS) } if (w.w_op == NET_RT_NHOP) error = nhops_dump_sysctl(rnh, w.w_req); + else +#ifdef ROUTE_MPATH + error = nhgrp_dump_sysctl(rnh, w.w_req); +#else + error = ENOTSUP; +#endif break; case NET_RT_IFLIST: case NET_RT_IFLISTL: diff --git a/sys/netinet/in.c b/sys/netinet/in.c index 5553530628bf..0bc02b5f20d3 100644 --- a/sys/netinet/in.c +++ b/sys/netinet/in.c @@ -35,8 +35,6 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#include "opt_mpath.h" - #include <sys/param.h> #include <sys/eventhandler.h> #include <sys/systm.h> @@ -699,14 +697,6 @@ in_addprefix(struct in_ifaddr *target, int flags) * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { -#ifdef RADIX_MPATH - if (ia->ia_addr.sin_addr.s_addr == - target->ia_addr.sin_addr.s_addr) { - IN_IFADDR_RUNLOCK(&in_ifa_tracker); - return (EEXIST); - } else - break; -#endif if (V_nosameprefix) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); diff --git a/sys/netinet/in_fib.c b/sys/netinet/in_fib.c index c46c55bd7d00..4c84de2c7281 100644 --- a/sys/netinet/in_fib.c +++ b/sys/netinet/in_fib.c @@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_route.h" -#include "opt_mpath.h" #include <sys/param.h> #include <sys/systm.h> @@ -48,14 +47,11 @@ __FBSDID("$FreeBSD$"); #include <net/if_var.h> #include <net/if_dl.h> #include <net/route.h> +#include <net/route/route_ctl.h> #include <net/route/route_var.h> #include <net/route/nhop.h> #include <net/vnet.h> -#ifdef RADIX_MPATH -#include <net/radix_mpath.h> -#endif - #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/in_fib.h> @@ -80,7 +76,6 @@ fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib4_lookup: bad fibnum")); @@ -99,12 +94,7 @@ fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); -#ifdef RADIX_MPATH - if (rt_mpath_next(rt) != NULL) - rt = rt_mpath_selectrte(rt, flowid); -#endif - nh = rt->rt_nhop; + nh = nhop_select((RNTORT(rn))->rt_nhop, flowid); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) @@ -120,7 +110,7 @@ fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, } inline static int -check_urpf(const struct nhop_object *nh, uint32_t flags, +check_urpf_nhop(const struct nhop_object *nh, uint32_t flags, const struct ifnet *src_if) { @@ -137,21 +127,24 @@ check_urpf(const struct nhop_object *nh, uint32_t flags, return (0); } -#ifdef RADIX_MPATH -inline static int -check_urpf_mpath(struct rtentry *rt, uint32_t flags, +static int +check_urpf(struct nhop_object *nh, uint32_t flags, const struct ifnet *src_if) { - - while (rt != NULL) { - if (check_urpf(rt->rt_nhop, flags, src_if) != 0) - return (1); - rt = rt_mpath_next(rt); - } - - return (0); -} +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + struct weightened_nhop *wn; + uint32_t num_nhops; + wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); + for (int i = 0; i < num_nhops; i++) { + if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0) + return (1); + } + return (0); + } else #endif + return (check_urpf_nhop(nh, flags, src_if)); +} /* * Performs reverse path forwarding lookup. @@ -169,7 +162,6 @@ fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; int ret; KASSERT((fibnum < rt_numfibs), ("fib4_check_urpf: bad fibnum")); @@ -186,12 +178,7 @@ fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); -#ifdef RADIX_MPATH - ret = check_urpf_mpath(rt, flags, src_if); -#else - ret = check_urpf(rt->rt_nhop, flags, src_if); -#endif + ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if); RIB_RUNLOCK(rh); return (ret); } @@ -206,7 +193,6 @@ fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, { struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib4_lookup_debugnet: bad fibnum")); @@ -225,12 +211,7 @@ fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, /* unlocked lookup */ rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); -#ifdef RADIX_MPATH - if (rt_mpath_next(rt) != NULL) - rt = rt_mpath_selectrte(rt, 0); -#endif - nh = rt->rt_nhop; + nh = nhop_select((RNTORT(rn))->rt_nhop, 0); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c index ef40fdc6af6c..6dfa1e56eff1 100644 --- a/sys/netinet/in_rmx.c +++ b/sys/netinet/in_rmx.c @@ -30,8 +30,6 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#include "opt_mpath.h" - #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> @@ -127,9 +125,6 @@ in_inithead(uint32_t fibnum) return (NULL); rh->rnh_preadd = rib4_preadd; -#ifdef RADIX_MPATH - rt_mpath_init_rnh(rh); -#endif return (rh); } diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index a26722c97f88..a7e72f4ec407 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -38,7 +38,6 @@ __FBSDID("$FreeBSD$"); #include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_mbuf_stress_test.h" -#include "opt_mpath.h" #include "opt_ratelimit.h" #include "opt_route.h" #include "opt_rss.h" @@ -470,11 +469,7 @@ again: * for correct operation (as it is for ARP). */ uint32_t flowid; -#ifdef RADIX_MPATH - flowid = ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr); -#else flowid = m->m_pkthdr.flowid; -#endif ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0, NHR_REF, flowid); diff --git a/sys/netinet6/in6_fib.c b/sys/netinet6/in6_fib.c index a0e4dacc86e1..9fd869b2730b 100644 --- a/sys/netinet6/in6_fib.c +++ b/sys/netinet6/in6_fib.c @@ -33,7 +33,6 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" -#include "opt_mpath.h" #include <sys/param.h> #include <sys/systm.h> @@ -49,14 +48,11 @@ __FBSDID("$FreeBSD$"); #include <net/if_var.h> #include <net/if_dl.h> #include <net/route.h> +#include <net/route/route_ctl.h> #include <net/route/route_var.h> #include <net/route/nhop.h> #include <net/vnet.h> -#ifdef RADIX_MPATH -#include <net/radix_mpath.h> -#endif - #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/ip_mroute.h> @@ -88,7 +84,6 @@ fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6, RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; struct nhop_object *nh; struct sockaddr_in6 sin6; @@ -111,12 +106,7 @@ fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6, RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); -#ifdef RADIX_MPATH - if (rt_mpath_next(rt) != NULL) - rt = rt_mpath_selectrte(rt, flowid); -#endif - nh = rt->rt_nhop; + nh = nhop_select((RNTORT(rn))->rt_nhop, flowid); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) @@ -132,7 +122,7 @@ fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6, } inline static int -check_urpf(const struct nhop_object *nh, uint32_t flags, +check_urpf_nhop(const struct nhop_object *nh, uint32_t flags, const struct ifnet *src_if) { @@ -149,21 +139,24 @@ check_urpf(const struct nhop_object *nh, uint32_t flags, return (0); } -#ifdef RADIX_MPATH -inline static int -check_urpf_mpath(struct rtentry *rt, uint32_t flags, +static int +check_urpf(struct nhop_object *nh, uint32_t flags, const struct ifnet *src_if) { - - while (rt != NULL) { - if (check_urpf(rt->rt_nhop, flags, src_if) != 0) - return (1); - rt = rt_mpath_next(rt); - } - - return (0); -} +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + struct weightened_nhop *wn; + uint32_t num_nhops; + wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); + for (int i = 0; i < num_nhops; i++) { + if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0) + return (1); + } + return (0); + } else #endif + return (check_urpf_nhop(nh, flags, src_if)); +} /* * Performs reverse path forwarding lookup. @@ -181,7 +174,6 @@ fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6, RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; struct sockaddr_in6 sin6; int ret; @@ -203,12 +195,7 @@ fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6, RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); -#ifdef RADIX_MPATH - ret = check_urpf_mpath(rt, flags, src_if); -#else - ret = check_urpf(rt->rt_nhop, flags, src_if); -#endif + ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if); RIB_RUNLOCK(rh); return (ret); } @@ -223,7 +210,6 @@ fib6_lookup_debugnet(uint32_t fibnum, const struct in6_addr *dst6, { struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; struct nhop_object *nh; struct sockaddr_in6 sin6; @@ -245,8 +231,7 @@ fib6_lookup_debugnet(uint32_t fibnum, const struct in6_addr *dst6, rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); - nh = rt->rt_nhop; + nh = nhop_select((RNTORT(rn))->rt_nhop, 0); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) diff --git a/sys/netinet6/in6_rmx.c b/sys/netinet6/in6_rmx.c index 5f2e2fe3ae6e..54136f9983b2 100644 --- a/sys/netinet6/in6_rmx.c +++ b/sys/netinet6/in6_rmx.c @@ -64,8 +64,6 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#include "opt_mpath.h" - #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> @@ -153,9 +151,6 @@ in6_inithead(uint32_t fibnum) return (NULL); rh->rnh_preadd = rib6_preadd; -#ifdef RADIX_MPATH - rt_mpath_init_rnh(rh); -#endif rs = rib_subscribe_internal(rh, nd6_subscription_cb, NULL, RIB_NOTIFY_IMMEDIATE, true); diff --git a/sys/netinet6/nd6.c b/sys/netinet6/nd6.c index ad31d750eb70..1597a4cb6b93 100644 --- a/sys/netinet6/nd6.c +++ b/sys/netinet6/nd6.c @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_route.h" #include <sys/param.h> #include <sys/systm.h> @@ -1591,7 +1592,11 @@ void nd6_subscription_cb(struct rib_head *rnh, struct rib_cmd_info *rc, void *arg) { +#ifdef ROUTE_MPATH + rib_decompose_notification(rc, check_release_defrouter, NULL); +#else check_release_defrouter(rc, NULL); +#endif } int diff --git a/sys/sys/socket.h b/sys/sys/socket.h index 038c4d3ef8b9..311d65671051 100644 --- a/sys/sys/socket.h +++ b/sys/sys/socket.h @@ -417,6 +417,7 @@ struct sockproto { #define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en * versions of msghdr structs. */ #define NET_RT_NHOP 6 /* dump routing nexthops */ +#define NET_RT_NHGRP 7 /* dump routing nexthop groups */ #endif /* __BSD_VISIBLE */ /* |