aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
Diffstat (limited to 'sys')
-rw-r--r--sys/conf/NOTES4
-rw-r--r--sys/conf/files4
-rw-r--r--sys/conf/options1
-rw-r--r--sys/net/radix.c4
-rw-r--r--sys/net/route.c1
-rw-r--r--sys/net/route.h5
-rw-r--r--sys/net/route/mpath_ctl.c165
-rw-r--r--sys/net/route/nhgrp.c344
-rw-r--r--sys/net/route/nhgrp_ctl.c788
-rw-r--r--sys/net/route/nhgrp_var.h72
-rw-r--r--sys/net/route/nhop.c8
-rw-r--r--sys/net/route/nhop.h37
-rw-r--r--sys/net/route/nhop_ctl.c7
-rw-r--r--sys/net/route/nhop_var.h11
-rw-r--r--sys/net/route/route_ctl.c275
-rw-r--r--sys/net/route/route_ctl.h18
-rw-r--r--sys/net/route/route_helpers.c164
-rw-r--r--sys/net/route/route_var.h80
-rw-r--r--sys/net/rtsock.c111
-rw-r--r--sys/netinet/in.c10
-rw-r--r--sys/netinet/in_fib.c59
-rw-r--r--sys/netinet/in_rmx.c5
-rw-r--r--sys/netinet/ip_output.c5
-rw-r--r--sys/netinet6/in6_fib.c55
-rw-r--r--sys/netinet6/in6_rmx.c5
-rw-r--r--sys/netinet6/nd6.c5
-rw-r--r--sys/sys/socket.h1
27 files changed, 2020 insertions, 224 deletions
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index 7aa957efa271..0d9fac844365 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -1002,7 +1002,7 @@ device lagg
#
# TCP_HHOOK enables the hhook(9) framework hooks for the TCP stack.
#
-# RADIX_MPATH provides support for equal-cost multi-path routing.
+# ROUTE_MPATH provides support for multipath routing.
#
options MROUTING # Multicast routing
options IPFIREWALL #firewall
@@ -1023,7 +1023,7 @@ options TCPDEBUG
options TCPPCAP
options TCP_BLACKBOX
options TCP_HHOOK
-options RADIX_MPATH
+options ROUTE_MPATH
# The MBUF_STRESS_TEST option enables options which create
# various random failures / extreme cases related to mbuf
diff --git a/sys/conf/files b/sys/conf/files
index e3c142441653..8ec5eacd053e 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -4143,10 +4143,12 @@ net/debugnet.c optional inet debugnet
net/debugnet_inet.c optional inet debugnet
net/pfil.c optional ether | inet
net/radix.c standard
-net/radix_mpath.c standard
net/raw_cb.c standard
net/raw_usrreq.c standard
net/route.c standard
+net/route/mpath_ctl.c optional route_mpath
+net/route/nhgrp.c optional route_mpath
+net/route/nhgrp_ctl.c optional route_mpath
net/route/nhop.c standard
net/route/nhop_ctl.c standard
net/route/nhop_utils.c standard
diff --git a/sys/conf/options b/sys/conf/options
index e22197093f58..e68621d61a37 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -454,6 +454,7 @@ NFSLOCKD
PCBGROUP opt_pcbgroup.h
PF_DEFAULT_TO_DROP opt_pf.h
RADIX_MPATH opt_mpath.h
+ROUTE_MPATH opt_route.h
ROUTETABLES opt_route.h
RSS opt_rss.h
SLIP_IFF_OPTS opt_slip.h
diff --git a/sys/net/radix.c b/sys/net/radix.c
index 3d9ed0a69538..f65153393d74 100644
--- a/sys/net/radix.c
+++ b/sys/net/radix.c
@@ -44,10 +44,6 @@
#include <sys/malloc.h>
#include <sys/syslog.h>
#include <net/radix.h>
-#include "opt_mpath.h"
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
#else /* !_KERNEL */
#include <stdio.h>
#include <strings.h>
diff --git a/sys/net/route.c b/sys/net/route.c
index d19a4cfc0afe..dac3211bc1f5 100644
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -39,7 +39,6 @@
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_mrouting.h"
-#include "opt_mpath.h"
#include "opt_route.h"
#include <sys/param.h>
diff --git a/sys/net/route.h b/sys/net/route.h
index 19c9ce0eb51b..021b4621692b 100644
--- a/sys/net/route.h
+++ b/sys/net/route.h
@@ -178,6 +178,7 @@ VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */
*/
/* Consumer-visible nexthop info flags */
+#define NHF_MULTIPATH 0x0008 /* Nexhop is a nexthop group */
#define NHF_REJECT 0x0010 /* RTF_REJECT */
#define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */
#define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */
@@ -208,6 +209,10 @@ struct rtstat {
uint64_t rts_wildcard; /* lookups satisfied by a wildcard */
uint64_t rts_nh_idx_alloc_failure; /* nexthop index alloc failure*/
uint64_t rts_nh_alloc_failure; /* nexthop allocation failure*/
+ uint64_t rts_add_failure; /* # of route addition failures */
+ uint64_t rts_add_retry; /* # of route addition retries */
+ uint64_t rts_del_failure; /* # of route deletion failure */
+ uint64_t rts_del_retry; /* # of route deletion retries */
};
/*
diff --git a/sys/net/route/mpath_ctl.c b/sys/net/route/mpath_ctl.c
new file mode 100644
index 000000000000..1ac7c191ed05
--- /dev/null
+++ b/sys/net/route/mpath_ctl.c
@@ -0,0 +1,165 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+
+/*
+ * This file contains the supporting functions for adding/deleting/updating
+ * multipath routes to the routing table.
+ */
+
+SYSCTL_DECL(_net_route);
+
+/*
+ * Tries to add @rnd_add nhop to the existing set of nhops (@nh_orig) for the
+ * prefix specified by @rt.
+ *
+ * Return 0 ans consumes rt / rnd_add nhop references. @rc gets populated
+ * with the operation result.
+ * Otherwise errno is returned.
+ *
+ * caller responsibility is to unlock/free rt and
+ * rt->rt_nhop.
+ */
+int
+add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct route_nhop_data *rnd_add,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+{
+ RIB_RLOCK_TRACKER;
+ struct route_nhop_data rnd_new;
+ int error = 0;
+
+ /*
+ * It is possible that multiple rtsock speakers will try to update
+ * the same route simultaneously. Reduce the chance of failing the
+ * request by retrying the cycle multiple times.
+ */
+ for (int i = 0; i < RIB_MAX_RETRIES; i++) {
+ error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add,
+ &rnd_new);
+ if (error != 0) {
+ if (error != EAGAIN)
+ break;
+
+ /*
+ * Group creation failed, most probably because
+ * @rnd_orig data got scheduled for deletion.
+ * Refresh @rnd_orig data and retry.
+ */
+ RIB_RLOCK(rnh);
+ lookup_prefix(rnh, info, rnd_orig);
+ RIB_RUNLOCK(rnh);
+ continue;
+ }
+
+ error = change_route_conditional(rnh, rt, info, rnd_orig,
+ &rnd_new, rc);
+ if (error != EAGAIN)
+ break;
+ RTSTAT_INC(rts_add_retry);
+ }
+
+ return (error);
+}
+
+struct rt_match_info {
+ struct rt_addrinfo *info;
+ struct rtentry *rt;
+};
+
+static bool
+gw_filter_func(const struct nhop_object *nh, void *_data)
+{
+ struct rt_match_info *ri = (struct rt_match_info *)_data;
+
+ return (check_info_match_nhop(ri->info, ri->rt, nh) == 0);
+}
+
+/*
+ * Tries to delete matching paths from @nhg.
+ * Returns 0 on success and updates operation result in @rc.
+ */
+int
+del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct nhgrp_object *nhg,
+ struct rib_cmd_info *rc)
+{
+ struct route_nhop_data rnd;
+ struct rt_match_info ri = { .info = info, .rt = rt };
+ int error;
+
+ RIB_WLOCK_ASSERT(rh);
+
+ /*
+ * Require gateway to delete multipath routes, to forbid
+ * deleting all paths at once.
+ * If the filter function is provided, skip gateway check to
+ * allow rib_walk_del() delete routes for any criteria based
+ * on provided callback.
+ */
+ if ((info->rti_info[RTAX_GATEWAY] == NULL) && (info->rti_filter == NULL))
+ return (ESRCH);
+
+ error = nhgrp_get_filtered_group(rh, nhg, gw_filter_func, (void *)&ri,
+ &rnd);
+ if (error == 0)
+ error = change_route_nhop(rh, rt, info, &rnd, rc);
+ return (error);
+}
+
diff --git a/sys/net/route/nhgrp.c b/sys/net/route/nhgrp.c
new file mode 100644
index 000000000000..c25f4f09865b
--- /dev/null
+++ b/sys/net/route/nhgrp.c
@@ -0,0 +1,344 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/nhgrp_var.h>
+
+/*
+ * This file contains data structures management logic for the nexthop
+ * groups ("nhgrp") route subsystem.
+ *
+ * Nexthop groups are used to store multiple routes available for the specific
+ * prefix. Nexthop groups are immutable and can be shared across multiple
+ * prefixes.
+ *
+ * Each group consists of a control plane part and a dataplane part.
+ * Control plane is basically a collection of nexthop objects with
+ * weights and refcount.
+ *
+ * Datapath consists of a array of nexthop pointers, compiled from control
+ * plane data to support O(1) nexthop selection.
+ *
+ * For example, consider the following group:
+ * [(nh1, weight=100), (nh2, weight=200)]
+ * It will compile to the following array:
+ * [nh1, nh2, nh2]
+ *
+ */
+
+static void consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets,
+ uint32_t new_idx_items);
+
+static int cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b);
+static unsigned int hash_nhgrp(const struct nhgrp_priv *obj);
+
+static unsigned
+djb_hash(const unsigned char *h, const int len)
+{
+ unsigned int result = 0;
+ int i;
+
+ for (i = 0; i < len; i++)
+ result = 33 * result ^ h[i];
+
+ return (result);
+}
+
+static int
+cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b)
+{
+
+ /*
+ * In case of consistent hashing, there can be multiple nexthop groups
+ * with the same "control plane" list of nexthops with weights and a
+ * different set of "data plane" nexthops.
+ * For now, ignore the data plane and focus on the control plane list.
+ */
+ if (a->nhg_nh_count != b->nhg_nh_count)
+ return (0);
+ return !memcmp(a->nhg_nh_weights, b->nhg_nh_weights,
+ sizeof(struct weightened_nhop) * a->nhg_nh_count);
+}
+
+/*
+ * Hash callback: calculate hash of an object
+ */
+static unsigned int
+hash_nhgrp(const struct nhgrp_priv *obj)
+{
+ const unsigned char *key;
+
+ key = (const unsigned char *)obj->nhg_nh_weights;
+
+ return (djb_hash(key, sizeof(struct weightened_nhop) * obj->nhg_nh_count));
+}
+
+/*
+ * Returns object referenced and unlocked
+ */
+struct nhgrp_priv *
+find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key)
+{
+ struct nhgrp_priv *priv_ret;
+
+ NHOPS_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->gr_head, mpath, key, priv_ret);
+ if (priv_ret != NULL) {
+ if (refcount_acquire_if_not_zero(&priv_ret->nhg_refcount) == 0) {
+ /* refcount is 0 -> group is being deleted */
+ priv_ret = NULL;
+ }
+ }
+ NHOPS_RUNLOCK(ctl);
+
+ return (priv_ret);
+}
+
+int
+link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv)
+{
+ uint16_t idx;
+ uint32_t new_num_buckets, new_num_items;
+
+ NHOPS_WLOCK(ctl);
+ /* Check if we need to resize hash and index */
+ new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->gr_head);
+ new_num_items = bitmask_get_resize_items(&ctl->gr_idx_head);
+
+ if (bitmask_alloc_idx(&ctl->gr_idx_head, &idx) != 0) {
+ NHOPS_WUNLOCK(ctl);
+ DPRINTF("Unable to allocate mpath index");
+ consider_resize(ctl, new_num_buckets, new_num_items);
+ return (0);
+ }
+
+ grp_priv->nhg_idx = idx;
+ grp_priv->nh_control = ctl;
+ CHT_SLIST_INSERT_HEAD(&ctl->gr_head, mpath, grp_priv);
+
+ NHOPS_WUNLOCK(ctl);
+
+ consider_resize(ctl, new_num_buckets, new_num_items);
+
+ return (1);
+}
+
+struct nhgrp_priv *
+unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key)
+{
+ struct nhgrp_priv *nhg_priv_ret;
+ int ret, idx;
+
+ NHOPS_WLOCK(ctl);
+
+ CHT_SLIST_REMOVE_BYOBJ(&ctl->gr_head, mpath, key, nhg_priv_ret);
+
+ if (nhg_priv_ret == NULL) {
+ DPRINTF("Unable to find nhop group!");
+ NHOPS_WUNLOCK(ctl);
+ return (NULL);
+ }
+
+ idx = nhg_priv_ret->nhg_idx;
+ ret = bitmask_free_idx(&ctl->gr_idx_head, idx);
+ nhg_priv_ret->nhg_idx = 0;
+ nhg_priv_ret->nh_control = NULL;
+
+ NHOPS_WUNLOCK(ctl);
+
+ return (nhg_priv_ret);
+}
+
+/*
+ * Checks if hash needs resizing and performs this resize if necessary
+ *
+ */
+__noinline static void
+consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items)
+{
+ void *nh_ptr, *nh_idx_ptr;
+ void *old_idx_ptr;
+ size_t alloc_size;
+
+ nh_ptr = NULL ;
+ if (new_nh_buckets != 0) {
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets);
+ nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ nh_idx_ptr = NULL;
+ if (new_idx_items != 0) {
+ alloc_size = bitmask_get_size(new_idx_items);
+ nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ if (nh_ptr == NULL && nh_idx_ptr == NULL) {
+ /* Either resize is not required or allocations have failed. */
+ return;
+ }
+
+ DPRINTF("mp: going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]",
+ nh_ptr, new_nh_buckets, nh_idx_ptr, new_idx_items);
+
+ old_idx_ptr = NULL;
+
+ NHOPS_WLOCK(ctl);
+ if (nh_ptr != NULL) {
+ CHT_SLIST_RESIZE(&ctl->gr_head, mpath, nh_ptr, new_nh_buckets);
+ }
+ if (nh_idx_ptr != NULL) {
+ if (bitmask_copy(&ctl->gr_idx_head, nh_idx_ptr, new_idx_items))
+ bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr);
+ }
+ NHOPS_WUNLOCK(ctl);
+
+ if (nh_ptr != NULL)
+ free(nh_ptr, M_NHOP);
+ if (old_idx_ptr != NULL)
+ free(old_idx_ptr, M_NHOP);
+}
+
+/*
+ * Function allocating the necessary group data structures.
+ */
+bool
+nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags)
+{
+ size_t alloc_size;
+ uint32_t num_buckets, num_items;
+ void *cht_ptr, *mask_ptr;
+
+ malloc_flags = (malloc_flags & (M_NOWAIT | M_WAITOK)) | M_ZERO;
+
+ num_buckets = 8;
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
+ cht_ptr = malloc(alloc_size, M_NHOP, malloc_flags);
+
+ if (cht_ptr == NULL) {
+ DPRINTF("mpath init failed");
+ return (false);
+ }
+
+ /*
+ * Allocate nexthop index bitmask.
+ */
+ num_items = 128;
+ mask_ptr = malloc(bitmask_get_size(num_items), M_NHOP, malloc_flags);
+ if (mask_ptr == NULL) {
+ DPRINTF("mpath bitmask init failed");
+ free(cht_ptr, M_NHOP);
+ return (false);
+ }
+
+ NHOPS_WLOCK(ctl);
+
+ if (ctl->gr_head.hash_size == 0) {
+ /* Init hash and bitmask */
+ CHT_SLIST_INIT(&ctl->gr_head, cht_ptr, num_buckets);
+ bitmask_init(&ctl->gr_idx_head, mask_ptr, num_items);
+ NHOPS_WUNLOCK(ctl);
+ } else {
+ /* Other thread has already initiliazed hash/bitmask */
+ NHOPS_WUNLOCK(ctl);
+ free(cht_ptr, M_NHOP);
+ free(mask_ptr, M_NHOP);
+ }
+
+ DPRINTF("mpath init done for fib/af %d/%d", ctl->rh->rib_fibnum,
+ ctl->rh->rib_family);
+
+ return (true);
+}
+
+int
+nhgrp_ctl_init(struct nh_control *ctl)
+{
+
+ /*
+ * By default, do not allocate datastructures as multipath
+ * routes will not be necessarily used.
+ */
+ CHT_SLIST_INIT(&ctl->gr_head, NULL, 0);
+ bitmask_init(&ctl->gr_idx_head, NULL, 0);
+ return (0);
+}
+
+void
+nhgrp_ctl_free(struct nh_control *ctl)
+{
+
+ if (ctl->gr_head.ptr != NULL)
+ free(ctl->gr_head.ptr, M_NHOP);
+ if (ctl->gr_idx_head.idx != NULL)
+ free(ctl->gr_idx_head.idx, M_NHOP);
+}
+
+void
+nhgrp_ctl_unlink_all(struct nh_control *ctl)
+{
+ struct nhgrp_priv *nhg_priv;
+
+ NHOPS_WLOCK_ASSERT(ctl);
+
+ CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
+ DPRINTF("Marking nhgrp %u unlinked", nhg_priv->nhg_idx);
+ refcount_release(&nhg_priv->nhg_linked);
+ } CHT_SLIST_FOREACH_END;
+}
+
diff --git a/sys/net/route/nhgrp_ctl.c b/sys/net/route/nhgrp_ctl.c
new file mode 100644
index 000000000000..a3a824992e08
--- /dev/null
+++ b/sys/net/route/nhgrp_ctl.c
@@ -0,0 +1,788 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+#define RTDEBUG
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/epoch.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/nhgrp_var.h>
+
+/*
+ * This file contains the supporting functions for creating multipath groups
+ * and compiling their dataplane parts.
+ */
+
+/* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */
+_Static_assert(MPF_MULTIPATH == NHF_MULTIPATH,
+ "MPF_MULTIPATH must be the same as NHF_MULTIPATH");
+/* Offset and size of flags field has to be the same for nhop/nhop groups */
+CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags);
+/* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */
+CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
+
+static int wn_cmp(const void *a, const void *b);
+static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops);
+
+static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
+ struct weightened_nhop *wn, int num_nhops, int *perror);
+static void destroy_nhgrp(struct nhgrp_priv *nhg_priv);
+static void destroy_nhgrp_epoch(epoch_context_t ctx);
+static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv);
+
+static int
+wn_cmp(const void *a, const void *b)
+{
+ const struct weightened_nhop *wa = a;
+ const struct weightened_nhop *wb = b;
+
+ if (wa->weight > wb->weight)
+ return (1);
+ else if (wa->weight < wb->weight)
+ return (-1);
+
+ /* Compare nexthops by pointer */
+ if (wa->nh > wb->nh)
+ return (1);
+ else if (wa->nh < wb->nh)
+ return (-1);
+ else
+ return (0);
+}
+
+/*
+ * Perform in-place sorting for array of nexthops in @wn.
+ *
+ * To avoid nh groups duplication, nexthops/weights in the
+ * @wn need to be ordered deterministically.
+ * As this sorting is needed only for the control plane functionality,
+ * there are no specific external requirements.
+ *
+ * Sort by weight first, to ease calculation of the slot sizes.
+ */
+static void
+sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
+{
+
+ qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp);
+}
+
+/*
+ * Calculate minimum number of slots required to fit the existing
+ * set of weights in the common use case where weights are "easily"
+ * comparable.
+ * Assumes @wn is sorted by weight ascending and each weight is > 0.
+ * Returns number of slots or 0 if precise calculation failed.
+ *
+ * Some examples:
+ * note: (i, X) pair means (nhop=i, weight=X):
+ * (1, 1) (2, 2) -> 3 slots [1, 2, 2]
+ * (1, 100), (2, 200) -> 3 slots [1, 2, 2]
+ * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3]
+ */
+static uint32_t
+calc_min_mpath_slots_fast(const struct weightened_nhop *wn, size_t num_items)
+{
+ uint32_t i, last, xmin;
+ uint64_t total = 0;
+
+ last = 0;
+ xmin = wn[0].weight;
+ for (i = 0; i < num_items; i++) {
+ total += wn[i].weight;
+ if ((wn[i].weight - last < xmin) && (wn[i].weight != last))
+ xmin = wn[i].weight - last;
+ last = wn[i].weight;
+ }
+ /* xmin is the minimum unit of desired capacity */
+ if ((total % xmin) != 0)
+ return (0);
+ for (i = 0; i < num_items; i++) {
+ if ((wn[i].weight % xmin) != 0)
+ return (0);
+ }
+
+ return ((uint32_t)(total / xmin));
+}
+
+/*
+ * Calculate minimum number of slots required to fit the existing
+ * set of weights while maintaining weight coefficients.
+ *
+ * Assume @wn is sorted by weight ascending and each weight is > 0.
+ *
+ * Tries to find simple precise solution first and falls back to
+ * RIB_MAX_MPATH_WIDTH in case of any failure.
+ */
+static uint32_t
+calc_min_mpath_slots(const struct weightened_nhop *wn, size_t num_items)
+{
+ uint32_t v;
+
+ v = calc_min_mpath_slots_fast(wn, num_items);
+ if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH))
+ v = RIB_MAX_MPATH_WIDTH;
+
+ return (v);
+}
+
+/*
+ * Nexthop group data consists of
+ * 1) dataplane part, with nhgrp_object as a header followed by an
+ * arbitrary number of nexthop pointers.
+ * 2) control plane part, with nhgrp_priv as a header, followed by
+ * an arbirtrary number of 'struct weightened_nhop' object.
+ *
+ * Given nexthop groups are (mostly) immutable, allocate all data
+ * in one go.
+ *
+ */
+__noinline static size_t
+get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops)
+{
+ size_t sz;
+
+ sz = sizeof(struct nhgrp_object);
+ sz += nhg_size * sizeof(struct nhop_object *);
+ sz += sizeof(struct nhgrp_priv);
+ sz += num_nhops * sizeof(struct weightened_nhop);
+ return (sz);
+}
+
+/*
+ * Compile actual list of nexthops to be used by datapath from
+ * the nexthop group @dst.
+ *
+ * For example, compiling control plane list of 2 nexthops
+ * [(200, A), (100, B)] would result in the datapath array
+ * [A, A, B]
+ */
+static void
+compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x,
+ uint32_t num_slots)
+{
+ struct nhgrp_object *dst;
+ int i, slot_idx, remaining_slots;
+ uint64_t remaining_sum, nh_weight, nh_slots;
+
+ slot_idx = 0;
+ dst = dst_priv->nhg;
+ /* Calculate sum of all weights */
+ remaining_sum = 0;
+ for (i = 0; i < dst_priv->nhg_nh_count; i++)
+ remaining_sum += x[i].weight;
+ remaining_slots = num_slots;
+ DPRINTF("O: %u/%u", (uint32_t)remaining_sum, remaining_slots);
+ for (i = 0; i < dst_priv->nhg_nh_count; i++) {
+ /* Calculate number of slots for the current nexthop */
+ if (remaining_sum > 0) {
+ nh_weight = (uint64_t)x[i].weight;
+ nh_slots = (nh_weight * remaining_slots / remaining_sum);
+ } else
+ nh_slots = 0;
+
+ remaining_sum -= x[i].weight;
+ remaining_slots -= nh_slots;
+
+ DPRINTF(" OO[%d]: %u/%u curr=%d slot_idx=%d", i,
+ (uint32_t)remaining_sum, remaining_slots,
+ (int)nh_slots, slot_idx);
+
+ KASSERT((slot_idx + nh_slots <= num_slots),
+ ("index overflow during nhg compilation"));
+ while (nh_slots-- > 0)
+ dst->nhops[slot_idx++] = x[i].nh;
+ }
+}
+
+/*
+ * Allocates new nexthop group for the list of weightened nexthops.
+ * Assume sorted list.
+ * Does NOT reference any nexthops in the group.
+ * Returns group with refcount=1 or NULL.
+ */
+static struct nhgrp_priv *
+alloc_nhgrp(struct weightened_nhop *wn, int num_nhops)
+{
+ uint32_t nhgrp_size;
+ int flags = M_NOWAIT;
+ struct nhgrp_object *nhg;
+ struct nhgrp_priv *nhg_priv;
+
+ nhgrp_size = calc_min_mpath_slots(wn, num_nhops);
+ if (nhgrp_size == 0) {
+ /* Zero weights, abort */
+ return (NULL);
+ }
+
+ size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops);
+ nhg = malloc(sz, M_NHOP, flags | M_ZERO);
+ if (nhg == NULL) {
+ return (NULL);
+ }
+
+ /* Has to be the first to make NHGRP_PRIV() work */
+ nhg->nhg_size = nhgrp_size;
+ DPRINTF("new mpath group: num_nhops: %u", (uint32_t)nhgrp_size);
+ nhg->nhg_flags = MPF_MULTIPATH;
+
+ nhg_priv = NHGRP_PRIV(nhg);
+ nhg_priv->nhg_nh_count = num_nhops;
+ refcount_init(&nhg_priv->nhg_refcount, 1);
+
+ /* Please see nhgrp_free() comments on the initial value */
+ refcount_init(&nhg_priv->nhg_linked, 2);
+
+ nhg_priv->nhg = nhg;
+ memcpy(&nhg_priv->nhg_nh_weights[0], wn,
+ num_nhops * sizeof(struct weightened_nhop));
+
+ compile_nhgrp(nhg_priv, wn, nhg->nhg_size);
+
+ return (nhg_priv);
+}
+
+void
+nhgrp_free(struct nhgrp_object *nhg)
+{
+ struct nhgrp_priv *nhg_priv;
+ struct nh_control *ctl;
+ struct epoch_tracker et;
+
+ nhg_priv = NHGRP_PRIV(nhg);
+
+ if (!refcount_release(&nhg_priv->nhg_refcount))
+ return;
+
+ /*
+ * group objects don't have an explicit lock attached to it.
+ * As groups are reclaimed based on reference count, it is possible
+ * that some groups will persist after vnet destruction callback
+ * called. Given that, handle scenario with nhgrp_free_group() being
+ * called either after or simultaneously with nhgrp_ctl_unlink_all()
+ * by using another reference counter: nhg_linked.
+ *
+ * There are only 2 places, where nhg_linked can be decreased:
+ * rib destroy (nhgrp_ctl_unlink_all) and this function.
+ * nhg_link can never be increased.
+ *
+ * Hence, use initial value of 2 to make use of
+ * refcount_release_if_not_last().
+ *
+ * There can be two scenarious when calling this function:
+ *
+ * 1) nhg_linked value is 2. This means that either
+ * nhgrp_ctl_unlink_all() has not been called OR it is running,
+ * but we are guaranteed that nh_control won't be freed in
+ * this epoch. Hence, nexthop can be safely unlinked.
+ *
+ * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all()
+ * has been called and nhgrp unlink can be skipped.
+ */
+
+ NET_EPOCH_ENTER(et);
+ if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) {
+ ctl = nhg_priv->nh_control;
+ if (unlink_nhgrp(ctl, nhg_priv) == NULL) {
+ /* Do not try to reclaim */
+ DPRINTF("Failed to unlink nexhop group %p", nhg_priv);
+ NET_EPOCH_EXIT(et);
+ return;
+ }
+ }
+ NET_EPOCH_EXIT(et);
+
+ epoch_call(net_epoch_preempt, destroy_nhgrp_epoch,
+ &nhg_priv->nhg_epoch_ctx);
+}
+
+/*
+ * Destroys all local resources belonging to @nhg_priv.
+ */
+__noinline static void
+destroy_nhgrp_int(struct nhgrp_priv *nhg_priv)
+{
+
+ free(nhg_priv->nhg, M_NHOP);
+}
+
+__noinline static void
+destroy_nhgrp(struct nhgrp_priv *nhg_priv)
+{
+
+ KASSERT((nhg_priv->nhg_refcount == 0), ("nhg_refcount != 0"));
+
+ DPRINTF("DEL MPATH %p", nhg_priv);
+
+ KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0"));
+
+ free_nhgrp_nhops(nhg_priv);
+
+ destroy_nhgrp_int(nhg_priv);
+}
+
+/*
+ * Epoch callback indicating group is safe to destroy
+ */
+static void
+destroy_nhgrp_epoch(epoch_context_t ctx)
+{
+ struct nhgrp_priv *nhg_priv;
+
+ nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx);
+
+ destroy_nhgrp(nhg_priv);
+}
+
+static bool
+ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
+{
+
+ for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
+ if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0)
+ continue;
+
+ /*
+ * Failed to ref the nexthop, b/c it's deleted.
+ * Need to rollback references back.
+ */
+ for (int j = 0; j < i; j++)
+ nhop_free(nhg_priv->nhg_nh_weights[j].nh);
+ return (false);
+ }
+
+ return (true);
+}
+
+static void
+free_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
+{
+
+ for (int i = 0; i < nhg_priv->nhg_nh_count; i++)
+ nhop_free(nhg_priv->nhg_nh_weights[i].nh);
+}
+
+/*
+ * Creates or looks up an existing nexthop group based on @wn and @num_nhops.
+ *
+ * Returns referenced nhop group or NULL, passing error code in @perror.
+ */
+struct nhgrp_priv *
+get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops,
+ int *perror)
+{
+ struct nhgrp_priv *key, *nhg_priv;
+
+ if (num_nhops > RIB_MAX_MPATH_WIDTH) {
+ *perror = E2BIG;
+ return (NULL);
+ }
+
+ if (ctl->gr_head.hash_size == 0) {
+ /* First multipath request. Bootstrap mpath datastructures. */
+ if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ }
+
+ /* Sort nexthops & check there are no duplicates */
+ sort_weightened_nhops(wn, num_nhops);
+ uint32_t last_id = 0;
+ for (int i = 0; i < num_nhops; i++) {
+ if (wn[i].nh->nh_priv->nh_idx == last_id) {
+ *perror = EEXIST;
+ return (NULL);
+ }
+ last_id = wn[i].nh->nh_priv->nh_idx;
+ }
+
+ if ((key = alloc_nhgrp(wn, num_nhops)) == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+
+ nhg_priv = find_nhgrp(ctl, key);
+ if (nhg_priv != NULL) {
+ /*
+ * Free originally-created group. As it hasn't been linked
+ * and the dependent nexhops haven't been referenced, just free
+ * the group.
+ */
+ destroy_nhgrp_int(key);
+ *perror = 0;
+ return (nhg_priv);
+ } else {
+ /* No existing group, try to link the new one */
+ if (!ref_nhgrp_nhops(key)) {
+ /*
+ * Some of the nexthops have been scheduled for deletion.
+ * As the group hasn't been linked / no nexhops have been
+ * referenced, call the final destructor immediately.
+ */
+ destroy_nhgrp_int(key);
+ *perror = EAGAIN;
+ return (NULL);
+ }
+ if (link_nhgrp(ctl, key) == 0) {
+ /* Unable to allocate index? */
+ *perror = EAGAIN;
+ destroy_nhgrp(key);
+ }
+ *perror = 0;
+ return (key);
+ }
+
+ /* NOTREACHED */
+}
+
+/*
+ * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig.
+ *
+ * Returns referenced nexthop group or NULL. In the latter case, @perror is
+ * filled with an error code.
+ * Note that function does NOT care if the next nexthops already exists
+ * in the @gr_orig. As a result, they will be added, resulting in the
+ * same nexthop being present multiple times in the new group.
+ */
+static struct nhgrp_priv *
+append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig,
+ struct weightened_nhop *wn, int num_nhops, int *perror)
+{
+ char storage[64];
+ struct weightened_nhop *pnhops;
+ struct nhgrp_priv *nhg_priv;
+ const struct nhgrp_priv *src_priv;
+ size_t sz;
+ int curr_nhops;
+
+ src_priv = NHGRP_PRIV_CONST(gr_orig);
+ curr_nhops = src_priv->nhg_nh_count;
+
+ *perror = 0;
+
+ sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop));
+ /* optimize for <= 4 paths, each path=16 bytes */
+ if (sz <= sizeof(storage))
+ pnhops = (struct weightened_nhop *)&storage[0];
+ else {
+ pnhops = malloc(sz, M_TEMP, M_NOWAIT);
+ if (pnhops == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ }
+
+ /* Copy nhops from original group first */
+ memcpy(pnhops, src_priv->nhg_nh_weights,
+ curr_nhops * sizeof(struct weightened_nhop));
+ memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop));
+ curr_nhops += num_nhops;
+
+ nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, perror);
+
+ if (pnhops != (struct weightened_nhop *)&storage[0])
+ free(pnhops, M_TEMP);
+
+ if (nhg_priv == NULL)
+ return (NULL);
+
+ return (nhg_priv);
+}
+
+
+/*
+ * Creates/finds nexthop group based on @wn and @num_nhops.
+ * Returns 0 on success with referenced group in @rnd, or
+ * errno.
+ *
+ * If the error is EAGAIN, then the operation can be retried.
+ */
+int
+nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops,
+ struct route_nhop_data *rnd)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct nhgrp_priv *nhg_priv;
+ int error;
+
+ nhg_priv = get_nhgrp(ctl, wn, num_nhops, &error);
+ if (nhg_priv != NULL)
+ rnd->rnd_nhgrp = nhg_priv->nhg;
+ rnd->rnd_weight = 0;
+
+ return (error);
+}
+
+/*
+ * Creates new nexthop group based on @src group with the nexthops defined in bitmask
+ * @nhop_mask removed.
+ * Returns referenced nexthop group or NULL on failure.
+ */
+int
+nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src,
+ nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd)
+{
+ char storage[64];
+ struct nh_control *ctl = rh->nh_control;
+ struct weightened_nhop *pnhops;
+ const struct nhgrp_priv *mp_priv, *src_priv;
+ size_t sz;
+ int error, i, num_nhops;
+
+ src_priv = NHGRP_PRIV_CONST(src);
+
+ sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop));
+ /* optimize for <= 4 paths, each path=16 bytes */
+ if (sz <= sizeof(storage))
+ pnhops = (struct weightened_nhop *)&storage[0];
+ else {
+ if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL)
+ return (ENOMEM);
+ }
+
+ /* Filter nexthops */
+ error = 0;
+ num_nhops = 0;
+ for (i = 0; i < src_priv->nhg_nh_count; i++) {
+ if (flt_func(src_priv->nhg_nh_weights[i].nh, flt_data))
+ continue;
+ memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i],
+ sizeof(struct weightened_nhop));
+ }
+
+ if (num_nhops == 0) {
+ rnd->rnd_nhgrp = NULL;
+ rnd->rnd_weight = 0;
+ } else if (num_nhops == 1) {
+ rnd->rnd_nhop = pnhops[0].nh;
+ rnd->rnd_weight = pnhops[0].weight;
+ if (nhop_try_ref_object(rnd->rnd_nhop) == 0)
+ error = EAGAIN;
+ } else {
+ mp_priv = get_nhgrp(ctl, pnhops, num_nhops, &error);
+ if (mp_priv != NULL)
+ rnd->rnd_nhgrp = mp_priv->nhg;
+ rnd->rnd_weight = 0;
+ }
+
+ if (pnhops != (struct weightened_nhop *)&storage[0])
+ free(pnhops, M_TEMP);
+
+ return (error);
+}
+
+/*
+ * Creates new multipath group based on existing group/nhop in @rnd_orig and
+ * to-be-added nhop @wn_add.
+ * Returns 0 on success and stores result in @rnd_new.
+ */
+int
+nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig,
+ struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct nhgrp_priv *nhg_priv;
+ struct weightened_nhop wn[2];
+ int error;
+
+ if (rnd_orig->rnd_nhop == NULL) {
+ /* No paths to add to, just reference current nhop */
+ *rnd_new = *rnd_add;
+ if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0)
+ return (EAGAIN);
+ return (0);
+ }
+
+ wn[0].nh = rnd_add->rnd_nhop;
+ wn[0].weight = rnd_add->rnd_weight;
+
+ if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) {
+ /* Simple merge of 2 non-multipath nexthops */
+ wn[1].nh = rnd_orig->rnd_nhop;
+ wn[1].weight = rnd_orig->rnd_weight;
+ nhg_priv = get_nhgrp(ctl, wn, 2, &error);
+ } else {
+ /* Get new nhop group with @rt->rt_nhop as an additional nhop */
+ nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1,
+ &error);
+ }
+
+ if (nhg_priv == NULL)
+ return (error);
+ rnd_new->rnd_nhgrp = nhg_priv->nhg;
+ rnd_new->rnd_weight = 0;
+
+ return (0);
+}
+
+/*
+ * Returns pointer to array of nexthops with weights for
+ * given @nhg. Stores number of items in the array into @pnum_nhops.
+ */
+struct weightened_nhop *
+nhgrp_get_nhops(struct nhgrp_object *nhg, uint32_t *pnum_nhops)
+{
+ struct nhgrp_priv *nhg_priv;
+
+ KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
+
+ nhg_priv = NHGRP_PRIV(nhg);
+ *pnum_nhops = nhg_priv->nhg_nh_count;
+
+ return (nhg_priv->nhg_nh_weights);
+}
+
+__noinline static int
+dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv,
+ char *buffer, size_t buffer_size, struct sysctl_req *w)
+{
+ struct rt_msghdr *rtm;
+ struct nhgrp_external *nhge;
+ struct nhgrp_container *nhgc;
+ const struct nhgrp_object *nhg;
+ struct nhgrp_nhop_external *ext;
+ int error;
+ size_t sz;
+
+ nhg = nhg_priv->nhg;
+
+ sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
+ /* controlplane nexthops */
+ sz += sizeof(struct nhgrp_container);
+ sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
+ /* dataplane nexthops */
+ sz += sizeof(struct nhgrp_container);
+ sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
+
+ KASSERT(sz <= buffer_size, ("increase nhgrp buffer size"));
+
+ bzero(buffer, sz);
+
+ rtm = (struct rt_msghdr *)buffer;
+ rtm->rtm_msglen = sz;
+ rtm->rtm_version = RTM_VERSION;
+ rtm->rtm_type = RTM_GET;
+
+ nhge = (struct nhgrp_external *)(rtm + 1);
+
+ nhge->nhg_idx = nhg_priv->nhg_idx;
+ nhge->nhg_refcount = nhg_priv->nhg_refcount;
+
+ /* fill in control plane nexthops firs */
+ nhgc = (struct nhgrp_container *)(nhge + 1);
+ nhgc->nhgc_type = NHG_C_TYPE_CNHOPS;
+ nhgc->nhgc_subtype = 0;
+ nhgc->nhgc_len = sizeof(struct nhgrp_container);
+ nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
+ nhgc->nhgc_count = nhg_priv->nhg_nh_count;
+
+ ext = (struct nhgrp_nhop_external *)(nhgc + 1);
+ for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
+ ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx;
+ ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight;
+ }
+
+ /* fill in dataplane nexthops */
+ nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]);
+ nhgc->nhgc_type = NHG_C_TYPE_DNHOPS;
+ nhgc->nhgc_subtype = 0;
+ nhgc->nhgc_len = sizeof(struct nhgrp_container);
+ nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
+ nhgc->nhgc_count = nhg->nhg_size;
+
+ ext = (struct nhgrp_nhop_external *)(nhgc + 1);
+ for (int i = 0; i < nhg->nhg_size; i++) {
+ ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx;
+ ext[i].nh_weight = 0;
+ }
+
+ error = SYSCTL_OUT(w, buffer, sz);
+
+ return (error);
+}
+
+int
+nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct epoch_tracker et;
+ struct nhgrp_priv *nhg_priv;
+ char *buffer;
+ size_t sz;
+ int error = 0;
+
+ if (ctl->gr_head.items_count == 0)
+ return (0);
+
+ /* Calculate the maximum nhop group size in bytes */
+ sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
+ sz += 2 * sizeof(struct nhgrp_container);
+ sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH;
+ buffer = malloc(sz, M_TEMP, M_WAITOK);
+
+ NET_EPOCH_ENTER(et);
+ NHOPS_RLOCK(ctl);
+ CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
+ error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w);
+ if (error != 0)
+ break;
+ } CHT_SLIST_FOREACH_END;
+ NHOPS_RUNLOCK(ctl);
+ NET_EPOCH_EXIT(et);
+
+ free(buffer, M_TEMP);
+
+ return (error);
+}
diff --git a/sys/net/route/nhgrp_var.h b/sys/net/route/nhgrp_var.h
new file mode 100644
index 000000000000..ba90a3feedc8
--- /dev/null
+++ b/sys/net/route/nhgrp_var.h
@@ -0,0 +1,72 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This header file contains private definitions for the nexthop groups.
+ *
+ * Header is not intended to be included by the code external to the
+ * routing subsystem.
+ */
+
+#ifndef _NET_ROUTE_NHGRP_VAR_H_
+#define _NET_ROUTE_NHGRP_VAR_H_
+
+/* nhgrp hash definition */
+/* produce hash value for an object */
+#define mpath_hash_obj(_obj) (hash_nhgrp(_obj))
+/* compare two objects */
+#define mpath_cmp(_one, _two) (cmp_nhgrp(_one, _two))
+/* next object accessor */
+#define mpath_next(_obj) (_obj)->nhg_priv_next
+
+struct nhgrp_priv {
+ uint32_t nhg_idx;
+ uint8_t nhg_nh_count; /* number of items in nh_weights */
+ uint8_t nhg_spare[3];
+ u_int nhg_refcount; /* use refcount */
+ u_int nhg_linked; /* refcount(9), == 2 if linked to the list */
+ struct nh_control *nh_control; /* parent control structure */
+ struct nhgrp_priv *nhg_priv_next;
+ struct nhgrp_object *nhg;
+ struct epoch_context nhg_epoch_ctx; /* epoch data for nhop */
+ struct weightened_nhop nhg_nh_weights[0];
+};
+
+#define _NHGRP_PRIV(_src) (&(_src)->nhops[(_src)->nhg_size])
+#define NHGRP_PRIV(_src) ((struct nhgrp_priv *)_NHGRP_PRIV(_src))
+#define NHGRP_PRIV_CONST(_src) ((const struct nhgrp_priv *)_NHGRP_PRIV(_src))
+
+/* nhgrp.c */
+bool nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags);
+struct nhgrp_priv *find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key);
+int link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv);
+struct nhgrp_priv *unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key);
+
+#endif
+
diff --git a/sys/net/route/nhop.c b/sys/net/route/nhop.c
index 4b9a79ffbf20..0db47db9916e 100644
--- a/sys/net/route/nhop.c
+++ b/sys/net/route/nhop.c
@@ -64,7 +64,7 @@ __FBSDID("$FreeBSD$");
* is backed by the bitmask array.
*/
-static MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data");
+MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data");
/* Hash management functions */
@@ -112,6 +112,9 @@ destroy_ctl(struct nh_control *ctl)
NHOPS_LOCK_DESTROY(ctl);
free(ctl->nh_head.ptr, M_NHOP);
free(ctl->nh_idx_head.idx, M_NHOP);
+#ifdef ROUTE_MPATH
+ nhgrp_ctl_free(ctl);
+#endif
free(ctl, M_NHOP);
}
@@ -154,6 +157,9 @@ nhops_destroy_rib(struct rib_head *rh)
DPRINTF("Marking nhop %u unlinked", nh_priv->nh_idx);
refcount_release(&nh_priv->nh_linked);
} CHT_SLIST_FOREACH_END;
+#ifdef ROUTE_MPATH
+ nhgrp_ctl_unlink_all(ctl);
+#endif
NHOPS_WUNLOCK(ctl);
/*
diff --git a/sys/net/route/nhop.h b/sys/net/route/nhop.h
index 1f6aff134c2d..3944d8946b07 100644
--- a/sys/net/route/nhop.h
+++ b/sys/net/route/nhop.h
@@ -155,7 +155,7 @@ struct nhop_object {
*/
#define NH_IS_VALID(_nh) RT_LINK_IS_UP((_nh)->nh_ifp)
-#define NH_IS_MULTIPATH(_nh) ((_nh)->nh_flags & NHF_MULTIPATH)
+#define NH_IS_NHGRP(_nh) ((_nh)->nh_flags & NHF_MULTIPATH)
#define RT_GATEWAY(_rt) ((struct sockaddr *)&(_rt)->rt_nhop->gw4_sa)
#define RT_GATEWAY_CONST(_rt) ((const struct sockaddr *)&(_rt)->rt_nhop->gw4_sa)
@@ -166,6 +166,11 @@ struct nhop_object {
_nh = NULL; \
} while (0)
+struct weightened_nhop {
+ struct nhop_object *nh;
+ uint32_t weight;
+};
+
void nhop_free(struct nhop_object *nh);
struct sysctl_req;
@@ -209,16 +214,34 @@ struct nhop_addrs {
uint16_t src_sa_off; /* offset of src address SA */
};
-struct mpath_nhop_external {
+#define NHG_C_TYPE_CNHOPS 0x1 /* Control plane nhops list */
+#define NHG_C_TYPE_DNHOPS 0x2 /* Dataplane nhops list */
+struct nhgrp_container {
+ uint32_t nhgc_len; /* container length */
+ uint16_t nhgc_count; /* number of items */
+ uint8_t nhgc_type; /* container type */
+ uint8_t nhgc_subtype; /* container subtype */
+};
+
+struct nhgrp_nhop_external {
uint32_t nh_idx;
uint32_t nh_weight;
};
-struct mpath_external {
- uint32_t mp_idx;
- uint32_t mp_refcount;
- uint32_t mp_nh_count;
- uint32_t mp_group_size;
+/*
+ * Layout:
+ * - nhgrp_external
+ * - nhgrp_container (control plane nhops list)
+ * - nhgrp_nhop_external
+ * - nhgrp_nhop_external
+ * ..
+ * - nhgrp_container (dataplane nhops list)
+ * - nhgrp_nhop_external
+ * - nhgrp_nhop_external
+ */
+struct nhgrp_external {
+ uint32_t nhg_idx; /* Nexthop group index */
+ uint32_t nhg_refcount; /* number of references */
};
#endif
diff --git a/sys/net/route/nhop_ctl.c b/sys/net/route/nhop_ctl.c
index b9ac4d63218d..150ae5c4be58 100644
--- a/sys/net/route/nhop_ctl.c
+++ b/sys/net/route/nhop_ctl.c
@@ -695,7 +695,14 @@ void
nhop_free_any(struct nhop_object *nh)
{
+#ifdef ROUTE_MPATH
+ if (!NH_IS_NHGRP(nh))
+ nhop_free(nh);
+ else
+ nhgrp_free((struct nhgrp_object *)nh);
+#else
nhop_free(nh);
+#endif
}
/* Helper functions */
diff --git a/sys/net/route/nhop_var.h b/sys/net/route/nhop_var.h
index 220b6c9a7634..6e1aba670e3c 100644
--- a/sys/net/route/nhop_var.h
+++ b/sys/net/route/nhop_var.h
@@ -37,6 +37,8 @@
#ifndef _NET_ROUTE_NHOP_VAR_H_
#define _NET_ROUTE_NHOP_VAR_H_
+MALLOC_DECLARE(M_NHOP);
+
/* define nhop hash table */
struct nhop_priv;
CHT_SLIST_DEFINE(nhops, struct nhop_priv);
@@ -47,9 +49,15 @@ CHT_SLIST_DEFINE(nhops, struct nhop_priv);
/* next object accessor */
#define nhops_next(_obj) (_obj)->nh_next
+/* define multipath hash table */
+struct nhgrp_priv;
+CHT_SLIST_DEFINE(nhgroups, struct nhgrp_priv);
+
struct nh_control {
struct nhops_head nh_head; /* hash table head */
struct bitmask_head nh_idx_head; /* nhop index head */
+ struct nhgroups_head gr_head; /* nhgrp hash table head */
+ struct bitmask_head gr_idx_head; /* nhgrp index head */
struct rwlock ctl_lock; /* overall ctl lock */
struct rib_head *ctl_rh; /* pointer back to rnh */
struct epoch_context ctl_epoch_ctx; /* epoch ctl helper */
@@ -80,7 +88,8 @@ struct nhop_priv {
struct epoch_context nh_epoch_ctx; /* epoch data for nhop */
};
-#define NH_IS_PINNED(_nh) ((_nh)->nh_priv->rt_flags & RTF_PINNED)
+#define NH_IS_PINNED(_nh) ((!NH_IS_NHGRP(_nh)) && \
+ ((_nh)->nh_priv->rt_flags & RTF_PINNED))
/* nhop.c */
struct nhop_priv *find_nhop(struct nh_control *ctl,
diff --git a/sys/net/route/route_ctl.c b/sys/net/route/route_ctl.c
index 37c23e2cb1cb..f720d08f1f52 100644
--- a/sys/net/route/route_ctl.c
+++ b/sys/net/route/route_ctl.c
@@ -29,7 +29,7 @@
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
-#include "opt_mpath.h"
+#include "opt_route.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -83,9 +83,6 @@ static int del_route(struct rib_head *rnh, struct rt_addrinfo *info,
struct rib_cmd_info *rc);
static int change_route(struct rib_head *rnh, struct rt_addrinfo *info,
struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc);
-static int change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
- struct rt_addrinfo *info, struct route_nhop_data *rnd,
- struct rib_cmd_info *rc);
static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info,
struct rib_cmd_info *rc);
@@ -94,6 +91,20 @@ static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type,
struct rib_cmd_info *rc);
static void destroy_subscription_epoch(epoch_context_t ctx);
+static bool rib_can_multipath(struct rib_head *rh);
+
+/* Per-vnet multipath routing configuration */
+SYSCTL_DECL(_net_route);
+#define V_rib_route_multipath VNET(rib_route_multipath)
+#ifdef ROUTE_MPATH
+#define _MP_FLAGS CTLFLAG_RW
+#else
+#define _MP_FLAGS CTLFLAG_RD
+#endif
+VNET_DEFINE(u_int, rib_route_multipath) = 0;
+SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
+ &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
+#undef _MP_FLAGS
/* Routing table UMA zone */
VNET_DEFINE_STATIC(uma_zone_t, rtzone);
@@ -128,7 +139,7 @@ destroy_rtentry(struct rtentry *rt)
CURVNET_SET(nhop_get_vnet(rt->rt_nhop));
/* Unreference nexthop */
- nhop_free(rt->rt_nhop);
+ nhop_free_any(rt->rt_nhop);
uma_zfree(V_rtzone, rt);
@@ -175,6 +186,41 @@ get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
return (rnh);
}
+#ifdef ROUTE_MPATH
+static bool
+rib_can_multipath(struct rib_head *rh)
+{
+ int result;
+
+ CURVNET_SET(rh->rib_vnet);
+ result = !!V_rib_route_multipath;
+ CURVNET_RESTORE();
+
+ return (result);
+}
+
+/*
+ * Check is nhop is multipath-eligible.
+ * Avoid nhops without gateways and redirects.
+ *
+ * Returns 1 for multipath-eligible nexthop,
+ * 0 otherwise.
+ */
+bool
+nhop_can_multipath(const struct nhop_object *nh)
+{
+
+ if ((nh->nh_flags & NHF_MULTIPATH) != 0)
+ return (1);
+ if ((nh->nh_flags & NHF_GATEWAY) == 0)
+ return (0);
+ if ((nh->nh_flags & NHF_REDIRECT) != 0)
+ return (0);
+
+ return (1);
+}
+#endif
+
static int
get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
{
@@ -206,7 +252,7 @@ rt_set_expire_info(struct rtentry *rt, const struct rt_addrinfo *info)
*
* Returns true if matches, false otherwise.
*/
-static bool
+bool
match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
{
@@ -461,7 +507,7 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info,
struct rib_cmd_info *rc)
{
struct nhop_object *nh_orig;
- struct route_nhop_data rnd;
+ struct route_nhop_data rnd_orig, rnd_add;
struct nhop_object *nh;
struct rtentry *rt, *rt_orig;
int error;
@@ -470,32 +516,19 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info,
if (error != 0)
return (error);
- rnd.rnd_nhop = rt->rt_nhop;
- rnd.rnd_weight = rt->rt_weight;
+ rnd_add.rnd_nhop = rt->rt_nhop;
+ rnd_add.rnd_weight = rt->rt_weight;
nh = rt->rt_nhop;
RIB_WLOCK(rnh);
-#ifdef RADIX_MPATH
- struct sockaddr *netmask;
- netmask = info->rti_info[RTAX_NETMASK];
- /* do not permit exactly the same dst/mask/gw pair */
- if (rt_mpath_capable(rnh) &&
- rt_mpath_conflict(rnh, rt, netmask)) {
- RIB_WUNLOCK(rnh);
-
- nhop_free(nh);
- uma_zfree(V_rtzone, rt);
- return (EEXIST);
- }
-#endif
- error = add_route_nhop(rnh, rt, info, &rnd, rc);
+ error = add_route_nhop(rnh, rt, info, &rnd_add, rc);
if (error == 0) {
RIB_WUNLOCK(rnh);
return (0);
}
/* addition failed. Lookup prefix in the rib to determine the cause */
- rt_orig = lookup_prefix(rnh, info, &rnd);
+ rt_orig = lookup_prefix(rnh, info, &rnd_orig);
if (rt_orig == NULL) {
/* No prefix -> rnh_addaddr() failed to allocate memory */
RIB_WUNLOCK(rnh);
@@ -505,11 +538,11 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info,
}
/* We have existing route in the RIB. */
- nh_orig = rnd.rnd_nhop;
+ nh_orig = rnd_orig.rnd_nhop;
/* Check if new route has higher preference */
if (can_override_nhop(info, nh_orig) > 0) {
/* Update nexthop to the new route */
- change_route_nhop(rnh, rt_orig, info, &rnd, rc);
+ change_route_nhop(rnh, rt_orig, info, &rnd_add, rc);
RIB_WUNLOCK(rnh);
uma_zfree(V_rtzone, rt);
nhop_free(nh_orig);
@@ -518,11 +551,26 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info,
RIB_WUNLOCK(rnh);
+#ifdef ROUTE_MPATH
+ if (rib_can_multipath(rnh) && nhop_can_multipath(rnd_add.rnd_nhop) &&
+ nhop_can_multipath(rnd_orig.rnd_nhop))
+ error = add_route_mpath(rnh, info, rt, &rnd_add, &rnd_orig, rc);
+ else
+#endif
/* Unable to add - another route with the same preference exists */
error = EEXIST;
+ /*
+ * ROUTE_MPATH disabled: failed to add route, free both nhop and rt.
+ * ROUTE_MPATH enabled: original nhop reference is unused in any case,
+ * free rt only if not _adding_ new route to rib (e.g. the case
+ * when initial lookup returned existing route, but then it got
+ * deleted prior to multipath group insertion, leading to a simple
+ * non-multipath add as a result).
+ */
nhop_free(nh);
- uma_zfree(V_rtzone, rt);
+ if ((error != 0) || rc->rc_cmd != RTM_ADD)
+ uma_zfree(V_rtzone, rt);
return (error);
}
@@ -588,7 +636,13 @@ rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info
return (ESRCH);
nh = rt->rt_nhop;
-
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ error = del_route_mpath(rnh, info, rt,
+ (struct nhgrp_object *)nh, rc);
+ return (error);
+ }
+#endif
error = check_info_match_nhop(info, rt, nh);
if (error != 0)
return (error);
@@ -600,14 +654,6 @@ rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info
* Remove the item from the tree and return it.
* Complain if it is not there and do no more processing.
*/
-#ifdef RADIX_MPATH
- info->rti_info[RTAX_GATEWAY] = &nh->gw_sa;
- if (rt_mpath_capable(rnh)) {
- rn = rt_mpath_unlink(rnh, info, rt, &error);
- if (error != 0)
- return (error);
- } else
-#endif
rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST],
info->rti_info[RTAX_NETMASK], &rnh->head);
if (rn == NULL)
@@ -648,7 +694,18 @@ del_route(struct rib_head *rnh, struct rt_addrinfo *info,
* If the caller wants it, then it can have it,
* the entry will be deleted after the end of the current epoch.
*/
- rtfree(rc->rc_rt);
+ if (rc->rc_cmd == RTM_DELETE)
+ rtfree(rc->rc_rt);
+#ifdef ROUTE_MPATH
+ else {
+ /*
+ * Deleting 1 path may result in RTM_CHANGE to
+ * a different mpath group/nhop.
+ * Free old mpath group.
+ */
+ nhop_free_any(rc->rc_nh_old);
+ }
+#endif
return (0);
}
@@ -694,19 +751,6 @@ rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
return (ESRCH);
}
-#ifdef RADIX_MPATH
- /*
- * If we got multipath routes,
- * we require users to specify a matching RTAX_GATEWAY.
- */
- if (rt_mpath_capable(rnh)) {
- rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]);
- if (rt == NULL) {
- RIB_RUNLOCK(rnh);
- return (ESRCH);
- }
- }
-#endif
rnd_orig.rnd_nhop = rt->rt_nhop;
rnd_orig.rnd_weight = rt->rt_weight;
@@ -722,18 +766,11 @@ rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
}
static int
-change_route(struct rib_head *rnh, struct rt_addrinfo *info,
- struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_object *nh_orig, struct nhop_object **nh_new)
{
- int error = 0;
int free_ifa = 0;
- struct nhop_object *nh, *nh_orig;
- struct route_nhop_data rnd_new;
-
- nh = NULL;
- nh_orig = rnd_orig->rnd_nhop;
- if (nh_orig == NULL)
- return (ESRCH);
+ int error;
/*
* New gateway could require new ifaddr, ifp;
@@ -759,24 +796,101 @@ change_route(struct rib_head *rnh, struct rt_addrinfo *info,
}
}
- error = nhop_create_from_nhop(rnh, nh_orig, info, &nh);
+ error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
if (free_ifa) {
ifa_free(info->rti_ifa);
info->rti_ifa = NULL;
}
+
+ return (error);
+}
+
+#ifdef ROUTE_MPATH
+static int
+change_mpath_route(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+{
+ int error = 0;
+ struct nhop_object *nh, *nh_orig, *nh_new;
+ struct route_nhop_data rnd_new;
+
+ nh = NULL;
+ nh_orig = rnd_orig->rnd_nhop;
+
+ struct weightened_nhop *wn = NULL, *wn_new;
+ uint32_t num_nhops;
+
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh_orig, &num_nhops);
+ nh_orig = NULL;
+ for (int i = 0; i < num_nhops; i++) {
+ if (check_info_match_nhop(info, NULL, wn[i].nh)) {
+ nh_orig = wn[i].nh;
+ break;
+ }
+ }
+
+ if (nh_orig == NULL)
+ return (ESRCH);
+
+ error = change_nhop(rnh, info, nh_orig, &nh_new);
if (error != 0)
return (error);
- rnd_new.rnd_nhop = nh;
- if (info->rti_mflags & RTV_WEIGHT)
- rnd_new.rnd_weight = info->rti_rmx->rmx_weight;
- else
- rnd_new.rnd_weight = rnd_orig->rnd_weight;
+ wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
+ M_TEMP, M_NOWAIT | M_ZERO);
+ if (wn_new == NULL) {
+ nhop_free(nh_new);
+ return (EAGAIN);
+ }
+
+ memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
+ for (int i = 0; i < num_nhops; i++) {
+ if (wn[i].nh == nh_orig) {
+ wn[i].nh = nh_new;
+ wn[i].weight = get_info_weight(info, rnd_orig->rnd_weight);
+ break;
+ }
+ }
+
+ error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new);
+ nhop_free(nh_new);
+ free(wn_new, M_TEMP);
+
+ if (error != 0)
+ return (error);
error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc);
return (error);
}
+#endif
+
+static int
+change_route(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+{
+ int error = 0;
+ struct nhop_object *nh, *nh_orig;
+ struct route_nhop_data rnd_new;
+
+ nh = NULL;
+ nh_orig = rnd_orig->rnd_nhop;
+ if (nh_orig == NULL)
+ return (ESRCH);
+
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh_orig))
+ return (change_mpath_route(rnh, info, rnd_orig, rc));
+#endif
+
+ rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
+ error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
+ if (error != 0)
+ return (error);
+ error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc);
+
+ return (error);
+}
/*
* Insert @rt with nhop data from @rnd_new to @rnh.
@@ -827,7 +941,7 @@ add_route_nhop(struct rib_head *rnh, struct rtentry *rt,
* Conditionally set rt_expire if set in @info.
* Returns 0 on success.
*/
-static int
+int
change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
struct rt_addrinfo *info, struct route_nhop_data *rnd,
struct rib_cmd_info *rc)
@@ -855,6 +969,8 @@ change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head);
if (rn == NULL)
return (ESRCH);
+ rt = RNTORT(rn);
+ rt->rte_flags &= ~RTF_UP;
}
/* Finalize notification */
@@ -989,7 +1105,6 @@ rt_checkdelroute(struct radix_node *rn, void *arg)
info->rti_info[RTAX_DST] = rt_key(rt);
info->rti_info[RTAX_NETMASK] = rt_mask(rt);
- info->rti_info[RTAX_GATEWAY] = &rt->rt_nhop->gw_sa;
error = rt_unlinkrte(di->rnh, info, &di->rc);
@@ -1000,7 +1115,7 @@ rt_checkdelroute(struct radix_node *rn, void *arg)
* XXX: Delayed notifications not implemented
* for nexthop updates.
*/
- if (error == 0) {
+ if ((error == 0) && (di->rc.rc_cmd == RTM_DELETE)) {
/* Add to the list and return */
rt->rt_chain = di->head;
di->head = rt;
@@ -1024,6 +1139,7 @@ rib_walk_del(u_int fibnum, int family, rt_filter_f_t *filter_f, void *arg, bool
struct rib_head *rnh;
struct rt_delinfo di;
struct rtentry *rt;
+ struct nhop_object *nh;
struct epoch_tracker et;
rnh = rt_tables_get_rnh(fibnum, family);
@@ -1049,18 +1165,31 @@ rib_walk_del(u_int fibnum, int family, rt_filter_f_t *filter_f, void *arg, bool
rt = di.head;
di.head = rt->rt_chain;
rt->rt_chain = NULL;
+ nh = rt->rt_nhop;
di.rc.rc_rt = rt;
- di.rc.rc_nh_old = rt->rt_nhop;
+ di.rc.rc_nh_old = nh;
rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
/* TODO std rt -> rt_addrinfo export */
di.info.rti_info[RTAX_DST] = rt_key(rt);
di.info.rti_info[RTAX_NETMASK] = rt_mask(rt);
- if (report)
- rt_routemsg(RTM_DELETE, rt, rt->rt_nhop->nh_ifp, 0,
- fibnum);
+ if (report) {
+#ifdef ROUTE_MPATH
+ struct nhgrp_object *nhg;
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ if (NH_IS_NHGRP(nh)) {
+ nhg = (struct nhgrp_object *)nh;
+ wn = nhgrp_get_nhops(nhg, &num_nhops);
+ for (int i = 0; i < num_nhops; i++)
+ rt_routemsg(RTM_DELETE, rt,
+ wn[i].nh->nh_ifp, 0, fibnum);
+ } else
+#endif
+ rt_routemsg(RTM_DELETE, rt, nh->nh_ifp, 0, fibnum);
+ }
rtfree(rt);
}
diff --git a/sys/net/route/route_ctl.h b/sys/net/route/route_ctl.h
index fb6dda47b3ba..151771146e65 100644
--- a/sys/net/route/route_ctl.h
+++ b/sys/net/route/route_ctl.h
@@ -53,6 +53,10 @@ int rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
int rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
struct rib_cmd_info *rc);
+typedef void route_notification_t(struct rib_cmd_info *rc, void *);
+void rib_decompose_notification(struct rib_cmd_info *rc,
+ route_notification_t *cb, void *cbdata);
+
int rib_add_redirect(u_int fibnum, struct sockaddr *dst,
struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp,
int flags, int expire_sec);
@@ -66,6 +70,20 @@ typedef void rt_setwarg_t(struct rib_head *, uint32_t, int, void *);
void rt_foreach_fib_walk(int af, rt_setwarg_t *, rt_walktree_f_t *, void *);
void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg);
+struct route_nhop_data;
+const struct rtentry *rib_lookup_prefix(uint32_t fibnum, int family,
+ const struct sockaddr *dst, const struct sockaddr *netmask,
+ struct route_nhop_data *rnd);
+const struct rtentry *rib_lookup_lpm(uint32_t fibnum, int family,
+ const struct sockaddr *dst, struct route_nhop_data *rnd);
+
+/* Multipath */
+struct nhgrp_object;
+struct weightened_nhop;
+
+struct weightened_nhop *nhgrp_get_nhops(struct nhgrp_object *nhg,
+ uint32_t *pnum_nhops);
+
enum rib_subscription_type {
RIB_NOTIFY_IMMEDIATE,
RIB_NOTIFY_DELAYED
diff --git a/sys/net/route/route_helpers.c b/sys/net/route/route_helpers.c
index b5b45ef662cc..dfa573d23a66 100644
--- a/sys/net/route/route_helpers.c
+++ b/sys/net/route/route_helpers.c
@@ -131,3 +131,167 @@ rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags,
return (nh);
}
+
+#ifdef ROUTE_MPATH
+static void
+decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb,
+ void *cbdata)
+{
+ uint32_t num_old, num_new;
+ uint32_t nh_idx_old, nh_idx_new;
+ struct weightened_nhop *wn_old, *wn_new;
+ struct weightened_nhop tmp = { NULL, 0 };
+ uint32_t idx_old = 0, idx_new = 0;
+
+ struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt };
+ struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt };
+
+ if (NH_IS_NHGRP(rc->rc_nh_old)) {
+ wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old);
+ } else {
+ tmp.nh = rc->rc_nh_old;
+ tmp.weight = rc->rc_nh_weight;
+ wn_old = &tmp;
+ num_old = 1;
+ }
+ if (NH_IS_NHGRP(rc->rc_nh_new)) {
+ wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new);
+ } else {
+ tmp.nh = rc->rc_nh_new;
+ tmp.weight = rc->rc_nh_weight;
+ wn_new = &tmp;
+ num_new = 1;
+ }
+
+ /* Use the fact that each @wn array is sorted */
+ /*
+ * Want to convert into set of add and delete operations
+ * [1] -> [1, 2] = A{2}
+ * [2] -> [1, 2] = A{1}
+ * [1, 2, 4]->[1, 3, 4] = A{2}, D{3}
+ * [1, 2, 4]->[1, 4] = D{2}
+ * [1, 2, 4] -> [3, 4] = D{1}, C{2,3} OR C{1,3}, D{2} OR D{1},D{2},A{3}
+ * [1, 2] -> [3, 4] =
+ *
+ */
+ idx_old = 0;
+ while ((idx_old < num_old) && (idx_new < num_new)) {
+ nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx;
+ nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx;
+
+ if (nh_idx_old == nh_idx_new) {
+ if (wn_old[idx_old].weight != wn_new[idx_new].weight) {
+ /* Update weight by providing del/add notifications */
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ }
+ idx_old++;
+ idx_new++;
+ } else if (nh_idx_old < nh_idx_new) {
+ /*
+ * [1, ~2~, 4], [1, ~3~, 4]
+ * [1, ~2~, 5], [1, ~3~, 4]
+ * [1, ~2~], [1, ~3~, 4]
+ */
+ if ((idx_old + 1 >= num_old) ||
+ (wn_old[idx_old + 1].nh->nh_priv->nh_idx > nh_idx_new)) {
+ /* Add new unless the next old item is still <= new */
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ idx_new++;
+ }
+ /* In any case, delete current old */
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+ idx_old++;
+ } else {
+ /*
+ * nh_idx_old > nh_idx_new
+ *
+ * [1, ~3~, 4], [1, ~2~, 4]
+ * [1, ~3~, 5], [1, ~2~, 4]
+ * [1, ~3~, 4], [1, ~2~]
+ */
+ if ((idx_new + 1 >= num_new) ||
+ (wn_new[idx_new + 1].nh->nh_priv->nh_idx > nh_idx_old)) {
+ /* No next item or next item is > current one */
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ idx_new++;
+ }
+ /* In any case, delete current old */
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+ idx_old++;
+ }
+ }
+
+ while (idx_old < num_old) {
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+ idx_old++;
+ }
+
+ while (idx_new < num_new) {
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ idx_new++;
+ }
+}
+
+/*
+ * Decompose multipath cmd info @rc into a list of add/del/change
+ * single-path operations, calling @cb callback for each operation.
+ * Assumes at least one of the nexthops in @rc is multipath.
+ */
+void
+rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb,
+ void *cbdata)
+{
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ struct rib_cmd_info rc_new;
+
+ rc_new = *rc;
+ DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p",
+ cb, rc->cmd, rc->nh_old, rc->nh_new);
+ switch (rc->rc_cmd) {
+ case RTM_ADD:
+ if (!NH_IS_NHGRP(rc->rc_nh_new))
+ return;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops);
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ rc_new.rc_nh_new = wn[i].nh;
+ rc_new.rc_nh_weight = wn[i].weight;
+ cb(&rc_new, cbdata);
+ }
+ break;
+ case RTM_DELETE:
+ if (!NH_IS_NHGRP(rc->rc_nh_old))
+ return;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops);
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ rc_new.rc_nh_old = wn[i].nh;
+ rc_new.rc_nh_weight = wn[i].weight;
+ cb(&rc_new, cbdata);
+ }
+ break;
+ case RTM_CHANGE:
+ if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new))
+ return;
+ decompose_change_notification(rc, cb, cbdata);
+ break;
+ }
+}
+#endif
diff --git a/sys/net/route/route_var.h b/sys/net/route/route_var.h
index 6164ec08850c..12d081d410a2 100644
--- a/sys/net/route/route_var.h
+++ b/sys/net/route/route_var.h
@@ -87,6 +87,7 @@ struct rib_head {
/* Constants */
#define RIB_MAX_RETRIES 3
#define RT_MAXFIBS UINT16_MAX
+#define RIB_MAX_MPATH_WIDTH 64
/* Macro for verifying fields in af-specific 'struct route' structures */
#define CHK_STRUCT_FIELD_GENERIC(_s1, _f1, _s2, _f2) \
@@ -113,12 +114,7 @@ _Static_assert(__offsetof(struct route, ro_dst) == __offsetof(_ro_new, _dst_new)
"ro_dst and " #_dst_new " are at different offset")
struct rib_head *rt_tables_get_rnh(uint32_t table, sa_family_t family);
-void rt_mpath_init_rnh(struct rib_head *rnh);
int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum);
-#ifdef RADIX_MPATH
-struct radix_node *rt_mpath_unlink(struct rib_head *rnh,
- struct rt_addrinfo *info, struct rtentry *rto, int *perror);
-#endif
struct rib_cmd_info;
VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat);
@@ -202,14 +198,6 @@ struct rtentry {
/* rtentry rt flag mask */
#define RTE_RT_FLAG_MASK (RTF_UP | RTF_HOST)
-/* Nexthop selection */
-#define _NH2MP(_nh) ((struct nhgrp_object *)(_nh))
-#define _SELECT_NHOP(_nh, _flowid) \
- (_NH2MP(_nh))->nhops[(_flowid) % (_NH2MP(_nh))->mp_size]
-#define _RT_SELECT_NHOP(_nh, _flowid) \
- ((!NH_IS_MULTIPATH(_nh)) ? (_nh) : _SELECT_NHOP(_nh, _flowid))
-#define RT_SELECT_NHOP(_rt, _flowid) _RT_SELECT_NHOP((_rt)->rt_nhop, _flowid)
-
/* route_temporal.c */
void tmproutes_update(struct rib_head *rnh, struct rtentry *rt);
void tmproutes_init(struct rib_head *rh);
@@ -217,14 +205,24 @@ void tmproutes_destroy(struct rib_head *rh);
/* route_ctl.c */
struct route_nhop_data {
- struct nhop_object *rnd_nhop;
- uint32_t rnd_weight;
+ union {
+ struct nhop_object *rnd_nhop;
+ struct nhgrp_object *rnd_nhgrp;
+ };
+ uint32_t rnd_weight;
};
+
+int change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
+ struct rt_addrinfo *info, struct route_nhop_data *rnd,
+ struct rib_cmd_info *rc);
int change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
struct route_nhop_data *nhd_new, struct rib_cmd_info *rc);
struct rtentry *lookup_prefix(struct rib_head *rnh,
const struct rt_addrinfo *info, struct route_nhop_data *rnd);
+
+bool nhop_can_multipath(const struct nhop_object *nh);
+bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw);
int check_info_match_nhop(const struct rt_addrinfo *info,
const struct rtentry *rt, const struct nhop_object *nh);
int can_override_nhop(const struct rt_addrinfo *info,
@@ -256,5 +254,57 @@ int nhop_create_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_ori
void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu);
int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+/* MULTIPATH */
+#define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */
+
+struct nhgrp_object {
+ uint16_t nhg_flags; /* nexthop group flags */
+ uint8_t nhg_size; /* dataplain group size */
+ uint8_t spare;
+ struct nhop_object *nhops[0]; /* nhops */
+};
+
+static inline struct nhop_object *
+nhop_select(struct nhop_object *nh, uint32_t flowid)
+{
+
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
+ nh = nhg->nhops[flowid % nhg->nhg_size];
+ }
+#endif
+ return (nh);
+}
+
+
+struct weightened_nhop;
+
+/* mpath_ctl.c */
+int add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct route_nhop_data *rnd_add,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc);
+int del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct nhgrp_object *nhg, struct rib_cmd_info *rc);
+
+/* nhgrp.c */
+int nhgrp_ctl_init(struct nh_control *ctl);
+void nhgrp_ctl_free(struct nh_control *ctl);
+void nhgrp_ctl_unlink_all(struct nh_control *ctl);
+
+
+/* nhgrp_ctl.c */
+int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+
+int nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn,
+ int num_nhops, struct route_nhop_data *rnd);
+typedef bool nhgrp_filter_cb_t(const struct nhop_object *nh, void *data);
+int nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src,
+ nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd);
+int nhgrp_get_addition_group(struct rib_head *rnh,
+ struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_add,
+ struct route_nhop_data *rnd_new);
+
+void nhgrp_free(struct nhgrp_object *nhg);
#endif
diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c
index f3b0ecec2430..c2e2273d0d31 100644
--- a/sys/net/rtsock.c
+++ b/sys/net/rtsock.c
@@ -32,7 +32,7 @@
* $FreeBSD$
*/
#include "opt_ddb.h"
-#include "opt_mpath.h"
+#include "opt_route.h"
#include "opt_inet.h"
#include "opt_inet6.h"
@@ -158,8 +158,7 @@ MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF);
#define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx)
#define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED)
-static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
- "");
+SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
struct walkarg {
int w_tmemsize;
@@ -650,6 +649,25 @@ fill_addrinfo(struct rt_msghdr *rtm, int len, u_int fibnum, struct rt_addrinfo *
return (0);
}
+static struct nhop_object *
+select_nhop(struct nhop_object *nh, const struct sockaddr *gw)
+{
+ if (!NH_IS_NHGRP(nh))
+ return (nh);
+#ifdef ROUTE_MPATH
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ if (gw == NULL)
+ return (wn[0].nh);
+ for (int i = 0; i < num_nhops; i++) {
+ if (match_nhop_gw(wn[i].nh, gw))
+ return (wn[i].nh);
+ }
+#endif
+ return (NULL);
+}
+
/*
* Handles RTM_GET message from routing socket, returning matching rt.
*
@@ -663,6 +681,7 @@ handle_rtm_get(struct rt_addrinfo *info, u_int fibnum,
{
RIB_RLOCK_TRACKER;
struct rib_head *rnh;
+ struct nhop_object *nh;
sa_family_t saf;
saf = info->rti_info[RTAX_DST]->sa_family;
@@ -690,21 +709,12 @@ handle_rtm_get(struct rt_addrinfo *info, u_int fibnum,
RIB_RUNLOCK(rnh);
return (ESRCH);
}
-#ifdef RADIX_MPATH
- /*
- * for RTM_GET, gate is optional even with multipath.
- * if gate == NULL the first match is returned.
- * (no need to call rt_mpath_matchgate if gate == NULL)
- */
- if (rt_mpath_capable(rnh) && info->rti_info[RTAX_GATEWAY]) {
- rc->rc_rt = rt_mpath_matchgate(rc->rc_rt,
- info->rti_info[RTAX_GATEWAY]);
- if (rc->rc_rt == NULL) {
- RIB_RUNLOCK(rnh);
- return (ESRCH);
- }
+
+ nh = select_nhop(rc->rc_rt->rt_nhop, info->rti_info[RTAX_GATEWAY]);
+ if (nh == NULL) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
}
-#endif
/*
* If performing proxied L2 entry insertion, and
* the actual PPP host entry is found, perform
@@ -740,8 +750,13 @@ handle_rtm_get(struct rt_addrinfo *info, u_int fibnum,
RIB_RUNLOCK(rnh);
return (ESRCH);
}
+ nh = select_nhop(rc->rc_rt->rt_nhop, info->rti_info[RTAX_GATEWAY]);
+ if (nh == NULL) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
+ }
}
- rc->rc_nh_new = rc->rc_rt->rt_nhop;
+ rc->rc_nh_new = nh;
rc->rc_nh_weight = rc->rc_rt->rt_weight;
RIB_RUNLOCK(rnh);
@@ -832,6 +847,24 @@ update_rtm_from_rc(struct rt_addrinfo *info, struct rt_msghdr **prtm,
return (0);
}
+static void
+save_del_notification(struct rib_cmd_info *rc, void *_cbdata)
+{
+ struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata;
+
+ if (rc->rc_cmd == RTM_DELETE)
+ *rc_new = *rc;
+}
+
+static void
+save_add_notification(struct rib_cmd_info *rc, void *_cbdata)
+{
+ struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata;
+
+ if (rc->rc_cmd == RTM_ADD)
+ *rc_new = *rc;
+}
+
/*ARGSUSED*/
static int
route_output(struct mbuf *m, struct socket *so, ...)
@@ -919,6 +952,15 @@ route_output(struct mbuf *m, struct socket *so, ...)
#ifdef INET6
rti_need_deembed = 1;
#endif
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(rc.rc_nh_new) ||
+ (rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) {
+ struct rib_cmd_info rc_simple = {};
+ rib_decompose_notification(&rc,
+ save_add_notification, (void *)&rc_simple);
+ rc = rc_simple;
+ }
+#endif
nh = rc.rc_nh_new;
rtm->rtm_index = nh->nh_ifp->if_index;
}
@@ -927,6 +969,15 @@ route_output(struct mbuf *m, struct socket *so, ...)
case RTM_DELETE:
error = rib_action(fibnum, RTM_DELETE, &info, &rc);
if (error == 0) {
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(rc.rc_nh_old) ||
+ (rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) {
+ struct rib_cmd_info rc_simple = {};
+ rib_decompose_notification(&rc,
+ save_del_notification, (void *)&rc_simple);
+ rc = rc_simple;
+ }
+#endif
nh = rc.rc_nh_old;
goto report;
}
@@ -1708,7 +1759,19 @@ sysctl_dumpentry(struct radix_node *rn, void *vw)
if (!can_export_rte(w->w_req->td->td_ucred, rt))
return (0);
nh = rt->rt_nhop;
- error = sysctl_dumpnhop(rt, nh, rt->rt_weight, w);
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ for (int i = 0; i < num_nhops; i++) {
+ error = sysctl_dumpnhop(rt, wn[i].nh, wn[i].weight, w);
+ if (error != 0)
+ return (error);
+ }
+ } else
+#endif
+ error = sysctl_dumpnhop(rt, nh, rt->rt_weight, w);
return (0);
}
@@ -1748,6 +1811,7 @@ sysctl_dumpnhop(struct rtentry *rt, struct nhop_object *nh, uint32_t weight,
rtm->rtm_flags = rt->rte_flags;
rtm->rtm_flags |= nhop_get_rtflags(nh);
rt_getmetrics(rt, nh, &rtm->rtm_rmx);
+ rtm->rtm_rmx.rmx_weight = weight;
rtm->rtm_index = nh->nh_ifp->if_index;
rtm->rtm_addrs = info.rti_addrs;
error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
@@ -2028,7 +2092,7 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS)
namelen--;
if (req->newptr)
return (EPERM);
- if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP) {
+ if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP || name[1] == NET_RT_NHGRP) {
if (namelen == 3)
fib = req->td->td_proc->p_fibnum;
else if (namelen == 4)
@@ -2096,6 +2160,7 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS)
}
break;
case NET_RT_NHOP:
+ case NET_RT_NHGRP:
/* Allow dumping one specific af/fib at a time */
if (namelen < 4) {
error = EINVAL;
@@ -2113,6 +2178,12 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS)
}
if (w.w_op == NET_RT_NHOP)
error = nhops_dump_sysctl(rnh, w.w_req);
+ else
+#ifdef ROUTE_MPATH
+ error = nhgrp_dump_sysctl(rnh, w.w_req);
+#else
+ error = ENOTSUP;
+#endif
break;
case NET_RT_IFLIST:
case NET_RT_IFLISTL:
diff --git a/sys/netinet/in.c b/sys/netinet/in.c
index 5553530628bf..0bc02b5f20d3 100644
--- a/sys/netinet/in.c
+++ b/sys/netinet/in.c
@@ -35,8 +35,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include "opt_mpath.h"
-
#include <sys/param.h>
#include <sys/eventhandler.h>
#include <sys/systm.h>
@@ -699,14 +697,6 @@ in_addprefix(struct in_ifaddr *target, int flags)
* interface address, we are done here.
*/
if (ia->ia_flags & IFA_ROUTE) {
-#ifdef RADIX_MPATH
- if (ia->ia_addr.sin_addr.s_addr ==
- target->ia_addr.sin_addr.s_addr) {
- IN_IFADDR_RUNLOCK(&in_ifa_tracker);
- return (EEXIST);
- } else
- break;
-#endif
if (V_nosameprefix) {
IN_IFADDR_RUNLOCK(&in_ifa_tracker);
return (EEXIST);
diff --git a/sys/netinet/in_fib.c b/sys/netinet/in_fib.c
index c46c55bd7d00..4c84de2c7281 100644
--- a/sys/netinet/in_fib.c
+++ b/sys/netinet/in_fib.c
@@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_route.h"
-#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -48,14 +47,11 @@ __FBSDID("$FreeBSD$");
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
+#include <net/route/route_ctl.h>
#include <net/route/route_var.h>
#include <net/route/nhop.h>
#include <net/vnet.h>
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
-
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_fib.h>
@@ -80,7 +76,6 @@ fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib4_lookup: bad fibnum"));
@@ -99,12 +94,7 @@ fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- if (rt_mpath_next(rt) != NULL)
- rt = rt_mpath_selectrte(rt, flowid);
-#endif
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, flowid);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
@@ -120,7 +110,7 @@ fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
}
inline static int
-check_urpf(const struct nhop_object *nh, uint32_t flags,
+check_urpf_nhop(const struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
@@ -137,21 +127,24 @@ check_urpf(const struct nhop_object *nh, uint32_t flags,
return (0);
}
-#ifdef RADIX_MPATH
-inline static int
-check_urpf_mpath(struct rtentry *rt, uint32_t flags,
+static int
+check_urpf(struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
-
- while (rt != NULL) {
- if (check_urpf(rt->rt_nhop, flags, src_if) != 0)
- return (1);
- rt = rt_mpath_next(rt);
- }
-
- return (0);
-}
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ for (int i = 0; i < num_nhops; i++) {
+ if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0)
+ return (1);
+ }
+ return (0);
+ } else
#endif
+ return (check_urpf_nhop(nh, flags, src_if));
+}
/*
* Performs reverse path forwarding lookup.
@@ -169,7 +162,6 @@ fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
int ret;
KASSERT((fibnum < rt_numfibs), ("fib4_check_urpf: bad fibnum"));
@@ -186,12 +178,7 @@ fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- ret = check_urpf_mpath(rt, flags, src_if);
-#else
- ret = check_urpf(rt->rt_nhop, flags, src_if);
-#endif
+ ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if);
RIB_RUNLOCK(rh);
return (ret);
}
@@ -206,7 +193,6 @@ fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
{
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib4_lookup_debugnet: bad fibnum"));
@@ -225,12 +211,7 @@ fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
/* unlocked lookup */
rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- if (rt_mpath_next(rt) != NULL)
- rt = rt_mpath_selectrte(rt, 0);
-#endif
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, 0);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c
index ef40fdc6af6c..6dfa1e56eff1 100644
--- a/sys/netinet/in_rmx.c
+++ b/sys/netinet/in_rmx.c
@@ -30,8 +30,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include "opt_mpath.h"
-
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@@ -127,9 +125,6 @@ in_inithead(uint32_t fibnum)
return (NULL);
rh->rnh_preadd = rib4_preadd;
-#ifdef RADIX_MPATH
- rt_mpath_init_rnh(rh);
-#endif
return (rh);
}
diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c
index a26722c97f88..a7e72f4ec407 100644
--- a/sys/netinet/ip_output.c
+++ b/sys/netinet/ip_output.c
@@ -38,7 +38,6 @@ __FBSDID("$FreeBSD$");
#include "opt_ipsec.h"
#include "opt_kern_tls.h"
#include "opt_mbuf_stress_test.h"
-#include "opt_mpath.h"
#include "opt_ratelimit.h"
#include "opt_route.h"
#include "opt_rss.h"
@@ -470,11 +469,7 @@ again:
* for correct operation (as it is for ARP).
*/
uint32_t flowid;
-#ifdef RADIX_MPATH
- flowid = ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr);
-#else
flowid = m->m_pkthdr.flowid;
-#endif
ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0,
NHR_REF, flowid);
diff --git a/sys/netinet6/in6_fib.c b/sys/netinet6/in6_fib.c
index a0e4dacc86e1..9fd869b2730b 100644
--- a/sys/netinet6/in6_fib.c
+++ b/sys/netinet6/in6_fib.c
@@ -33,7 +33,6 @@ __FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_route.h"
-#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -49,14 +48,11 @@ __FBSDID("$FreeBSD$");
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
+#include <net/route/route_ctl.h>
#include <net/route/route_var.h>
#include <net/route/nhop.h>
#include <net/vnet.h>
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
-
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip_mroute.h>
@@ -88,7 +84,6 @@ fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6,
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
struct sockaddr_in6 sin6;
@@ -111,12 +106,7 @@ fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6,
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- if (rt_mpath_next(rt) != NULL)
- rt = rt_mpath_selectrte(rt, flowid);
-#endif
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, flowid);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
@@ -132,7 +122,7 @@ fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6,
}
inline static int
-check_urpf(const struct nhop_object *nh, uint32_t flags,
+check_urpf_nhop(const struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
@@ -149,21 +139,24 @@ check_urpf(const struct nhop_object *nh, uint32_t flags,
return (0);
}
-#ifdef RADIX_MPATH
-inline static int
-check_urpf_mpath(struct rtentry *rt, uint32_t flags,
+static int
+check_urpf(struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
-
- while (rt != NULL) {
- if (check_urpf(rt->rt_nhop, flags, src_if) != 0)
- return (1);
- rt = rt_mpath_next(rt);
- }
-
- return (0);
-}
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ for (int i = 0; i < num_nhops; i++) {
+ if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0)
+ return (1);
+ }
+ return (0);
+ } else
#endif
+ return (check_urpf_nhop(nh, flags, src_if));
+}
/*
* Performs reverse path forwarding lookup.
@@ -181,7 +174,6 @@ fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6,
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct sockaddr_in6 sin6;
int ret;
@@ -203,12 +195,7 @@ fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6,
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- ret = check_urpf_mpath(rt, flags, src_if);
-#else
- ret = check_urpf(rt->rt_nhop, flags, src_if);
-#endif
+ ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if);
RIB_RUNLOCK(rh);
return (ret);
}
@@ -223,7 +210,6 @@ fib6_lookup_debugnet(uint32_t fibnum, const struct in6_addr *dst6,
{
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
struct sockaddr_in6 sin6;
@@ -245,8 +231,7 @@ fib6_lookup_debugnet(uint32_t fibnum, const struct in6_addr *dst6,
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, 0);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
diff --git a/sys/netinet6/in6_rmx.c b/sys/netinet6/in6_rmx.c
index 5f2e2fe3ae6e..54136f9983b2 100644
--- a/sys/netinet6/in6_rmx.c
+++ b/sys/netinet6/in6_rmx.c
@@ -64,8 +64,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include "opt_mpath.h"
-
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@@ -153,9 +151,6 @@ in6_inithead(uint32_t fibnum)
return (NULL);
rh->rnh_preadd = rib6_preadd;
-#ifdef RADIX_MPATH
- rt_mpath_init_rnh(rh);
-#endif
rs = rib_subscribe_internal(rh, nd6_subscription_cb, NULL,
RIB_NOTIFY_IMMEDIATE, true);
diff --git a/sys/netinet6/nd6.c b/sys/netinet6/nd6.c
index ad31d750eb70..1597a4cb6b93 100644
--- a/sys/netinet6/nd6.c
+++ b/sys/netinet6/nd6.c
@@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
+#include "opt_route.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -1591,7 +1592,11 @@ void
nd6_subscription_cb(struct rib_head *rnh, struct rib_cmd_info *rc, void *arg)
{
+#ifdef ROUTE_MPATH
+ rib_decompose_notification(rc, check_release_defrouter, NULL);
+#else
check_release_defrouter(rc, NULL);
+#endif
}
int
diff --git a/sys/sys/socket.h b/sys/sys/socket.h
index 038c4d3ef8b9..311d65671051 100644
--- a/sys/sys/socket.h
+++ b/sys/sys/socket.h
@@ -417,6 +417,7 @@ struct sockproto {
#define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en
* versions of msghdr structs. */
#define NET_RT_NHOP 6 /* dump routing nexthops */
+#define NET_RT_NHGRP 7 /* dump routing nexthop groups */
#endif /* __BSD_VISIBLE */
/*