aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander V. Chernikov <melifaro@FreeBSD.org>2020-04-12 14:30:00 +0000
committerAlexander V. Chernikov <melifaro@FreeBSD.org>2020-04-12 14:30:00 +0000
commita666325282eaed4b044459d121f339b2d6d0224b (patch)
tree64aab98b0911750e1f0625db916b74583e682bdf
parent07ddae2822b0e0cb4b1b63307dfa422e82297e15 (diff)
downloadsrc-a666325282eaed4b044459d121f339b2d6d0224b.tar.gz
src-a666325282eaed4b044459d121f339b2d6d0224b.zip
Introduce nexthop objects and new routing KPI.
This is the foundational change for the routing subsytem rearchitecture. More details and goals are available in https://reviews.freebsd.org/D24141 . This patch introduces concept of nexthop objects and new nexthop-based routing KPI. Nexthops are objects, containing all necessary information for performing the packet output decision. Output interface, mtu, flags, gw address goes there. For most of the cases, these objects will serve the same role as the struct rtentry is currently serving. Typically there will be low tens of such objects for the router even with multiple BGP full-views, as these objects will be shared between routing entries. This allows to store more information in the nexthop. New KPI: struct nhop_object *fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, uint32_t flags, uint32_t flowid); struct nhop_object *fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags, uint32_t flowid); These 2 function are intended to replace all all flavours of <in_|in6_>rtalloc[1]<_ign><_fib>, mpath functions and the previous fib[46]-generation functions. Upon successful lookup, they return nexthop object which is guaranteed to exist within current NET_EPOCH. If longer lifetime is desired, one can specify NHR_REF as a flag and get a referenced version of the nexthop. Reference semantic closely resembles rtentry one, allowing sed-style conversion. Additionally, another 2 functions are introduced to support uRPF functionality inside variety of our firewalls. Their primary goal is to hide the multipath implementation details inside the routing subsystem, greatly simplifying firewalls implementation: int fib4_lookup_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, uint32_t flags, const struct ifnet *src_if); int fib6_lookup_urpf(uint32_t fibnum, const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags, const struct ifnet *src_if); All functions have a separate scopeid argument, paving way to eliminating IPv6 scope embedding and allowing to support IPv4 link-locals in the future. Structure changes: * rtentry gets new 'rt_nhop' pointer, slightly growing the overall size. * rib_head gets new 'rnh_preadd' callback pointer, slightly growing overall sz. Old KPI: During the transition state old and new KPI will coexists. As there are another 4-5 decent-sized conversion patches, it will probably take a couple of weeks. To support both KPIs, fields not required by the new KPI (most of rtentry) has to be kept, resulting in the temporary size increase. Once conversion is finished, rtentry will notably shrink. More details: * architectural overview: https://reviews.freebsd.org/D24141 * list of the next changes: https://reviews.freebsd.org/D24232 Reviewed by: ae,glebius(initial version) Differential Revision: https://reviews.freebsd.org/D24232
Notes
Notes: svn path=/head/; revision=359823
-rw-r--r--etc/mtree/BSD.include.dist2
-rw-r--r--include/Makefile1
-rw-r--r--lib/libc/gen/sysctl.34
-rw-r--r--sys/conf/files5
-rw-r--r--sys/net/radix_mpath.c2
-rw-r--r--sys/net/radix_mpath.h17
-rw-r--r--sys/net/route.c48
-rw-r--r--sys/net/route.h12
-rw-r--r--sys/net/route/nhop.c388
-rw-r--r--sys/net/route/nhop.h229
-rw-r--r--sys/net/route/nhop_ctl.c827
-rw-r--r--sys/net/route/nhop_utils.c219
-rw-r--r--sys/net/route/nhop_utils.h200
-rw-r--r--sys/net/route/nhop_var.h96
-rw-r--r--sys/net/route/route_ctl.c65
-rw-r--r--sys/net/route/route_helpers.c83
-rw-r--r--sys/net/route/shared.h68
-rw-r--r--sys/net/route_var.h44
-rw-r--r--sys/net/rtsock.c24
-rw-r--r--sys/netinet/in_fib.c198
-rw-r--r--sys/netinet/in_fib.h4
-rw-r--r--sys/netinet/in_rmx.c64
-rw-r--r--sys/netinet6/in6_fib.c221
-rw-r--r--sys/netinet6/in6_fib.h6
-rw-r--r--sys/netinet6/in6_rmx.c40
-rw-r--r--sys/sys/socket.h1
-rw-r--r--usr.bin/netstat/Makefile2
-rw-r--r--usr.bin/netstat/common.c140
-rw-r--r--usr.bin/netstat/common.h58
-rw-r--r--usr.bin/netstat/main.c14
-rw-r--r--usr.bin/netstat/netstat.h5
-rw-r--r--usr.bin/netstat/nhops.c472
-rw-r--r--usr.bin/netstat/route.c88
33 files changed, 3471 insertions, 176 deletions
diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist
index cf279ca1a554..f2597ba65052 100644
--- a/etc/mtree/BSD.include.dist
+++ b/etc/mtree/BSD.include.dist
@@ -208,6 +208,8 @@
net
altq
..
+ route
+ ..
..
net80211
..
diff --git a/include/Makefile b/include/Makefile
index 462bcb001566..cbfb75951c62 100644
--- a/include/Makefile
+++ b/include/Makefile
@@ -53,6 +53,7 @@ LSUBDIRS= cam/ata cam/mmc cam/nvme cam/scsi \
geom/mirror geom/mountver geom/multipath geom/nop \
geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \
net/altq \
+ net/route \
netgraph/atm netgraph/netflow \
netinet/cc \
netinet/netdump \
diff --git a/lib/libc/gen/sysctl.3 b/lib/libc/gen/sysctl.3
index e44455df5cec..f383d61c36ef 100644
--- a/lib/libc/gen/sysctl.3
+++ b/lib/libc/gen/sysctl.3
@@ -563,6 +563,7 @@ The fifth, sixth, and seventh level names are as follows:
.It Dv NET_RT_IFLIST Ta 0 or if_index Ta None
.It Dv NET_RT_IFMALIST Ta 0 or if_index Ta None
.It Dv NET_RT_IFLISTL Ta 0 or if_index Ta None
+.It Dv NET_RT_NHOPS Ta None Ta fib number
.El
.Pp
The
@@ -583,6 +584,9 @@ uses 'l' versions of the message header structures:
.Va struct if_msghdrl
and
.Va struct ifa_msghdrl .
+.Pp
+.Dv NET_RT_NHOPS
+returns all nexthops for specified address family in given fib.
.It Li PF_INET
Get or set various global information about the IPv4
(Internet Protocol version 4).
diff --git a/sys/conf/files b/sys/conf/files
index 6805a4ddeb0b..1f11498138ef 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -4091,6 +4091,11 @@ net/raw_cb.c standard
net/raw_usrreq.c standard
net/route.c standard
net/route_temporal.c standard
+net/route/nhop.c standard
+net/route/nhop_ctl.c standard
+net/route/nhop_utils.c standard
+net/route/route_ctl.c standard
+net/route/route_helpers.c standard
net/rss_config.c optional inet rss | inet6 rss
net/rtsock.c standard
net/slcompress.c optional netgraph_vjc | sppp | \
diff --git a/sys/net/radix_mpath.c b/sys/net/radix_mpath.c
index 8a341458cea2..698ddbb516e8 100644
--- a/sys/net/radix_mpath.c
+++ b/sys/net/radix_mpath.c
@@ -211,7 +211,7 @@ rt_mpath_conflict(struct rib_head *rnh, struct rtentry *rt,
return (0);
}
-static struct rtentry *
+struct rtentry *
rt_mpath_selectrte(struct rtentry *rte, uint32_t hash)
{
struct radix_node *rn0, *rn;
diff --git a/sys/net/radix_mpath.h b/sys/net/radix_mpath.h
index 8f73dd032c9c..e4f513847545 100644
--- a/sys/net/radix_mpath.h
+++ b/sys/net/radix_mpath.h
@@ -56,10 +56,27 @@ int rt_mpath_conflict(struct rib_head *, struct rtentry *,
struct sockaddr *);
void rtalloc_mpath_fib(struct route *, u_int32_t, u_int);
struct rtentry *rt_mpath_select(struct rtentry *, uint32_t);
+struct rtentry *rt_mpath_selectrte(struct rtentry *, uint32_t);
int rt_mpath_deldup(struct rtentry *, struct rtentry *);
int rn4_mpath_inithead(void **, int, u_int);
int rn6_mpath_inithead(void **, int, u_int);
+static inline struct rtentry *
+rt_mpath_next(struct rtentry *rt)
+{
+ struct radix_node *next, *rn;
+
+ rn = (struct radix_node *)rt;
+
+ if (!rn->rn_dupedkey)
+ return (NULL);
+ next = rn->rn_dupedkey;
+ if (rn->rn_mask == next->rn_mask)
+ return (struct rtentry *)next;
+ else
+ return (NULL);
+}
+
#endif
#endif /* _NET_RADIX_MPATH_H_ */
diff --git a/sys/net/route.c b/sys/net/route.c
index 402373277ef4..9ffaf9570a26 100644
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -62,6 +62,8 @@
#include <net/if_dl.h>
#include <net/route.h>
#include <net/route_var.h>
+#include <net/route/nhop.h>
+#include <net/route/shared.h>
#include <net/vnet.h>
#ifdef RADIX_MPATH
@@ -108,10 +110,7 @@ VNET_DEFINE(u_int, rt_add_addr_allfibs) = 1;
SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET,
&VNET_NAME(rt_add_addr_allfibs), 0, "");
-VNET_PCPUSTAT_DEFINE_STATIC(struct rtstat, rtstat);
-#define RTSTAT_ADD(name, val) \
- VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val))
-#define RTSTAT_INC(name) RTSTAT_ADD(name, 1)
+VNET_PCPUSTAT_DEFINE(struct rtstat, rtstat);
VNET_PCPUSTAT_SYSINIT(rtstat);
#ifdef VIMAGE
@@ -240,6 +239,7 @@ route_init(void)
rt_numfibs = RT_MAXFIBS;
if (rt_numfibs == 0)
rt_numfibs = 1;
+ nhops_init();
}
SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL);
@@ -377,6 +377,8 @@ rt_table_init(int offset, int family, u_int fibnum)
/* Init locks */
RIB_LOCK_INIT(rh);
+ nhops_init_rib(rh);
+
/* Finally, set base callbacks */
rh->rnh_addaddr = rn_addroute;
rh->rnh_deladdr = rn_delete;
@@ -408,6 +410,8 @@ rt_table_destroy(struct rib_head *rh)
rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head);
+ nhops_destroy_rib(rh);
+
/* Assume table is already empty */
RIB_LOCK_DESTROY(rh);
free(rh, M_RTABLE);
@@ -586,6 +590,9 @@ rtfree(struct rtentry *rt)
*/
R_Free(rt_key(rt));
+ /* Unreference nexthop */
+ nhop_free(rt->rt_nhop);
+
/*
* and the rtentry itself of course
*/
@@ -1400,6 +1407,7 @@ rt_updatemtu(struct ifnet *ifp)
RIB_WLOCK(rnh);
rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu);
RIB_WUNLOCK(rnh);
+ nhops_update_ifmtu(rnh, ifp, ifmtu.mtu);
}
}
}
@@ -1544,6 +1552,7 @@ int
rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
u_int fibnum)
{
+ struct epoch_tracker et;
const struct sockaddr *dst;
struct rib_head *rnh;
int error;
@@ -1592,9 +1601,11 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
error = add_route(rnh, info, ret_nrt);
break;
case RTM_CHANGE:
+ NET_EPOCH_ENTER(et);
RIB_WLOCK(rnh);
error = change_route(rnh, info, ret_nrt);
RIB_WUNLOCK(rnh);
+ NET_EPOCH_EXIT(et);
break;
default:
error = EOPNOTSUPP;
@@ -1609,9 +1620,11 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info,
{
struct sockaddr *dst, *ndst, *gateway, *netmask;
struct rtentry *rt, *rt_old;
+ struct nhop_object *nh;
struct radix_node *rn;
struct ifaddr *ifa;
int error, flags;
+ struct epoch_tracker et;
dst = info->rti_info[RTAX_DST];
gateway = info->rti_info[RTAX_GATEWAY];
@@ -1631,18 +1644,30 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info,
} else {
ifa_ref(info->rti_ifa);
}
+
+ NET_EPOCH_ENTER(et);
+ error = nhop_create_from_info(rnh, info, &nh);
+ NET_EPOCH_EXIT(et);
+ if (error != 0) {
+ ifa_free(info->rti_ifa);
+ return (error);
+ }
+
rt = uma_zalloc(V_rtzone, M_NOWAIT);
if (rt == NULL) {
ifa_free(info->rti_ifa);
+ nhop_free(nh);
return (ENOBUFS);
}
rt->rt_flags = RTF_UP | flags;
rt->rt_fibnum = rnh->rib_fibnum;
+ rt->rt_nhop = nh;
/*
* Add the gateway. Possibly re-malloc-ing the storage for it.
*/
if ((error = rt_setgate(rt, dst, gateway)) != 0) {
ifa_free(info->rti_ifa);
+ nhop_free(nh);
uma_zfree(V_rtzone, rt);
return (error);
}
@@ -1682,6 +1707,7 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info,
ifa_free(rt->rt_ifa);
R_Free(rt_key(rt));
+ nhop_free(nh);
uma_zfree(V_rtzone, rt);
return (EEXIST);
}
@@ -1723,6 +1749,7 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info,
if (rn == NULL) {
ifa_free(rt->rt_ifa);
R_Free(rt_key(rt));
+ nhop_free(nh);
uma_zfree(V_rtzone, rt);
return (EEXIST);
}
@@ -1802,6 +1829,7 @@ change_route(struct rib_head *rnh, struct rt_addrinfo *info,
int error = 0;
int free_ifa = 0;
int family, mtu;
+ struct nhop_object *nh;
struct if_mtuinfo ifmtu;
RIB_WLOCK_ASSERT(rnh);
@@ -1824,6 +1852,7 @@ change_route(struct rib_head *rnh, struct rt_addrinfo *info,
}
#endif
+ nh = NULL;
RT_LOCK(rt);
rt_setmetrics(info, rt);
@@ -1852,6 +1881,10 @@ change_route(struct rib_head *rnh, struct rt_addrinfo *info,
goto bad;
}
+ error = nhop_create_from_nhop(rnh, rt->rt_nhop, info, &nh);
+ if (error != 0)
+ goto bad;
+
/* Check if outgoing interface has changed */
if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa &&
rt->rt_ifa != NULL) {
@@ -1897,6 +1930,11 @@ change_route(struct rib_head *rnh, struct rt_addrinfo *info,
}
}
+ /* Update nexthop */
+ nhop_free(rt->rt_nhop);
+ rt->rt_nhop = nh;
+ nh = NULL;
+
/*
* This route change may have modified the route's gateway. In that
* case, any inpcbs that have cached this route need to invalidate their
@@ -1910,6 +1948,8 @@ change_route(struct rib_head *rnh, struct rt_addrinfo *info,
}
bad:
RT_UNLOCK(rt);
+ if (nh != NULL)
+ nhop_free(nh);
if (free_ifa != 0) {
ifa_free(info->rti_ifa);
info->rti_ifa = NULL;
diff --git a/sys/net/route.h b/sys/net/route.h
index b5646246320c..4bdb6e84389e 100644
--- a/sys/net/route.h
+++ b/sys/net/route.h
@@ -90,7 +90,8 @@ struct rt_metrics {
u_long rmx_rttvar; /* estimated rtt variance */
u_long rmx_pksent; /* packets sent using this route */
u_long rmx_weight; /* route weight */
- u_long rmx_filler[3]; /* will be used for T/TCP later */
+ u_long rmx_nhidx; /* route nexhop index */
+ u_long rmx_filler[2]; /* will be used for T/TCP later */
};
/*
@@ -150,6 +151,7 @@ struct rtentry {
struct sockaddr *rt_gateway; /* value */
struct ifnet *rt_ifp; /* the answer: interface to use */
struct ifaddr *rt_ifa; /* the answer: interface address to use */
+ struct nhop_object *rt_nhop; /* nexthop data */
int rt_flags; /* up/down?, host/net */
int rt_refcnt; /* # held references */
u_int rt_fibnum; /* which FIB */
@@ -215,9 +217,13 @@ struct rtentry {
#define NHF_HOST 0x0400 /* RTF_HOST */
/* Nexthop request flags */
+#define NHR_NONE 0x00 /* empty flags field */
#define NHR_IFAIF 0x01 /* Return ifa_ifp interface */
#define NHR_REF 0x02 /* For future use */
+/* uRPF */
+#define NHR_NODEFAULT 0x04 /* do not consider default route */
+
/* Control plane route request flags */
#define NHR_COPY 0x100 /* Copy rte data */
@@ -245,6 +251,8 @@ struct rtstat {
uint64_t rts_newgateway; /* routes modified by redirects */
uint64_t rts_unreach; /* lookups which failed */
uint64_t rts_wildcard; /* lookups satisfied by a wildcard */
+ uint64_t rts_nh_idx_alloc_failure; /* nexthop index alloc failure*/
+ uint64_t rts_nh_alloc_failure; /* nexthop allocation failure*/
};
/*
@@ -507,6 +515,8 @@ int rib_add_redirect(u_int fibnum, struct sockaddr *dst,
struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp,
int flags, int expire_sec);
+/* New API */
+void rib_walk(int af, u_int fibnum, rt_walktree_f_t *wa_f, void *arg);
#endif
#endif
diff --git a/sys/net/route/nhop.c b/sys/net/route/nhop.c
new file mode 100644
index 000000000000..d71ba79c1295
--- /dev/null
+++ b/sys/net/route/nhop.c
@@ -0,0 +1,388 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/shared.h>
+#include <net/vnet.h>
+
+/*
+ * This file contains data structures management logic for the nexthop ("nhop")
+ * route subsystem.
+ *
+ * Nexthops in the original sense are the objects containing all the necessary
+ * information to forward the packet to the selected destination.
+ * In particular, nexthop is defined by a combination of
+ * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and
+ * NHF_DEFAULT
+ *
+ * All nexthops are stored in the resizable hash table.
+ * Additionally, each nexthop gets assigned its unique index (nexthop index)
+ * so userland programs can interact with the nexthops easier. Index allocation
+ * is backed by the bitmask array.
+ */
+
+static MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data");
+
+
+/* Hash management functions */
+
+int
+nhops_init_rib(struct rib_head *rh)
+{
+ struct nh_control *ctl;
+ size_t alloc_size;
+ uint32_t num_buckets, num_items;
+ void *ptr;
+
+ ctl = malloc(sizeof(struct nh_control), M_NHOP, M_WAITOK | M_ZERO);
+
+ /*
+ * Allocate nexthop hash. Start with 16 items by default (128 bytes).
+ * This will be enough for most of the cases.
+ */
+ num_buckets = 16;
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
+ ptr = malloc(alloc_size, M_NHOP, M_WAITOK | M_ZERO);
+ CHT_SLIST_INIT(&ctl->nh_head, ptr, num_buckets);
+
+ /*
+ * Allocate nexthop index bitmask.
+ */
+ num_items = 128 * 8; /* 128 bytes */
+ ptr = malloc(bitmask_get_size(num_items), M_NHOP, M_WAITOK | M_ZERO);
+ bitmask_init(&ctl->nh_idx_head, ptr, num_items);
+
+ NHOPS_LOCK_INIT(ctl);
+
+ rh->nh_control = ctl;
+ ctl->ctl_rh = rh;
+
+ DPRINTF("NHOPS init for fib %u af %u: ctl %p rh %p", rh->rib_fibnum,
+ rh->rib_family, ctl, rh);
+
+ return (0);
+}
+
+static void
+destroy_ctl(struct nh_control *ctl)
+{
+
+ NHOPS_LOCK_DESTROY(ctl);
+ free(ctl->nh_head.ptr, M_NHOP);
+ free(ctl->nh_idx_head.idx, M_NHOP);
+ free(ctl, M_NHOP);
+}
+
+/*
+ * Epoch callback indicating ctl is safe to destroy
+ */
+static void
+destroy_ctl_epoch(epoch_context_t ctx)
+{
+ struct nh_control *ctl;
+
+ ctl = __containerof(ctx, struct nh_control, ctl_epoch_ctx);
+
+ destroy_ctl(ctl);
+}
+
+void
+nhops_destroy_rib(struct rib_head *rh)
+{
+ struct nh_control *ctl;
+ struct nhop_priv *nh_priv;
+
+ ctl = rh->nh_control;
+
+ /*
+ * All routes should have been deleted in rt_table_destroy().
+ * However, TCP stack or other consumers may store referenced
+ * nexthop pointers. When these references go to zero,
+ * nhop_free() will try to unlink these records from the
+ * datastructures, most likely leading to panic.
+ *
+ * Avoid that by explicitly marking all of the remaining
+ * nexthops as unlinked by removing a reference from a special
+ * counter. Please see nhop_free() comments for more
+ * details.
+ */
+
+ NHOPS_WLOCK(ctl);
+ CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) {
+ DPRINTF("Marking nhop %u unlinked", nh_priv->nh_idx);
+ refcount_release(&nh_priv->nh_linked);
+ } CHT_SLIST_FOREACH_END;
+ NHOPS_WUNLOCK(ctl);
+
+ /*
+ * Postpone destruction till the end of current epoch
+ * so nhop_free() can safely use nh_control pointer.
+ */
+ epoch_call(net_epoch_preempt, destroy_ctl_epoch,
+ &ctl->ctl_epoch_ctx);
+}
+
+/*
+ * Nexhop hash calculation:
+ *
+ * Nexthops distribution:
+ * 2 "mandatory" nexthops per interface ("interface route", "loopback").
+ * For direct peering: 1 nexthop for the peering router per ifp/af.
+ * For Ix-like peering: tens to hundreds nexthops of neghbors per ifp/af.
+ * IGP control plane & broadcast segment: tens of nexthops per ifp/af.
+ *
+ * Each fib/af combination has its own hash table.
+ * With that in mind, hash nexthops by the combination of the interface
+ * and GW IP address.
+ *
+ * To optimize hash calculation, ignore higher bytes of ifindex, as they
+ * give very little entropy.
+ * Similarly, use lower 4 bytes of IPv6 address to distinguish between the
+ * neighbors.
+ */
+struct _hash_data {
+ uint16_t ifindex;
+ uint8_t family;
+ uint8_t nh_type;
+ uint32_t gw_addr;
+};
+
+static unsigned
+djb_hash(const unsigned char *h, const int len)
+{
+ unsigned int result = 0;
+ int i;
+
+ for (i = 0; i < len; i++)
+ result = 33 * result ^ h[i];
+
+ return (result);
+}
+
+static uint32_t
+hash_priv(const struct nhop_priv *priv)
+{
+ struct nhop_object *nh;
+ uint16_t ifindex;
+ struct _hash_data key;
+
+ nh = priv->nh;
+ ifindex = nh->nh_ifp->if_index & 0xFFFF;
+ memset(&key, 0, sizeof(key));
+
+ key.ifindex = ifindex;
+ key.family = nh->gw_sa.sa_family;
+ key.nh_type = priv->nh_type & 0xFF;
+ if (nh->gw_sa.sa_family == AF_INET6)
+ memcpy(&key.gw_addr, &nh->gw6_sa.sin6_addr.s6_addr32[3], 4);
+ else if (nh->gw_sa.sa_family == AF_INET)
+ memcpy(&key.gw_addr, &nh->gw4_sa.sin_addr, 4);
+
+ return (uint32_t)(djb_hash((const unsigned char *)&key, sizeof(key)));
+}
+
+/*
+ * Checks if hash needs resizing and performs this resize if necessary
+ *
+ */
+static void
+consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items)
+{
+ void *nh_ptr, *nh_idx_ptr;
+ void *old_idx_ptr;
+ size_t alloc_size;
+
+ nh_ptr = NULL;
+ if (new_nh_buckets != 0) {
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets);
+ nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ nh_idx_ptr = NULL;
+ if (new_idx_items != 0) {
+ alloc_size = bitmask_get_size(new_idx_items);
+ nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ if (nh_ptr == NULL && nh_idx_ptr == NULL) {
+ /* Either resize is not required or allocations have failed. */
+ return;
+ }
+
+ DPRINTF("going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]", nh_ptr,
+ new_nh_buckets, nh_idx_ptr, new_idx_items);
+
+ old_idx_ptr = NULL;
+
+ NHOPS_WLOCK(ctl);
+ if (nh_ptr != NULL) {
+ CHT_SLIST_RESIZE(&ctl->nh_head, nhops, nh_ptr, new_nh_buckets);
+ }
+ if (nh_idx_ptr != NULL) {
+ if (bitmask_copy(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items) == 0)
+ bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr);
+ }
+ NHOPS_WUNLOCK(ctl);
+
+ if (nh_ptr != NULL)
+ free(nh_ptr, M_NHOP);
+ if (old_idx_ptr != NULL)
+ free(old_idx_ptr, M_NHOP);
+}
+
+/*
+ * Links nextop @nh_priv to the nexhop hash table and allocates
+ * nexhop index.
+ * Returns allocated index or 0 on failure.
+ */
+int
+link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv)
+{
+ uint16_t idx;
+ uint32_t num_buckets_new, num_items_new;
+
+ KASSERT((nh_priv->nh_idx == 0), ("nhop index is already allocated"));
+ NHOPS_WLOCK(ctl);
+
+ /*
+ * Check if we need to resize hash and index.
+ * The following 2 functions returns either new size or 0
+ * if resize is not required.
+ */
+ num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head);
+ num_items_new = bitmask_get_resize_items(&ctl->nh_idx_head);
+
+ if (bitmask_alloc_idx(&ctl->nh_idx_head, &idx) != 0) {
+ NHOPS_WUNLOCK(ctl);
+ DPRINTF("Unable to allocate nhop index");
+ RTSTAT_INC(rts_nh_idx_alloc_failure);
+ consider_resize(ctl, num_buckets_new, num_items_new);
+ return (0);
+ }
+
+ nh_priv->nh_idx = idx;
+ nh_priv->nh_control = ctl;
+
+ CHT_SLIST_INSERT_HEAD(&ctl->nh_head, nhops, nh_priv);
+
+ NHOPS_WUNLOCK(ctl);
+
+ DPRINTF("Linked nhop priv %p to %d, hash %u, ctl %p", nh_priv, idx,
+ hash_priv(nh_priv), ctl);
+ consider_resize(ctl, num_buckets_new, num_items_new);
+
+ return (idx);
+}
+
+/*
+ * Unlinks nexthop specified by @nh_priv data from the hash.
+ *
+ * Returns found nexthop or NULL.
+ */
+struct nhop_priv *
+unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv_del)
+{
+ struct nhop_priv *priv_ret;
+ int idx;
+ uint32_t num_buckets_new, num_items_new;
+
+ idx = 0;
+
+ NHOPS_WLOCK(ctl);
+ CHT_SLIST_REMOVE_BYOBJ(&ctl->nh_head, nhops, nh_priv_del, priv_ret);
+
+ if (priv_ret != NULL) {
+ idx = priv_ret->nh_idx;
+ priv_ret->nh_idx = 0;
+
+ KASSERT((idx != 0), ("bogus nhop index 0"));
+ if ((bitmask_free_idx(&ctl->nh_idx_head, idx)) != 0) {
+ DPRINTF("Unable to remove index %d from fib %u af %d",
+ idx, ctl->ctl_rh->rib_fibnum,
+ ctl->ctl_rh->rib_family);
+ }
+ }
+
+ /* Check if hash or index needs to be resized */
+ num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head);
+ num_items_new = bitmask_get_resize_items(&ctl->nh_idx_head);
+
+ NHOPS_WUNLOCK(ctl);
+
+ if (priv_ret == NULL)
+ DPRINTF("Unable to unlink nhop priv %p from hash, hash %u ctl %p",
+ nh_priv_del, hash_priv(nh_priv_del), ctl);
+ else
+ DPRINTF("Unlinked nhop %p priv idx %d", priv_ret, idx);
+
+ consider_resize(ctl, num_buckets_new, num_items_new);
+
+ return (priv_ret);
+}
+
+/*
+ * Searches for the nexthop by data specifcied in @nh_priv.
+ * Returns referenced nexthop or NULL.
+ */
+struct nhop_priv *
+find_nhop(struct nh_control *ctl, const struct nhop_priv *nh_priv)
+{
+ struct nhop_priv *nh_priv_ret;
+
+ NHOPS_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->nh_head, nhops, nh_priv, nh_priv_ret);
+ if (nh_priv_ret != NULL) {
+ if (refcount_acquire_if_not_zero(&nh_priv_ret->nh_refcnt) == 0){
+ /* refcount was 0 -> nhop is being deleted */
+ nh_priv_ret = NULL;
+ }
+ }
+ NHOPS_RUNLOCK(ctl);
+
+ return (nh_priv_ret);
+}
+
diff --git a/sys/net/route/nhop.h b/sys/net/route/nhop.h
new file mode 100644
index 000000000000..c747a6399c2c
--- /dev/null
+++ b/sys/net/route/nhop.h
@@ -0,0 +1,229 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This header file contains public definitions for the nexthop routing subsystem.
+ */
+
+#ifndef _NET_ROUTE_NHOP_H_
+#define _NET_ROUTE_NHOP_H_
+
+#include <netinet/in.h> /* sockaddr_in && sockaddr_in6 */
+
+#include <sys/counter.h>
+
+enum nhop_type {
+ NH_TYPE_IPV4_ETHER_RSLV = 1, /* IPv4 ethernet without GW */
+ NH_TYPE_IPV4_ETHER_NHOP = 2, /* IPv4 with pre-calculated ethernet encap */
+ NH_TYPE_IPV6_ETHER_RSLV = 3, /* IPv6 ethernet, without GW */
+ NH_TYPE_IPV6_ETHER_NHOP = 4 /* IPv6 with pre-calculated ethernet encap*/
+};
+
+#ifdef _KERNEL
+
+/*
+ * Define shorter version of AF_LINK sockaddr.
+ *
+ * Currently the only use case of AF_LINK gateway is storing
+ * interface index of the interface of the source IPv6 address.
+ * This is used by the IPv6 code for the connections over loopback
+ * interface.
+ *
+ * The structure below copies 'struct sockaddr_dl', reducing the
+ * size of sdl_data buffer, as it is not used. This change
+ * allows to store the AF_LINK gateways in the nhop gateway itself,
+ * simplifying control plane handling.
+ */
+struct sockaddr_dl_short {
+ u_char sdl_len; /* Total length of sockaddr */
+ u_char sdl_family; /* AF_LINK */
+ u_short sdl_index; /* if != 0, system given index for interface */
+ u_char sdl_type; /* interface type */
+ u_char sdl_nlen; /* interface name length, no trailing 0 reqd. */
+ u_char sdl_alen; /* link level address length */
+ u_char sdl_slen; /* link layer selector length */
+ char sdl_data[8]; /* unused */
+};
+
+#define NHOP_RELATED_FLAGS \
+ (RTF_GATEWAY | RTF_HOST | RTF_REJECT | RTF_BLACKHOLE | \
+ RTF_FIXEDMTU | RTF_LOCAL | RTF_BROADCAST | RTF_MULTICAST)
+
+struct nh_control;
+struct nhop_priv;
+
+/*
+ * Struct 'nhop_object' field description:
+ *
+ * nh_flags: NHF_ flags used in the dataplane code. NHF_GATEWAY or NHF_BLACKHOLE
+ * can be examples of such flags.
+ * nh_mtu: ready-to-use nexthop mtu. Already accounts for the link-level header,
+ * interface MTU and protocol-specific limitations.
+ * nh_prepend_len: link-level prepend length. Currently unused.
+ * nh_ifp: logical transmit interface. The one from which if_transmit() will be
+ * called. Guaranteed to be non-NULL.
+ * nh_aifp: ifnet of the source address. Same as nh_ifp except IPv6 loopback
+ * routes. See the example below.
+ * nh_ifa: interface address to use. Guaranteed to be non-NULL.
+ * nh_pksent: counter(9) reflecting the number of packets transmitted.
+ *
+ * gw_: storage suitable to hold AF_INET, AF_INET6 or AF_LINK gateway. More
+ * details ara available in the examples below.
+ *
+ * Examples:
+ *
+ * Direct routes (routes w/o gateway):
+ * NHF_GATEWAY is NOT set.
+ * nh_ifp denotes the logical transmit interface ().
+ * nh_aifp is the same as nh_ifp
+ * gw_sa contains AF_LINK sa with nh_aifp ifindex (compat)
+ * Loopback routes:
+ * NHF_GATEWAY is NOT set.
+ * nh_ifp points to the loopback interface (lo0).
+ * nh_aifp points to the interface where the destination address belongs to.
+ * This is useful in IPv6 link-local-over-loopback communications.
+ * gw_sa contains AF_LINK sa with nh_aifp ifindex (compat)
+ * GW routes:
+ * NHF_GATEWAY is set.
+ * nh_ifp denotes the logical transmit interface.
+ * nh_aifp is the same as nh_ifp
+ * gw_sa contains L3 address (either AF_INET or AF_INET6).
+ *
+ *
+ * Note: struct nhop_object fields are ordered in a way that
+ * supports memcmp-based comparisons.
+ *
+ */
+#define NHOP_END_CMP (__offsetof(struct nhop_object, nh_pksent))
+
+struct nhop_object {
+ uint16_t nh_flags; /* nhop flags */
+ uint16_t nh_mtu; /* nexthop mtu */
+ union {
+ struct sockaddr_in gw4_sa; /* GW accessor as IPv4 */
+ struct sockaddr_in6 gw6_sa; /* GW accessor as IPv6 */
+ struct sockaddr gw_sa;
+ struct sockaddr_dl_short gwl_sa; /* AF_LINK gw (compat) */
+ char gw_buf[28];
+ };
+ struct ifnet *nh_ifp; /* Logical egress interface. Always != NULL */
+ struct ifaddr *nh_ifa; /* interface address to use. Always != NULL */
+ struct ifnet *nh_aifp; /* ifnet of the source address. Always != NULL */
+ counter_u64_t nh_pksent; /* packets sent using this nhop */
+ /* 32 bytes + 4xPTR == 64(amd64) / 48(i386) */
+ uint8_t nh_prepend_len; /* length of prepend data */
+ uint8_t spare[3];
+ uint32_t spare1; /* alignment */
+ char nh_prepend[48]; /* L2 prepend */
+ struct nhop_priv *nh_priv; /* control plane data */
+ /* -- 128 bytes -- */
+};
+
+/*
+ * Nhop validness.
+ *
+ * Currently we verify whether link is up or not on every packet, which can be
+ * quite costy.
+ * TODO: subscribe for the interface notifications and update the nexthops
+ * with NHF_INVALID flag.
+ */
+
+#define NH_IS_VALID(_nh) RT_LINK_IS_UP((_nh)->nh_ifp)
+#define NH_IS_MULTIPATH(_nh) ((_nh)->nh_flags & NHF_MULTIPATH)
+
+#define RT_GATEWAY(_rt) ((struct sockaddr *)&(_rt)->rt_nhop->gw4_sa)
+#define RT_GATEWAY_CONST(_rt) ((const struct sockaddr *)&(_rt)->rt_nhop->gw4_sa)
+
+#define NH_FREE(_nh) do { \
+ nhop_free(_nh); \
+ /* guard against invalid refs */ \
+ _nh = NULL; \
+} while (0)
+
+
+void nhop_free(struct nhop_object *nh);
+
+struct sysctl_req;
+struct sockaddr_dl;
+struct rib_head;
+
+uint32_t nhop_get_idx(const struct nhop_object *nh);
+enum nhop_type nhop_get_type(const struct nhop_object *nh);
+int nhop_get_rtflags(const struct nhop_object *nh);
+
+int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+
+#endif /* _KERNEL */
+
+/* Kernel <> userland structures */
+
+/* Structure usage and layout are described in dump_nhop_entry() */
+struct nhop_external {
+ uint32_t nh_len; /* length of the datastructure */
+ uint32_t nh_idx; /* Nexthop index */
+ uint32_t nh_fib; /* Fib nexhop is attached to */
+ uint32_t ifindex; /* transmit interface ifindex */
+ uint32_t aifindex; /* address ifindex */
+ uint8_t prepend_len; /* length of the prepend */
+ uint8_t nh_family; /* address family */
+ uint16_t nh_type; /* nexthop type */
+ uint16_t nh_mtu; /* nexthop mtu */
+
+ uint16_t nh_flags; /* nhop flags */
+ struct in_addr nh_addr; /* GW/DST IPv4 address */
+ struct in_addr nh_src; /* default source IPv4 address */
+ uint64_t nh_pksent;
+ /* control plane */
+ /* lookup key: address, family, type */
+ char nh_prepend[64]; /* L2 prepend */
+ uint64_t nh_refcount; /* number of references */
+};
+
+struct nhop_addrs {
+ uint32_t na_len; /* length of the datastructure */
+ uint16_t gw_sa_off; /* offset of gateway SA */
+ uint16_t src_sa_off; /* offset of src address SA */
+};
+
+struct mpath_nhop_external {
+ uint32_t nh_idx;
+ uint32_t nh_weight;
+};
+
+struct mpath_external {
+ uint32_t mp_idx;
+ uint32_t mp_refcount;
+ uint32_t mp_nh_count;
+ uint32_t mp_group_size;
+};
+
+
+#endif
+
+
diff --git a/sys/net/route/nhop_ctl.c b/sys/net/route/nhop_ctl.c
new file mode 100644
index 000000000000..cb1617e1cc3a
--- /dev/null
+++ b/sys/net/route/nhop_ctl.c
@@ -0,0 +1,827 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/epoch.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/shared.h>
+#include <net/vnet.h>
+
+/*
+ * This file contains core functionality for the nexthop ("nhop") route subsystem.
+ * The business logic needed to create nexhop objects is implemented here.
+ *
+ * Nexthops in the original sense are the objects containing all the necessary
+ * information to forward the packet to the selected destination.
+ * In particular, nexthop is defined by a combination of
+ * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and
+ * NHF_DEFAULT
+ *
+ * Additionally, each nexthop gets assigned its unique index (nexthop index).
+ * It serves two purposes: first one is to ease the ability of userland programs to
+ * reference nexthops by their index. The second one allows lookup algorithms to
+ * to store index instead of pointer (2 bytes vs 8) as a lookup result.
+ * All nexthops are stored in the resizable hash table.
+ *
+ * Basically, this file revolves around supporting 3 functions:
+ * 1) nhop_create_from_info / nhop_create_from_nhop, which contains all
+ * business logic on filling the nexthop fields based on the provided request.
+ * 2) nhop_get(), which gets a usable referenced nexthops.
+ *
+ * Conventions:
+ * 1) non-exported functions start with verb
+ * 2) exported function starts with the subsystem prefix: "nhop"
+ */
+
+static int dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w);
+
+static struct nhop_priv *alloc_nhop_structure(void);
+static int get_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_priv **pnh_priv);
+static int finalize_nhop(struct nh_control *ctl, struct rt_addrinfo *info,
+ struct nhop_priv *nh_priv);
+static struct ifnet *get_aifp(const struct nhop_object *nh, int reference);
+static void fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp);
+
+static void destroy_nhop_epoch(epoch_context_t ctx);
+static void destroy_nhop(struct nhop_priv *nh_priv);
+
+static void print_nhop(const char *prefix, const struct nhop_object *nh);
+
+_Static_assert(__offsetof(struct nhop_object, nh_ifp) == 32,
+ "nhop_object: wrong nh_ifp offset");
+_Static_assert(sizeof(struct nhop_object) <= 128,
+ "nhop_object: size exceeds 128 bytes");
+
+static uma_zone_t nhops_zone; /* Global zone for each and every nexthop */
+
+
+#define NHOP_OBJECT_ALIGNED_SIZE roundup2(sizeof(struct nhop_object), \
+ 2 * CACHE_LINE_SIZE)
+#define NHOP_PRIV_ALIGNED_SIZE roundup2(sizeof(struct nhop_priv), \
+ 2 * CACHE_LINE_SIZE)
+void
+nhops_init(void)
+{
+
+ nhops_zone = uma_zcreate("routing nhops",
+ NHOP_OBJECT_ALIGNED_SIZE + NHOP_PRIV_ALIGNED_SIZE,
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+
+/*
+ * Fetches the interface of source address used by the route.
+ * In all cases except interface-address-route it would be the
+ * same as the transmit interfaces.
+ * However, for the interface address this function will return
+ * this interface ifp instead of loopback. This is needed to support
+ * link-local IPv6 loopback communications.
+ *
+ * If @reference is non-zero, found ifp is referenced.
+ *
+ * Returns found ifp.
+ */
+static struct ifnet *
+get_aifp(const struct nhop_object *nh, int reference)
+{
+ struct ifnet *aifp = NULL;
+
+ /*
+ * Adjust the "outgoing" interface. If we're going to loop
+ * the packet back to ourselves, the ifp would be the loopback
+ * interface. However, we'd rather know the interface associated
+ * to the destination address (which should probably be one of
+ * our own addresses).
+ */
+ if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) &&
+ nh->gw_sa.sa_family == AF_LINK) {
+ if (reference)
+ aifp = ifnet_byindex_ref(nh->gwl_sa.sdl_index);
+ else
+ aifp = ifnet_byindex(nh->gwl_sa.sdl_index);
+ if (aifp == NULL) {
+ DPRINTF("unable to get aifp for %s index %d",
+ if_name(nh->nh_ifp), nh->gwl_sa.sdl_index);
+ }
+ }
+
+ if (aifp == NULL) {
+ aifp = nh->nh_ifp;
+ if (reference)
+ if_ref(aifp);
+ }
+
+ return (aifp);
+}
+
+int
+cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two)
+{
+
+ if (memcmp(_one->nh, _two->nh, NHOP_END_CMP) != 0)
+ return (0);
+
+ if ((_one->nh_type != _two->nh_type) ||
+ (_one->nh_family != _two->nh_family))
+ return (0);
+
+ return (1);
+}
+
+/*
+ * Conditionally sets @nh mtu data based on the @info data.
+ */
+static void
+set_nhop_mtu_from_info(struct nhop_object *nh, const struct rt_addrinfo *info)
+{
+
+ if (info->rti_mflags & RTV_MTU) {
+ if (info->rti_rmx->rmx_mtu != 0) {
+
+ /*
+ * MTU was explicitly provided by user.
+ * Keep it.
+ */
+
+ nh->nh_priv->rt_flags |= RTF_FIXEDMTU;
+ } else {
+
+ /*
+ * User explicitly sets MTU to 0.
+ * Assume rollback to default.
+ */
+ nh->nh_priv->rt_flags &= ~RTF_FIXEDMTU;
+ }
+ nh->nh_mtu = info->rti_rmx->rmx_mtu;
+ }
+}
+
+/*
+ * Fills in shorted link-level sockadd version suitable to be stored inside the
+ * nexthop gateway buffer.
+ */
+static void
+fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp)
+{
+
+ sdl->sdl_family = AF_LINK;
+ sdl->sdl_len = sizeof(struct sockaddr_dl_short);
+ sdl->sdl_index = ifp->if_index;
+ sdl->sdl_type = ifp->if_type;
+}
+
+static int
+set_nhop_gw_from_info(struct nhop_object *nh, struct rt_addrinfo *info)
+{
+ struct sockaddr *gw;
+
+ gw = info->rti_info[RTAX_GATEWAY];
+ if (info->rti_flags & RTF_GATEWAY) {
+ if (gw->sa_len > sizeof(struct sockaddr_in6)) {
+ DPRINTF("nhop SA size too big: AF %d len %u",
+ gw->sa_family, gw->sa_len);
+ return (ENOMEM);
+ }
+ memcpy(&nh->gw_sa, gw, gw->sa_len);
+ } else {
+ /*
+ * Interface route. Currently the route.c code adds
+ * sa of type AF_LINK, which is 56 bytes long. The only
+ * meaningful data there is the interface index. It is used
+ * used is the IPv6 loopback output, where we need to preserve
+ * the original interface to maintain proper scoping.
+ * Despite the fact that nexthop code stores original interface
+ * in the separate field (nh_aifp, see below), write AF_LINK
+ * compatible sa with shorter total length.
+ */
+ fill_sdl_from_ifp(&nh->gwl_sa, nh->nh_ifp);
+ }
+
+ return (0);
+}
+
+static int
+fill_nhop_from_info(struct nhop_priv *nh_priv, struct rt_addrinfo *info)
+{
+ int error, rt_flags;
+ struct nhop_object *nh;
+
+ nh = nh_priv->nh;
+
+ rt_flags = info->rti_flags & NHOP_RT_FLAG_MASK;
+
+ nh->nh_priv->rt_flags = rt_flags;
+ nh_priv->nh_family = info->rti_info[RTAX_DST]->sa_family;
+ nh_priv->nh_type = 0; // hook responsibility to set nhop type
+
+ nh->nh_flags = fib_rte_to_nh_flags(rt_flags);
+ set_nhop_mtu_from_info(nh, info);
+ nh->nh_ifp = info->rti_ifa->ifa_ifp;
+ nh->nh_ifa = info->rti_ifa;
+ nh->nh_aifp = get_aifp(nh, 0);
+
+ if ((error = set_nhop_gw_from_info(nh, info)) != 0)
+ return (error);
+
+ /*
+ * Note some of the remaining data is set by the
+ * per-address-family pre-add hook.
+ */
+
+ return (0);
+}
+
+/*
+ * Creates a new nexthop based on the information in @info.
+ *
+ * Returns:
+ * 0 on success, filling @nh_ret with the desired nexthop object ptr
+ * errno otherwise
+ */
+int
+nhop_create_from_info(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_object **nh_ret)
+{
+ struct nhop_priv *nh_priv;
+ int error;
+
+ NET_EPOCH_ASSERT();
+
+ nh_priv = alloc_nhop_structure();
+
+ error = fill_nhop_from_info(nh_priv, info);
+ if (error != 0) {
+ uma_zfree(nhops_zone, nh_priv->nh);
+ return (error);
+ }
+
+ error = get_nhop(rnh, info, &nh_priv);
+ if (error == 0)
+ *nh_ret = nh_priv->nh;
+
+ return (error);
+}
+
+/*
+ * Gets linked nhop using the provided @pnh_priv nexhop data.
+ * If linked nhop is found, returns it, freeing the provided one.
+ * If there is no such nexthop, attaches the remaining data to the
+ * provided nexthop and links it.
+ *
+ * Returns 0 on success, storing referenced nexthop in @pnh_priv.
+ * Otherwise, errno is returned.
+ */
+static int
+get_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_priv **pnh_priv)
+{
+ const struct sockaddr *dst, *gateway, *netmask;
+ struct nhop_priv *nh_priv, *tmp_priv;
+ int error;
+
+ nh_priv = *pnh_priv;
+
+ /* Give the protocols chance to augment the request data */
+ dst = info->rti_info[RTAX_DST];
+ netmask = info->rti_info[RTAX_NETMASK];
+ gateway = info->rti_info[RTAX_GATEWAY];
+
+ error = rnh->rnh_preadd(rnh->rib_fibnum, dst, netmask, nh_priv->nh);
+ if (error != 0) {
+ uma_zfree(nhops_zone, nh_priv->nh);
+ return (error);
+ }
+
+ tmp_priv = find_nhop(rnh->nh_control, nh_priv);
+ if (tmp_priv != NULL) {
+ uma_zfree(nhops_zone, nh_priv->nh);
+ *pnh_priv = tmp_priv;
+ return (0);
+ }
+
+ /*
+ * Existing nexthop not found, need to create new one.
+ * Note: multiple simultaneous get_nhop() requests
+ * can result in multiple equal nexhops existing in the
+ * nexthop table. This is not a not a problem until the
+ * relative number of such nexthops is significant, which
+ * is extremely unlikely.
+ */
+
+ error = finalize_nhop(rnh->nh_control, info, nh_priv);
+ if (error != 0)
+ return (error);
+
+ return (0);
+}
+
+/*
+ * Update @nh with data supplied in @info.
+ * This is a helper function to support route changes.
+ *
+ * It limits the changes that can be done to the route to the following:
+ * 1) all combination of gateway changes (gw, interface, blackhole/reject)
+ * 2) route flags (FLAG[123],STATIC,BLACKHOLE,REJECT)
+ * 3) route MTU
+ *
+ * Returns:
+ * 0 on success
+ */
+static int
+alter_nhop_from_info(struct nhop_object *nh, struct rt_addrinfo *info)
+{
+ struct sockaddr *info_gw;
+ int error;
+
+ /* Update MTU if set in the request*/
+ set_nhop_mtu_from_info(nh, info);
+
+ /* XXX: allow only one of BLACKHOLE,REJECT,GATEWAY */
+
+ /* Allow some flags (FLAG1,STATIC,BLACKHOLE,REJECT) to be toggled on change. */
+ nh->nh_priv->rt_flags &= ~RTF_FMASK;
+ nh->nh_priv->rt_flags |= info->rti_flags & RTF_FMASK;
+
+ /* Consider gateway change */
+ info_gw = info->rti_info[RTAX_GATEWAY];
+ if (info_gw != NULL) {
+ error = set_nhop_gw_from_info(nh, info);
+ if (error != 0)
+ return (error);
+ /* Update RTF_GATEWAY flag status */
+ nh->nh_priv->rt_flags &= ~RTF_GATEWAY;
+ nh->nh_priv->rt_flags |= (RTF_GATEWAY & info->rti_flags);
+ }
+ /* Update datapath flags */
+ nh->nh_flags = fib_rte_to_nh_flags(nh->nh_priv->rt_flags);
+
+ if (info->rti_ifa != NULL)
+ nh->nh_ifa = info->rti_ifa;
+ if (info->rti_ifp != NULL)
+ nh->nh_ifp = info->rti_ifp;
+ nh->nh_aifp = get_aifp(nh, 0);
+
+ return (0);
+}
+
+/*
+ * Creates new nexthop based on @nh_orig and augmentation data from @info.
+ * Helper function used in the route changes, please see
+ * alter_nhop_from_info() comments for more details.
+ *
+ * Returns:
+ * 0 on success, filling @nh_ret with the desired nexthop object
+ * errno otherwise
+ */
+int
+nhop_create_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_orig,
+ struct rt_addrinfo *info, struct nhop_object **pnh)
+{
+ struct nhop_priv *nh_priv;
+ struct nhop_object *nh;
+ int error;
+
+ NET_EPOCH_ASSERT();
+
+ nh_priv = alloc_nhop_structure();
+ nh = nh_priv->nh;
+
+ /* Start with copying data from original nexthop */
+ nh_priv->nh_family = nh_orig->nh_priv->nh_family;
+ nh_priv->rt_flags = nh_orig->nh_priv->rt_flags;
+ nh_priv->nh_type = nh_orig->nh_priv->nh_type;
+
+ nh->nh_ifp = nh_orig->nh_ifp;
+ nh->nh_ifa = nh_orig->nh_ifa;
+ nh->nh_aifp = nh_orig->nh_aifp;
+ nh->nh_mtu = nh_orig->nh_mtu;
+ nh->nh_flags = nh_orig->nh_flags;
+ memcpy(&nh->gw_sa, &nh_orig->gw_sa, nh_orig->gw_sa.sa_len);
+
+ error = alter_nhop_from_info(nh, info);
+ if (error != 0) {
+ uma_zfree(nhops_zone, nh_priv->nh);
+ return (error);
+ }
+
+ error = get_nhop(rnh, info, &nh_priv);
+ if (error == 0)
+ *pnh = nh_priv->nh;
+
+ return (error);
+}
+
+/*
+ * Allocates memory for public/private nexthop structures.
+ *
+ * Returns pointer to nhop_priv or NULL.
+ */
+static struct nhop_priv *
+alloc_nhop_structure()
+{
+ struct nhop_object *nh;
+ struct nhop_priv *nh_priv;
+
+ nh = (struct nhop_object *)uma_zalloc(nhops_zone, M_NOWAIT | M_ZERO);
+ if (nh == NULL)
+ return (NULL);
+ nh_priv = (struct nhop_priv *)((char *)nh + NHOP_OBJECT_ALIGNED_SIZE);
+
+ nh->nh_priv = nh_priv;
+ nh_priv->nh = nh;
+
+ return (nh_priv);
+}
+
+/*
+ * Alocates/references the remaining bits of nexthop data and links
+ * it to the hash table.
+ * Returns 0 if successful,
+ * errno otherwise. @nh_priv is freed in case of error.
+ */
+static int
+finalize_nhop(struct nh_control *ctl, struct rt_addrinfo *info,
+ struct nhop_priv *nh_priv)
+{
+ struct nhop_object *nh;
+
+ nh = nh_priv->nh;
+
+ /* Allocate per-cpu packet counter */
+ nh->nh_pksent = counter_u64_alloc(M_NOWAIT);
+ if (nh->nh_pksent == NULL) {
+ uma_zfree(nhops_zone, nh);
+ RTSTAT_INC(rts_nh_alloc_failure);
+ DPRINTF("nh_alloc_finalize failed");
+ return (ENOMEM);
+ }
+
+ /* Reference external objects and calculate (referenced) ifa */
+ if_ref(nh->nh_ifp);
+ ifa_ref(nh->nh_ifa);
+ nh->nh_aifp = get_aifp(nh, 1);
+ DPRINTF("AIFP: %p nh_ifp %p", nh->nh_aifp, nh->nh_ifp);
+
+ refcount_init(&nh_priv->nh_refcnt, 1);
+
+ /* Please see nhop_free() comments on the initial value */
+ refcount_init(&nh_priv->nh_linked, 2);
+
+ print_nhop("FINALIZE", nh);
+
+ if (link_nhop(ctl, nh_priv) == 0) {
+
+ /*
+ * Adding nexthop to the datastructures
+ * failed. Call destructor w/o waiting for
+ * the epoch end, as nexthop is not used
+ * and return.
+ */
+ DPRINTF("link_nhop failed!");
+ destroy_nhop(nh_priv);
+
+ return (ENOBUFS);
+ }
+
+ return (0);
+}
+
+static void
+print_nhop_sa(char *buf, size_t buflen, const struct sockaddr *sa)
+{
+
+ if (sa->sa_family == AF_INET) {
+ const struct sockaddr_in *sin4;
+ sin4 = (const struct sockaddr_in *)sa;
+ inet_ntop(AF_INET, &sin4->sin_addr, buf, buflen);
+ } else if (sa->sa_family == AF_INET6) {
+ const struct sockaddr_in6 *sin6;
+ sin6 = (const struct sockaddr_in6 *)sa;
+ inet_ntop(AF_INET6, &sin6->sin6_addr, buf, buflen);
+ } else if (sa->sa_family == AF_LINK) {
+ const struct sockaddr_dl *sdl;
+ sdl = (const struct sockaddr_dl *)sa;
+ snprintf(buf, buflen, "if#%d", sdl->sdl_index);
+ } else
+ snprintf(buf, buflen, "af:%d", sa->sa_family);
+}
+
+static void
+print_nhop(const char *prefix, const struct nhop_object *nh)
+{
+ char src_buf[INET6_ADDRSTRLEN], addr_buf[INET6_ADDRSTRLEN];
+
+ print_nhop_sa(src_buf, sizeof(src_buf), nh->nh_ifa->ifa_addr);
+ print_nhop_sa(addr_buf, sizeof(addr_buf), &nh->gw_sa);
+
+ DPRINTF("%s nhop priv %p: AF %d ifp %p %s addr %s src %p %s aifp %p %s mtu %d nh_flags %X",
+ prefix, nh->nh_priv, nh->nh_priv->nh_family, nh->nh_ifp,
+ if_name(nh->nh_ifp), addr_buf, nh->nh_ifa, src_buf, nh->nh_aifp,
+ if_name(nh->nh_aifp), nh->nh_mtu, nh->nh_flags);
+}
+
+static void
+destroy_nhop(struct nhop_priv *nh_priv)
+{
+ struct nhop_object *nh;
+
+ nh = nh_priv->nh;
+
+ print_nhop("DEL", nh);
+
+ if_rele(nh->nh_ifp);
+ if_rele(nh->nh_aifp);
+ ifa_free(nh->nh_ifa);
+ counter_u64_free(nh->nh_pksent);
+
+ uma_zfree(nhops_zone, nh);
+}
+
+/*
+ * Epoch callback indicating nhop is safe to destroy
+ */
+static void
+destroy_nhop_epoch(epoch_context_t ctx)
+{
+ struct nhop_priv *nh_priv;
+
+ nh_priv = __containerof(ctx, struct nhop_priv, nh_epoch_ctx);
+
+ destroy_nhop(nh_priv);
+}
+
+int
+nhop_ref_object(struct nhop_object *nh)
+{
+
+ return (refcount_acquire_if_not_zero(&nh->nh_priv->nh_refcnt));
+}
+
+void
+nhop_free(struct nhop_object *nh)
+{
+ struct nh_control *ctl;
+ struct nhop_priv *nh_priv = nh->nh_priv;
+ struct epoch_tracker et;
+
+ if (!refcount_release(&nh_priv->nh_refcnt))
+ return;
+
+ /*
+ * There are only 2 places, where nh_linked can be decreased:
+ * rib destroy (nhops_destroy_rib) and this function.
+ * nh_link can never be increased.
+ *
+ * Hence, use initial value of 2 to make use of
+ * refcount_release_if_not_last().
+ *
+ * There can be two scenarious when calling this function:
+ *
+ * 1) nh_linked value is 2. This means that either
+ * nhops_destroy_rib() has not been called OR it is running,
+ * but we are guaranteed that nh_control won't be freed in
+ * this epoch. Hence, nexthop can be safely unlinked.
+ *
+ * 2) nh_linked value is 1. In that case, nhops_destroy_rib()
+ * has been called and nhop unlink can be skipped.
+ */
+
+ NET_EPOCH_ENTER(et);
+ if (refcount_release_if_not_last(&nh_priv->nh_linked)) {
+ ctl = nh_priv->nh_control;
+ if (unlink_nhop(ctl, nh_priv) == NULL) {
+ /* Do not try to reclaim */
+ DPRINTF("Failed to unlink nexhop %p", nh_priv);
+ NET_EPOCH_EXIT(et);
+ return;
+ }
+ }
+ NET_EPOCH_EXIT(et);
+
+ epoch_call(net_epoch_preempt, destroy_nhop_epoch,
+ &nh_priv->nh_epoch_ctx);
+}
+
+int
+nhop_ref_any(struct nhop_object *nh)
+{
+
+ return (nhop_ref_object(nh));
+}
+
+void
+nhop_free_any(struct nhop_object *nh)
+{
+
+ nhop_free(nh);
+}
+
+
+/* Helper functions */
+
+uint32_t
+nhop_get_idx(const struct nhop_object *nh)
+{
+
+ return (nh->nh_priv->nh_idx);
+}
+
+enum nhop_type
+nhop_get_type(const struct nhop_object *nh)
+{
+
+ return (nh->nh_priv->nh_type);
+}
+
+void
+nhop_set_type(struct nhop_object *nh, enum nhop_type nh_type)
+{
+
+ nh->nh_priv->nh_type = nh_type;
+}
+
+int
+nhop_get_rtflags(const struct nhop_object *nh)
+{
+
+ return (nh->nh_priv->rt_flags);
+}
+
+void
+nhop_set_rtflags(struct nhop_object *nh, int rt_flags)
+{
+
+ nh->nh_priv->rt_flags = rt_flags;
+}
+
+void
+nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu)
+{
+ struct nh_control *ctl;
+ struct nhop_priv *nh_priv;
+ struct nhop_object *nh;
+
+ ctl = rh->nh_control;
+
+ NHOPS_WLOCK(ctl);
+ CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) {
+ nh = nh_priv->nh;
+ if (nh->nh_ifp == ifp) {
+ if ((nh_priv->rt_flags & RTF_FIXEDMTU) == 0 ||
+ nh->nh_mtu > mtu) {
+ /* Update MTU directly */
+ nh->nh_mtu = mtu;
+ }
+ }
+ } CHT_SLIST_FOREACH_END;
+ NHOPS_WUNLOCK(ctl);
+
+}
+
+/*
+ * Dumps a single entry to sysctl buffer.
+ *
+ * Layout:
+ * rt_msghdr - generic RTM header to allow users to skip non-understood messages
+ * nhop_external - nexhop description structure (with length)
+ * nhop_addrs - structure encapsulating GW/SRC sockaddrs
+ */
+static int
+dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w)
+{
+ struct {
+ struct rt_msghdr rtm;
+ struct nhop_external nhe;
+ struct nhop_addrs na;
+ } arpc;
+ struct nhop_external *pnhe;
+ struct sockaddr *gw_sa, *src_sa;
+ struct sockaddr_storage ss;
+ size_t addrs_len;
+ int error;
+
+ //DPRINTF("Dumping: head %p nh %p flags %X req %p\n", rh, nh, nh->nh_flags, w);
+
+ memset(&arpc, 0, sizeof(arpc));
+
+ arpc.rtm.rtm_msglen = sizeof(arpc);
+ arpc.rtm.rtm_version = RTM_VERSION;
+ arpc.rtm.rtm_type = RTM_GET;
+ //arpc.rtm.rtm_flags = RTF_UP;
+ arpc.rtm.rtm_flags = nh->nh_priv->rt_flags;
+
+ /* nhop_external */
+ pnhe = &arpc.nhe;
+ pnhe->nh_len = sizeof(struct nhop_external);
+ pnhe->nh_idx = nh->nh_priv->nh_idx;
+ pnhe->nh_fib = rh->rib_fibnum;
+ pnhe->ifindex = nh->nh_ifp->if_index;
+ pnhe->aifindex = nh->nh_aifp->if_index;
+ pnhe->nh_family = nh->nh_priv->nh_family;
+ pnhe->nh_type = nh->nh_priv->nh_type;
+ pnhe->nh_mtu = nh->nh_mtu;
+ pnhe->nh_flags = nh->nh_flags;
+
+ memcpy(pnhe->nh_prepend, nh->nh_prepend, sizeof(nh->nh_prepend));
+ pnhe->prepend_len = nh->nh_prepend_len;
+ pnhe->nh_refcount = nh->nh_priv->nh_refcnt;
+ pnhe->nh_pksent = counter_u64_fetch(nh->nh_pksent);
+
+ /* sockaddr container */
+ addrs_len = sizeof(struct nhop_addrs);
+ arpc.na.gw_sa_off = addrs_len;
+ gw_sa = (struct sockaddr *)&nh->gw4_sa;
+ addrs_len += gw_sa->sa_len;
+
+ src_sa = nh->nh_ifa->ifa_addr;
+ if (src_sa->sa_family == AF_LINK) {
+ /* Shorten structure */
+ memset(&ss, 0, sizeof(struct sockaddr_storage));
+ fill_sdl_from_ifp((struct sockaddr_dl_short *)&ss,
+ nh->nh_ifa->ifa_ifp);
+ src_sa = (struct sockaddr *)&ss;
+ }
+ arpc.na.src_sa_off = addrs_len;
+ addrs_len += src_sa->sa_len;
+
+ /* Write total container length */
+ arpc.na.na_len = addrs_len;
+
+ arpc.rtm.rtm_msglen += arpc.na.na_len - sizeof(struct nhop_addrs);
+
+ error = SYSCTL_OUT(w, &arpc, sizeof(arpc));
+ if (error == 0)
+ error = SYSCTL_OUT(w, gw_sa, gw_sa->sa_len);
+ if (error == 0)
+ error = SYSCTL_OUT(w, src_sa, src_sa->sa_len);
+
+ return (error);
+}
+
+int
+nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
+{
+ struct nh_control *ctl;
+ struct nhop_priv *nh_priv;
+ int error;
+
+ ctl = rh->nh_control;
+
+ NHOPS_RLOCK(ctl);
+ DPRINTF("NHDUMP: count=%u", ctl->nh_head.items_count);
+ CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) {
+ error = dump_nhop_entry(rh, nh_priv->nh, w);
+ if (error != 0) {
+ NHOPS_RUNLOCK(ctl);
+ return (error);
+ }
+ } CHT_SLIST_FOREACH_END;
+ NHOPS_RUNLOCK(ctl);
+
+ return (0);
+}
+
diff --git a/sys/net/route/nhop_utils.c b/sys/net/route/nhop_utils.c
new file mode 100644
index 000000000000..56bca99c9ed8
--- /dev/null
+++ b/sys/net/route/nhop_utils.c
@@ -0,0 +1,219 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_route.h"
+#include "opt_mpath.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+
+#include <net/route/nhop_utils.h>
+
+#define BLOCK_ITEMS (8 * sizeof(u_long)) /* Number of items for ffsl() */
+
+#define _BLOCKS_TO_SZ(_blocks) ((size_t)(_blocks) * sizeof(u_long))
+#define _BLOCKS_TO_ITEMS(_blocks) ((uint32_t)(_blocks) * BLOCK_ITEMS)
+#define _ITEMS_TO_BLOCKS(_items) ((_items) / BLOCK_ITEMS)
+
+
+static void _bitmask_init_idx(void *index, uint32_t items);
+
+void
+bitmask_init(struct bitmask_head *bh, void *idx, uint32_t num_items)
+{
+
+ if (idx != NULL)
+ _bitmask_init_idx(idx, num_items);
+
+ memset(bh, 0, sizeof(struct bitmask_head));
+ bh->blocks = _ITEMS_TO_BLOCKS(num_items);
+ bh->idx = (u_long *)idx;
+}
+
+uint32_t
+bitmask_get_resize_items(const struct bitmask_head *bh)
+{
+ if ((bh->items_count * 2 > _BLOCKS_TO_ITEMS(bh->blocks)) && bh->items_count < 65536)
+ return (_BLOCKS_TO_ITEMS(bh->blocks) * 2);
+
+ return (0);
+}
+
+int
+bitmask_should_resize(const struct bitmask_head *bh)
+{
+
+ return (bitmask_get_resize_items(bh) != 0);
+}
+
+#if 0
+uint32_t
+_bitmask_get_blocks(uint32_t items)
+{
+
+ return (items / BLOCK_ITEMS);
+}
+#endif
+
+size_t
+bitmask_get_size(uint32_t items)
+{
+#if _KERNEL
+ KASSERT((items % BLOCK_ITEMS) == 0,
+ ("bitmask size needs to power of 2 and greater or equal to %zu",
+ BLOCK_ITEMS));
+#else
+ assert((items % BLOCK_ITEMS) == 0);
+#endif
+
+ return (items / 8);
+}
+
+static void
+_bitmask_init_idx(void *_idx, uint32_t items)
+{
+ size_t size = bitmask_get_size(items);
+ u_long *idx = (u_long *)_idx;
+
+ /* Mark all as free */
+ memset(idx, 0xFF, size);
+ *idx &= ~(u_long)1; /* Always skip index 0 */
+}
+
+
+/*
+ * _try_merge api to allow shrinking?
+ */
+int
+bitmask_copy(const struct bitmask_head *bi, void *new_idx, uint32_t new_items)
+{
+ uint32_t new_blocks = _BLOCKS_TO_ITEMS(new_items);
+
+ _bitmask_init_idx(new_idx, new_items);
+
+ if (bi->blocks < new_blocks) {
+ /* extend current blocks */
+ if (bi->blocks > 0)
+ memcpy(new_idx, bi->idx, _BLOCKS_TO_SZ(bi->blocks));
+ return (0);
+ } else {
+ /* XXX: ensure all other blocks are non-zero */
+ for (int i = new_blocks; i < bi->blocks; i++) {
+ }
+
+ return (1);
+ }
+}
+
+void
+bitmask_swap(struct bitmask_head *bh, void *new_idx, uint32_t new_items, void **pidx)
+{
+ void *old_ptr;
+
+ old_ptr = bh->idx;
+
+ bh->idx = (u_long *)new_idx;
+ bh->blocks = _ITEMS_TO_BLOCKS(new_items);
+
+ if (pidx != NULL)
+ *pidx = old_ptr;
+}
+
+/*
+ * Allocate new index in given instance and stores in in @pidx.
+ * Returns 0 on success.
+ */
+int
+bitmask_alloc_idx(struct bitmask_head *bi, uint16_t *pidx)
+{
+ u_long *mask;
+ int i, off, v;
+
+ off = bi->free_off;
+ mask = &bi->idx[off];
+
+ for (i = off; i < bi->blocks; i++, mask++) {
+ if ((v = ffsl(*mask)) == 0)
+ continue;
+
+ /* Mark as busy */
+ *mask &= ~ ((u_long)1 << (v - 1));
+
+ bi->free_off = i;
+
+ v = BLOCK_ITEMS * i + v - 1;
+
+ *pidx = v;
+ bi->items_count++;
+ return (0);
+ }
+
+ return (1);
+}
+
+/*
+ * Removes index from given set.
+ * Returns 0 on success.
+ */
+int
+bitmask_free_idx(struct bitmask_head *bi, uint16_t idx)
+{
+ u_long *mask;
+ int i, v;
+
+ if (idx == 0)
+ return (1);
+
+ i = idx / BLOCK_ITEMS;
+ v = idx % BLOCK_ITEMS;
+
+ if (i >= bi->blocks)
+ return (1);
+
+ mask = &bi->idx[i];
+
+ if ((*mask & ((u_long)1 << v)) != 0)
+ return (1);
+
+ /* Mark as free */
+ *mask |= (u_long)1 << v;
+ bi->items_count--;
+
+ /* Update free offset */
+ if (bi->free_off > i)
+ bi->free_off = i;
+
+ return (0);
+}
+
diff --git a/sys/net/route/nhop_utils.h b/sys/net/route/nhop_utils.h
new file mode 100644
index 000000000000..a2876178cbb0
--- /dev/null
+++ b/sys/net/route/nhop_utils.h
@@ -0,0 +1,200 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NET_ROUTE_NHOP_UTILS_H_
+#define _NET_ROUTE_NHOP_UTILS_H_
+
+/* Chained hash table */
+struct _cht_head {
+ uint32_t hash_size;
+ uint32_t items_count;
+ void **ptr;
+};
+
+static inline uint32_t
+_cht_get_resize_size(const struct _cht_head *head)
+{
+ uint32_t new_size = 0;
+
+ if ((head->items_count * 2 > head->hash_size) && (head->hash_size < 65536))
+ new_size = head->hash_size * 2;
+ else if ((head->items_count * 4 < head->hash_size) && head->hash_size > 16)
+ new_size = head->hash_size / 2;
+
+ return (new_size);
+}
+
+static inline int
+_cht_need_resize(const struct _cht_head *head)
+{
+
+ return (_cht_get_resize_size(head) > 0);
+}
+
+
+#ifndef typeof
+#define typeof __typeof
+#endif
+
+#define CHT_SLIST_NEED_RESIZE(_head) \
+ _cht_need_resize((const struct _cht_head *)(_head))
+#define CHT_SLIST_GET_RESIZE_BUCKETS(_head) \
+ _cht_get_resize_size((const struct _cht_head *)(_head))
+#define CHT_SLIST_GET_RESIZE_SIZE(_buckets) ((_buckets) * sizeof(void *))
+
+#define CHT_SLIST_DEFINE(_HNAME, _ITEM_TYPE) \
+struct _HNAME##_head { \
+ uint32_t hash_size; \
+ uint32_t items_count; \
+ _ITEM_TYPE **ptr; \
+}
+
+#define CHT_SLIST_INIT(_head, _ptr, _num_buckets) \
+ (_head)->hash_size = _num_buckets; \
+ (_head)->items_count = 0; \
+ (_head)->ptr = _ptr;
+
+/* Default hash method for constant-size keys */
+
+#define CHT_GET_BUCK(_head, _PX, _key) _PX##_hash_key(_key) & ((_head)->hash_size - 1)
+#define CHT_GET_BUCK_OBJ(_head, _PX, _obj) _PX##_hash_obj(_obj) & ((_head)->hash_size - 1)
+
+#define CHT_FIRST(_head, idx) _CHT_FIRST((_head)->ptr, idx)
+#define _CHT_FIRST(_ptr, idx) (_ptr)[idx]
+
+#define CHT_SLIST_FIND(_head, _PX, _key, _ret) do { \
+ uint32_t _buck = CHT_GET_BUCK(_head, _PX, _key); \
+ _ret = CHT_FIRST(_head, _buck); \
+ for ( ; _ret != NULL; _ret = _PX##_next(_ret)) { \
+ if (_PX##_cmp(_key, (_ret))) \
+ break; \
+ } \
+} while(0)
+
+/*
+ * hash_obj, nhop_cmp
+ */
+#define CHT_SLIST_FIND_BYOBJ(_head, _PX, _obj, _ret) do { \
+ uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \
+ _ret = CHT_FIRST(_head, _buck); \
+ for ( ; _ret != NULL; _ret = _PX##_next(_ret)) { \
+ if (_PX##_cmp(_obj, _ret)) \
+ break; \
+ } \
+} while(0)
+
+#define CHT_SLIST_INSERT_HEAD(_head, _PX, _obj) do { \
+ uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \
+ _PX##_next(_obj) = CHT_FIRST(_head, _buck); \
+ CHT_FIRST(_head, _buck) = _obj; \
+ (_head)->items_count++; \
+} while(0)
+
+#define CHT_SLIST_REMOVE(_head, _PX, _key, _ret) do { \
+ typeof(*(_head)->ptr) _tmp; \
+ uint32_t _buck = CHT_GET_BUCK(_head, _PX, _key); \
+ _ret = CHT_FIRST(_head, _buck); \
+ _tmp = NULL; \
+ for ( ; _ret != NULL; _tmp = _ret, _ret = _PX##_next(_ret)) { \
+ if (_PX##_cmp(_key, _ret)) \
+ break; \
+ } \
+ if (_ret != NULL) { \
+ if (_tmp == NULL) \
+ CHT_FIRST(_head, _buck) = _PX##_next(_ret); \
+ else \
+ _PX##_next(_tmp) = _PX##_next(_ret); \
+ (_head)->items_count--; \
+ } \
+} while(0)
+
+#define CHT_SLIST_REMOVE_BYOBJ(_head, _PX, _obj, _ret) do { \
+ typeof(*(_head)->ptr) _tmp; \
+ uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \
+ _ret = CHT_FIRST(_head, _buck); \
+ _tmp = NULL; \
+ for ( ; _ret != NULL; _tmp = _ret, _ret = _PX##_next(_ret)) { \
+ if (_PX##_cmp(_obj, _ret)) \
+ break; \
+ } \
+ if (_ret != NULL) { \
+ if (_tmp == NULL) \
+ CHT_FIRST(_head, _buck) = _PX##_next(_ret); \
+ else \
+ _PX##_next(_tmp) = _PX##_next(_ret); \
+ (_head)->items_count--; \
+ } \
+} while(0)
+
+
+#define CHT_SLIST_FOREACH(_head, _PX, _x) \
+ for (uint32_t _i = 0; _i < (_head)->hash_size; _i++) { \
+ for (_x = CHT_FIRST(_head, _i); _x; _x = _PX##_next(_x))
+
+#define CHT_SLIST_FOREACH_END }
+
+#define CHT_SLIST_RESIZE(_head, _PX, _new_void_ptr, _new_hsize) \
+ uint32_t _new_idx; \
+ typeof((_head)->ptr) _new_ptr = (void *)_new_void_ptr; \
+ typeof(*(_head)->ptr) _x, _y; \
+ for (uint32_t _old_idx = 0; _old_idx < (_head)->hash_size; _old_idx++) {\
+ _x = CHT_FIRST(_head, _old_idx); \
+ _y = _x; \
+ while (_y != NULL) { \
+ _y = _PX##_next(_x); \
+ _new_idx = _PX##_hash_obj(_x) & (_new_hsize - 1);\
+ _PX##_next(_x) = _CHT_FIRST(_new_ptr, _new_idx);\
+ _CHT_FIRST(_new_ptr, _new_idx) = _x; \
+ _x = _y; \
+ } \
+ } \
+ (_head)->hash_size = _new_hsize; \
+ _new_void_ptr = (void *)(_head)->ptr; \
+ (_head)->ptr = _new_ptr;
+
+/* bitmasks */
+
+struct bitmask_head {
+ uint16_t free_off; /* index of the first potentially free block */
+ uint16_t blocks; /* number of 4/8-byte blocks in the index */
+ uint32_t items_count; /* total number of items */
+ u_long *idx;
+};
+
+size_t bitmask_get_size(uint32_t items);
+uint32_t bitmask_get_resize_items(const struct bitmask_head *nh);
+int bitmask_should_resize(const struct bitmask_head *bh);
+void bitmask_swap(struct bitmask_head *bh, void *new_idx, uint32_t new_items, void **pidx);
+void bitmask_init(struct bitmask_head *bh, void *idx, uint32_t num_items);
+int bitmask_copy(const struct bitmask_head *bi, void *new_idx, uint32_t new_items);
+int bitmask_alloc_idx(struct bitmask_head *bi, uint16_t *pidx);
+int bitmask_free_idx(struct bitmask_head *bi, uint16_t idx);
+
+#endif
+
diff --git a/sys/net/route/nhop_var.h b/sys/net/route/nhop_var.h
new file mode 100644
index 000000000000..4bf26ff54269
--- /dev/null
+++ b/sys/net/route/nhop_var.h
@@ -0,0 +1,96 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This header file contains private definitions for nexthop routing.
+ *
+ * Header is not intended to be included by the code external to the
+ * routing subsystem.
+ */
+
+#ifndef _NET_ROUTE_NHOP_VAR_H_
+#define _NET_ROUTE_NHOP_VAR_H_
+
+/* define nhop hash table */
+struct nhop_priv;
+CHT_SLIST_DEFINE(nhops, struct nhop_priv);
+/* produce hash value for an object */
+#define nhops_hash_obj(_obj) hash_priv(_obj)
+/* compare two objects */
+#define nhops_cmp(_one, _two) cmp_priv(_one, _two)
+/* next object accessor */
+#define nhops_next(_obj) (_obj)->nh_next
+
+
+struct nh_control {
+ struct nhops_head nh_head; /* hash table head */
+ struct bitmask_head nh_idx_head; /* nhop index head */
+ struct rwlock ctl_lock; /* overall ctl lock */
+ struct rib_head *ctl_rh; /* pointer back to rnh */
+ struct epoch_context ctl_epoch_ctx; /* epoch ctl helper */
+};
+
+#define NHOPS_WLOCK(ctl) rw_wlock(&(ctl)->ctl_lock)
+#define NHOPS_RLOCK(ctl) rw_rlock(&(ctl)->ctl_lock)
+#define NHOPS_WUNLOCK(ctl) rw_wunlock(&(ctl)->ctl_lock)
+#define NHOPS_RUNLOCK(ctl) rw_runlock(&(ctl)->ctl_lock)
+#define NHOPS_LOCK_INIT(ctl) rw_init(&(ctl)->ctl_lock, "nhop_ctl")
+#define NHOPS_LOCK_DESTROY(ctl) rw_destroy(&(ctl)->ctl_lock)
+#define NHOPS_WLOCK_ASSERT(ctl) rw_assert(&(ctl)->ctl_lock, RA_WLOCKED)
+
+
+/* Control plane-only nhop data */
+struct nhop_object;
+struct nhop_priv {
+ uint32_t nh_idx; /* nexthop index */
+ uint8_t nh_family; /* address family of the lookup */
+ uint16_t nh_type; /* nexthop type */
+ void *cb_func; /* function handling additional rewrite caps */
+ u_int nh_refcnt; /* number of references, refcount(9) */
+ u_int nh_linked; /* refcount(9), == 2 if linked to the list */
+ int rt_flags; /* routing flags for the control plane */
+ struct nhop_object *nh; /* backreference to the dataplane nhop */
+ struct nh_control *nh_control; /* backreference to the rnh */
+ struct nhop_priv *nh_next; /* hash table membership */
+ struct epoch_context nh_epoch_ctx; /* epoch data for nhop */
+};
+
+#define NH_IS_PINNED(_nh) ((_nh)->nh_priv->rt_flags & RTF_PINNED)
+
+/* nhop.c */
+struct nhop_priv *find_nhop(struct nh_control *ctl,
+ const struct nhop_priv *nh_priv);
+int link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv);
+struct nhop_priv *unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv);
+
+/* nhop_ctl.c */
+int cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two);
+
+#endif
+
diff --git a/sys/net/route/route_ctl.c b/sys/net/route/route_ctl.c
new file mode 100644
index 000000000000..09c2ded80796
--- /dev/null
+++ b/sys/net/route/route_ctl.c
@@ -0,0 +1,65 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/vnet.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/shared.h>
+#include <netinet/in.h>
+
+#include <vm/uma.h>
+
+
+/*
+ * This file contains control plane routing tables functions.
+ *
+ * All functions assumes they are called in net epoch.
+ */
+
+
diff --git a/sys/net/route/route_helpers.c b/sys/net/route/route_helpers.c
new file mode 100644
index 000000000000..c124a52b0b77
--- /dev/null
+++ b/sys/net/route/route_helpers.c
@@ -0,0 +1,83 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_route.h"
+
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/shared.h>
+#include <net/vnet.h>
+
+/*
+ * RIB helper functions.
+ */
+
+/*
+ * Calls @wa_f with @arg for each entry in the table specified by
+ * @af and @fibnum.
+ *
+ * Table is traversed under read lock.
+ */
+void
+rib_walk(int af, u_int fibnum, rt_walktree_f_t *wa_f, void *arg)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rnh;
+
+ if ((rnh = rt_tables_get_rnh(fibnum, af)) == NULL)
+ return;
+
+ RIB_RLOCK(rnh);
+ rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg);
+ RIB_RUNLOCK(rnh);
+}
+
diff --git a/sys/net/route/shared.h b/sys/net/route/shared.h
new file mode 100644
index 000000000000..a4476373dd97
--- /dev/null
+++ b/sys/net/route/shared.h
@@ -0,0 +1,68 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Contains various definitions shared between the parts of a routing subsystem.
+ *
+ * Header is not intended to be included by the code external to the
+ * routing subsystem.
+ */
+
+#ifndef _NET_ROUTE_SHARED_H_
+#define _NET_ROUTE_SHARED_H_
+
+#ifdef RTDEBUG
+#define DPRINTF(_fmt, ...) printf("%s: " _fmt "\n", __func__ , ## __VA_ARGS__)
+#else
+#define DPRINTF(_fmt, ...)
+#endif
+
+struct rib_head;
+
+/* Nexhops */
+void nhops_init(void);
+int nhops_init_rib(struct rib_head *rh);
+void nhops_destroy_rib(struct rib_head *rh);
+int nhop_ref_object(struct nhop_object *nh);
+int nhop_ref_any(struct nhop_object *nh);
+void nhop_free_any(struct nhop_object *nh);
+
+void nhop_set_type(struct nhop_object *nh, enum nhop_type nh_type);
+void nhop_set_rtflags(struct nhop_object *nh, int rt_flags);
+
+int nhop_create_from_info(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_object **nh_ret);
+int nhop_create_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_orig,
+ struct rt_addrinfo *info, struct nhop_object **pnh_priv);
+
+void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu);
+int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+
+#endif
+
diff --git a/sys/net/route_var.h b/sys/net/route_var.h
index db0aa07f60a9..92b5e433a972 100644
--- a/sys/net/route_var.h
+++ b/sys/net/route_var.h
@@ -32,6 +32,10 @@
#ifndef _NET_ROUTE_VAR_H_
#define _NET_ROUTE_VAR_H_
+struct nh_control;
+typedef int rnh_preadd_entry_f_t(u_int fibnum, const struct sockaddr *addr,
+ const struct sockaddr *mask, struct nhop_object *nh);
+
struct rib_head {
struct radix_head head;
rn_matchaddr_f_t *rnh_matchaddr; /* longest match for sockaddr */
@@ -41,6 +45,7 @@ struct rib_head {
rn_walktree_t *rnh_walktree; /* traverse tree */
rn_walktree_from_t *rnh_walktree_from; /* traverse tree below a */
rn_close_t *rnh_close; /*do something when the last ref drops*/
+ rnh_preadd_entry_f_t *rnh_preadd; /* hook to alter record prior to insertion */
rt_gen_t rnh_gen; /* generation counter */
int rnh_multipath; /* multipath capable ? */
struct radix_node rnh_nodes[3]; /* empty tree for common case */
@@ -51,6 +56,7 @@ struct rib_head {
u_int rib_fibnum; /* fib number */
struct callout expire_callout; /* Callout for expiring dynamic routes */
time_t next_expire; /* Next expire run ts */
+ struct nh_control *nh_control; /* nexthop subsystem data */
};
#define RIB_RLOCK_TRACKER struct rm_priotracker _rib_tracker
@@ -90,6 +96,44 @@ _Static_assert(__offsetof(struct route, ro_dst) == __offsetof(_ro_new, _dst_new)
struct rib_head *rt_tables_get_rnh(int fib, int family);
void rt_mpath_init_rnh(struct rib_head *rnh);
+VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat);
+#define RTSTAT_ADD(name, val) \
+ VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val))
+#define RTSTAT_INC(name) RTSTAT_ADD(name, 1)
+
+/*
+ * With the split between the routing entry and the nexthop,
+ * rt_flags has to be split between these 2 entries. As rtentry
+ * mostly contains prefix data and is thought to be generic enough
+ * so one can transparently change the nexthop pointer w/o requiring
+ * any other rtentry changes, most of rt_flags shifts to the particular nexthop.
+ * /
+ *
+ * RTF_UP: rtentry, as an indication that it is linked.
+ * RTF_HOST: rtentry, nhop. The latter indication is needed for the datapath
+ * RTF_DYNAMIC: nhop, to make rtentry generic.
+ * RTF_MODIFIED: nhop, to make rtentry generic. (legacy)
+ * -- "native" path (nhop) properties:
+ * RTF_GATEWAY, RTF_STATIC, RTF_PROTO1, RTF_PROTO2, RTF_PROTO3, RTF_FIXEDMTU,
+ * RTF_PINNED, RTF_REJECT, RTF_BLACKHOLE, RTF_BROADCAST
+ */
+
+/* Nexthop rt flags mask */
+#define NHOP_RT_FLAG_MASK (RTF_GATEWAY | RTF_HOST | RTF_REJECT | RTF_DYNAMIC | \
+ RTF_MODIFIED | RTF_STATIC | RTF_BLACKHOLE | RTF_PROTO1 | RTF_PROTO2 | \
+ RTF_PROTO3 | RTF_FIXEDMTU | RTF_PINNED | RTF_BROADCAST)
+
+/* rtentry rt flag mask */
+#define RTE_RT_FLAG_MASK (RTF_UP | RTF_HOST)
+
+/* Nexthop selection */
+#define _NH2MP(_nh) ((struct nhgrp_object *)(_nh))
+#define _SELECT_NHOP(_nh, _flowid) \
+ (_NH2MP(_nh))->nhops[(_flowid) % (_NH2MP(_nh))->mp_size]
+#define _RT_SELECT_NHOP(_nh, _flowid) \
+ ((!NH_IS_MULTIPATH(_nh)) ? (_nh) : _SELECT_NHOP(_nh, _flowid))
+#define RT_SELECT_NHOP(_rt, _flowid) _RT_SELECT_NHOP((_rt)->rt_nhop, _flowid)
+
/* rte<>nhop translation */
static inline uint16_t
fib_rte_to_nh_flags(int rt_flags)
diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c
index 521d2fdc3d99..0ba071459ca6 100644
--- a/sys/net/rtsock.c
+++ b/sys/net/rtsock.c
@@ -77,6 +77,7 @@
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#endif
+#include <net/route/nhop.h>
#ifdef COMPAT_FREEBSD32
#include <sys/mount.h>
@@ -1076,6 +1077,7 @@ rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out)
out->rmx_mtu = rt->rt_mtu;
out->rmx_weight = rt->rt_weight;
out->rmx_pksent = counter_u64_fetch(rt->rt_pksent);
+ out->rmx_nhidx = nhop_get_idx(rt->rt_nhop);
/* Kernel -> userland timebase conversion. */
out->rmx_expire = rt->rt_expire ?
rt->rt_expire - time_uptime + time_second : 0;
@@ -2025,7 +2027,7 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS)
namelen--;
if (req->newptr)
return (EPERM);
- if (name[1] == NET_RT_DUMP) {
+ if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP) {
if (namelen == 3)
fib = req->td->td_proc->p_fibnum;
else if (namelen == 4)
@@ -2092,7 +2094,25 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS)
error = EAFNOSUPPORT;
}
break;
-
+ case NET_RT_NHOP:
+ /* Allow dumping one specific af/fib at a time */
+ if (namelen < 4) {
+ error = EINVAL;
+ break;
+ }
+ fib = name[3];
+ if (fib < 0 || fib > rt_numfibs) {
+ error = EINVAL;
+ break;
+ }
+ rnh = rt_tables_get_rnh(fib, af);
+ if (rnh == NULL) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+ if (w.w_op == NET_RT_NHOP)
+ error = nhops_dump_sysctl(rnh, w.w_req);
+ break;
case NET_RT_IFLIST:
case NET_RT_IFLISTL:
error = sysctl_iflist(af, &w);
diff --git a/sys/netinet/in_fib.c b/sys/netinet/in_fib.c
index f7a02e36a40b..4456856c426e 100644
--- a/sys/netinet/in_fib.c
+++ b/sys/netinet/in_fib.c
@@ -49,6 +49,8 @@ __FBSDID("$FreeBSD$");
#include <net/if_dl.h>
#include <net/route.h>
#include <net/route_var.h>
+#include <net/route/nhop.h>
+#include <net/route/shared.h>
#include <net/vnet.h>
#ifdef RADIX_MPATH
@@ -60,59 +62,49 @@ __FBSDID("$FreeBSD$");
#include <netinet/in_fib.h>
#ifdef INET
-static void fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst,
+static void fib4_rte_to_nh_basic(struct nhop_object *nh, struct in_addr dst,
uint32_t flags, struct nhop4_basic *pnh4);
-static void fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
+static void fib4_rte_to_nh_extended(struct nhop_object *nh, struct in_addr dst,
uint32_t flags, struct nhop4_extended *pnh4);
#define RNTORT(p) ((struct rtentry *)(p))
static void
-fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst,
+fib4_rte_to_nh_basic(struct nhop_object *nh, struct in_addr dst,
uint32_t flags, struct nhop4_basic *pnh4)
{
- struct sockaddr_in *gw;
if ((flags & NHR_IFAIF) != 0)
- pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
+ pnh4->nh_ifp = nh->nh_ifa->ifa_ifp;
+ else
+ pnh4->nh_ifp = nh->nh_ifp;
+ pnh4->nh_mtu = nh->nh_mtu;
+ if (nh->nh_flags & NHF_GATEWAY)
+ pnh4->nh_addr = nh->gw4_sa.sin_addr;
else
- pnh4->nh_ifp = rte->rt_ifp;
- pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
- if (rte->rt_flags & RTF_GATEWAY) {
- gw = (struct sockaddr_in *)rte->rt_gateway;
- pnh4->nh_addr = gw->sin_addr;
- } else
pnh4->nh_addr = dst;
/* Set flags */
- pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
- gw = (struct sockaddr_in *)rt_key(rte);
- if (gw->sin_addr.s_addr == 0)
- pnh4->nh_flags |= NHF_DEFAULT;
+ pnh4->nh_flags = nh->nh_flags;
/* TODO: Handle RTF_BROADCAST here */
}
static void
-fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
+fib4_rte_to_nh_extended(struct nhop_object *nh, struct in_addr dst,
uint32_t flags, struct nhop4_extended *pnh4)
{
- struct sockaddr_in *gw;
if ((flags & NHR_IFAIF) != 0)
- pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
+ pnh4->nh_ifp = nh->nh_ifa->ifa_ifp;
+ else
+ pnh4->nh_ifp = nh->nh_ifp;
+ pnh4->nh_mtu = nh->nh_mtu;
+ if (nh->nh_flags & NHF_GATEWAY)
+ pnh4->nh_addr = nh->gw4_sa.sin_addr;
else
- pnh4->nh_ifp = rte->rt_ifp;
- pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
- if (rte->rt_flags & RTF_GATEWAY) {
- gw = (struct sockaddr_in *)rte->rt_gateway;
- pnh4->nh_addr = gw->sin_addr;
- } else
pnh4->nh_addr = dst;
/* Set flags */
- pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
- gw = (struct sockaddr_in *)rt_key(rte);
- if (gw->sin_addr.s_addr == 0)
- pnh4->nh_flags |= NHF_DEFAULT;
- pnh4->nh_ia = ifatoia(rte->rt_ifa);
+ pnh4->nh_flags = nh->nh_flags;
+ pnh4->nh_ia = ifatoia(nh->nh_ifa);
pnh4->nh_src = IA_SIN(pnh4->nh_ia)->sin_addr;
}
@@ -135,7 +127,7 @@ fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flags,
struct rib_head *rh;
struct radix_node *rn;
struct sockaddr_in sin;
- struct rtentry *rte;
+ struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_basic: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET);
@@ -150,10 +142,10 @@ fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flags,
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rte = RNTORT(rn);
+ nh = RNTORT(rn)->rt_nhop;
/* Ensure route & ifp is UP */
- if (RT_LINK_IS_UP(rte->rt_ifp)) {
- fib4_rte_to_nh_basic(rte, dst, flags, pnh4);
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ fib4_rte_to_nh_basic(nh, dst, flags, pnh4);
RIB_RUNLOCK(rh);
return (0);
@@ -185,6 +177,7 @@ fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flags,
struct radix_node *rn;
struct sockaddr_in sin;
struct rtentry *rte;
+ struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_ext: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET);
@@ -207,9 +200,10 @@ fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flags,
return (ENOENT);
}
#endif
+ nh = rte->rt_nhop;
/* Ensure route & ifp is UP */
- if (RT_LINK_IS_UP(rte->rt_ifp)) {
- fib4_rte_to_nh_extended(rte, dst, flags, pnh4);
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ fib4_rte_to_nh_extended(nh, dst, flags, pnh4);
if ((flags & NHR_REF) != 0) {
/* TODO: lwref on egress ifp's ? */
}
@@ -229,4 +223,138 @@ fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4)
}
+/*
+ * Looks up path in fib @fibnum specified by @dst.
+ * Returns path nexthop on success. Nexthop is safe to use
+ * within the current network epoch. If longer lifetime is required,
+ * one needs to pass NHR_REF as a flag. This will return referenced
+ * nexthop.
+ */
+struct nhop_object *
+fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
+ uint32_t flags, uint32_t flowid)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh;
+ struct radix_node *rn;
+ struct rtentry *rt;
+ struct nhop_object *nh;
+
+ KASSERT((fibnum < rt_numfibs), ("fib4_lookup: bad fibnum"));
+ rh = rt_tables_get_rnh(fibnum, AF_INET);
+ if (rh == NULL)
+ return (NULL);
+
+ /* Prepare lookup key */
+ struct sockaddr_in sin4;
+ memset(&sin4, 0, sizeof(sin4));
+ sin4.sin_family = AF_INET;
+ sin4.sin_len = sizeof(struct sockaddr_in);
+ sin4.sin_addr = dst;
+
+ nh = NULL;
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ rt = RNTORT(rn);
+#ifdef RADIX_MPATH
+ if (rt_mpath_next(rt) != NULL)
+ rt = rt_mpath_selectrte(rt, flowid);
+#endif
+ nh = rt->rt_nhop;
+ /* Ensure route & ifp is UP */
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ if (flags & NHR_REF)
+ nhop_ref_object(nh);
+ RIB_RUNLOCK(rh);
+ return (nh);
+ }
+ }
+ RIB_RUNLOCK(rh);
+
+ RTSTAT_INC(rts_unreach);
+ return (NULL);
+}
+
+inline static int
+check_urpf(const struct nhop_object *nh, uint32_t flags,
+ const struct ifnet *src_if)
+{
+
+ if (src_if != NULL && nh->nh_aifp == src_if) {
+ return (1);
+ }
+ if (src_if == NULL) {
+ if ((flags & NHR_NODEFAULT) == 0)
+ return (1);
+ else if ((nh->nh_flags & NHF_DEFAULT) == 0)
+ return (1);
+ }
+
+ return (0);
+}
+
+#ifdef RADIX_MPATH
+inline static int
+check_urpf_mpath(struct rtentry *rt, uint32_t flags,
+ const struct ifnet *src_if)
+{
+
+ while (rt != NULL) {
+ if (check_urpf(rt->rt_nhop, flags, src_if) != 0)
+ return (1);
+ rt = rt_mpath_next(rt);
+ }
+
+ return (0);
+}
+#endif
+
+/*
+ * Performs reverse path forwarding lookup.
+ * If @src_if is non-zero, verifies that at least 1 path goes via
+ * this interface.
+ * If @src_if is zero, verifies that route exist.
+ * if @flags contains NHR_NOTDEFAULT, do not consider default route.
+ *
+ * Returns 1 if route matching conditions is found, 0 otherwise.
+ */
+int
+fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
+ uint32_t flags, const struct ifnet *src_if)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh;
+ struct radix_node *rn;
+ struct rtentry *rt;
+ int ret;
+
+ KASSERT((fibnum < rt_numfibs), ("fib4_check_urpf: bad fibnum"));
+ rh = rt_tables_get_rnh(fibnum, AF_INET);
+ if (rh == NULL)
+ return (0);
+
+ /* Prepare lookup key */
+ struct sockaddr_in sin4;
+ memset(&sin4, 0, sizeof(sin4));
+ sin4.sin_len = sizeof(struct sockaddr_in);
+ sin4.sin_addr = dst;
+
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ rt = RNTORT(rn);
+#ifdef RADIX_MPATH
+ ret = check_urpf_mpath(rt, flags, src_if);
+#else
+ ret = check_urpf(rt->rt_nhop, flags, src_if);
+#endif
+ RIB_RUNLOCK(rh);
+ return (ret);
+ }
+ RIB_RUNLOCK(rh);
+
+ return (0);
+}
+
#endif
diff --git a/sys/netinet/in_fib.h b/sys/netinet/in_fib.h
index f0b4d159d5e1..ff78967061e4 100644
--- a/sys/netinet/in_fib.h
+++ b/sys/netinet/in_fib.h
@@ -58,5 +58,9 @@ int fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flags,
uint32_t flowid, struct nhop4_extended *pnh4);
void fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4);
+struct nhop_object *fib4_lookup(uint32_t fibnum, struct in_addr dst,
+ uint32_t scopeid, uint32_t flags, uint32_t flowid);
+int fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
+ uint32_t flags, const struct ifnet *src_if);
#endif
diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c
index 8ea777b90158..eeb7760c5ccb 100644
--- a/sys/netinet/in_rmx.c
+++ b/sys/netinet/in_rmx.c
@@ -43,6 +43,8 @@ __FBSDID("$FreeBSD$");
#include <net/if_var.h>
#include <net/route.h>
#include <net/route_var.h>
+#include <net/route/nhop.h>
+#include <net/route/shared.h>
#include <net/vnet.h>
#include <netinet/in.h>
@@ -56,6 +58,67 @@ extern int in_inithead(void **head, int off, u_int fibnum);
extern int in_detachhead(void **head, int off);
#endif
+static int
+rib4_preadd(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask,
+ struct nhop_object *nh)
+{
+ const struct sockaddr_in *addr4 = (const struct sockaddr_in *)addr;
+ uint16_t nh_type;
+ int rt_flags;
+
+ /* XXX: RTF_LOCAL && RTF_MULTICAST */
+
+ rt_flags = nhop_get_rtflags(nh);
+
+ if (rt_flags & RTF_HOST) {
+
+ /*
+ * Backward compatibility:
+ * if the destination is broadcast,
+ * mark route as broadcast.
+ * This behavior was useful when route cloning
+ * was in place, so there was an explicit cloned
+ * route for every broadcasted address.
+ * Currently (2020-04) there is no kernel machinery
+ * to do route cloning, though someone might explicitly
+ * add these routes to support some cases with active-active
+ * load balancing. Given that, retain this support.
+ */
+ if (in_broadcast(addr4->sin_addr, nh->nh_ifp)) {
+ rt_flags |= RTF_BROADCAST;
+ nhop_set_rtflags(nh, rt_flags);
+ nh->nh_flags |= NHF_BROADCAST;
+ }
+ }
+
+ /*
+ * Check route MTU:
+ * inherit interface MTU if not set or
+ * check if MTU is too large.
+ */
+ if (nh->nh_mtu == 0) {
+ nh->nh_mtu = nh->nh_ifp->if_mtu;
+ } else if (nh->nh_mtu > nh->nh_ifp->if_mtu)
+ nh->nh_mtu = nh->nh_ifp->if_mtu;
+
+ /* Ensure that default route nhop has special flag */
+ const struct sockaddr_in *mask4 = (const struct sockaddr_in *)mask;
+ if ((rt_flags & RTF_HOST) == 0 && mask4->sin_addr.s_addr == 0)
+ nh->nh_flags |= NHF_DEFAULT;
+
+ /* Set nhop type to basic per-AF nhop */
+ if (nhop_get_type(nh) == 0) {
+ if (nh->nh_flags & NHF_GATEWAY)
+ nh_type = NH_TYPE_IPV4_ETHER_NHOP;
+ else
+ nh_type = NH_TYPE_IPV4_ETHER_RSLV;
+
+ nhop_set_type(nh, nh_type);
+ }
+
+ return (0);
+}
+
/*
* Do what we need to do when inserting a route.
*/
@@ -126,6 +189,7 @@ in_inithead(void **head, int off, u_int fibnum)
if (rh == NULL)
return (0);
+ rh->rnh_preadd = rib4_preadd;
rh->rnh_addaddr = in_addroute;
#ifdef RADIX_MPATH
rt_mpath_init_rnh(rh);
diff --git a/sys/netinet6/in6_fib.c b/sys/netinet6/in6_fib.c
index ae4beab3b5ce..b3effb2b422e 100644
--- a/sys/netinet6/in6_fib.c
+++ b/sys/netinet6/in6_fib.c
@@ -50,6 +50,8 @@ __FBSDID("$FreeBSD$");
#include <net/if_dl.h>
#include <net/route.h>
#include <net/route_var.h>
+#include <net/route/nhop.h>
+#include <net/route/shared.h>
#include <net/vnet.h>
#ifdef RADIX_MPATH
@@ -68,94 +70,63 @@ __FBSDID("$FreeBSD$");
#include <net/if_types.h>
#ifdef INET6
-static void fib6_rte_to_nh_extended(struct rtentry *rte,
+static void fib6_rte_to_nh_extended(const struct nhop_object *nh,
const struct in6_addr *dst, uint32_t flags, struct nhop6_extended *pnh6);
-static void fib6_rte_to_nh_basic(struct rtentry *rte, const struct in6_addr *dst,
+static void fib6_rte_to_nh_basic(const struct nhop_object *nh, const struct in6_addr *dst,
uint32_t flags, struct nhop6_basic *pnh6);
-static struct ifnet *fib6_get_ifaifp(struct rtentry *rte);
#define RNTORT(p) ((struct rtentry *)(p))
#define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa))
CHK_STRUCT_ROUTE_COMPAT(struct route_in6, ro_dst);
-/*
- * Gets real interface for the @rte.
- * Returns rt_ifp for !IFF_LOOPBACK routers.
- * Extracts "real" address interface from interface address
- * loopback routes.
- */
-static struct ifnet *
-fib6_get_ifaifp(struct rtentry *rte)
-{
- struct ifnet *ifp;
- struct sockaddr_dl *sdl;
-
- ifp = rte->rt_ifp;
- if ((ifp->if_flags & IFF_LOOPBACK) &&
- rte->rt_gateway->sa_family == AF_LINK) {
- sdl = (struct sockaddr_dl *)rte->rt_gateway;
- return (ifnet_byindex(sdl->sdl_index));
- }
- return (ifp);
-}
static void
-fib6_rte_to_nh_basic(struct rtentry *rte, const struct in6_addr *dst,
+fib6_rte_to_nh_basic(const struct nhop_object *nh, const struct in6_addr *dst,
uint32_t flags, struct nhop6_basic *pnh6)
{
- struct sockaddr_in6 *gw;
/* Do explicit nexthop zero unless we're copying it */
memset(pnh6, 0, sizeof(*pnh6));
if ((flags & NHR_IFAIF) != 0)
- pnh6->nh_ifp = fib6_get_ifaifp(rte);
+ pnh6->nh_ifp = nh->nh_aifp;
else
- pnh6->nh_ifp = rte->rt_ifp;
+ pnh6->nh_ifp = nh->nh_ifp;
- pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp));
- if (rte->rt_flags & RTF_GATEWAY) {
+ pnh6->nh_mtu = nh->nh_mtu;
+ if (nh->nh_flags & NHF_GATEWAY) {
/* Return address with embedded scope. */
- gw = (struct sockaddr_in6 *)rte->rt_gateway;
- pnh6->nh_addr = gw->sin6_addr;
+ pnh6->nh_addr = nh->gw6_sa.sin6_addr;
} else
pnh6->nh_addr = *dst;
/* Set flags */
- pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
- gw = (struct sockaddr_in6 *)rt_key(rte);
- if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr))
- pnh6->nh_flags |= NHF_DEFAULT;
+ pnh6->nh_flags = nh->nh_flags;
}
static void
-fib6_rte_to_nh_extended(struct rtentry *rte, const struct in6_addr *dst,
+fib6_rte_to_nh_extended(const struct nhop_object *nh, const struct in6_addr *dst,
uint32_t flags, struct nhop6_extended *pnh6)
{
- struct sockaddr_in6 *gw;
/* Do explicit nexthop zero unless we're copying it */
memset(pnh6, 0, sizeof(*pnh6));
if ((flags & NHR_IFAIF) != 0)
- pnh6->nh_ifp = fib6_get_ifaifp(rte);
+ pnh6->nh_ifp = nh->nh_aifp;
else
- pnh6->nh_ifp = rte->rt_ifp;
+ pnh6->nh_ifp = nh->nh_ifp;
- pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp));
- if (rte->rt_flags & RTF_GATEWAY) {
+ pnh6->nh_mtu = nh->nh_mtu;
+ if (nh->nh_flags & NHF_GATEWAY) {
/* Return address with embedded scope. */
- gw = (struct sockaddr_in6 *)rte->rt_gateway;
- pnh6->nh_addr = gw->sin6_addr;
+ pnh6->nh_addr = nh->gw6_sa.sin6_addr;
} else
pnh6->nh_addr = *dst;
/* Set flags */
- pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
- gw = (struct sockaddr_in6 *)rt_key(rte);
- if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr))
- pnh6->nh_flags |= NHF_DEFAULT;
- pnh6->nh_ia = ifatoia6(rte->rt_ifa);
+ pnh6->nh_flags = nh->nh_flags;
+ pnh6->nh_ia = ifatoia6(nh->nh_ifa);
}
/*
@@ -180,7 +151,7 @@ fib6_lookup_nh_basic(uint32_t fibnum, const struct in6_addr *dst, uint32_t scope
struct rib_head *rh;
struct radix_node *rn;
struct sockaddr_in6 sin6;
- struct rtentry *rte;
+ struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_basic: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET6);
@@ -198,10 +169,10 @@ fib6_lookup_nh_basic(uint32_t fibnum, const struct in6_addr *dst, uint32_t scope
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rte = RNTORT(rn);
+ nh = RNTORT(rn)->rt_nhop;
/* Ensure route & ifp is UP */
- if (RT_LINK_IS_UP(rte->rt_ifp)) {
- fib6_rte_to_nh_basic(rte, &sin6.sin6_addr, flags, pnh6);
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ fib6_rte_to_nh_basic(nh, &sin6.sin6_addr, flags, pnh6);
RIB_RUNLOCK(rh);
return (0);
}
@@ -231,6 +202,7 @@ fib6_lookup_nh_ext(uint32_t fibnum, const struct in6_addr *dst,uint32_t scopeid,
struct radix_node *rn;
struct sockaddr_in6 sin6;
struct rtentry *rte;
+ struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_ext: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET6);
@@ -256,9 +228,10 @@ fib6_lookup_nh_ext(uint32_t fibnum, const struct in6_addr *dst,uint32_t scopeid,
return (ENOENT);
}
#endif
+ nh = rte->rt_nhop;
/* Ensure route & ifp is UP */
- if (RT_LINK_IS_UP(rte->rt_ifp)) {
- fib6_rte_to_nh_extended(rte, &sin6.sin6_addr, flags,
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ fib6_rte_to_nh_extended(nh, &sin6.sin6_addr, flags,
pnh6);
if ((flags & NHR_REF) != 0) {
/* TODO: Do lwref on egress ifp's */
@@ -279,5 +252,145 @@ fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6)
}
+/*
+ * Looks up path in fib @fibnum specified by @dst.
+ * Assumes scope is deembedded and provided in @scopeid.
+ *
+ * Returns path nexthop on success. Nexthop is safe to use
+ * within the current network epoch. If longer lifetime is required,
+ * one needs to pass NHR_REF as a flag. This will return referenced
+ * nexthop.
+ */
+struct nhop_object *
+fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid, uint32_t flags, uint32_t flowid)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh;
+ struct radix_node *rn;
+ struct rtentry *rt;
+ struct nhop_object *nh;
+ struct sockaddr_in6 sin6;
+
+ KASSERT((fibnum < rt_numfibs), ("fib6_lookup: bad fibnum"));
+ rh = rt_tables_get_rnh(fibnum, AF_INET6);
+ if (rh == NULL)
+ return (NULL);
+
+ /* TODO: radix changes */
+ //addr = *dst6;
+ /* Prepare lookup key */
+ memset(&sin6, 0, sizeof(sin6));
+ sin6.sin6_len = sizeof(struct sockaddr_in6);
+ sin6.sin6_addr = *dst6;
+
+ /* Assume scopeid is valid and embed it directly */
+ if (IN6_IS_SCOPE_LINKLOCAL(dst6))
+ sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff);
+
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ rt = RNTORT(rn);
+#ifdef RADIX_MPATH
+ if (rt_mpath_next(rt) != NULL)
+ rt = rt_mpath_selectrte(rt, flowid);
+#endif
+ nh = rt->rt_nhop;
+ /* Ensure route & ifp is UP */
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ if (flags & NHR_REF)
+ nhop_ref_object(nh);
+ RIB_RUNLOCK(rh);
+ return (nh);
+ }
+ }
+ RIB_RUNLOCK(rh);
+
+ RTSTAT_INC(rts_unreach);
+ return (NULL);
+}
+
+inline static int
+check_urpf(const struct nhop_object *nh, uint32_t flags,
+ const struct ifnet *src_if)
+{
+
+ if (src_if != NULL && nh->nh_aifp == src_if) {
+ return (1);
+ }
+ if (src_if == NULL) {
+ if ((flags & NHR_NODEFAULT) == 0)
+ return (1);
+ else if ((nh->nh_flags & NHF_DEFAULT) == 0)
+ return (1);
+ }
+
+ return (0);
+}
+
+#ifdef RADIX_MPATH
+inline static int
+check_urpf_mpath(struct rtentry *rt, uint32_t flags,
+ const struct ifnet *src_if)
+{
+
+ while (rt != NULL) {
+ if (check_urpf(rt->rt_nhop, flags, src_if) != 0)
+ return (1);
+ rt = rt_mpath_next(rt);
+ }
+
+ return (0);
+}
+#endif
+
+/*
+ * Performs reverse path forwarding lookup.
+ * If @src_if is non-zero, verifies that at least 1 path goes via
+ * this interface.
+ * If @src_if is zero, verifies that route exist.
+ * if @flags contains NHR_NOTDEFAULT, do not consider default route.
+ *
+ * Returns 1 if route matching conditions is found, 0 otherwise.
+ */
+int
+fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid, uint32_t flags, const struct ifnet *src_if)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh;
+ struct radix_node *rn;
+ struct rtentry *rt;
+ struct in6_addr addr;
+ int ret;
+
+ KASSERT((fibnum < rt_numfibs), ("fib6_check_urpf: bad fibnum"));
+ rh = rt_tables_get_rnh(fibnum, AF_INET6);
+ if (rh == NULL)
+ return (0);
+
+ addr = *dst6;
+ /* Assume scopeid is valid and embed it directly */
+ if (IN6_IS_SCOPE_LINKLOCAL(dst6))
+ addr.s6_addr16[1] = htons(scopeid & 0xffff);
+
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&addr, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ rt = RNTORT(rn);
+#ifdef RADIX_MPATH
+ ret = check_urpf_mpath(rt, flags, src_if);
+#else
+ ret = check_urpf(rt->rt_nhop, flags, src_if);
+#endif
+ RIB_RUNLOCK(rh);
+ return (ret);
+ }
+ RIB_RUNLOCK(rh);
+
+ return (0);
+}
+
#endif
diff --git a/sys/netinet6/in6_fib.h b/sys/netinet6/in6_fib.h
index fa07a5ce9a3e..bf8d367309cc 100644
--- a/sys/netinet6/in6_fib.h
+++ b/sys/netinet6/in6_fib.h
@@ -58,5 +58,11 @@ int fib6_lookup_nh_ext(uint32_t fibnum, const struct in6_addr *dst,
uint32_t scopeid, uint32_t flags, uint32_t flowid,
struct nhop6_extended *pnh6);
void fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6);
+
+struct nhop_object *fib6_lookup(uint32_t fibnum,
+ const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags,
+ uint32_t flowid);
+int fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid, uint32_t flags, const struct ifnet *src_if);
#endif
diff --git a/sys/netinet6/in6_rmx.c b/sys/netinet6/in6_rmx.c
index 35756cf95868..7f10b290309b 100644
--- a/sys/netinet6/in6_rmx.c
+++ b/sys/netinet6/in6_rmx.c
@@ -82,6 +82,8 @@ __FBSDID("$FreeBSD$");
#include <net/if_var.h>
#include <net/route.h>
#include <net/route_var.h>
+#include <net/route/nhop.h>
+#include <net/route/shared.h>
#include <netinet/in.h>
#include <netinet/ip_var.h>
@@ -103,6 +105,43 @@ extern int in6_inithead(void **head, int off, u_int fibnum);
extern int in6_detachhead(void **head, int off);
#endif
+static int
+rib6_preadd(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask,
+ struct nhop_object *nh)
+{
+ uint16_t nh_type;
+
+ /* XXX: RTF_LOCAL */
+
+ /*
+ * Check route MTU:
+ * inherit interface MTU if not set or
+ * check if MTU is too large.
+ */
+ if (nh->nh_mtu == 0) {
+ nh->nh_mtu = IN6_LINKMTU(nh->nh_ifp);
+ } else if (nh->nh_mtu > IN6_LINKMTU(nh->nh_ifp))
+ nh->nh_mtu = IN6_LINKMTU(nh->nh_ifp);
+
+ /* Ensure that default route nhop has special flag */
+ const struct sockaddr_in6 *mask6 = (const struct sockaddr_in6 *)mask;
+ if ((nhop_get_rtflags(nh) & RTF_HOST) == 0 &&
+ IN6_IS_ADDR_UNSPECIFIED(&mask6->sin6_addr))
+ nh->nh_flags |= NHF_DEFAULT;
+
+ /* Set nexthop type */
+ if (nhop_get_type(nh) == 0) {
+ if (nh->nh_flags & NHF_GATEWAY)
+ nh_type = NH_TYPE_IPV6_ETHER_NHOP;
+ else
+ nh_type = NH_TYPE_IPV6_ETHER_RSLV;
+
+ nhop_set_type(nh, nh_type);
+ }
+
+ return (0);
+}
+
/*
* Do what we need to do when inserting a route.
*/
@@ -169,6 +208,7 @@ in6_inithead(void **head, int off, u_int fibnum)
return (0);
rh->rnh_addaddr = in6_addroute;
+ rh->rnh_preadd = rib6_preadd;
#ifdef RADIX_MPATH
rt_mpath_init_rnh(rh);
#endif
diff --git a/sys/sys/socket.h b/sys/sys/socket.h
index eaad9b1bacdb..1768480cc8c8 100644
--- a/sys/sys/socket.h
+++ b/sys/sys/socket.h
@@ -416,6 +416,7 @@ struct sockproto {
#define NET_RT_IFMALIST 4 /* return multicast address list */
#define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en
* versions of msghdr structs. */
+#define NET_RT_NHOP 6 /* dump routing nexthops */
#endif /* __BSD_VISIBLE */
/*
diff --git a/usr.bin/netstat/Makefile b/usr.bin/netstat/Makefile
index 0e60b0b40359..b61afdc410b0 100644
--- a/usr.bin/netstat/Makefile
+++ b/usr.bin/netstat/Makefile
@@ -5,7 +5,7 @@
PROG= netstat
SRCS= if.c inet.c main.c mbuf.c mroute.c netisr.c nl_symbols.c route.c \
- unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c \
+ unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c \
nl_defs.h
nl_symbols.c: nlist_symbols
diff --git a/usr.bin/netstat/common.c b/usr.bin/netstat/common.c
new file mode 100644
index 000000000000..ac721b3e9ab0
--- /dev/null
+++ b/usr.bin/netstat/common.c
@@ -0,0 +1,140 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1983, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+#include <libutil.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <err.h>
+#include <libxo/xo.h>
+#include "netstat.h"
+#include "common.h"
+
+const char *
+fmt_flags(const struct bits *p, int f)
+{
+ static char name[33];
+ char *flags;
+
+ for (flags = name; p->b_mask; p++)
+ if (p->b_mask & f)
+ *flags++ = p->b_val;
+ *flags = '\0';
+ return (name);
+}
+
+void
+print_flags_generic(int flags, const struct bits *pbits, const char *format,
+ const char *tag_name)
+{
+ const struct bits *p;
+ char tag_fmt[64];
+
+ xo_emit(format, fmt_flags(pbits, flags));
+
+ snprintf(tag_fmt, sizeof(tag_fmt), "{le:%s/%%s}", tag_name);
+ xo_open_list(tag_name);
+ for (p = pbits; p->b_mask; p++)
+ if (p->b_mask & flags)
+ xo_emit(tag_fmt, p->b_name);
+ xo_close_list(tag_name);
+}
+
+struct ifmap_entry *
+prepare_ifmap(size_t *pifmap_size)
+{
+ int ifindex = 0, size;
+ struct ifaddrs *ifap, *ifa;
+ struct sockaddr_dl *sdl;
+
+ struct ifmap_entry *ifmap = NULL;
+ int ifmap_size = 0;
+
+ /*
+ * Retrieve interface list at first
+ * since we need #ifindex -> if_xname match
+ */
+ if (getifaddrs(&ifap) != 0)
+ err(EX_OSERR, "getifaddrs");
+
+ for (ifa = ifap; ifa; ifa = ifa->ifa_next) {
+
+ if (ifa->ifa_addr->sa_family != AF_LINK)
+ continue;
+
+ sdl = (struct sockaddr_dl *)ifa->ifa_addr;
+ ifindex = sdl->sdl_index;
+
+ if (ifindex >= ifmap_size) {
+ size = roundup(ifindex + 1, 32) *
+ sizeof(struct ifmap_entry);
+ if ((ifmap = realloc(ifmap, size)) == NULL)
+ errx(2, "realloc(%d) failed", size);
+ memset(&ifmap[ifmap_size], 0,
+ size - ifmap_size *
+ sizeof(struct ifmap_entry));
+
+ ifmap_size = roundup(ifindex + 1, 32);
+ }
+
+ if (*ifmap[ifindex].ifname != '\0')
+ continue;
+
+ strlcpy(ifmap[ifindex].ifname, ifa->ifa_name, IFNAMSIZ);
+ }
+
+ freeifaddrs(ifap);
+
+ *pifmap_size = ifmap_size;
+
+ return (ifmap);
+}
+
diff --git a/usr.bin/netstat/common.h b/usr.bin/netstat/common.h
new file mode 100644
index 000000000000..aafa45df8936
--- /dev/null
+++ b/usr.bin/netstat/common.h
@@ -0,0 +1,58 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1992, 1993
+ * Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)netstat.h 8.2 (Berkeley) 1/4/94
+ * $FreeBSD$
+ */
+
+#ifndef _NETSTAT_COMMON_H_
+#define _NETSTAT_COMMON_H_
+
+struct bits {
+ u_long b_mask;
+ char b_val;
+ const char *b_name;
+};
+extern struct bits rt_bits[];
+
+const char *fmt_flags(const struct bits *p, int f);
+void print_flags_generic(int flags, const struct bits *pbits,
+ const char *format, const char *tag_name);
+int print_sockaddr(const char *name, struct sockaddr *sa,
+ struct sockaddr *mask, int flags, int width);
+
+struct ifmap_entry {
+ char ifname[IFNAMSIZ];
+};
+
+struct ifmap_entry *prepare_ifmap(size_t *ifmap_size);
+
+#endif
+
diff --git a/usr.bin/netstat/main.c b/usr.bin/netstat/main.c
index 03dceab993cf..329c551cfc5d 100644
--- a/usr.bin/netstat/main.c
+++ b/usr.bin/netstat/main.c
@@ -214,6 +214,7 @@ int mflag; /* show memory stats */
int noutputs = 0; /* how much outputs before we exit */
int numeric_addr; /* show addresses numerically */
int numeric_port; /* show ports numerically */
+int oflag; /* show nexthop objects*/
int Pflag; /* show TCP log ID */
static int pflag; /* show given protocol */
static int Qflag; /* show netisr information */
@@ -248,7 +249,7 @@ main(int argc, char *argv[])
if (argc < 0)
exit(EXIT_FAILURE);
- while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:nPp:Qq:RrSTsuWw:xz"))
+ while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:noPp:Qq:RrSTsuWw:xz"))
!= -1)
switch(ch) {
case '4':
@@ -345,6 +346,9 @@ main(int argc, char *argv[])
case 'n':
numeric_addr = numeric_port = 1;
break;
+ case 'o':
+ oflag = 1;
+ break;
case 'P':
Pflag = 1;
break;
@@ -494,6 +498,14 @@ main(int argc, char *argv[])
xo_finish();
exit(0);
}
+ if (oflag) {
+ xo_open_container("statistics");
+ nhops_print(fib, af);
+ xo_close_container("statistics");
+ xo_finish();
+ exit(0);
+ }
+
if (gflag) {
xo_open_container("statistics");
diff --git a/usr.bin/netstat/netstat.h b/usr.bin/netstat/netstat.h
index 5f35ff097851..713608431a12 100644
--- a/usr.bin/netstat/netstat.h
+++ b/usr.bin/netstat/netstat.h
@@ -147,6 +147,10 @@ void rt_stats(void);
char *routename(struct sockaddr *, int);
const char *netname(struct sockaddr *, struct sockaddr *);
void routepr(int, int);
+int p_sockaddr(const char *name, struct sockaddr *sa,
+ struct sockaddr *mask, int flags, int width);
+const char *fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask,
+ int flags);
#ifdef NETGRAPH
void netgraphprotopr(u_long, const char *, int, int);
@@ -157,3 +161,4 @@ void unixpr(u_long, u_long, u_long, u_long, u_long, bool *);
void mroutepr(void);
void mrt_stats(void);
void bpf_stats(char *);
+void nhops_print(int fibnum, int af);
diff --git a/usr.bin/netstat/nhops.c b/usr.bin/netstat/nhops.c
new file mode 100644
index 000000000000..d62eb7290f5c
--- /dev/null
+++ b/usr.bin/netstat/nhops.c
@@ -0,0 +1,472 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1983, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+
+#include <netinet/in.h>
+#include <netgraph/ng_socket.h>
+
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+#include <libutil.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <err.h>
+#include <libxo/xo.h>
+#include "netstat.h"
+#include "common.h"
+
+/* column widths; each followed by one space */
+#ifndef INET6
+#define WID_DST_DEFAULT(af) 18 /* width of destination column */
+#define WID_GW_DEFAULT(af) 18 /* width of gateway column */
+#define WID_IF_DEFAULT(af) (Wflag ? 10 : 8) /* width of netif column */
+#else
+#define WID_DST_DEFAULT(af) \
+ ((af) == AF_INET6 ? (numeric_addr ? 33: 18) : 18)
+#define WID_GW_DEFAULT(af) \
+ ((af) == AF_INET6 ? (numeric_addr ? 29 : 18) : 18)
+#define WID_IF_DEFAULT(af) ((af) == AF_INET6 ? 8 : (Wflag ? 10 : 8))
+#endif /*INET6*/
+static int wid_dst;
+static int wid_gw;
+static int wid_flags;
+static int wid_pksent;
+static int wid_mtu;
+static int wid_if;
+static int wid_nhidx;
+static int wid_nhtype;
+static int wid_refcnt;
+static int wid_prepend;
+
+static struct bits nh_bits[] = {
+ { NHF_REJECT, 'R', "reject" },
+ { NHF_BLACKHOLE,'B', "blackhole" },
+ { NHF_REDIRECT, 'r', "redirect" },
+ { NHF_GATEWAY, 'G', "gateway" },
+ { NHF_DEFAULT, 'd', "default" },
+ { NHF_BROADCAST,'b', "broadcast" },
+ { 0 , 0, NULL }
+};
+
+static char *nh_types[] = {
+ "empty", /* 0 */
+ "v4/resolve", /* 1 */
+ "v4/gw",
+ "v6/resolve",
+ "v6/gw"
+};
+
+struct nhop_entry {
+ char gw[64];
+ char ifname[IFNAMSIZ];
+};
+
+struct nhop_map {
+ struct nhop_entry *ptr;
+ size_t size;
+};
+static struct nhop_map global_nhop_map;
+
+static void nhop_map_update(struct nhop_map *map, uint32_t idx,
+ char *gw, char *ifname);
+static struct nhop_entry *nhop_get(struct nhop_map *map, uint32_t idx);
+
+
+static struct ifmap_entry *ifmap;
+static size_t ifmap_size;
+
+static void
+print_sockaddr_buf(char *buf, size_t bufsize, const struct sockaddr *sa)
+{
+
+ switch (sa->sa_family) {
+ case AF_INET:
+ inet_ntop(AF_INET, &((struct sockaddr_in *)sa)->sin_addr,
+ buf, bufsize);
+ break;
+ case AF_INET6:
+ inet_ntop(AF_INET6, &((struct sockaddr_in6 *)sa)->sin6_addr,
+ buf, bufsize);
+ break;
+ default:
+ snprintf(buf, bufsize, "unknown:%d", sa->sa_family);
+ break;
+ }
+}
+
+static int
+print_addr(const char *name, const char *addr, int width)
+{
+ char buf[128];
+ int protrusion;
+
+ if (width < 0) {
+ snprintf(buf, sizeof(buf), "{:%s/%%s} ", name);
+ xo_emit(buf, addr);
+ protrusion = 0;
+ } else {
+ if (Wflag != 0 || numeric_addr) {
+ snprintf(buf, sizeof(buf), "{[:%d}{:%s/%%s}{]:} ",
+ -width, name);
+ xo_emit(buf, addr);
+ protrusion = strlen(addr) - width;
+ if (protrusion < 0)
+ protrusion = 0;
+ } else {
+ snprintf(buf, sizeof(buf), "{[:%d}{:%s/%%-.*s}{]:} ",
+ -width, name);
+ xo_emit(buf, width, addr);
+ protrusion = 0;
+ }
+ }
+ return (protrusion);
+}
+
+
+static void
+print_nhop_header(int af1 __unused)
+{
+
+ if (Wflag) {
+ xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s} "
+ "{T:/%*.*s} {T:/%-*.*s} {T:/%*.*s} {T:/%*.*s} {T:/%*.*s} {T:/%*s}\n",
+ wid_nhidx, wid_nhidx, "Idx",
+ wid_nhtype, wid_nhtype, "Type",
+ wid_dst, wid_dst, "IFA",
+ wid_gw, wid_gw, "Gateway",
+ wid_flags, wid_flags, "Flags",
+ wid_pksent, wid_pksent, "Use",
+ wid_mtu, wid_mtu, "Mtu",
+ wid_if, wid_if, "Netif",
+ wid_if, wid_if, "Addrif",
+ wid_refcnt, wid_refcnt, "Refcnt",
+ wid_prepend, "Prepend");
+ } else {
+ xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s} "
+ " {T:/%*s}\n",
+ wid_nhidx, wid_nhidx, "Idx",
+ wid_dst, wid_dst, "IFA",
+ wid_gw, wid_gw, "Gateway",
+ wid_flags, wid_flags, "Flags",
+ wid_if, wid_if, "Netif",
+ wid_prepend, "Refcnt");
+ }
+}
+
+static void
+nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname)
+{
+ if (idx >= map->size) {
+ uint32_t new_size;
+ size_t sz;
+ if (map->size == 0)
+ new_size = 32;
+ else
+ new_size = map->size * 2;
+ if (new_size <= idx)
+ new_size = roundup(idx + 1, 32);
+
+ sz = new_size * (sizeof(struct nhop_entry));
+ if ((map->ptr = realloc(map->ptr, sz)) == NULL)
+ errx(2, "realloc(%lu) failed", sz);
+
+ memset(&map->ptr[map->size], 0, (new_size - map->size) * sizeof(struct nhop_entry));
+ map->size = new_size;
+ }
+
+ strlcpy(map->ptr[idx].ifname, ifname, sizeof(map->ptr[idx].ifname));
+ strlcpy(map->ptr[idx].gw, gw, sizeof(map->ptr[idx].gw));
+}
+
+static struct nhop_entry *
+nhop_get(struct nhop_map *map, uint32_t idx)
+{
+
+ if (idx >= map->size)
+ return (NULL);
+ if (*map->ptr[idx].ifname == '\0')
+ return (NULL);
+ return &map->ptr[idx];
+}
+
+static void
+print_nhop_entry_sysctl(const char *name, struct rt_msghdr *rtm, struct nhop_external *nh)
+{
+ char buffer[128];
+ char iface_name[128];
+ int protrusion;
+ char gw_addr[64];
+ struct nhop_addrs *na;
+ struct sockaddr *sa_gw, *sa_ifa;
+
+ xo_open_instance(name);
+
+ snprintf(buffer, sizeof(buffer), "{[:-%d}{:index/%%lu}{]:} ", wid_nhidx);
+ //xo_emit("{t:index/%-lu} ", wid_nhidx, nh->nh_idx);
+ xo_emit(buffer, nh->nh_idx);
+
+ if (Wflag) {
+ char *cp = nh_types[nh->nh_type];
+ xo_emit("{t:type_str/%*s} ", wid_nhtype, cp);
+ }
+ memset(iface_name, 0, sizeof(iface_name));
+ if (nh->ifindex < (uint32_t)ifmap_size) {
+ strlcpy(iface_name, ifmap[nh->ifindex].ifname,
+ sizeof(iface_name));
+ if (*iface_name == '\0')
+ strlcpy(iface_name, "---", sizeof(iface_name));
+ }
+
+ na = (struct nhop_addrs *)((char *)nh + nh->nh_len);
+ //inet_ntop(nh->nh_family, &nh->nh_src, src_addr, sizeof(src_addr));
+ //protrusion = p_addr("ifa", src_addr, wid_dst);
+ sa_gw = (struct sockaddr *)((char *)na + na->gw_sa_off);
+ sa_ifa = (struct sockaddr *)((char *)na + na->src_sa_off);
+ protrusion = p_sockaddr("ifa", sa_ifa, NULL, RTF_HOST, wid_dst);
+
+ if (nh->nh_flags & NHF_GATEWAY) {
+ const char *cp;
+ cp = fmt_sockaddr(sa_gw, NULL, RTF_HOST);
+ strlcpy(gw_addr, cp, sizeof(gw_addr));
+ } else
+ snprintf(gw_addr, sizeof(gw_addr), "%s/resolve", iface_name);
+ protrusion = print_addr("gateway", gw_addr, wid_dst - protrusion);
+
+ nhop_map_update(&global_nhop_map, nh->nh_idx, gw_addr, iface_name);
+
+ snprintf(buffer, sizeof(buffer), "{[:-%d}{:flags/%%s}{]:} ",
+ wid_flags - protrusion);
+
+ //p_nhflags(nh->nh_flags, buffer);
+ print_flags_generic(rtm->rtm_flags, rt_bits, buffer, "rt_flags_pretty");
+
+ if (Wflag) {
+ xo_emit("{t:use/%*lu} ", wid_pksent, nh->nh_pksent);
+ xo_emit("{t:mtu/%*lu} ", wid_mtu, nh->nh_mtu);
+ }
+ //printf("IDX: %d IFACE: %s FAMILY: %d TYPE: %d FLAGS: %X GW \n");
+
+ if (Wflag)
+ xo_emit("{t:interface-name/%*s}", wid_if, iface_name);
+ else
+ xo_emit("{t:interface-name/%*.*s}", wid_if, wid_if, iface_name);
+
+ memset(iface_name, 0, sizeof(iface_name));
+ if (nh->aifindex < (uint32_t)ifmap_size && nh->ifindex != nh->aifindex) {
+ strlcpy(iface_name, ifmap[nh->aifindex].ifname,
+ sizeof(iface_name));
+ if (*iface_name == '\0')
+ strlcpy(iface_name, "---", sizeof(iface_name));
+ }
+ if (Wflag)
+ xo_emit("{t:address-interface-name/%*s}", wid_if, iface_name);
+
+ xo_emit("{t:refcount/%*lu} ", wid_refcnt, nh->nh_refcount);
+ if (Wflag && nh->prepend_len) {
+ char *prepend_hex = "AABBCCDDEE";
+ xo_emit(" {:nhop-prepend/%*s}", wid_prepend, prepend_hex);
+ }
+
+ xo_emit("\n");
+ xo_close_instance(name);
+}
+
+struct nhops_map {
+ uint32_t idx;
+ struct rt_msghdr *rtm;
+};
+
+static int
+cmp_nh_idx(const void *_a, const void *_b)
+{
+ const struct nhops_map *a, *b;
+
+ a = _a;
+ b = _b;
+
+ if (a->idx > b->idx)
+ return (1);
+ else if (a->idx < b->idx)
+ return (-1);
+ return (0);
+}
+
+static void
+print_nhops_sysctl(int fibnum, int af)
+{
+ size_t needed;
+ int mib[7];
+ char *buf, *next, *lim;
+ struct rt_msghdr *rtm;
+ struct nhop_external *nh;
+ int fam;
+ struct nhops_map *nh_map;
+ size_t nh_count, nh_size;
+
+ mib[0] = CTL_NET;
+ mib[1] = PF_ROUTE;
+ mib[2] = 0;
+ mib[3] = af;
+ mib[4] = NET_RT_NHOP;
+ mib[5] = 0;
+ mib[6] = fibnum;
+ if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0)
+ err(EX_OSERR, "sysctl: net.route.0.%d.nhdump.%d estimate", af,
+ fibnum);
+ if ((buf = malloc(needed)) == NULL)
+ errx(2, "malloc(%lu)", (unsigned long)needed);
+ if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0)
+ err(1, "sysctl: net.route.0.%d.nhdump.%d", af, fibnum);
+ lim = buf + needed;
+ xo_open_container("nhop-table");
+ xo_open_list("rt-family");
+
+ /*
+ * nexhops are received unsorted. Collect everything first, sort and then display
+ * sorted.
+ */
+ nh_count = 0;
+ nh_size = 16;
+ nh_map = calloc(nh_size, sizeof(struct nhops_map));
+ for (next = buf; next < lim; next += rtm->rtm_msglen) {
+ rtm = (struct rt_msghdr *)next;
+ if (rtm->rtm_version != RTM_VERSION)
+ continue;
+
+ if (nh_count >= nh_size) {
+ nh_size *= 2;
+ nh_map = realloc(nh_map, nh_size * sizeof(struct nhops_map));
+ }
+
+ nh = (struct nhop_external *)(rtm + 1);
+ nh_map[nh_count].idx = nh->nh_idx;
+ nh_map[nh_count].rtm = rtm;
+ nh_count++;
+ }
+
+ if (nh_count > 0) {
+ qsort(nh_map, nh_count, sizeof(struct nhops_map), cmp_nh_idx);
+ nh = (struct nhop_external *)(nh_map[0].rtm + 1);
+ fam = nh->nh_family;
+
+ wid_dst = WID_GW_DEFAULT(fam);
+ wid_gw = WID_GW_DEFAULT(fam);
+ wid_nhidx = 5;
+ wid_nhtype = 12;
+ wid_refcnt = 6;
+ wid_flags = 6;
+ wid_pksent = 8;
+ wid_mtu = 6;
+ wid_if = WID_IF_DEFAULT(fam);
+ xo_open_instance("rt-family");
+ pr_family(fam);
+ xo_open_list("nh-entry");
+
+ print_nhop_header(fam);
+
+ for (size_t i = 0; i < nh_count; i++) {
+ rtm = nh_map[i].rtm;
+ nh = (struct nhop_external *)(rtm + 1);
+ print_nhop_entry_sysctl("nh-entry", rtm, nh);
+ }
+
+ xo_close_list("nh-entry");
+ xo_close_instance("rt-family");
+ }
+ xo_close_list("rt-family");
+ xo_close_container("nhop-table");
+ free(buf);
+}
+
+static void
+p_nhflags(int f, const char *format)
+{
+ struct bits *p;
+ char *pretty_name = "nh_flags_pretty";
+
+ xo_emit(format, fmt_flags(nh_bits, f));
+
+ xo_open_list(pretty_name);
+ for (p = nh_bits; p->b_mask; p++)
+ if (p->b_mask & f)
+ xo_emit("{le:nh_flags_pretty/%s}", p->b_name);
+ xo_close_list(pretty_name);
+}
+
+void
+nhops_print(int fibnum, int af)
+{
+ size_t intsize;
+ int numfibs;
+
+ intsize = sizeof(int);
+ if (fibnum == -1 &&
+ sysctlbyname("net.my_fibnum", &fibnum, &intsize, NULL, 0) == -1)
+ fibnum = 0;
+ if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1)
+ numfibs = 1;
+ if (fibnum < 0 || fibnum > numfibs - 1)
+ errx(EX_USAGE, "%d: invalid fib", fibnum);
+
+ ifmap = prepare_ifmap(&ifmap_size);
+
+ xo_open_container("route-nhop-information");
+ xo_emit("{T:Nexthop data}");
+ if (fibnum)
+ xo_emit(" ({L:fib}: {:fib/%d})", fibnum);
+ xo_emit("\n");
+ print_nhops_sysctl(fibnum, af);
+ xo_close_container("route-nhop-information");
+}
+
diff --git a/usr.bin/netstat/route.c b/usr.bin/netstat/route.c
index e15cf1578029..ba47a4b56ac5 100644
--- a/usr.bin/netstat/route.c
+++ b/usr.bin/netstat/route.c
@@ -69,16 +69,13 @@ __FBSDID("$FreeBSD$");
#include <err.h>
#include <libxo/xo.h>
#include "netstat.h"
+#include "common.h"
#include "nl_defs.h"
/*
* Definitions for showing gateway flags.
*/
-static struct bits {
- u_long b_mask;
- char b_val;
- const char *b_name;
-} bits[] = {
+struct bits rt_bits[] = {
{ RTF_UP, 'U', "up" },
{ RTF_GATEWAY, 'G', "gateway" },
{ RTF_HOST, 'H', "host" },
@@ -99,11 +96,8 @@ static struct bits {
{ 0 , 0, NULL }
};
-struct ifmap_entry {
- char ifname[IFNAMSIZ];
-};
static struct ifmap_entry *ifmap;
-static int ifmap_size;
+static size_t ifmap_size;
static struct timespec uptime;
static const char *netname4(in_addr_t, in_addr_t);
@@ -112,12 +106,7 @@ static const char *netname6(struct sockaddr_in6 *, struct sockaddr_in6 *);
#endif
static void p_rtable_sysctl(int, int);
static void p_rtentry_sysctl(const char *name, struct rt_msghdr *);
-static int p_sockaddr(const char *name, struct sockaddr *, struct sockaddr *,
- int, int);
-static const char *fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask,
- int flags);
static void p_flags(int, const char *);
-static const char *fmt_flags(int f);
static void domask(char *, size_t, u_long);
@@ -229,7 +218,7 @@ pr_rthdr(int af1 __unused)
wid_dst, wid_dst, "Destination",
wid_gw, wid_gw, "Gateway",
wid_flags, wid_flags, "Flags",
- wid_pksent, wid_pksent, "Use",
+ wid_mtu, wid_mtu, "Nhop#",
wid_mtu, wid_mtu, "Mtu",
wid_if, wid_if, "Netif",
wid_expire, "Expire");
@@ -252,46 +241,10 @@ p_rtable_sysctl(int fibnum, int af)
char *buf, *next, *lim;
struct rt_msghdr *rtm;
struct sockaddr *sa;
- int fam = AF_UNSPEC, ifindex = 0, size;
+ int fam = AF_UNSPEC;
int need_table_close = false;
- struct ifaddrs *ifap, *ifa;
- struct sockaddr_dl *sdl;
-
- /*
- * Retrieve interface list at first
- * since we need #ifindex -> if_xname match
- */
- if (getifaddrs(&ifap) != 0)
- err(EX_OSERR, "getifaddrs");
-
- for (ifa = ifap; ifa; ifa = ifa->ifa_next) {
-
- if (ifa->ifa_addr->sa_family != AF_LINK)
- continue;
-
- sdl = (struct sockaddr_dl *)ifa->ifa_addr;
- ifindex = sdl->sdl_index;
-
- if (ifindex >= ifmap_size) {
- size = roundup(ifindex + 1, 32) *
- sizeof(struct ifmap_entry);
- if ((ifmap = realloc(ifmap, size)) == NULL)
- errx(2, "realloc(%d) failed", size);
- memset(&ifmap[ifmap_size], 0,
- size - ifmap_size *
- sizeof(struct ifmap_entry));
-
- ifmap_size = roundup(ifindex + 1, 32);
- }
-
- if (*ifmap[ifindex].ifname != '\0')
- continue;
-
- strlcpy(ifmap[ifindex].ifname, ifa->ifa_name, IFNAMSIZ);
- }
-
- freeifaddrs(ifap);
+ ifmap = prepare_ifmap(&ifmap_size);
mib[0] = CTL_NET;
mib[1] = PF_ROUTE;
@@ -377,7 +330,8 @@ p_rtentry_sysctl(const char *name, struct rt_msghdr *rtm)
wid_flags - protrusion);
p_flags(rtm->rtm_flags, buffer);
if (Wflag) {
- xo_emit("{t:use/%*lu} ", wid_pksent, rtm->rtm_rmx.rmx_pksent);
+ /* XXX: use=0? */
+ xo_emit("{t:nhop/%*lu} ", wid_mtu, rtm->rtm_rmx.rmx_nhidx);
if (rtm->rtm_rmx.rmx_mtu != 0)
xo_emit("{t:mtu/%*lu} ", wid_mtu, rtm->rtm_rmx.rmx_mtu);
@@ -410,7 +364,7 @@ p_rtentry_sysctl(const char *name, struct rt_msghdr *rtm)
xo_close_instance(name);
}
-static int
+int
p_sockaddr(const char *name, struct sockaddr *sa, struct sockaddr *mask,
int flags, int width)
{
@@ -442,7 +396,7 @@ p_sockaddr(const char *name, struct sockaddr *sa, struct sockaddr *mask,
return (protrusion);
}
-static const char *
+const char *
fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask, int flags)
{
static char buf[128];
@@ -519,30 +473,10 @@ fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask, int flags)
static void
p_flags(int f, const char *format)
{
- struct bits *p;
-
- xo_emit(format, fmt_flags(f));
- xo_open_list("flags_pretty");
- for (p = bits; p->b_mask; p++)
- if (p->b_mask & f)
- xo_emit("{le:flags_pretty/%s}", p->b_name);
- xo_close_list("flags_pretty");
+ print_flags_generic(f, rt_bits, format, "flags_pretty");
}
-static const char *
-fmt_flags(int f)
-{
- static char name[33];
- char *flags;
- struct bits *p = bits;
-
- for (flags = name; p->b_mask; p++)
- if (p->b_mask & f)
- *flags++ = p->b_val;
- *flags = '\0';
- return (name);
-}
char *
routename(struct sockaddr *sa, int flags)