aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander V. Chernikov <melifaro@FreeBSD.org>2022-01-20 21:39:21 +0000
committerAlexander V. Chernikov <melifaro@FreeBSD.org>2022-10-01 14:15:35 +0000
commit7e5bf68495cc0a8c9793a338a8a02009a7f6dbb6 (patch)
tree9ed5b89fbb30fccbebc050062a943288f63e043c
parent35d60ac2e5bdb63ea8c6e08caca699dede8674e6 (diff)
downloadsrc-7e5bf68495cc0a8c9793a338a8a02009a7f6dbb6.tar.gz
src-7e5bf68495cc0a8c9793a338a8a02009a7f6dbb6.zip
netlink: add netlink support
Netlinks is a communication protocol currently used in Linux kernel to modify, read and subscribe for nearly all networking state. Interfaces, addresses, routes, firewall, fibs, vnets, etc are controlled via netlink. It is async, TLV-based protocol, providing 1-1 and 1-many communications. The current implementation supports the subset of NETLINK_ROUTE family. To be more specific, the following is supported: * Dumps: - routes - nexthops / nexthop groups - interfaces - interface addresses - neighbors (arp/ndp) * Notifications: - interface arrival/departure - interface address arrival/departure - route addition/deletion * Modifications: - adding/deleting routes - adding/deleting nexthops/nexthops groups - adding/deleting neghbors - adding/deleting interfaces (basic support only) * Rtsock interaction - route events are bridged both ways The implementation also supports the NETLINK_GENERIC family framework. Implementation notes: Netlink is implemented via loadable/unloadable kernel module, not touching many kernel parts. Each netlink socket uses dedicated taskqueue to support async operations that can sleep, such as interface creation. All message processing is performed within these taskqueues. Compatibility: Most of the Netlink data models specified above maps to FreeBSD concepts nicely. Unmodified ip(8) binary correctly works with interfaces, addresses, routes, nexthops and nexthop groups. Some software such as net/bird require header-only modifications to compile and work with FreeBSD netlink. Reviewed by: imp Differential Revision: https://reviews.freebsd.org/D36002 MFC after: 2 months
-rw-r--r--etc/mtree/BSD.include.dist4
-rw-r--r--sys/modules/Makefile1
-rw-r--r--sys/modules/netlink/Makefile17
-rw-r--r--sys/net/route.c11
-rw-r--r--sys/net/route/route_ctl.h7
-rw-r--r--sys/net/rtsock.c42
-rw-r--r--sys/netlink/netlink.h257
-rw-r--r--sys/netlink/netlink_ctl.h102
-rw-r--r--sys/netlink/netlink_debug.h82
-rw-r--r--sys/netlink/netlink_domain.c689
-rw-r--r--sys/netlink/netlink_generic.c472
-rw-r--r--sys/netlink/netlink_generic.h112
-rw-r--r--sys/netlink/netlink_io.c528
-rw-r--r--sys/netlink/netlink_linux.h54
-rw-r--r--sys/netlink/netlink_message_parser.c472
-rw-r--r--sys/netlink/netlink_message_parser.h270
-rw-r--r--sys/netlink/netlink_message_writer.c686
-rw-r--r--sys/netlink/netlink_message_writer.h250
-rw-r--r--sys/netlink/netlink_module.c228
-rw-r--r--sys/netlink/netlink_route.c135
-rw-r--r--sys/netlink/netlink_route.h43
-rw-r--r--sys/netlink/netlink_var.h142
-rw-r--r--sys/netlink/route/common.h213
-rw-r--r--sys/netlink/route/iface.c857
-rw-r--r--sys/netlink/route/iface_drivers.c165
-rw-r--r--sys/netlink/route/ifaddrs.h90
-rw-r--r--sys/netlink/route/interface.h245
-rw-r--r--sys/netlink/route/neigh.c571
-rw-r--r--sys/netlink/route/neigh.h105
-rw-r--r--sys/netlink/route/nexthop.c1000
-rw-r--r--sys/netlink/route/nexthop.h102
-rw-r--r--sys/netlink/route/route.c972
-rw-r--r--sys/netlink/route/route.h366
-rw-r--r--sys/netlink/route/route_var.h101
34 files changed, 9391 insertions, 0 deletions
diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist
index bb5453252d86..192508bbf6f1 100644
--- a/etc/mtree/BSD.include.dist
+++ b/etc/mtree/BSD.include.dist
@@ -269,6 +269,10 @@
..
netinet6
..
+ netlink
+ route
+ ..
+ ..
netipsec
..
netnatm
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index 68b3dfcac776..a6aee9bbab36 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -266,6 +266,7 @@ SUBDIR= \
my \
${_nctgpio} \
${_neta} \
+ netlink \
${_netgraph} \
${_nfe} \
nfscl \
diff --git a/sys/modules/netlink/Makefile b/sys/modules/netlink/Makefile
new file mode 100644
index 000000000000..046ecf5a2961
--- /dev/null
+++ b/sys/modules/netlink/Makefile
@@ -0,0 +1,17 @@
+.PATH: ${SRCTOP}/sys/netlink
+KMOD= netlink
+
+SRCS = netlink_module.c netlink_domain.c netlink_io.c \
+ netlink_message_parser.c netlink_message_writer.c netlink_generic.c \
+ netlink_route.c route/iface.c route/iface_drivers.c route/neigh.c \
+ route/nexthop.c route/route.c
+
+EXPORT_SYMS=
+EXPORT_SYMS+= nlmsg_get_chain_writer
+EXPORT_SYMS+= nlmsg_refill_buffer
+EXPORT_SYMS+= nlmsg_end
+EXPORT_SYMS+= nlmsg_flush
+
+EXPORT_SYMS= YES
+
+.include <bsd.kmod.mk>
diff --git a/sys/net/route.c b/sys/net/route.c
index 7d46ba2588ed..9773f899f5af 100644
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -694,3 +694,14 @@ rt_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum)
return (rtsock_routemsg_info(cmd, info, fibnum));
}
+
+/* Netlink-related callbacks needed to glue rtsock, netlink and linuxolator */
+static void
+ignore_route_event(uint32_t fibnum, const struct rib_cmd_info *rc)
+{
+}
+static struct rtbridge ignore_cb = { .route_f = ignore_route_event };
+
+void *linux_netlink_p = NULL; /* Callback pointer for Linux translator functions */
+struct rtbridge *rtsock_callback_p = &ignore_cb;
+struct rtbridge *netlink_callback_p = &ignore_cb;
diff --git a/sys/net/route/route_ctl.h b/sys/net/route/route_ctl.h
index 0b331e5f7d2c..d150da6264d4 100644
--- a/sys/net/route/route_ctl.h
+++ b/sys/net/route/route_ctl.h
@@ -189,4 +189,11 @@ void rib_unsubscribe_locked(struct rib_subscription *rs);
void rib_notify(struct rib_head *rnh, enum rib_subscription_type type,
struct rib_cmd_info *rc);
+/* Event bridge */
+typedef void route_event_f(uint32_t fibnum, const struct rib_cmd_info *rc);
+struct rtbridge{
+ route_event_f *route_f;
+};
+extern struct rtbridge *rtsock_callback_p;
+extern struct rtbridge *netlink_callback_p;
#endif
diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c
index 91ad8c79a5eb..99d962c972cb 100644
--- a/sys/net/rtsock.c
+++ b/sys/net/rtsock.c
@@ -219,6 +219,7 @@ static void send_rtm_reply(struct socket *so, struct rt_msghdr *rtm,
int rtm_errno);
static bool can_export_rte(struct ucred *td_ucred, bool rt_is_host,
const struct sockaddr *rt_dst);
+static void rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc);
static struct netisr_handler rtsock_nh = {
.nh_name = "rtsock",
@@ -275,6 +276,45 @@ VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
#endif
static void
+report_route_event(const struct rib_cmd_info *rc, void *_cbdata)
+{
+ uint32_t fibnum = (uint32_t)(uintptr_t)_cbdata;
+ struct nhop_object *nh;
+
+ nh = rc->rc_cmd == RTM_DELETE ? rc->rc_nh_old : rc->rc_nh_new;
+ rt_routemsg(rc->rc_cmd, rc->rc_rt, nh, fibnum);
+}
+
+static void
+rts_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc)
+{
+#ifdef ROUTE_MPATH
+ if ((rc->rc_nh_new && NH_IS_NHGRP(rc->rc_nh_new)) ||
+ (rc->rc_nh_old && NH_IS_NHGRP(rc->rc_nh_old))) {
+ rib_decompose_notification(rc, report_route_event,
+ (void *)(uintptr_t)fibnum);
+ } else
+#endif
+ report_route_event(rc, (void *)(uintptr_t)fibnum);
+}
+static struct rtbridge rtsbridge = { .route_f = rts_handle_route_event };
+static struct rtbridge *rtsbridge_orig_p;
+
+static void
+rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc)
+{
+ netlink_callback_p->route_f(fibnum, rc);
+}
+
+static void
+rtsock_init(void)
+{
+ rtsbridge_orig_p = rtsock_callback_p;
+ rtsock_callback_p = &rtsbridge;
+}
+SYSINIT(rtsock_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtsock_init, NULL);
+
+static void
rts_handle_ifnet_arrival(void *arg __unused, struct ifnet *ifp)
{
rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
@@ -1074,6 +1114,7 @@ rts_send(struct socket *so, int flags, struct mbuf *m,
}
error = rib_action(fibnum, rtm->rtm_type, &info, &rc);
if (error == 0) {
+ rtsock_notify_event(fibnum, &rc);
#ifdef ROUTE_MPATH
if (NH_IS_NHGRP(rc.rc_nh_new) ||
(rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) {
@@ -1095,6 +1136,7 @@ rts_send(struct socket *so, int flags, struct mbuf *m,
case RTM_DELETE:
error = rib_action(fibnum, RTM_DELETE, &info, &rc);
if (error == 0) {
+ rtsock_notify_event(fibnum, &rc);
#ifdef ROUTE_MPATH
if (NH_IS_NHGRP(rc.rc_nh_old) ||
(rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) {
diff --git a/sys/netlink/netlink.h b/sys/netlink/netlink.h
new file mode 100644
index 000000000000..6a68dcec1382
--- /dev/null
+++ b/sys/netlink/netlink.h
@@ -0,0 +1,257 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (C) The Internet Society (2003). All Rights Reserved.
+ *
+ * This document and translations of it may be copied and furnished to
+ * others, and derivative works that comment on or otherwise explain it
+ * or assist in its implementation may be prepared, copied, published
+ * and distributed, in whole or in part, without restriction of any
+ * kind, provided that the above copyright notice and this paragraph are
+ * included on all such copies and derivative works. However, this
+ * document itself may not be modified in any way, such as by removing
+ * the copyright notice or references to the Internet Society or other
+ * Internet organizations, except as needed for the purpose of
+ * developing Internet standards in which case the procedures for
+ * copyrights defined in the Internet Standards process must be
+ * followed, or as required to translate it into languages other than
+ * English.
+ *
+ * The limited permissions granted above are perpetual and will not be
+ * revoked by the Internet Society or its successors or assignees.
+ *
+ * This document and the information contained herein is provided on an
+ * "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
+ * TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
+ * HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+
+ */
+
+/*
+ * This file contains structures and constants for RFC 3549 (Netlink)
+ * protocol. Some values have been taken from Linux implementation.
+ */
+
+#ifndef _NETLINK_NETLINK_H_
+#define _NETLINK_NETLINK_H_
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+struct sockaddr_nl {
+ uint8_t nl_len; /* sizeof(sockaddr_nl) */
+ sa_family_t nl_family; /* netlink family */
+ uint16_t nl_pad; /* reserved, set to 0 */
+ uint32_t nl_pid; /* desired port ID, 0 for auto-select */
+ uint32_t nl_groups; /* multicast groups mask to bind to */
+};
+
+#define SOL_NETLINK 270
+
+/* Netlink socket options */
+#define NETLINK_ADD_MEMBERSHIP 1 /* Subscribe for the specified group notifications */
+#define NETLINK_DROP_MEMBERSHIP 2 /* Unsubscribe from the specified group */
+#define NETLINK_PKTINFO 3 /* XXX: not supported */
+#define NETLINK_BROADCAST_ERROR 4 /* XXX: not supported */
+#define NETLINK_NO_ENOBUFS 5 /* XXX: not supported */
+#define NETLINK_RX_RING 6 /* XXX: not supported */
+#define NETLINK_TX_RING 7 /* XXX: not supported */
+#define NETLINK_LISTEN_ALL_NSID 8 /* XXX: not supported */
+
+#define NETLINK_LIST_MEMBERSHIPS 9
+#define NETLINK_CAP_ACK 10 /* Send only original message header in the reply */
+#define NETLINK_EXT_ACK 11 /* Ack support for receiving additional TLVs in ack */
+#define NETLINK_GET_STRICT_CHK 12 /* Strict header checking */
+
+
+/*
+ * RFC 3549, 2.3.2 Netlink Message Header
+ */
+struct nlmsghdr {
+ uint32_t nlmsg_len; /* Length of message including header */
+ uint16_t nlmsg_type; /* Message type identifier */
+ uint16_t nlmsg_flags; /* Flags (NLM_F_) */
+ uint32_t nlmsg_seq; /* Sequence number */
+ uint32_t nlmsg_pid; /* Sending process port ID */
+};
+
+/*
+ * RFC 3549, 2.3.2 standard flag bits (nlmsg_flags)
+ */
+#define NLM_F_REQUEST 0x01 /* Indicateds request to kernel */
+#define NLM_F_MULTI 0x02 /* Message is part of a group terminated by NLMSG_DONE msg */
+#define NLM_F_ACK 0x04 /* Reply with ack message containing resulting error code */
+#define NLM_F_ECHO 0x08 /* (not supported) Echo this request back */
+#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */
+#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */
+
+/*
+ * RFC 3549, 2.3.2 Additional flag bits for GET requests
+ */
+#define NLM_F_ROOT 0x100 /* Return the complete table */
+#define NLM_F_MATCH 0x200 /* Return all entries matching criteria */
+#define NLM_F_ATOMIC 0x400 /* Return an atomic snapshot (ignored) */
+#define NLM_F_DUMP (NLM_F_ROOT | NLM_F_MATCH)
+
+/*
+ * RFC 3549, 2.3.2 Additional flag bits for NEW requests
+ */
+#define NLM_F_REPLACE 0x100 /* Replace existing matching config object */
+#define NLM_F_EXCL 0x200 /* Don't replace the object if exists */
+#define NLM_F_CREATE 0x400 /* Create if it does not exist */
+#define NLM_F_APPEND 0x800 /* Add to end of list */
+
+/* Modifiers to DELETE requests */
+#define NLM_F_NONREC 0x100 /* Do not delete recursively */
+
+/* Flags for ACK message */
+#define NLM_F_CAPPED 0x100 /* request was capped */
+#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */
+
+/*
+ * RFC 3549, 2.3.2 standard message types (nlmsg_type).
+ */
+#define NLMSG_NOOP 0x1 /* Message is ignored. */
+#define NLMSG_ERROR 0x2 /* reply error code reporting */
+#define NLMSG_DONE 0x3 /* Message terminates a multipart message. */
+#define NLMSG_OVERRUN 0x4 /* overrun detected, data is lost */
+
+#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */
+
+/*
+ * Defition of numbers assigned to the netlink subsystems.
+ */
+#define NETLINK_ROUTE 0 /* Routing/device hook */
+#define NETLINK_UNUSED 1 /* not supported */
+#define NETLINK_USERSOCK 2 /* not supported */
+#define NETLINK_FIREWALL 3 /* not supported */
+#define NETLINK_SOCK_DIAG 4 /* not supported */
+#define NETLINK_NFLOG 5 /* not supported */
+#define NETLINK_XFRM 6 /* (not supported) PF_SETKEY */
+#define NETLINK_SELINUX 7 /* not supported */
+#define NETLINK_ISCSI 8 /* not supported */
+#define NETLINK_AUDIT 9 /* not supported */
+#define NETLINK_FIB_LOOKUP 10 /* not supported */
+#define NETLINK_CONNECTOR 11 /* not supported */
+#define NETLINK_NETFILTER 12 /* not supported */
+#define NETLINK_IP6_FW 13 /* not supported */
+#define NETLINK_DNRTMSG 14 /* not supported */
+#define NETLINK_KOBJECT_UEVENT 15 /* not supported */
+#define NETLINK_GENERIC 16 /* Generic netlink (dynamic families) */
+
+/*
+ * RFC 3549, 2.3.2.2 The ACK Netlink Message
+ */
+struct nlmsgerr {
+ int error;
+ struct nlmsghdr msg;
+};
+
+enum nlmsgerr_attrs {
+ NLMSGERR_ATTR_UNUSED,
+ NLMSGERR_ATTR_MSG = 1, /* string, error message */
+ NLMSGERR_ATTR_OFFS = 2, /* u32, offset of the invalid attr from nl header */
+ NLMSGERR_ATTR_COOKIE = 3, /* binary, data to pass to userland */
+ NLMSGERR_ATTR_POLICY = 4, /* not supported */
+ __NLMSGERR_ATTR_MAX,
+ NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1
+};
+
+
+#ifndef roundup2
+#define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
+#endif
+#define NL_ITEM_ALIGN_SIZE sizeof(uint32_t)
+#define NL_ITEM_ALIGN(_len) roundup2(_len, NL_ITEM_ALIGN_SIZE)
+#define NL_ITEM_DATA(_ptr, _off) ((void *)((char *)(_ptr) + _off))
+#define NL_ITEM_DATA_CONST(_ptr, _off) ((const void *)((const char *)(_ptr) + _off))
+
+#define NL_ITEM_OK(_ptr, _len, _hlen, _LEN_M) \
+ ((_len) >= _hlen && _LEN_M(_ptr) >= _hlen && _LEN_M(_ptr) <= (_len))
+#define NL_ITEM_NEXT(_ptr, _LEN_M) ((typeof(_ptr))((char *)(_ptr) + _LEN_M(_ptr)))
+#define NL_ITEM_ITER(_ptr, _len, _LEN_MACRO) \
+ ((_len) -= _LEN_MACRO(_ptr), NL_ITEM_NEXT(_ptr, _LEN_MACRO))
+
+
+#ifndef _KERNEL
+/* part of netlink(3) API */
+#define NLMSG_ALIGNTO NL_ITEM_ALIGN_SIZE
+#define NLMSG_ALIGN(_len) NL_ITEM_ALIGN(_len)
+#define NLMSG_HDRLEN ((int)sizeof(struct nlmsghdr))
+#define NLMSG_LENGTH(_len) ((_len) + NLMSG_HDRLEN)
+#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(_len))
+#define NLMSG_DATA(_hdr) NL_ITEM_DATA(_hdr, NLMSG_HDRLEN)
+#define _NLMSG_LEN(_hdr) ((int)(_hdr)->nlmsg_len)
+#define _NLMSG_ALIGNED_LEN(_hdr) NLMSG_ALIGN(_NLMSG_LEN(_hdr))
+#define NLMSG_OK(_hdr, _len) NL_ITEM_OK(_hdr, _len, NLMSG_HDRLEN, _NLMSG_LEN)
+#define NLMSG_PAYLOAD(_hdr,_len) (_NLMSG_LEN(_hdr) - NLMSG_SPACE((_len)))
+#define NLMSG_NEXT(_hdr, _len) NL_ITEM_ITER(_hdr, _len, _NLMSG_ALIGNED_LEN)
+
+#else
+#define NLMSG_ALIGNTO 4U
+#define NLMSG_ALIGN(len) (((len) + NLMSG_ALIGNTO - 1) & ~(NLMSG_ALIGNTO - 1))
+#define NLMSG_HDRLEN ((int)NLMSG_ALIGN(sizeof(struct nlmsghdr)))
+#endif
+
+/*
+ * Base netlink attribute TLV header.
+ */
+struct nlattr {
+ uint16_t nla_len; /* Total attribute length */
+ uint16_t nla_type; /* Attribute type */
+};
+
+/*
+ *
+ * nl_type field enconding:
+ *
+ * 0 1
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |N|O| Attribute type |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * N - attribute contains other attributes (mostly unused)
+ * O - encoded in network byte order (mostly unused)
+ * Note: N & O are mutually exclusive
+ *
+ * Note: attribute type value scope normally is either parent attribute
+ * or the message/message group.
+ */
+
+#define NLA_F_NESTED (1 << 15)
+#define NLA_F_NET_BYTEORDER (1 << 14)
+#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER)
+
+#ifndef _KERNEL
+#define NLA_ALIGNTO NL_ITEM_ALIGN_SIZE
+#define NLA_ALIGN(_len) NL_ITEM_ALIGN(_len)
+#define NLA_HDRLEN ((int)sizeof(struct nlattr))
+#endif
+
+#endif
diff --git a/sys/netlink/netlink_ctl.h b/sys/netlink/netlink_ctl.h
new file mode 100644
index 000000000000..fb5a8b30e0aa
--- /dev/null
+++ b/sys/netlink/netlink_ctl.h
@@ -0,0 +1,102 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETLINK_NETLINK_CTL_H_
+#define _NETLINK_NETLINK_CTL_H_
+
+/*
+ * This file provides headers for the public KPI of the netlink
+ * subsystem
+ */
+
+MALLOC_DECLARE(M_NETLINK);
+
+/*
+ * Macro for handling attribute TLVs
+ */
+#define _roundup2(x, y) (((x)+((y)-1))&(~((y)-1)))
+
+#define NETLINK_ALIGN_SIZE sizeof(uint32_t)
+#define NETLINK_ALIGN(_len) _roundup2(_len, NETLINK_ALIGN_SIZE)
+
+#define NLA_ALIGN_SIZE sizeof(uint32_t)
+#define NLA_ALIGN(_len) _roundup2(_len, NLA_ALIGN_SIZE)
+#define NLA_HDRLEN ((int)sizeof(struct nlattr))
+#define NLA_DATA_LEN(_nla) ((int)((_nla)->nla_len - NLA_HDRLEN))
+#define NLA_DATA(_nla) NL_ITEM_DATA(_nla, NLA_HDRLEN)
+#define NLA_DATA_CONST(_nla) NL_ITEM_DATA_CONST(_nla, NLA_HDRLEN)
+#define NLA_TYPE(_nla) ((_nla)->nla_type & 0x3FFF)
+
+#ifndef typeof
+#define typeof __typeof
+#endif
+
+#define NLA_NEXT(_attr) (struct nlattr *)((char *)_attr + NLA_ALIGN(_attr->nla_len))
+#define _NLA_END(_start, _len) ((char *)(_start) + (_len))
+#define NLA_FOREACH(_attr, _start, _len) \
+ for (typeof(_attr) _end = (typeof(_attr))_NLA_END(_start, _len), _attr = (_start); \
+ ((char *)_attr < (char *)_end) && \
+ ((char *)NLA_NEXT(_attr) <= (char *)_end); \
+ _attr = (_len -= NLA_ALIGN(_attr->nla_len), NLA_NEXT(_attr)))
+
+#define NL_ARRAY_LEN(_a) (sizeof(_a) / sizeof((_a)[0]))
+
+#include <netlink/netlink_message_writer.h>
+#include <netlink/netlink_message_parser.h>
+
+
+/* Protocol handlers */
+struct nl_pstate;
+typedef int (*nl_handler_f)(struct nlmsghdr *hdr, struct nl_pstate *npt);
+
+bool netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler);
+bool netlink_unregister_proto(int proto);
+
+/* Common helpers */
+bool nl_has_listeners(int netlink_family, uint32_t groups_mask);
+bool nlp_has_priv(struct nlpcb *nlp, int priv);
+
+/* netlink_generic.c */
+struct genl_cmd {
+ const char *cmd_name;
+ nl_handler_f cmd_cb;
+ uint32_t cmd_flags;
+ uint32_t cmd_priv;
+ uint32_t cmd_num;
+};
+
+uint32_t genl_register_family(const char *family_name, size_t hdrsize,
+ int family_version, int max_attr_idx);
+bool genl_unregister_family(const char *family_name);
+bool genl_register_cmds(const char *family_name, const struct genl_cmd *cmds,
+ int count);
+uint32_t genl_register_group(const char *family_name, const char *group_name);
+
+/* Debug */
+uint32_t nlp_get_pid(const struct nlpcb *nlp);
+
+#endif
diff --git a/sys/netlink/netlink_debug.h b/sys/netlink/netlink_debug.h
new file mode 100644
index 000000000000..6ff6811c6a5a
--- /dev/null
+++ b/sys/netlink/netlink_debug.h
@@ -0,0 +1,82 @@
+/*-
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETLINK_NETLINK_DEBUG_H_
+#define _NETLINK_NETLINK_DEBUG_H_
+
+#define _DEBUG_SYSCTL_OID _net_netlink_debug
+#include <net/route/route_debug.h>
+
+SYSCTL_DECL(_net_netlink_debug);
+
+/*
+ * Generic debug
+ * [nl_domain] func_name: debug text
+ */
+#define NL_LOG RT_LOG
+
+/*
+ * Logging for events specific for particular process
+ * Example: [nl_domain] PID 4834 fdump_sa: unsupported family: 45
+ */
+#define NL_RAW_PID_LOG(_l, _pid, _fmt, ...) NL_RAW_PID_LOG_##_l(_l, _pid, _fmt, ## __VA_ARGS__)
+#define _NL_RAW_PID_LOG(_l, _pid, _fmt, ...) if (_DEBUG_PASS_MSG(_l)) { \
+ _output("[" DEBUG_PREFIX_NAME "] PID %u %s: " _fmt "\n", _pid, __func__, ##__VA_ARGS__); \
+}
+
+#define NLP_LOG(_l, _nlp, _fmt, ...) NL_RAW_PID_LOG_##_l(_l, nlp_get_pid(_nlp), _fmt, ## __VA_ARGS__)
+
+#if DEBUG_MAX_LEVEL>=LOG_DEBUG3
+#define NL_RAW_PID_LOG_LOG_DEBUG3 _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_DEBUG3(_l, _pid, _fmt, ...)
+#endif
+#if DEBUG_MAX_LEVEL>=LOG_DEBUG2
+#define NL_RAW_PID_LOG_LOG_DEBUG2 _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_DEBUG2(_l, _pid, _fmt, ...)
+#endif
+#if DEBUG_MAX_LEVEL>=LOG_DEBUG
+#define NL_RAW_PID_LOG_LOG_DEBUG _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_DEBUG(_l, _pid, _fmt, ...)
+#endif
+#if DEBUG_MAX_LEVEL>=LOG_INFO
+#define NL_RAW_PID_LOG_LOG_INFO _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_INFO(_l, _pid, _fmt, ...)
+#endif
+#define NL_RAW_PID_LOG_LOG_NOTICE _NL_RAW_PID_LOG
+#define NL_RAW_PID_LOG_LOG_ERR _NL_RAW_PID_LOG
+#define NL_RAW_PID_LOG_LOG_WARNING _NL_RAW_PID_LOG
+
+
+
+#endif
diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c
new file mode 100644
index 000000000000..159dfd03724d
--- /dev/null
+++ b/sys/netlink/netlink_domain.c
@@ -0,0 +1,689 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This file contains socket and protocol bindings for netlink.
+ */
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/domain.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/ck.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/priv.h> /* priv_check */
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+
+#define DEBUG_MOD_NAME nl_domain
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+
+#define NLCTL_TRACKER struct rm_priotracker nl_tracker
+#define NLCTL_RLOCK(_ctl) rm_rlock(&((_ctl)->ctl_lock), &nl_tracker)
+#define NLCTL_RUNLOCK(_ctl) rm_runlock(&((_ctl)->ctl_lock), &nl_tracker)
+
+#define NLCTL_WLOCK(_ctl) rm_wlock(&((_ctl)->ctl_lock))
+#define NLCTL_WUNLOCK(_ctl) rm_wunlock(&((_ctl)->ctl_lock))
+
+static u_long nl_sendspace = NLSNDQ;
+SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0,
+ "Default netlink socket send space");
+
+static u_long nl_recvspace = NLSNDQ;
+SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0,
+ "Default netlink socket receive space");
+
+extern u_long sb_max_adj;
+static u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */
+
+uint32_t
+nlp_get_pid(const struct nlpcb *nlp)
+{
+ return (nlp->nl_process_id);
+}
+
+/*
+ * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx.
+ * Returns nlpcb pointer if present else NULL
+ */
+static struct nlpcb *
+nl_port_lookup(uint32_t port_id)
+{
+ struct nlpcb *nlp;
+
+ CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_port_head, nl_port_next) {
+ if (nlp->nl_port == port_id)
+ return (nlp);
+ }
+ return (NULL);
+}
+
+static void
+nl_update_groups_locked(struct nlpcb *nlp, uint64_t nl_groups)
+{
+ /* Update group mask */
+ NL_LOG(LOG_DEBUG2, "socket %p, groups 0x%lX -> 0x%lX",
+ nlp->nl_socket, nlp->nl_groups, nl_groups);
+ nlp->nl_groups = nl_groups;
+}
+
+/*
+ * Broadcasts message @m to the protocol @proto group specified by @group_id
+ */
+void
+nl_send_group(struct mbuf *m, int num_messages, int proto, int group_id)
+{
+ struct nlpcb *nlp_last = NULL;
+ struct nlpcb *nlp;
+ NLCTL_TRACKER;
+
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
+ NL_LOG(LOG_DEBUG2, "MCAST mbuf len %u msg type %d len %u to group %d/%d",
+ m->m_len, hdr->nlmsg_type, hdr->nlmsg_len, proto, group_id);
+ }
+
+ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+ if (__predict_false(ctl == NULL)) {
+ /*
+ * Can be the case when notification is sent within VNET
+ * which doesn't have any netlink sockets.
+ */
+ m_freem(m);
+ return;
+ }
+
+ NLCTL_RLOCK(ctl);
+
+ int io_flags = NL_IOF_UNTRANSLATED;
+ uint64_t groups_mask = 1 << ((uint64_t)group_id - 1);
+
+ CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) {
+ if (nlp->nl_groups & groups_mask && nlp->nl_proto == proto) {
+ if (nlp_last != NULL) {
+ struct mbuf *m_copy;
+ m_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT);
+ if (m_copy != NULL)
+ nl_send_one(m_copy, nlp_last, num_messages, io_flags);
+ else {
+ NLP_LOCK(nlp_last);
+ if (nlp_last->nl_socket != NULL)
+ sorwakeup(nlp_last->nl_socket);
+ NLP_UNLOCK(nlp_last);
+ }
+ }
+ nlp_last = nlp;
+ }
+ }
+ if (nlp_last != NULL)
+ nl_send_one(m, nlp_last, num_messages, io_flags);
+ else
+ m_freem(m);
+
+ NLCTL_RUNLOCK(ctl);
+}
+
+bool
+nl_has_listeners(int netlink_family, uint32_t groups_mask)
+{
+ return (V_nl_ctl != NULL);
+}
+
+bool
+nlp_has_priv(struct nlpcb *nlp, int priv)
+{
+ return (priv_check_cred(nlp->nl_cred, priv) == 0);
+}
+
+static uint32_t
+nl_find_port() {
+ /*
+ * app can open multiple netlink sockets.
+ * Start with current pid, if already taken,
+ * try random numbers in 65k..256k+65k space,
+ * avoiding clash with pids.
+ */
+ if (nl_port_lookup(curproc->p_pid) == NULL)
+ return (curproc->p_pid);
+ for (int i = 0; i < 16; i++) {
+ uint32_t nl_port = (arc4random() % 65536) + 65536 * 4;
+ if (nl_port_lookup(nl_port) == 0)
+ return (nl_port);
+ NL_LOG(LOG_DEBUG3, "tried %u\n", nl_port);
+ }
+ return (curproc->p_pid);
+}
+
+static int
+nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl)
+{
+ if (nlp->nl_bound) {
+ if (nlp->nl_port != snl->nl_pid) {
+ NL_LOG(LOG_DEBUG,
+ "bind() failed: program pid %d "
+ "is different from provided pid %d",
+ nlp->nl_port, snl->nl_pid);
+ return (EINVAL); // XXX: better error
+ }
+ } else {
+ if (snl->nl_pid == 0)
+ snl->nl_pid = nl_find_port();
+ if (nl_port_lookup(snl->nl_pid) != NULL)
+ return (EADDRINUSE);
+ nlp->nl_port = snl->nl_pid;
+ nlp->nl_bound = true;
+ CK_LIST_INSERT_HEAD(&V_nl_ctl->ctl_port_head, nlp, nl_port_next);
+ }
+ nl_update_groups_locked(nlp, snl->nl_groups);
+
+ return (0);
+}
+
+static int
+nl_pru_attach(struct socket *so, int proto, struct thread *td)
+{
+ struct nlpcb *nlp;
+ int error;
+
+ if (__predict_false(netlink_unloading != 0))
+ return (EAFNOSUPPORT);
+
+ error = nl_verify_proto(proto);
+ if (error != 0)
+ return (error);
+
+ bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX;
+ NL_LOG(LOG_DEBUG2, "socket %p, %sPID %d: attaching socket to %s",
+ so, is_linux ? "(linux) " : "", curproc->p_pid,
+ nl_get_proto_name(proto));
+
+ /* Create per-VNET state on first socket init */
+ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+ if (ctl == NULL)
+ ctl = vnet_nl_ctl_init();
+ KASSERT(V_nl_ctl != NULL, ("nl_attach: vnet_sock_init() failed"));
+
+ MPASS(sotonlpcb(so) == NULL);
+
+ nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO);
+ error = soreserve(so, nl_sendspace, nl_recvspace);
+ if (error != 0) {
+ free(nlp, M_PCB);
+ return (error);
+ }
+ so->so_pcb = nlp;
+ nlp->nl_socket = so;
+ /* Copy so_cred to avoid having socket_var.h in every header */
+ nlp->nl_cred = so->so_cred;
+ nlp->nl_proto = proto;
+ nlp->nl_process_id = curproc->p_pid;
+ nlp->nl_linux = is_linux;
+ nlp->nl_active = true;
+ NLP_LOCK_INIT(nlp);
+ refcount_init(&nlp->nl_refcount, 1);
+ nl_init_io(nlp);
+
+ nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK,
+ taskqueue_thread_enqueue, &nlp->nl_taskqueue);
+ TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp);
+ taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT,
+ "netlink_socket (PID %u)", nlp->nl_process_id);
+
+ NLCTL_WLOCK(ctl);
+ /* XXX: check ctl is still alive */
+ CK_LIST_INSERT_HEAD(&ctl->ctl_pcb_head, nlp, nl_next);
+ NLCTL_WUNLOCK(ctl);
+
+ soisconnected(so);
+
+ return (0);
+}
+
+static void
+nl_pru_abort(struct socket *so)
+{
+ NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ soisdisconnected(so);
+}
+
+static int
+nl_pru_bind(struct socket *so, struct sockaddr *sa, struct thread *td)
+{
+ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+ struct nlpcb *nlp = sotonlpcb(so);
+ struct sockaddr_nl *snl = (struct sockaddr_nl *)sa;
+ int error;
+
+ NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ if (snl->nl_len != sizeof(*snl)) {
+ NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
+ return (EINVAL);
+ }
+
+
+ NLCTL_WLOCK(ctl);
+ NLP_LOCK(nlp);
+ error = nl_bind_locked(nlp, snl);
+ NLP_UNLOCK(nlp);
+ NLCTL_WUNLOCK(ctl);
+ NL_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so,
+ snl->nl_pid, snl->nl_groups, error);
+
+ return (error);
+}
+
+
+static int
+nl_assign_port(struct nlpcb *nlp, uint32_t port_id)
+{
+ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+ struct sockaddr_nl snl = {
+ .nl_pid = port_id,
+ };
+ int error;
+
+ NLCTL_WLOCK(ctl);
+ NLP_LOCK(nlp);
+ snl.nl_groups = nlp->nl_groups;
+ error = nl_bind_locked(nlp, &snl);
+ NLP_UNLOCK(nlp);
+ NLCTL_WUNLOCK(ctl);
+
+ NL_LOG(LOG_DEBUG3, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error);
+ return (error);
+}
+
+/*
+ * nl_autobind_port binds a unused portid to @nlp
+ * @nlp: pcb data for the netlink socket
+ * @candidate_id: first id to consider
+ */
+static int
+nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id)
+{
+ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+ uint32_t port_id = candidate_id;
+ NLCTL_TRACKER;
+ bool exist;
+ int error;
+
+ for (int i = 0; i < 10; i++) {
+ NL_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id);
+ NLCTL_RLOCK(ctl);
+ exist = nl_port_lookup(port_id) != 0;
+ NLCTL_RUNLOCK(ctl);
+ if (!exist) {
+ error = nl_assign_port(nlp, port_id);
+ if (error != EADDRINUSE)
+ break;
+ }
+ port_id++;
+ }
+ NL_LOG(LOG_DEBUG3, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error);
+ return (error);
+}
+
+static int
+nl_pru_connect(struct socket *so, struct sockaddr *sa, struct thread *td)
+{
+ struct sockaddr_nl *snl = (struct sockaddr_nl *)sa;
+ struct nlpcb *nlp;
+
+ NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ if (snl->nl_len != sizeof(*snl)) {
+ NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
+ return (EINVAL);
+ }
+
+ nlp = sotonlpcb(so);
+ if (!nlp->nl_bound) {
+ int error = nl_autobind_port(nlp, td->td_proc->p_pid);
+ if (error != 0) {
+ NL_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error);
+ return (error);
+ }
+ }
+ /* XXX: Handle socket flags & multicast */
+ soisconnected(so);
+
+ NL_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid);
+
+ return (0);
+}
+
+static void
+destroy_nlpcb(struct nlpcb *nlp)
+{
+ NLP_LOCK(nlp);
+ nl_free_io(nlp);
+ NLP_LOCK_DESTROY(nlp);
+ free(nlp, M_PCB);
+}
+
+static void
+destroy_nlpcb_epoch(epoch_context_t ctx)
+{
+ struct nlpcb *nlp;
+
+ nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx);
+
+ destroy_nlpcb(nlp);
+}
+
+
+static void
+nl_pru_detach(struct socket *so)
+{
+ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+ MPASS(sotonlpcb(so) != NULL);
+ struct nlpcb *nlp;
+
+ NL_LOG(LOG_DEBUG2, "detaching socket %p, PID %d", so, curproc->p_pid);
+ nlp = sotonlpcb(so);
+
+ /* Mark as inactive so no new work can be enqueued */
+ NLP_LOCK(nlp);
+ bool was_bound = nlp->nl_bound;
+ nlp->nl_active = false;
+ NLP_UNLOCK(nlp);
+
+ /* Wait till all scheduled work has been completed */
+ taskqueue_drain_all(nlp->nl_taskqueue);
+ taskqueue_free(nlp->nl_taskqueue);
+
+ NLCTL_WLOCK(ctl);
+ NLP_LOCK(nlp);
+ if (was_bound) {
+ CK_LIST_REMOVE(nlp, nl_port_next);
+ NL_LOG(LOG_DEBUG3, "socket %p, unlinking bound pid %u", so, nlp->nl_port);
+ }
+ CK_LIST_REMOVE(nlp, nl_next);
+ nlp->nl_socket = NULL;
+ NLP_UNLOCK(nlp);
+ NLCTL_WUNLOCK(ctl);
+
+ so->so_pcb = NULL;
+
+ NL_LOG(LOG_DEBUG3, "socket %p, detached", so);
+
+ /* XXX: is delayed free needed? */
+ epoch_call(net_epoch_preempt, destroy_nlpcb_epoch, &nlp->nl_epoch_ctx);
+}
+
+static int
+nl_pru_disconnect(struct socket *so)
+{
+ NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ return (ENOTCONN);
+}
+
+static int
+nl_pru_peeraddr(struct socket *so, struct sockaddr **sa)
+{
+ NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ return (ENOTCONN);
+}
+
+static int
+nl_pru_shutdown(struct socket *so)
+{
+ NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ socantsendmore(so);
+ return (0);
+}
+
+static int
+nl_pru_sockaddr(struct socket *so, struct sockaddr **sa)
+{
+ struct sockaddr_nl *snl;
+
+ snl = malloc(sizeof(struct sockaddr_nl), M_SONAME, M_WAITOK | M_ZERO);
+ /* TODO: set other fields */
+ snl->nl_len = sizeof(struct sockaddr_nl);
+ snl->nl_family = AF_NETLINK;
+ snl->nl_pid = sotonlpcb(so)->nl_port;
+ *sa = (struct sockaddr *)snl;
+ return (0);
+}
+
+static void
+nl_pru_close(struct socket *so)
+{
+ NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ soisdisconnected(so);
+}
+
+static int
+nl_pru_output(struct mbuf *m, struct socket *so, ...)
+{
+
+ if (__predict_false(m == NULL ||
+ ((m->m_len < sizeof(struct nlmsghdr)) &&
+ (m = m_pullup(m, sizeof(struct nlmsghdr))) == NULL)))
+ return (ENOBUFS);
+ MPASS((m->m_flags & M_PKTHDR) != 0);
+
+ NL_LOG(LOG_DEBUG3, "sending message to kernel async processing");
+ nl_receive_async(m, so);
+ return (0);
+}
+
+
+static int
+nl_pru_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *sa,
+ struct mbuf *control, struct thread *td)
+{
+ NL_LOG(LOG_DEBUG2, "sending message to kernel");
+
+ if (__predict_false(control != NULL)) {
+ if (control->m_len) {
+ m_freem(control);
+ return (EINVAL);
+ }
+ m_freem(control);
+ }
+
+ return (nl_pru_output(m, so));
+}
+
+static int
+nl_pru_rcvd(struct socket *so, int flags)
+{
+ NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+
+ nl_on_transmit(sotonlpcb(so));
+
+ return (0);
+}
+
+static int
+nl_getoptflag(int sopt_name)
+{
+ switch (sopt_name) {
+ case NETLINK_CAP_ACK:
+ return (NLF_CAP_ACK);
+ case NETLINK_EXT_ACK:
+ return (NLF_EXT_ACK);
+ case NETLINK_GET_STRICT_CHK:
+ return (NLF_STRICT);
+ }
+
+ return (0);
+}
+
+static int
+nl_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+ struct nlpcb *nlp = sotonlpcb(so);
+ uint32_t flag;
+ uint64_t groups, group_mask;
+ int optval, error = 0;
+ NLCTL_TRACKER;
+
+ NL_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get",
+ so, sopt->sopt_name);
+
+ switch (sopt->sopt_dir) {
+ case SOPT_SET:
+ switch (sopt->sopt_name) {
+ case NETLINK_ADD_MEMBERSHIP:
+ case NETLINK_DROP_MEMBERSHIP:
+ sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
+ if (optval <= 0 || optval >= 64) {
+ error = ERANGE;
+ break;
+ }
+ group_mask = (uint64_t)1 << (optval - 1);
+ NL_LOG(LOG_DEBUG2, "ADD/DEL group %d mask (%lX)", optval, group_mask);
+
+ NLCTL_WLOCK(ctl);
+ if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP)
+ groups = nlp->nl_groups | group_mask;
+ else
+ groups = nlp->nl_groups & ~group_mask;
+ nl_update_groups_locked(nlp, groups);
+ NLCTL_WUNLOCK(ctl);
+ break;
+ case NETLINK_CAP_ACK:
+ case NETLINK_EXT_ACK:
+ case NETLINK_GET_STRICT_CHK:
+ sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
+
+ flag = nl_getoptflag(sopt->sopt_name);
+
+ NLCTL_WLOCK(ctl);
+ if (optval != 0)
+ nlp->nl_flags |= flag;
+ else
+ nlp->nl_flags &= ~flag;
+ NLCTL_WUNLOCK(ctl);
+ break;
+ default:
+ error = ENOPROTOOPT;
+ }
+ break;
+ case SOPT_GET:
+ switch (sopt->sopt_name) {
+ case NETLINK_LIST_MEMBERSHIPS:
+ NLCTL_RLOCK(ctl);
+ optval = nlp->nl_groups;
+ NLCTL_RUNLOCK(ctl);
+ error = sooptcopyout(sopt, &optval, sizeof(optval));
+ break;
+ case NETLINK_CAP_ACK:
+ case NETLINK_EXT_ACK:
+ case NETLINK_GET_STRICT_CHK:
+ NLCTL_RLOCK(ctl);
+ optval = (nlp->nl_flags & nl_getoptflag(sopt->sopt_name)) != 0;
+ NLCTL_RUNLOCK(ctl);
+ error = sooptcopyout(sopt, &optval, sizeof(optval));
+ break;
+ default:
+ error = ENOPROTOOPT;
+ }
+ break;
+ default:
+ error = ENOPROTOOPT;
+ }
+
+ return (error);
+}
+
+static int
+nl_setsbopt(struct socket *so, struct sockopt *sopt)
+{
+ int error, optval;
+ bool result;
+
+ if (sopt->sopt_name != SO_RCVBUF)
+ return (sbsetopt(so, sopt));
+
+ /* Allow to override max buffer size in certain conditions */
+
+ error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
+ if (error != 0)
+ return (error);
+ NL_LOG(LOG_DEBUG2, "socket %p, PID %d, SO_RCVBUF=%d", so, curproc->p_pid, optval);
+ if (optval > sb_max_adj) {
+ if (priv_check(curthread, PRIV_NET_ROUTE) != 0)
+ return (EPERM);
+ }
+
+ SOCK_RECVBUF_LOCK(so);
+ result = sbreserve_locked_limit(so, SO_RCV, optval, nl_maxsockbuf, curthread);
+ SOCK_RECVBUF_UNLOCK(so);
+
+ return (result ? 0 : ENOBUFS);
+}
+
+static struct protosw netlinksw = {
+ .pr_type = SOCK_RAW,
+ .pr_flags = PR_ATOMIC | PR_ADDR | PR_WANTRCVD,
+ .pr_ctloutput = nl_ctloutput,
+ .pr_setsbopt = nl_setsbopt,
+ .pr_abort = nl_pru_abort,
+ .pr_attach = nl_pru_attach,
+ .pr_bind = nl_pru_bind,
+ .pr_connect = nl_pru_connect,
+ .pr_detach = nl_pru_detach,
+ .pr_disconnect = nl_pru_disconnect,
+ .pr_peeraddr = nl_pru_peeraddr,
+ .pr_send = nl_pru_send,
+ .pr_rcvd = nl_pru_rcvd,
+ .pr_shutdown = nl_pru_shutdown,
+ .pr_sockaddr = nl_pru_sockaddr,
+ .pr_close = nl_pru_close
+};
+
+static struct domain netlinkdomain = {
+ .dom_family = PF_NETLINK,
+ .dom_name = "netlink",
+ .dom_flags = DOMF_UNLOADABLE,
+ .dom_nprotosw = 1,
+ .dom_protosw = { &netlinksw },
+};
+
+DOMAIN_SET(netlink);
diff --git a/sys/netlink/netlink_generic.c b/sys/netlink/netlink_generic.c
new file mode 100644
index 000000000000..d422416cd9b4
--- /dev/null
+++ b/sys/netlink/netlink_generic.c
@@ -0,0 +1,472 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/priv.h>
+#include <sys/socket.h>
+#include <sys/ck.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+#include <netlink/netlink_generic.h>
+
+#define DEBUG_MOD_NAME nl_generic
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG3);
+
+#define MAX_FAMILIES 20
+#define MAX_GROUPS 20
+
+#define MIN_GROUP_NUM 48
+
+static struct sx sx_lock;
+
+#define GENL_LOCK_INIT() sx_init(&sx_lock, "genetlink lock")
+#define GENL_LOCK_DESTROY() sx_destroy(&sx_lock)
+#define GENL_LOCK() sx_xlock(&sx_lock)
+#define GENL_UNLOCK() sx_xunlock(&sx_lock)
+
+struct genl_family {
+ const char *family_name;
+ uint16_t family_hdrsize;
+ uint16_t family_id;
+ uint16_t family_version;
+ uint16_t family_attr_max;
+ uint16_t family_cmd_size;
+ uint16_t family_num_groups;
+ struct genl_cmd *family_cmds;
+};
+
+static struct genl_family families[MAX_FAMILIES];
+
+
+struct genl_group {
+ struct genl_family *group_family;
+ const char *group_name;
+};
+static struct genl_group groups[MAX_GROUPS];
+
+
+static int dump_family(struct nlmsghdr *hdr, struct genlmsghdr *ghdr,
+ const struct genl_family *gf, struct nl_writer *nw);
+static void nlctrl_notify(const struct genl_family *gf, int action);
+
+static struct genl_family *
+find_family(const char *family_name)
+{
+ for (int i = 0; i < MAX_FAMILIES; i++) {
+ struct genl_family *gf = &families[i];
+ if (gf->family_name != NULL && !strcmp(gf->family_name, family_name))
+ return (gf);
+ }
+
+ return (NULL);
+}
+
+uint32_t
+genl_register_family(const char *family_name, size_t hdrsize, int family_version,
+ int max_attr_idx)
+{
+ uint32_t family_id = 0;
+
+ MPASS(family_name != NULL);
+ if (find_family(family_name) != NULL)
+ return (0);
+
+ GENL_LOCK();
+ for (int i = 0; i < MAX_FAMILIES; i++) {
+ struct genl_family *gf = &families[i];
+ if (gf->family_name == NULL) {
+ gf->family_name = family_name;
+ gf->family_version = family_version;
+ gf->family_hdrsize = hdrsize;
+ gf->family_attr_max = max_attr_idx;
+ gf->family_id = i + GENL_MIN_ID;
+ NL_LOG(LOG_DEBUG2, "Registered family %s id %d",
+ gf->family_name, gf->family_id);
+ family_id = gf->family_id;
+ nlctrl_notify(gf, CTRL_CMD_NEWFAMILY);
+ break;
+ }
+ }
+ GENL_UNLOCK();
+
+ return (family_id);
+}
+
+static void
+free_family(struct genl_family *gf)
+{
+ if (gf->family_cmds != NULL)
+ free(gf->family_cmds, M_NETLINK);
+}
+
+/*
+ * Can sleep, I guess
+ */
+bool
+genl_unregister_family(const char *family_name)
+{
+ bool found = false;
+
+ GENL_LOCK();
+ struct genl_family *gf = find_family(family_name);
+
+ nlctrl_notify(gf, CTRL_CMD_DELFAMILY);
+
+ if (gf != NULL) {
+ found = true;
+ /* TODO: zero pointer first */
+ free_family(gf);
+ bzero(gf, sizeof(*gf));
+ }
+ GENL_UNLOCK();
+
+ return (found);
+}
+
+bool
+genl_register_cmds(const char *family_name, const struct genl_cmd *cmds, int count)
+{
+ GENL_LOCK();
+ struct genl_family *gf = find_family(family_name);
+ if (gf == NULL) {
+ GENL_UNLOCK();
+ return (false);
+ }
+
+ int cmd_size = gf->family_cmd_size;
+
+ for (int i = 0; i < count; i++) {
+ MPASS(cmds[i].cmd_cb != NULL);
+ if (cmds[i].cmd_num >= cmd_size)
+ cmd_size = cmds[i].cmd_num + 1;
+ }
+
+ if (cmd_size > gf->family_cmd_size) {
+ /* need to realloc */
+ size_t sz = cmd_size * sizeof(struct genl_cmd);
+ void *data = malloc(sz, M_NETLINK, M_WAITOK | M_ZERO);
+
+ memcpy(data, gf->family_cmds, gf->family_cmd_size * sizeof(struct genl_cmd));
+ void *old_data = gf->family_cmds;
+ gf->family_cmds = data;
+ gf->family_cmd_size = cmd_size;
+ free(old_data, M_NETLINK);
+ }
+
+ for (int i = 0; i < count; i++) {
+ const struct genl_cmd *cmd = &cmds[i];
+ MPASS(gf->family_cmds[cmd->cmd_num].cmd_cb == NULL);
+ gf->family_cmds[cmd->cmd_num] = cmds[i];
+ NL_LOG(LOG_DEBUG2, "Adding cmd %s(%d) to family %s",
+ cmd->cmd_name, cmd->cmd_num, gf->family_name);
+ }
+ GENL_UNLOCK();
+ return (true);
+}
+
+static struct genl_group *
+find_group(const struct genl_family *gf, const char *group_name)
+{
+ for (int i = 0; i < MAX_GROUPS; i++) {
+ struct genl_group *gg = &groups[i];
+ if (gg->group_family == gf && !strcmp(gg->group_name, group_name))
+ return (gg);
+ }
+ return (NULL);
+}
+
+uint32_t
+genl_register_group(const char *family_name, const char *group_name)
+{
+ uint32_t group_id = 0;
+
+ MPASS(family_name != NULL);
+ MPASS(group_name != NULL);
+
+ GENL_LOCK();
+ struct genl_family *gf = find_family(family_name);
+
+ if (gf == NULL || find_group(gf, group_name) != NULL) {
+ GENL_UNLOCK();
+ return (0);
+ }
+
+ for (int i = 0; i < MAX_GROUPS; i++) {
+ struct genl_group *gg = &groups[i];
+ if (gg->group_family == NULL) {
+ gf->family_num_groups++;
+ gg->group_family = gf;
+ gg->group_name = group_name;
+ group_id = i + MIN_GROUP_NUM;
+ break;
+ }
+ }
+ GENL_UNLOCK();
+
+ return (group_id);
+}
+
+/*
+ * Handler called by netlink subsystem when matching netlink message is received
+ */
+static int
+genl_handle_message(struct nlmsghdr *hdr, struct nl_pstate *npt)
+{
+ struct nlpcb *nlp = npt->nlp;
+ int error = 0;
+
+ int family_id = (int)hdr->nlmsg_type - GENL_MIN_ID;
+
+ if (__predict_false(family_id < 0 || family_id > MAX_FAMILIES)) {
+ NLP_LOG(LOG_DEBUG, nlp, "invalid message type: %d", hdr->nlmsg_type);
+ return (ENOTSUP);
+ }
+
+ if (__predict_false(hdr->nlmsg_len < sizeof(hdr) + GENL_HDRLEN)) {
+ NLP_LOG(LOG_DEBUG, nlp, "invalid message size: %d", hdr->nlmsg_len);
+ return (EINVAL);
+ }
+
+ struct genl_family *gf = &families[family_id];
+
+ struct genlmsghdr *ghdr = (struct genlmsghdr *)(hdr + 1);
+
+ if (ghdr->cmd >= gf->family_cmd_size || gf->family_cmds[ghdr->cmd].cmd_cb == NULL) {
+ NLP_LOG(LOG_DEBUG, nlp, "family %s: invalid cmd %d",
+ gf->family_name, ghdr->cmd);
+ return (ENOTSUP);
+ }
+
+ struct genl_cmd *cmd = &gf->family_cmds[ghdr->cmd];
+
+ if (cmd->cmd_priv != 0 && !nlp_has_priv(nlp, cmd->cmd_priv)) {
+ NLP_LOG(LOG_DEBUG, nlp, "family %s: cmd %d priv_check() failed",
+ gf->family_name, ghdr->cmd);
+ return (EPERM);
+ }
+
+ NLP_LOG(LOG_DEBUG2, nlp, "received family %s cmd %s(%d) len %d",
+ gf->family_name, cmd->cmd_name, ghdr->cmd, hdr->nlmsg_len);
+
+ error = cmd->cmd_cb(hdr, npt);
+
+ return (error);
+}
+
+static uint32_t
+get_cmd_flags(const struct genl_cmd *cmd)
+{
+ uint32_t flags = cmd->cmd_flags;
+ if (cmd->cmd_priv != 0)
+ flags |= GENL_ADMIN_PERM;
+ return (flags);
+}
+
+static int
+dump_family(struct nlmsghdr *hdr, struct genlmsghdr *ghdr,
+ const struct genl_family *gf, struct nl_writer *nw)
+{
+ if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr)))
+ goto enomem;
+
+ struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr);
+ ghdr_new->cmd = ghdr->cmd;
+ ghdr_new->version = gf->family_version;
+ ghdr_new->reserved = 0;
+
+ nlattr_add_string(nw, CTRL_ATTR_FAMILY_NAME, gf->family_name);
+ nlattr_add_u16(nw, CTRL_ATTR_FAMILY_ID, gf->family_id);
+ nlattr_add_u32(nw, CTRL_ATTR_VERSION, gf->family_version);
+ nlattr_add_u32(nw, CTRL_ATTR_HDRSIZE, gf->family_hdrsize);
+ nlattr_add_u32(nw, CTRL_ATTR_MAXATTR, gf->family_attr_max);
+
+ if (gf->family_cmd_size > 0) {
+ int off = nlattr_add_nested(nw, CTRL_ATTR_OPS);
+ if (off == 0)
+ goto enomem;
+ for (int i = 0, cnt=0; i < gf->family_cmd_size; i++) {
+ struct genl_cmd *cmd = &gf->family_cmds[i];
+ if (cmd->cmd_cb == NULL)
+ continue;
+ int cmd_off = nlattr_add_nested(nw, ++cnt);
+ if (cmd_off == 0)
+ goto enomem;
+
+ nlattr_add_u32(nw, CTRL_ATTR_OP_ID, cmd->cmd_num);
+ nlattr_add_u32(nw, CTRL_ATTR_OP_FLAGS, get_cmd_flags(cmd));
+ nlattr_set_len(nw, cmd_off);
+ }
+ nlattr_set_len(nw, off);
+ }
+ if (gf->family_num_groups > 0) {
+ int off = nlattr_add_nested(nw, CTRL_ATTR_MCAST_GROUPS);
+ if (off == 0)
+ goto enomem;
+ for (int i = 0, cnt = 0; i < MAX_GROUPS; i++) {
+ struct genl_group *gg = &groups[i];
+ if (gg->group_family != gf)
+ continue;
+
+ int cmd_off = nlattr_add_nested(nw, ++cnt);
+ if (cmd_off == 0)
+ goto enomem;
+ nlattr_add_u32(nw, CTRL_ATTR_MCAST_GRP_ID, i + MIN_GROUP_NUM);
+ nlattr_add_string(nw, CTRL_ATTR_MCAST_GRP_NAME, gg->group_name);
+ nlattr_set_len(nw, cmd_off);
+ }
+ nlattr_set_len(nw, off);
+ }
+ if (nlmsg_end(nw))
+ return (0);
+enomem:
+ NL_LOG(LOG_DEBUG, "unable to dump family %s state (ENOMEM)", gf->family_name);
+ nlmsg_abort(nw);
+ return (ENOMEM);
+}
+
+
+/* Declare ourself as a user */
+#define CTRL_FAMILY_NAME "nlctrl"
+
+static uint32_t ctrl_family_id;
+static uint32_t ctrl_group_id;
+
+struct nl_parsed_family {
+ uint32_t family_id;
+ char *family_name;
+ uint8_t version;
+};
+
+#define _IN(_field) offsetof(struct genlmsghdr, _field)
+#define _OUT(_field) offsetof(struct nl_parsed_family, _field)
+static const struct nlfield_parser nlf_p_generic[] = {
+ { .off_in = _IN(version), .off_out = _OUT(version), .cb = nlf_get_u8 },
+};
+
+static struct nlattr_parser nla_p_generic[] = {
+ { .type = CTRL_ATTR_FAMILY_ID , .off = _OUT(family_id), .cb = nlattr_get_uint32 },
+ { .type = CTRL_ATTR_FAMILY_NAME , .off = _OUT(family_id), .cb = nlattr_get_string },
+};
+#undef _IN
+#undef _OUT
+NL_DECLARE_PARSER(genl_parser, struct genlmsghdr, nlf_p_generic, nla_p_generic);
+
+static int
+nlctrl_handle_getfamily(struct nlmsghdr *hdr, struct nl_pstate *npt)
+{
+ int error = 0;
+
+ struct nl_parsed_family attrs = {};
+ error = nl_parse_nlmsg(hdr, &genl_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ struct genlmsghdr ghdr = {
+ .cmd = CTRL_CMD_NEWFAMILY,
+ };
+
+ for (int i = 0; i < MAX_FAMILIES; i++) {
+ struct genl_family *gf = &families[i];
+ if (gf->family_name == NULL)
+ continue;
+ if (attrs.family_id != 0 && attrs.family_id != gf->family_id)
+ continue;
+ if (attrs.family_name != NULL && strcmp(attrs.family_name, gf->family_name))
+ continue;
+ error = dump_family(hdr, &ghdr, &families[i], npt->nw);
+ if (error != 0)
+ break;
+ }
+
+ return (error);
+}
+
+static void
+nlctrl_notify(const struct genl_family *gf, int cmd)
+{
+ struct nlmsghdr hdr = {.nlmsg_type = NETLINK_GENERIC };
+ struct genlmsghdr ghdr = { .cmd = cmd };
+ struct nl_writer nw = {};
+
+ if (nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_GENERIC, ctrl_group_id)) {
+ dump_family(&hdr, &ghdr, gf, &nw);
+ nlmsg_flush(&nw);
+ return;
+ }
+ NL_LOG(LOG_DEBUG, "error allocating group writer");
+}
+
+static const struct genl_cmd nlctrl_cmds[] = {
+ {
+ .cmd_num = CTRL_CMD_GETFAMILY,
+ .cmd_name = "GETFAMILY",
+ .cmd_cb = nlctrl_handle_getfamily,
+ .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP, GENL_CMD_CAP_HASPOL,
+ },
+};
+
+static void
+genl_nlctrl_init()
+{
+ ctrl_family_id = genl_register_family(CTRL_FAMILY_NAME, 0, 2, CTRL_ATTR_MAX);
+ genl_register_cmds(CTRL_FAMILY_NAME, nlctrl_cmds, NL_ARRAY_LEN(nlctrl_cmds));
+ ctrl_group_id = genl_register_group(CTRL_FAMILY_NAME, "notify");
+}
+
+static void
+genl_nlctrl_destroy()
+{
+ genl_unregister_family(CTRL_FAMILY_NAME);
+}
+
+static const struct nlhdr_parser *all_parsers[] = { &genl_parser };
+
+static void
+genl_load(void *u __unused)
+{
+ GENL_LOCK_INIT();
+ NL_VERIFY_PARSERS(all_parsers);
+ netlink_register_proto(NETLINK_GENERIC, "NETLINK_GENERIC", genl_handle_message);
+ genl_nlctrl_init();
+}
+SYSINIT(genl_load, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_load, NULL);
+
+static void
+genl_unload(void *u __unused)
+{
+ genl_nlctrl_destroy();
+ GENL_LOCK_DESTROY();
+ epoch_wait_preempt(net_epoch_preempt);
+}
+SYSUNINIT(genl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_unload, NULL);
diff --git a/sys/netlink/netlink_generic.h b/sys/netlink/netlink_generic.h
new file mode 100644
index 000000000000..9b411a67ab2a
--- /dev/null
+++ b/sys/netlink/netlink_generic.h
@@ -0,0 +1,112 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Generic netlink message header and attributes
+ */
+#ifndef _NETLINK_NETLINK_GENERIC_H_
+#define _NETLINK_NETLINK_GENERIC_H_
+
+/* Base header for all of the relevant messages */
+struct genlmsghdr {
+ uint8_t cmd; /* CTRL_CMD_ */
+ uint8_t version; /* ABI version for the cmd */
+ uint16_t reserved; /* reserved: set to 0 */
+};
+#define GENL_HDRLEN NL_ITEM_ALIGN(sizeof(struct genlmsghdr))
+
+/* Dynamic family number range, inclusive */
+#define GENL_MIN_ID NLMSG_MIN_TYPE
+#define GENL_MAX_ID 1023
+
+/* Pre-defined family numbers */
+#define GENL_ID_CTRL GENL_MIN_ID
+
+/* Available commands */
+enum {
+ CTRL_CMD_UNSPEC = 0,
+ CTRL_CMD_NEWFAMILY = 1,
+ CTRL_CMD_DELFAMILY = 2,
+ CTRL_CMD_GETFAMILY = 3, /* lists all (or matching) genetlink families */
+ CTRL_CMD_NEWOPS = 4,
+ CTRL_CMD_DELOPS = 5,
+ CTRL_CMD_GETOPS = 6,
+ CTRL_CMD_NEWMCAST_GRP = 7,
+ CTRL_CMD_DELMCAST_GRP = 8,
+ CTRL_CMD_GETMCAST_GRP = 9,
+ CTRL_CMD_GETPOLICY = 10,
+ __CTRL_CMD_MAX,
+};
+#define CTRL_CMD_MAX (__CTRL_CMD_MAX - 1)
+
+/* Generic attributes */
+enum {
+ CTRL_ATTR_UNSPEC,
+ CTRL_ATTR_FAMILY_ID = 1, /* u16, dynamically-assigned ID */
+ CTRL_ATTR_FAMILY_NAME = 2, /* string, family name */
+ CTRL_ATTR_VERSION = 3, /* u32, command version */
+ CTRL_ATTR_HDRSIZE = 4, /* u32, family header size */
+ CTRL_ATTR_MAXATTR = 5, /* u32, maximum family attr # */
+ CTRL_ATTR_OPS = 6, /* nested, available operations */
+ CTRL_ATTR_MCAST_GROUPS = 7,
+ CTRL_ATTR_POLICY = 8,
+ CTRL_ATTR_OP_POLICY = 9,
+ CTRL_ATTR_OP = 10,
+ __CTRL_ATTR_MAX,
+};
+#define CTRL_ATTR_MAX (__CTRL_ATTR_MAX - 1)
+
+#define GENL_NAMSIZ 16 /* max family name length including \0 */
+
+/* CTRL_ATTR_OPS attributes */
+enum {
+ CTRL_ATTR_OP_UNSPEC,
+ CTRL_ATTR_OP_ID = 1, /* u32, operation # */
+ CTRL_ATTR_OP_FLAGS = 2, /* u32, flags-based op description */
+ __CTRL_ATTR_OP_MAX,
+};
+#define CTRL_ATTR_OP_MAX (__CTRL_ATTR_OP_MAX - 1)
+
+/* CTRL_ATTR_OP_FLAGS values */
+#define GENL_ADMIN_PERM 0x0001 /* Requires elevated permissions */
+#define GENL_CMD_CAP_DO 0x0002 /* Operation is a modification request */
+#define GENL_CMD_CAP_DUMP 0x0004 /* Operation is a get/dump request */
+#define GENL_CMD_CAP_HASPOL 0x0008 /* Operation has a validation policy */
+#define GENL_UNS_ADMIN_PERM 0x0010
+
+/* CTRL_ATTR_MCAST_GROUPS attributes */
+enum {
+ CTRL_ATTR_MCAST_GRP_UNSPEC,
+ CTRL_ATTR_MCAST_GRP_NAME, /* string, group name */
+ CTRL_ATTR_MCAST_GRP_ID, /* u32, dynamically-assigned group id */
+ __CTRL_ATTR_MCAST_GRP_MAX,
+};
+#define CTRL_ATTR_MCAST_GRP_MAX (CTRL_ATTR_MCAST_GRP_MAX - 1)
+
+
+#endif
+
diff --git a/sys/netlink/netlink_io.c b/sys/netlink/netlink_io.c
new file mode 100644
index 000000000000..ef1c2c73a10e
--- /dev/null
+++ b/sys/netlink/netlink_io.c
@@ -0,0 +1,528 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/ck.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syslog.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_linux.h>
+#include <netlink/netlink_var.h>
+
+#define DEBUG_MOD_NAME nl_io
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+/*
+ * The logic below provide a p2p interface for receiving and
+ * sending netlink data between the kernel and userland.
+ */
+
+static const struct sockaddr_nl _nl_empty_src = {
+ .nl_len = sizeof(struct sockaddr_nl),
+ .nl_family = PF_NETLINK,
+ .nl_pid = 0 /* comes from the kernel */
+};
+static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src;
+
+static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp);
+
+
+static void
+queue_push(struct nl_io_queue *q, struct mbuf *mq)
+{
+ while (mq != NULL) {
+ struct mbuf *m = mq;
+ mq = mq->m_nextpkt;
+ m->m_nextpkt = NULL;
+
+ q->length += m_length(m, NULL);
+ STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt);
+ }
+}
+
+static void
+queue_push_head(struct nl_io_queue *q, struct mbuf *m)
+{
+ MPASS(m->m_nextpkt == NULL);
+
+ q->length += m_length(m, NULL);
+ STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt);
+}
+
+static struct mbuf *
+queue_pop(struct nl_io_queue *q)
+{
+ if (!STAILQ_EMPTY(&q->head)) {
+ struct mbuf *m = STAILQ_FIRST(&q->head);
+ STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
+ m->m_nextpkt = NULL;
+ q->length -= m_length(m, NULL);
+
+ return (m);
+ }
+ return (NULL);
+}
+
+static struct mbuf *
+queue_head(const struct nl_io_queue *q)
+{
+ return (STAILQ_FIRST(&q->head));
+}
+
+static inline bool
+queue_empty(const struct nl_io_queue *q)
+{
+ return (q->length == 0);
+}
+
+static void
+queue_free(struct nl_io_queue *q)
+{
+ while (!STAILQ_EMPTY(&q->head)) {
+ struct mbuf *m = STAILQ_FIRST(&q->head);
+ STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
+ m->m_nextpkt = NULL;
+ m_freem(m);
+ }
+ q->length = 0;
+}
+
+
+static void
+nl_schedule_taskqueue(struct nlpcb *nlp)
+{
+ if (!nlp->nl_task_pending) {
+ nlp->nl_task_pending = true;
+ taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
+ NL_LOG(LOG_DEBUG3, "taskqueue scheduled");
+ } else {
+ NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped");
+ }
+}
+
+int
+nl_receive_async(struct mbuf *m, struct socket *so)
+{
+ struct nlpcb *nlp = sotonlpcb(so);
+ int error = 0;
+
+ m->m_nextpkt = NULL;
+
+ NLP_LOCK(nlp);
+
+ if ((__predict_true(nlp->nl_active))) {
+ sbappend(&so->so_snd, m, 0);
+ NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL));
+ nl_schedule_taskqueue(nlp);
+ } else {
+ NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket",
+ m_length(m, NULL));
+ m_free(m);
+ error = EINVAL;
+ }
+
+ NLP_UNLOCK(nlp);
+
+ return (error);
+}
+
+static bool
+tx_check_locked(struct nlpcb *nlp)
+{
+ if (queue_empty(&nlp->tx_queue))
+ return (true);
+
+ /*
+ * Check if something can be moved from the internal TX queue
+ * to the socket queue.
+ */
+
+ bool appended = false;
+ struct sockbuf *sb = &nlp->nl_socket->so_rcv;
+ SOCKBUF_LOCK(sb);
+
+ while (true) {
+ struct mbuf *m = queue_head(&nlp->tx_queue);
+ if (m && sbappendaddr_locked(sb, nl_empty_src, m, NULL) != 0) {
+ /* appended successfully */
+ queue_pop(&nlp->tx_queue);
+ appended = true;
+ } else
+ break;
+ }
+
+ SOCKBUF_UNLOCK(sb);
+
+ if (appended)
+ sorwakeup(nlp->nl_socket);
+
+ return (queue_empty(&nlp->tx_queue));
+}
+
+static bool
+nl_process_received_one(struct nlpcb *nlp)
+{
+ bool reschedule = false;
+
+ NLP_LOCK(nlp);
+ nlp->nl_task_pending = false;
+
+ if (!tx_check_locked(nlp)) {
+ /* TX overflow queue still not empty, ignore RX */
+ NLP_UNLOCK(nlp);
+ return (false);
+ }
+
+ if (queue_empty(&nlp->rx_queue)) {
+ /*
+ * Grab all data we have from the socket TX queue
+ * and store it the internal queue, so it can be worked on
+ * w/o holding socket lock.
+ */
+ struct sockbuf *sb = &nlp->nl_socket->so_snd;
+
+ SOCKBUF_LOCK(sb);
+ unsigned int avail = sbavail(sb);
+ if (avail > 0) {
+ NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail);
+ queue_push(&nlp->rx_queue, sbcut_locked(sb, avail));
+ }
+ SOCKBUF_UNLOCK(sb);
+ } else {
+ /* Schedule another pass to read from the socket queue */
+ reschedule = true;
+ }
+
+ int prev_hiwat = nlp->tx_queue.hiwat;
+ NLP_UNLOCK(nlp);
+
+ while (!queue_empty(&nlp->rx_queue)) {
+ struct mbuf *m = queue_pop(&nlp->rx_queue);
+
+ m = nl_process_mbuf(m, nlp);
+ if (m != NULL) {
+ queue_push_head(&nlp->rx_queue, m);
+ reschedule = false;
+ break;
+ }
+ }
+ if (nlp->tx_queue.hiwat > prev_hiwat) {
+ NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat);
+
+ }
+
+ return (reschedule);
+}
+
+static void
+nl_process_received(struct nlpcb *nlp)
+{
+ NL_LOG(LOG_DEBUG3, "taskqueue called");
+
+ while (nl_process_received_one(nlp))
+ ;
+}
+
+void
+nl_init_io(struct nlpcb *nlp)
+{
+ STAILQ_INIT(&nlp->rx_queue.head);
+ STAILQ_INIT(&nlp->tx_queue.head);
+}
+
+void
+nl_free_io(struct nlpcb *nlp)
+{
+ queue_free(&nlp->rx_queue);
+ queue_free(&nlp->tx_queue);
+}
+
+/*
+ * Called after some data have been read from the socket.
+ */
+void
+nl_on_transmit(struct nlpcb *nlp)
+{
+ NLP_LOCK(nlp);
+
+ struct socket *so = nlp->nl_socket;
+ if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) {
+ uint64_t dropped_bytes = nlp->nl_dropped_bytes;
+ uint64_t dropped_messages = nlp->nl_dropped_messages;
+ nlp->nl_dropped_bytes = 0;
+ nlp->nl_dropped_messages = 0;
+
+ struct sockbuf *sb = &so->so_rcv;
+ NLP_LOG(LOG_DEBUG, nlp,
+ "socket RX overflowed, %lu messages (%lu bytes) dropped. "
+ "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes,
+ sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax);
+ /* TODO: send netlink message */
+ }
+
+ nl_schedule_taskqueue(nlp);
+ NLP_UNLOCK(nlp);
+}
+
+void
+nl_taskqueue_handler(void *_arg, int pending)
+{
+ struct nlpcb *nlp = (struct nlpcb *)_arg;
+
+ CURVNET_SET(nlp->nl_socket->so_vnet);
+ nl_process_received(nlp);
+ CURVNET_RESTORE();
+}
+
+static __noinline void
+queue_push_tx(struct nlpcb *nlp, struct mbuf *m)
+{
+ queue_push(&nlp->tx_queue, m);
+ nlp->nl_tx_blocked = true;
+
+ if (nlp->tx_queue.length > nlp->tx_queue.hiwat)
+ nlp->tx_queue.hiwat = nlp->tx_queue.length;
+}
+
+/*
+ * Tries to send @m to the socket @nlp.
+ *
+ * @m: mbuf(s) to send to. Consumed in any case.
+ * @nlp: socket to send to
+ * @cnt: number of messages in @m
+ * @io_flags: combination of NL_IOF_* flags
+ *
+ * Returns true on success.
+ * If no queue overrunes happened, wakes up socket owner.
+ */
+bool
+nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags)
+{
+ bool untranslated = io_flags & NL_IOF_UNTRANSLATED;
+ bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT;
+ bool result = true;
+
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
+ NLP_LOG(LOG_DEBUG2, nlp,
+ "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X",
+ m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len,
+ io_flags);
+ }
+
+ if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) {
+ m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp);
+ if (m == NULL)
+ return (false);
+ }
+
+ NLP_LOCK(nlp);
+
+ if (__predict_false(nlp->nl_socket == NULL)) {
+ NLP_UNLOCK(nlp);
+ m_freem(m);
+ return (false);
+ }
+
+ if (!queue_empty(&nlp->tx_queue)) {
+ if (ignore_limits) {
+ queue_push_tx(nlp, m);
+ } else {
+ m_free(m);
+ result = false;
+ }
+ NLP_UNLOCK(nlp);
+ return (result);
+ }
+
+ struct socket *so = nlp->nl_socket;
+ if (sbappendaddr(&so->so_rcv, nl_empty_src, m, NULL) != 0) {
+ sorwakeup(so);
+ NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up");
+ } else {
+ if (ignore_limits) {
+ queue_push_tx(nlp, m);
+ } else {
+ /*
+ * Store dropped data so it can be reported
+ * on the next read
+ */
+ nlp->nl_dropped_bytes += m_length(m, NULL);
+ nlp->nl_dropped_messages += num_messages;
+ NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
+ nlp->nl_dropped_messages, num_messages,
+ nlp->nl_dropped_bytes, m_length(m, NULL));
+ soroverflow(so);
+ m_freem(m);
+ result = false;
+ }
+ }
+ NLP_UNLOCK(nlp);
+
+ return (result);
+}
+
+static int
+nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
+ struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ nl_handler_f handler = nl_handlers[nlp->nl_proto].cb;
+ int error = 0;
+
+ NL_LOG(LOG_DEBUG2, "msg len: %d type: %d", hdr->nlmsg_len,
+ hdr->nlmsg_type);
+
+ if (__predict_false(hdr->nlmsg_len > remaining_length)) {
+ NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d",
+ hdr->nlmsg_len, remaining_length);
+ return (EINVAL);
+ } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
+ NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
+ return (EINVAL);
+ }
+ /* Stamp each message with sender pid */
+ hdr->nlmsg_pid = nlp->nl_port;
+
+ npt->hdr = hdr;
+
+ if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
+ NL_LOG(LOG_DEBUG2, "handling message with msg type: %d",
+ hdr->nlmsg_type);
+
+ if (nlp->nl_linux && linux_netlink_p != NULL) {
+ struct nlmsghdr *hdr_orig = hdr;
+ hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt);
+ if (hdr == NULL) {
+ npt->hdr = hdr_orig;
+ if (hdr->nlmsg_flags & NLM_F_ACK)
+ nlmsg_ack(nlp, EAGAIN, hdr, npt);
+ return (0);
+ }
+ }
+ error = handler(hdr, npt);
+ NL_LOG(LOG_DEBUG2, "retcode: %d", error);
+ }
+ if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
+ NL_LOG(LOG_DEBUG3, "ack");
+ nlmsg_ack(nlp, error, hdr, npt);
+ NL_LOG(LOG_DEBUG3, "done");
+ }
+
+ return (0);
+}
+
+static void
+npt_clear(struct nl_pstate *npt)
+{
+ lb_clear(&npt->lb);
+ npt->error = 0;
+ npt->err_msg = NULL;
+ npt->err_off = 0;
+ npt->hdr = NULL;
+}
+
+/*
+ * Processes an incoming packet, which can contain multiple netlink messages
+ */
+static struct mbuf *
+nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp)
+{
+ int offset, buffer_length;
+ struct nlmsghdr *hdr;
+ char *buffer;
+ int error;
+
+ NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket);
+
+ struct nl_writer nw = {};
+ if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) {
+ m_freem(m);
+ NL_LOG(LOG_DEBUG, "error allocating socket writer");
+ return (NULL);
+ }
+
+ nlmsg_ignore_limit(&nw);
+ /* TODO: alloc this buf once for nlp */
+ int data_length = m_length(m, NULL);
+ buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE;
+ if (nlp->nl_linux)
+ buffer_length += roundup2(data_length, 8);
+ buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (buffer == NULL) {
+ m_freem(m);
+ nlmsg_flush(&nw);
+ NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory",
+ buffer_length);
+ return (NULL);
+ }
+ m_copydata(m, 0, data_length, buffer);
+
+ struct nl_pstate npt = {
+ .nlp = nlp,
+ .lb.base = &buffer[roundup2(data_length, 8)],
+ .lb.size = buffer_length - roundup2(data_length, 8),
+ .nw = &nw,
+ .strict = nlp->nl_flags & NLF_STRICT,
+ };
+
+ for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) {
+ hdr = (struct nlmsghdr *)&buffer[offset];
+ /* Save length prior to calling handler */
+ int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
+ NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length);
+ npt_clear(&npt);
+ error = nl_receive_message(hdr, data_length - offset, nlp, &npt);
+ offset += msglen;
+ if (__predict_false(error != 0 || nlp->nl_tx_blocked))
+ break;
+ }
+ NL_LOG(LOG_DEBUG3, "packet parsing done");
+ free(buffer, M_NETLINK);
+ nlmsg_flush(&nw);
+
+ if (nlp->nl_tx_blocked) {
+ NLP_LOCK(nlp);
+ nlp->nl_tx_blocked = false;
+ NLP_UNLOCK(nlp);
+ m_adj(m, offset);
+ return (m);
+ } else {
+ m_freem(m);
+ return (NULL);
+ }
+}
diff --git a/sys/netlink/netlink_linux.h b/sys/netlink/netlink_linux.h
new file mode 100644
index 000000000000..8841624be070
--- /dev/null
+++ b/sys/netlink/netlink_linux.h
@@ -0,0 +1,54 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETLINK_LINUX_VAR_H_
+#define _NETLINK_LINUX_VAR_H_
+
+/*
+ * The file contains headers for the bridge interface between
+ * linux[_common] module and the netlink module
+ */
+struct nlpcb;
+struct nl_pstate;
+
+typedef struct mbuf *mbufs_to_linux_cb_t(int netlink_family, struct mbuf *m,
+ struct nlpcb *nlp);
+typedef struct mbuf *msgs_to_linux_cb_t(int netlink_family, char *buf, int data_length,
+ struct nlpcb *nlp);
+typedef struct nlmsghdr *msg_from_linux_cb_t(int netlink_family, struct nlmsghdr *hdr,
+ struct nl_pstate *npt);
+
+struct linux_netlink_provider {
+ mbufs_to_linux_cb_t *mbufs_to_linux;
+ msgs_to_linux_cb_t *msgs_to_linux;
+ msg_from_linux_cb_t *msg_from_linux;
+
+};
+
+extern struct linux_netlink_provider *linux_netlink_p;
+
+#endif
diff --git a/sys/netlink/netlink_message_parser.c b/sys/netlink/netlink_message_parser.c
new file mode 100644
index 000000000000..d33eddb800e4
--- /dev/null
+++ b/sys/netlink/netlink_message_parser.c
@@ -0,0 +1,472 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+
+#include <machine/stdarg.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+
+#include <net/route/route_ctl.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+#include <netlink/netlink_route.h>
+
+#define DEBUG_MOD_NAME nl_parser
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+bool
+nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...)
+{
+ va_list ap;
+
+ if (npt->err_msg != NULL)
+ return (false);
+ char *buf = npt_alloc(npt, NL_MAX_ERROR_BUF);
+ if (buf == NULL)
+ return (false);
+ va_start(ap, fmt);
+ vsnprintf(buf, NL_MAX_ERROR_BUF, fmt, ap);
+ va_end(ap);
+
+ npt->err_msg = buf;
+ return (true);
+}
+
+bool
+nlmsg_report_err_offset(struct nl_pstate *npt, uint32_t off)
+{
+ if (npt->err_off != 0)
+ return (false);
+ npt->err_off = off;
+ return (true);
+}
+
+static const struct nlattr_parser *
+search_states(const struct nlattr_parser *ps, int pslen, int key)
+{
+ int left_i = 0, right_i = pslen - 1;
+
+ if (key < ps[0].type || key > ps[pslen - 1].type)
+ return (NULL);
+
+ while (left_i + 1 < right_i) {
+ int mid_i = (left_i + right_i) / 2;
+ if (key < ps[mid_i].type)
+ right_i = mid_i;
+ else if (key > ps[mid_i].type)
+ left_i = mid_i + 1;
+ else
+ return (&ps[mid_i]);
+ }
+ if (ps[left_i].type == key)
+ return (&ps[left_i]);
+ else if (ps[right_i].type == key)
+ return (&ps[right_i]);
+ return (NULL);
+}
+
+int
+nl_parse_attrs_raw(struct nlattr *nla_head, int len, const struct nlattr_parser *ps, int pslen,
+ struct nl_pstate *npt, void *target)
+{
+ struct nlattr *nla = NULL;
+ int error = 0;
+
+ NL_LOG(LOG_DEBUG3, "parse %p remaining_len %d", nla_head, len);
+ int orig_len = len;
+ NLA_FOREACH(nla, nla_head, len) {
+ NL_LOG(LOG_DEBUG3, ">> parsing %p attr_type %d len %d (rem %d)", nla, nla->nla_type, nla->nla_len, len);
+ if (nla->nla_len < sizeof(struct nlattr)) {
+ NLMSG_REPORT_ERR_MSG(npt, "Invalid attr %p type %d len: %d",
+ nla, nla->nla_type, nla->nla_len);
+ uint32_t off = (char *)nla - (char *)npt->hdr;
+ nlmsg_report_err_offset(npt, off);
+ return (EINVAL);
+ }
+
+ int nla_type = nla->nla_type & NLA_TYPE_MASK;
+ const struct nlattr_parser *s = search_states(ps, pslen, nla_type);
+ if (s != NULL) {
+ void *ptr = (void *)((char *)target + s->off);
+ error = s->cb(nla, npt, s->arg, ptr);
+ if (error != 0) {
+ uint32_t off = (char *)nla - (char *)npt->hdr;
+ nlmsg_report_err_offset(npt, off);
+ NL_LOG(LOG_DEBUG3, "parse failed att offset %u", off);
+ return (error);
+ }
+ } else {
+ /* Ignore non-specified attributes */
+ NL_LOG(LOG_DEBUG3, "ignoring attr %d", nla->nla_type);
+ }
+ }
+ if (len >= sizeof(struct nlattr)) {
+ nla = (struct nlattr *)((char *)nla_head + (orig_len - len));
+ NL_LOG(LOG_DEBUG3, " >>> end %p attr_type %d len %d", nla,
+ nla->nla_type, nla->nla_len);
+ }
+ NL_LOG(LOG_DEBUG3, "end parse: %p remaining_len %d", nla, len);
+
+ return (0);
+}
+
+int
+nl_parse_attrs(struct nlmsghdr *hdr, int hdrlen, struct nlattr_parser *ps, int pslen,
+ struct nl_pstate *npt, void *target)
+{
+ int off = NLMSG_HDRLEN + NETLINK_ALIGN(hdrlen);
+ int len = hdr->nlmsg_len - off;
+ struct nlattr *nla_head = (struct nlattr *)((char *)hdr + off);
+
+ return (nl_parse_attrs_raw(nla_head, len, ps, pslen, npt, target));
+}
+
+int
+nlattr_get_flag(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != 0)) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not a flag",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+
+ *((uint8_t *)target) = 1;
+ return (0);
+}
+
+static struct sockaddr *
+parse_rta_ip4(void *rta_data, struct nl_pstate *npt, int *perror)
+{
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)npt_alloc_sockaddr(npt, sizeof(struct sockaddr_in));
+ if (__predict_false(sin == NULL)) {
+ *perror = ENOBUFS;
+ return (NULL);
+ }
+ sin->sin_len = sizeof(struct sockaddr_in);
+ sin->sin_family = AF_INET;
+ memcpy(&sin->sin_addr, rta_data, sizeof(struct in_addr));
+ return ((struct sockaddr *)sin);
+}
+
+static struct sockaddr *
+parse_rta_ip6(void *rta_data, struct nl_pstate *npt, int *perror)
+{
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)npt_alloc_sockaddr(npt, sizeof(struct sockaddr_in6));
+ if (__predict_false(sin6 == NULL)) {
+ *perror = ENOBUFS;
+ return (NULL);
+ }
+ sin6->sin6_len = sizeof(struct sockaddr_in6);
+ sin6->sin6_family = AF_INET6;
+ memcpy(&sin6->sin6_addr, rta_data, sizeof(struct in6_addr));
+ return ((struct sockaddr *)sin6);
+}
+
+static struct sockaddr *
+parse_rta_ip(struct rtattr *rta, struct nl_pstate *npt, int *perror)
+{
+ void *rta_data = NL_RTA_DATA(rta);
+ int rta_len = NL_RTA_DATA_LEN(rta);
+
+ if (rta_len == sizeof(struct in_addr)) {
+ return (parse_rta_ip4(rta_data, npt, perror));
+ } else if (rta_len == sizeof(struct in6_addr)) {
+ return (parse_rta_ip6(rta_data, npt, perror));
+ } else {
+ NLMSG_REPORT_ERR_MSG(npt, "unknown IP len: %d for rta type %d",
+ rta_len, rta->rta_type);
+ *perror = ENOTSUP;
+ return (NULL);
+ }
+ return (NULL);
+}
+
+int
+nlattr_get_ip(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ int error = 0;
+
+ struct sockaddr *sa = parse_rta_ip((struct rtattr *)nla, npt, &error);
+
+ *((struct sockaddr **)target) = sa;
+ return (error);
+}
+
+static struct sockaddr *
+parse_rta_via(struct rtattr *rta, struct nl_pstate *npt, int *perror)
+{
+ struct rtvia *via = NL_RTA_DATA(rta);
+ int data_len = NL_RTA_DATA_LEN(rta);
+
+ if (__predict_false(data_len) < sizeof(struct rtvia)) {
+ NLMSG_REPORT_ERR_MSG(npt, "undersized RTA_VIA(%d) attr: len %d",
+ rta->rta_type, data_len);
+ *perror = EINVAL;
+ return (NULL);
+ }
+ data_len -= offsetof(struct rtvia, rtvia_addr);
+
+ switch (via->rtvia_family) {
+ case AF_INET:
+ if (__predict_false(data_len < sizeof(struct in_addr))) {
+ *perror = EINVAL;
+ return (NULL);
+ }
+ return (parse_rta_ip4(via->rtvia_addr, npt, perror));
+ case AF_INET6:
+ if (__predict_false(data_len < sizeof(struct in6_addr))) {
+ *perror = EINVAL;
+ return (NULL);
+ }
+ return (parse_rta_ip6(via->rtvia_addr, npt, perror));
+ default:
+ *perror = ENOTSUP;
+ return (NULL);
+ }
+}
+
+int
+nlattr_get_ipvia(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ int error = 0;
+
+ struct sockaddr *sa = parse_rta_via((struct rtattr *)nla, npt, &error);
+
+ *((struct sockaddr **)target) = sa;
+ return (error);
+}
+
+
+int
+nlattr_get_uint16(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint16_t))) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint32",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+ *((uint16_t *)target) = *((const uint16_t *)NL_RTA_DATA_CONST(nla));
+ return (0);
+}
+
+int
+nlattr_get_uint32(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint32_t))) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint32",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+ *((uint32_t *)target) = *((const uint32_t *)NL_RTA_DATA_CONST(nla));
+ return (0);
+}
+
+int
+nlattr_get_uint64(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint64_t))) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint64",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+ memcpy(target, NL_RTA_DATA_CONST(nla), sizeof(uint64_t));
+ return (0);
+}
+
+static int
+nlattr_get_ifp_internal(struct nlattr *nla, struct nl_pstate *npt,
+ void *target, bool zero_ok)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint32_t))) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint32",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+ uint32_t ifindex = *((const uint32_t *)NLA_DATA_CONST(nla));
+
+ if (ifindex == 0 && zero_ok) {
+ *((struct ifnet **)target) = NULL;
+ return (0);
+ }
+
+ NET_EPOCH_ASSERT();
+
+ struct ifnet *ifp = ifnet_byindex(ifindex);
+ if (__predict_false(ifp == NULL)) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d: ifindex %u invalid",
+ nla->nla_type, ifindex);
+ return (ENOENT);
+ }
+ *((struct ifnet **)target) = ifp;
+ NL_LOG(LOG_DEBUG3, "nla type %d: ifindex %u -> %s", nla->nla_type,
+ ifindex, if_name(ifp));
+
+ return (0);
+}
+
+int
+nlattr_get_ifp(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ return (nlattr_get_ifp_internal(nla, npt, target, false));
+}
+
+int
+nlattr_get_ifpz(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ return (nlattr_get_ifp_internal(nla, npt, target, true));
+}
+
+int
+nlattr_get_string(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ int maxlen = NLA_DATA_LEN(nla);
+
+ if (__predict_false(strnlen((char *)NLA_DATA(nla), maxlen) >= maxlen)) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not NULL-terminated",
+ nla->nla_type, maxlen);
+ return (EINVAL);
+ }
+
+ *((char **)target) = (char *)NLA_DATA(nla);
+ return (0);
+}
+
+int
+nlattr_get_stringn(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ int maxlen = NLA_DATA_LEN(nla);
+
+ char *buf = npt_alloc(npt, maxlen + 1);
+ if (buf == NULL)
+ return (ENOMEM);
+ buf[maxlen] = '\0';
+ memcpy(buf, NLA_DATA(nla), maxlen);
+
+ *((char **)target) = buf;
+ return (0);
+}
+int
+nlattr_get_nla(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ NL_LOG(LOG_DEBUG3, "STORING %p len %d", nla, nla->nla_len);
+ *((struct nlattr **)target) = nla;
+ return (0);
+}
+
+int
+nlattr_get_nested(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ const struct nlhdr_parser *p = (const struct nlhdr_parser *)arg;
+ int error;
+
+ /* Assumes target points to the beginning of the structure */
+ error = nl_parse_header(NLA_DATA(nla), NLA_DATA_LEN(nla), p, npt, target);
+ return (error);
+}
+
+int
+nlf_get_ifp(void *src, struct nl_pstate *npt, void *target)
+{
+ int ifindex = *((const int *)src);
+
+ NET_EPOCH_ASSERT();
+
+ struct ifnet *ifp = ifnet_byindex(ifindex);
+ if (ifp == NULL) {
+ NL_LOG(LOG_DEBUG, "ifindex %u invalid", ifindex);
+ return (ENOENT);
+ }
+ *((struct ifnet **)target) = ifp;
+
+ return (0);
+}
+
+int
+nlf_get_ifpz(void *src, struct nl_pstate *npt, void *target)
+{
+ int ifindex = *((const int *)src);
+
+ NET_EPOCH_ASSERT();
+
+ struct ifnet *ifp = ifnet_byindex(ifindex);
+ if (ifindex != 0 && ifp == NULL) {
+ NL_LOG(LOG_DEBUG, "ifindex %u invalid", ifindex);
+ return (ENOENT);
+ }
+ *((struct ifnet **)target) = ifp;
+
+ return (0);
+}
+
+int
+nlf_get_u8(void *src, struct nl_pstate *npt, void *target)
+{
+ uint8_t val = *((const uint8_t *)src);
+
+ *((uint8_t *)target) = val;
+
+ return (0);
+}
+
+int
+nlf_get_u8_u32(void *src, struct nl_pstate *npt, void *target)
+{
+ *((uint32_t *)target) = *((const uint8_t *)src);
+ return (0);
+}
+
+int
+nlf_get_u16(void *src, struct nl_pstate *npt, void *target)
+{
+ *((uint16_t *)target) = *((const uint16_t *)src);
+ return (0);
+}
+
+int
+nlf_get_u32(void *src, struct nl_pstate *npt, void *target)
+{
+ *((uint32_t *)target) = *((const uint32_t *)src);
+ return (0);
+}
+
diff --git a/sys/netlink/netlink_message_parser.h b/sys/netlink/netlink_message_parser.h
new file mode 100644
index 000000000000..06a6788b7de5
--- /dev/null
+++ b/sys/netlink/netlink_message_parser.h
@@ -0,0 +1,270 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETLINK_NETLINK_MESSAGE_PARSER_H_
+#define _NETLINK_NETLINK_MESSAGE_PARSER_H_
+
+/*
+ * It is not meant to be included directly
+ */
+
+/* Parsing state */
+struct linear_buffer {
+ char *base; /* Base allocated memory pointer */
+ uint32_t offset; /* Currently used offset */
+ uint32_t size; /* Total buffer size */
+};
+
+static inline void *
+lb_alloc(struct linear_buffer *lb, int len)
+{
+ len = roundup2(len, sizeof(uint64_t));
+ if (lb->offset + len > lb->size)
+ return (NULL);
+ void *data = (void *)(lb->base + lb->offset);
+ lb->offset += len;
+ return (data);
+}
+
+static inline void
+lb_clear(struct linear_buffer *lb)
+{
+ memset(lb->base, 0, lb->size);
+ lb->offset = 0;
+}
+
+#define NL_MAX_ERROR_BUF 128
+#define SCRATCH_BUFFER_SIZE (1024 + NL_MAX_ERROR_BUF)
+struct nl_pstate {
+ struct linear_buffer lb; /* Per-message scratch buffer */
+ struct nlpcb *nlp; /* Originator socket */
+ struct nl_writer *nw; /* Message writer to use */
+ struct nlmsghdr *hdr; /* Current parsed message header */
+ uint32_t err_off; /* error offset from hdr start */
+ int error; /* last operation error */
+ char *err_msg; /* Description of last error */
+ bool strict; /* Strict parsing required */
+};
+
+static inline void *
+npt_alloc(struct nl_pstate *npt, int len)
+{
+ return (lb_alloc(&npt->lb, len));
+}
+#define npt_alloc_sockaddr(_npt, _len) ((struct sockaddr *)(npt_alloc(_npt, _len)))
+
+typedef int parse_field_f(void *hdr, struct nl_pstate *npt,
+ void *target);
+struct nlfield_parser {
+ uint16_t off_in;
+ uint16_t off_out;
+ parse_field_f *cb;
+};
+static const struct nlfield_parser nlf_p_empty[] = {};
+
+int nlf_get_ifp(void *src, struct nl_pstate *npt, void *target);
+int nlf_get_ifpz(void *src, struct nl_pstate *npt, void *target);
+int nlf_get_u8(void *src, struct nl_pstate *npt, void *target);
+int nlf_get_u16(void *src, struct nl_pstate *npt, void *target);
+int nlf_get_u32(void *src, struct nl_pstate *npt, void *target);
+int nlf_get_u8_u32(void *src, struct nl_pstate *npt, void *target);
+
+
+struct nlattr_parser;
+typedef int parse_attr_f(struct nlattr *attr, struct nl_pstate *npt,
+ const void *arg, void *target);
+struct nlattr_parser {
+ uint16_t type; /* Attribute type */
+ uint16_t off; /* field offset in the target structure */
+ parse_attr_f *cb; /* parser function to call */
+ const void *arg;
+};
+
+typedef bool strict_parser_f(void *hdr, struct nl_pstate *npt);
+
+struct nlhdr_parser {
+ int nl_hdr_off; /* aligned netlink header size */
+ int out_hdr_off; /* target header size */
+ int fp_size;
+ int np_size;
+ const struct nlfield_parser *fp; /* array of header field parsers */
+ const struct nlattr_parser *np; /* array of attribute parsers */
+ strict_parser_f *sp; /* Parser function */
+};
+
+#define NL_DECLARE_PARSER(_name, _t, _fp, _np) \
+static const struct nlhdr_parser _name = { \
+ .nl_hdr_off = sizeof(_t), \
+ .fp = &((_fp)[0]), \
+ .np = &((_np)[0]), \
+ .fp_size = NL_ARRAY_LEN(_fp), \
+ .np_size = NL_ARRAY_LEN(_np), \
+}
+
+#define NL_DECLARE_STRICT_PARSER(_name, _t, _sp, _fp, _np)\
+static const struct nlhdr_parser _name = { \
+ .nl_hdr_off = sizeof(_t), \
+ .fp = &((_fp)[0]), \
+ .np = &((_np)[0]), \
+ .fp_size = NL_ARRAY_LEN(_fp), \
+ .np_size = NL_ARRAY_LEN(_np), \
+ .sp = _sp, \
+}
+
+#define NL_DECLARE_ARR_PARSER(_name, _t, _o, _fp, _np) \
+static const struct nlhdr_parser _name = { \
+ .nl_hdr_off = sizeof(_t), \
+ .out_hdr_off = sizeof(_o), \
+ .fp = &((_fp)[0]), \
+ .np = &((_np)[0]), \
+ .fp_size = NL_ARRAY_LEN(_fp), \
+ .np_size = NL_ARRAY_LEN(_np), \
+}
+
+#define NL_DECLARE_ATTR_PARSER(_name, _np) \
+static const struct nlhdr_parser _name = { \
+ .np = &((_np)[0]), \
+ .np_size = NL_ARRAY_LEN(_np), \
+}
+
+struct nlarr_hdr {
+ int num_items;
+ int max_items;
+};
+
+int nl_parse_attrs_raw(struct nlattr *nla_head, int len, const struct nlattr_parser *ps,
+ int pslen, struct nl_pstate *npt, void *target);
+int nl_parse_attrs(struct nlmsghdr *hdr, int hdrlen, struct nlattr_parser *ps,
+ int pslen, struct nl_pstate *npt, void *target);
+
+int nlattr_get_flag(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_ip(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_uint16(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_uint32(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_uint64(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_ifp(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_ifpz(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_ipvia(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_string(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_stringn(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_nla(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_nested(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+
+bool nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...);
+
+#define NLMSG_REPORT_ERR_MSG(_npt, _fmt, ...) { \
+ nlmsg_report_err_msg(_npt, _fmt, ## __VA_ARGS__); \
+ NLP_LOG(LOG_DEBUG, (_npt)->nlp, _fmt, ## __VA_ARGS__); \
+}
+
+bool nlmsg_report_err_offset(struct nl_pstate *npt, uint32_t off);
+
+/*
+ * Have it inline so compiler can optimize field accesses into
+ * the list of direct function calls without iteration.
+ */
+static inline int
+nl_parse_header(void *hdr, int len, const struct nlhdr_parser *parser,
+ struct nl_pstate *npt, void *target)
+{
+ int error;
+
+ if (__predict_false(len < parser->nl_hdr_off)) {
+ nlmsg_report_err_msg(npt, "header too short: expected %d, got %d",
+ parser->nl_hdr_off, len);
+ return (EINVAL);
+ }
+
+ if (npt->strict && parser->sp != NULL && !parser->sp(hdr, npt))
+ return (EINVAL);
+
+ /* Extract fields first */
+ for (int i = 0; i < parser->fp_size; i++) {
+ const struct nlfield_parser *fp = &parser->fp[i];
+ void *src = (char *)hdr + fp->off_in;
+ void *dst = (char *)target + fp->off_out;
+
+ error = fp->cb(src, npt, dst);
+ if (error != 0)
+ return (error);
+ }
+
+ struct nlattr *nla_head = (struct nlattr *)((char *)hdr + parser->nl_hdr_off);
+ error = nl_parse_attrs_raw(nla_head, len - parser->nl_hdr_off, parser->np,
+ parser->np_size, npt, target);
+
+ return (error);
+}
+
+static inline int
+nl_parse_nested(struct nlattr *nla, const struct nlhdr_parser *parser,
+ struct nl_pstate *npt, void *target)
+{
+ struct nlattr *nla_head = (struct nlattr *)NLA_DATA(nla);
+
+ return (nl_parse_attrs_raw(nla_head, NLA_DATA_LEN(nla), parser->np,
+ parser->np_size, npt, target));
+}
+
+/*
+ * Checks that attributes are sorted by attribute type.
+ */
+static inline void
+nl_verify_parsers(const struct nlhdr_parser **parser, int count)
+{
+ for (int i = 0; i < count; i++) {
+ const struct nlhdr_parser *p = parser[i];
+ int attr_type = 0;
+ for (int j = 0; j < p->np_size; j++) {
+ MPASS(p->np[j].type > attr_type);
+ attr_type = p->np[j].type;
+ }
+ }
+}
+void nl_verify_parsers(const struct nlhdr_parser **parser, int count);
+#define NL_VERIFY_PARSERS(_p) nl_verify_parsers((_p), NL_ARRAY_LEN(_p))
+
+static inline int
+nl_parse_nlmsg(struct nlmsghdr *hdr, const struct nlhdr_parser *parser,
+ struct nl_pstate *npt, void *target)
+{
+ return (nl_parse_header(hdr + 1, hdr->nlmsg_len - sizeof(*hdr), parser, npt, target));
+}
+
+#endif
diff --git a/sys/netlink/netlink_message_writer.c b/sys/netlink/netlink_message_writer.c
new file mode 100644
index 000000000000..1856f2859b01
--- /dev/null
+++ b/sys/netlink/netlink_message_writer.c
@@ -0,0 +1,686 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/mbuf.h>
+#include <sys/ck.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syslog.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_linux.h>
+#include <netlink/netlink_var.h>
+
+#define DEBUG_MOD_NAME nl_writer
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+/*
+ * The goal of this file is to provide convenient message writing KPI on top of
+ * different storage methods (mbufs, uio, temporary memory chunks).
+ *
+ * The main KPI guarantee is the the (last) message always resides in the contiguous
+ * memory buffer, so one is able to update the header after writing the entire message.
+ *
+ * This guarantee comes with a side effect of potentially reallocating underlying
+ * buffer, so one needs to update the desired pointers after something is added
+ * to the header.
+ *
+ * Messaging layer contains hooks performing transparent Linux translation for the messages.
+ *
+ * There are 3 types of supported targets:
+ * * socket (adds mbufs to the socket buffer, used for message replies)
+ * * group (sends mbuf/chain to the specified groups, used for the notifications)
+ * * chain (returns mbuf chain, used in Linux message translation code)
+ *
+ * There are 3 types of storage:
+ * * NS_WRITER_TYPE_MBUF (mbuf-based, most efficient, used when a single message
+ * fits in MCLBYTES)
+ * * NS_WRITER_TYPE_BUF (fallback, malloc-based, used when a single message needs
+ * to be larger than one supported by NS_WRITER_TYPE_MBUF)
+ * * NS_WRITER_TYPE_LBUF (malloc-based, similar to NS_WRITER_TYPE_BUF, used for
+ * Linux sockets, calls translation hook prior to sending messages to the socket).
+ *
+ * Internally, KPI switches between different types of storage when memory requirements
+ * change. It happens transparently to the caller.
+ */
+
+
+typedef bool nlwriter_op_init(struct nl_writer *nw, int size, bool waitok);
+typedef bool nlwriter_op_write(struct nl_writer *nw, void *buf, int buflen, int cnt);
+
+struct nlwriter_ops {
+ nlwriter_op_init *init;
+ nlwriter_op_write *write_socket;
+ nlwriter_op_write *write_group;
+ nlwriter_op_write *write_chain;
+};
+
+/*
+ * NS_WRITER_TYPE_BUF
+ * Writes message to a temporary memory buffer,
+ * flushing to the socket/group when buffer size limit is reached
+ */
+static bool
+nlmsg_get_ns_buf(struct nl_writer *nw, int size, bool waitok)
+{
+ int mflag = waitok ? M_WAITOK : M_NOWAIT;
+ nw->_storage = malloc(size, M_NETLINK, mflag | M_ZERO);
+ if (__predict_false(nw->_storage == NULL))
+ return (false);
+ nw->alloc_len = size;
+ nw->offset = 0;
+ nw->hdr = NULL;
+ nw->data = nw->_storage;
+ nw->writer_type = NS_WRITER_TYPE_BUF;
+ nw->malloc_flag = mflag;
+ nw->num_messages = 0;
+ nw->enomem = false;
+ return (true);
+}
+
+static bool
+nlmsg_write_socket_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
+{
+ NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw);
+ if (__predict_false(datalen == 0)) {
+ free(buf, M_NETLINK);
+ return (true);
+ }
+
+ struct mbuf *m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR);
+ if (__predict_false(m == NULL)) {
+ /* XXX: should we set sorcverr? */
+ free(buf, M_NETLINK);
+ return (false);
+ }
+ m_append(m, datalen, buf);
+ free(buf, M_NETLINK);
+
+ int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
+ return (nl_send_one(m, (struct nlpcb *)(nw->arg_ptr), cnt, io_flags));
+}
+
+static bool
+nlmsg_write_group_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
+{
+ NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg_ptr);
+ if (__predict_false(datalen == 0)) {
+ free(buf, M_NETLINK);
+ return (true);
+ }
+
+ struct mbuf *m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR);
+ if (__predict_false(m == NULL)) {
+ free(buf, M_NETLINK);
+ return (false);
+ }
+ bool success = m_append(m, datalen, buf) != 0;
+ free(buf, M_NETLINK);
+
+ if (!success)
+ return (false);
+
+ nl_send_group(m, cnt, nw->arg_uint >> 16, nw->arg_uint & 0xFFFF);
+ return (true);
+}
+
+static bool
+nlmsg_write_chain_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
+{
+ struct mbuf **m0 = (struct mbuf **)(nw->arg_ptr);
+ NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg_ptr);
+
+ if (__predict_false(datalen == 0)) {
+ free(buf, M_NETLINK);
+ return (true);
+ }
+
+ if (*m0 == NULL) {
+ struct mbuf *m;
+
+ m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR);
+ if (__predict_false(m == NULL)) {
+ free(buf, M_NETLINK);
+ return (false);
+ }
+ *m0 = m;
+ }
+ if (__predict_false(m_append(*m0, datalen, buf) == 0)) {
+ free(buf, M_NETLINK);
+ return (false);
+ }
+ return (true);
+}
+
+
+/*
+ * NS_WRITER_TYPE_MBUF
+ * Writes message to the allocated mbuf,
+ * flushing to socket/group when mbuf size limit is reached.
+ * This is the most efficient mechanism as it avoids double-copying.
+ *
+ * Allocates a single mbuf suitable to store up to @size bytes of data.
+ * If size < MHLEN (around 160 bytes), allocates mbuf with pkghdr
+ * If size <= MCLBYTES (2k), allocate a single mbuf cluster
+ * Otherwise, return NULL.
+ */
+static bool
+nlmsg_get_ns_mbuf(struct nl_writer *nw, int size, bool waitok)
+{
+ struct mbuf *m;
+
+ int mflag = waitok ? M_WAITOK : M_NOWAIT;
+ m = m_get2(size, mflag, MT_DATA, M_PKTHDR);
+ if (__predict_false(m == NULL))
+ return (false);
+ nw->alloc_len = M_TRAILINGSPACE(m);
+ nw->offset = 0;
+ nw->hdr = NULL;
+ nw->_storage = (void *)m;
+ nw->data = mtod(m, void *);
+ nw->writer_type = NS_WRITER_TYPE_MBUF;
+ nw->malloc_flag = mflag;
+ nw->num_messages = 0;
+ nw->enomem = false;
+ NL_LOG(LOG_DEBUG2, "alloc mbuf %p req_len %d alloc_len %d data_ptr %p",
+ m, size, nw->alloc_len, nw->data);
+ return (true);
+}
+
+static bool
+nlmsg_write_socket_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
+{
+ struct mbuf *m = (struct mbuf *)buf;
+ NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg_ptr);
+
+ if (__predict_false(datalen == 0)) {
+ m_freem(m);
+ return (true);
+ }
+
+ m->m_pkthdr.len = datalen;
+ m->m_len = datalen;
+ int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
+ return (nl_send_one(m, (struct nlpcb *)(nw->arg_ptr), cnt, io_flags));
+}
+
+static bool
+nlmsg_write_group_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
+{
+ struct mbuf *m = (struct mbuf *)buf;
+ NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg_ptr);
+
+ if (__predict_false(datalen == 0)) {
+ m_freem(m);
+ return (true);
+ }
+
+ m->m_pkthdr.len = datalen;
+ m->m_len = datalen;
+ nl_send_group(m, cnt, nw->arg_uint >> 16, nw->arg_uint & 0xFFFF);
+ return (true);
+}
+
+static bool
+nlmsg_write_chain_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
+{
+ struct mbuf *m_new = (struct mbuf *)buf;
+ struct mbuf **m0 = (struct mbuf **)(nw->arg_ptr);
+
+ NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg_ptr);
+
+ if (__predict_false(datalen == 0)) {
+ m_freem(m_new);
+ return (true);
+ }
+
+ m_new->m_pkthdr.len = datalen;
+ m_new->m_len = datalen;
+
+ if (*m0 == NULL) {
+ *m0 = m_new;
+ } else {
+ struct mbuf *m_last;
+ for (m_last = *m0; m_last->m_next != NULL; m_last = m_last->m_next)
+ ;
+ m_last->m_next = m_new;
+ (*m0)->m_pkthdr.len += datalen;
+ }
+
+ return (true);
+}
+
+/*
+ * NS_WRITER_TYPE_LBUF
+ * Writes message to the allocated memory buffer,
+ * flushing to socket/group when mbuf size limit is reached.
+ * Calls linux handler to rewrite messages before sending to the socket.
+ */
+static bool
+nlmsg_get_ns_lbuf(struct nl_writer *nw, int size, bool waitok)
+{
+ int mflag = waitok ? M_WAITOK : M_NOWAIT;
+ size = roundup2(size, sizeof(void *));
+ int add_size = sizeof(struct linear_buffer) + SCRATCH_BUFFER_SIZE;
+ char *buf = malloc(add_size + size * 2, M_NETLINK, mflag | M_ZERO);
+ if (__predict_false(buf == NULL))
+ return (false);
+
+ /* Fill buffer header first */
+ struct linear_buffer *lb = (struct linear_buffer *)buf;
+ lb->base = &buf[sizeof(struct linear_buffer) + size];
+ lb->size = size + SCRATCH_BUFFER_SIZE;
+
+ nw->alloc_len = size;
+ nw->offset = 0;
+ nw->hdr = NULL;
+ nw->_storage = buf;
+ nw->data = (char *)(lb + 1);
+ nw->malloc_flag = mflag;
+ nw->writer_type = NS_WRITER_TYPE_LBUF;
+ nw->num_messages = 0;
+ nw->enomem = false;
+ return (true);
+}
+
+
+static bool
+nlmsg_write_socket_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
+{
+ struct linear_buffer *lb = (struct linear_buffer *)buf;
+ char *data = (char *)(lb + 1);
+ struct nlpcb *nlp = (struct nlpcb *)(nw->arg_ptr);
+
+ if (__predict_false(datalen == 0)) {
+ free(buf, M_NETLINK);
+ return (true);
+ }
+
+ struct mbuf *m = NULL;
+ if (linux_netlink_p != NULL)
+ m = linux_netlink_p->msgs_to_linux(nlp->nl_proto, data, datalen, nlp);
+ free(buf, M_NETLINK);
+
+ if (__predict_false(m == NULL)) {
+ /* XXX: should we set sorcverr? */
+ return (false);
+ }
+
+ int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
+ return (nl_send_one(m, nlp, cnt, io_flags));
+}
+
+/* Shouldn't be called (maybe except Linux code originating message) */
+static bool
+nlmsg_write_group_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
+{
+ struct linear_buffer *lb = (struct linear_buffer *)buf;
+ char *data = (char *)(lb + 1);
+
+ if (__predict_false(datalen == 0)) {
+ free(buf, M_NETLINK);
+ return (true);
+ }
+
+ struct mbuf *m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR);
+ if (__predict_false(m == NULL)) {
+ free(buf, M_NETLINK);
+ return (false);
+ }
+ m_append(m, datalen, data);
+ free(buf, M_NETLINK);
+
+ nl_send_group(m, cnt, nw->arg_uint >> 16, nw->arg_uint & 0xFFFF);
+ return (true);
+}
+
+struct nlwriter_ops nlmsg_writers[] = {
+ /* NS_WRITER_TYPE_MBUF */
+ {
+ .init = nlmsg_get_ns_mbuf,
+ .write_socket = nlmsg_write_socket_mbuf,
+ .write_group = nlmsg_write_group_mbuf,
+ .write_chain = nlmsg_write_chain_mbuf,
+ },
+ /* NS_WRITER_TYPE_BUF */
+ {
+ .init = nlmsg_get_ns_buf,
+ .write_socket = nlmsg_write_socket_buf,
+ .write_group = nlmsg_write_group_buf,
+ .write_chain = nlmsg_write_chain_buf,
+ },
+ /* NS_WRITER_TYPE_LBUF */
+ {
+ .init = nlmsg_get_ns_lbuf,
+ .write_socket = nlmsg_write_socket_lbuf,
+ .write_group = nlmsg_write_group_lbuf,
+ },
+};
+
+static void
+nlmsg_set_callback(struct nl_writer *nw)
+{
+ struct nlwriter_ops *pops = &nlmsg_writers[nw->writer_type];
+
+ switch (nw->writer_target) {
+ case NS_WRITER_TARGET_SOCKET:
+ nw->cb = pops->write_socket;
+ break;
+ case NS_WRITER_TARGET_GROUP:
+ nw->cb = pops->write_group;
+ break;
+ case NS_WRITER_TARGET_CHAIN:
+ nw->cb = pops->write_chain;
+ break;
+ default:
+ panic("not implemented");
+ }
+}
+
+static bool
+nlmsg_get_buf_type(struct nl_writer *nw, int size, int type, bool waitok)
+{
+ MPASS(type + 1 <= sizeof(nlmsg_writers) / sizeof(nlmsg_writers[0]));
+ NL_LOG(LOG_DEBUG3, "Setting up nw %p size %d type %d", nw, size, type);
+ return (nlmsg_writers[type].init(nw, size, waitok));
+}
+
+static bool
+nlmsg_get_buf(struct nl_writer *nw, int size, bool waitok, bool is_linux)
+{
+ int type;
+
+ if (!is_linux) {
+ if (__predict_true(size <= MCLBYTES))
+ type = NS_WRITER_TYPE_MBUF;
+ else
+ type = NS_WRITER_TYPE_BUF;
+ } else
+ type = NS_WRITER_TYPE_LBUF;
+ return (nlmsg_get_buf_type(nw, size, type, waitok));
+}
+
+bool
+nlmsg_get_unicast_writer(struct nl_writer *nw, int size, struct nlpcb *nlp)
+{
+ if (!nlmsg_get_buf(nw, size, false, nlp->nl_linux))
+ return (false);
+ nw->arg_ptr = (void *)nlp;
+ nw->writer_target = NS_WRITER_TARGET_SOCKET;
+ nlmsg_set_callback(nw);
+ return (true);
+}
+
+bool
+nlmsg_get_group_writer(struct nl_writer *nw, int size, int protocol, int group_id)
+{
+ if (!nlmsg_get_buf(nw, size, false, false))
+ return (false);
+ nw->arg_uint = (uint64_t)protocol << 16 | (uint64_t)group_id;
+ nw->writer_target = NS_WRITER_TARGET_GROUP;
+ nlmsg_set_callback(nw);
+ return (true);
+}
+
+bool
+nlmsg_get_chain_writer(struct nl_writer *nw, int size, struct mbuf **pm)
+{
+ if (!nlmsg_get_buf(nw, size, false, false))
+ return (false);
+ *pm = NULL;
+ nw->arg_ptr = (void *)pm;
+ nw->writer_target = NS_WRITER_TARGET_CHAIN;
+ nlmsg_set_callback(nw);
+ NL_LOG(LOG_DEBUG3, "setup cb %p (need %p)", nw->cb, &nlmsg_write_chain_mbuf);
+ return (true);
+}
+
+void
+nlmsg_ignore_limit(struct nl_writer *nw)
+{
+ nw->ignore_limit = true;
+}
+
+bool
+nlmsg_flush(struct nl_writer *nw)
+{
+
+ if (__predict_false(nw->hdr != NULL)) {
+ /* Last message has not been completed, skip it. */
+ int completed_len = (char *)nw->hdr - nw->data;
+ /* Send completed messages */
+ nw->offset -= nw->offset - completed_len;
+ nw->hdr = NULL;
+ }
+
+ NL_LOG(LOG_DEBUG2, "OUT");
+ bool result = nw->cb(nw, nw->_storage, nw->offset, nw->num_messages);
+ nw->_storage = NULL;
+
+ if (!result) {
+ NL_LOG(LOG_DEBUG, "nw %p offset %d: flush with %p() failed", nw, nw->offset, nw->cb);
+ }
+
+ return (result);
+}
+
+/*
+ * Flushes previous data and allocates new underlying storage
+ * sufficient for holding at least @required_len bytes.
+ * Return true on success.
+ */
+bool
+nlmsg_refill_buffer(struct nl_writer *nw, int required_len)
+{
+ struct nl_writer ns_new = {};
+ int completed_len, new_len;
+
+ if (nw->enomem)
+ return (false);
+
+ NL_LOG(LOG_DEBUG3, "no space at offset %d/%d (want %d), trying to reclaim",
+ nw->offset, nw->alloc_len, required_len);
+
+ /* Calculated new buffer size and allocate it s*/
+ completed_len = (nw->hdr != NULL) ? (char *)nw->hdr - nw->data : nw->offset;
+ if (completed_len > 0 && required_len < MCLBYTES) {
+ /* We already ran out of space, use the largest effective size */
+ new_len = max(nw->alloc_len, MCLBYTES);
+ } else {
+ if (nw->alloc_len < MCLBYTES)
+ new_len = MCLBYTES;
+ else
+ new_len = nw->alloc_len * 2;
+ while (new_len < required_len)
+ new_len *= 2;
+ }
+ bool waitok = (nw->malloc_flag == M_WAITOK);
+ bool is_linux = (nw->writer_type == NS_WRITER_TYPE_LBUF);
+ if (!nlmsg_get_buf(&ns_new, new_len, waitok, is_linux)) {
+ nw->enomem = true;
+ NL_LOG(LOG_DEBUG, "getting new buf failed, setting ENOMEM");
+ return (false);
+ }
+ if (nw->ignore_limit)
+ nlmsg_ignore_limit(&ns_new);
+
+ /* Update callback data */
+ ns_new.writer_target = nw->writer_target;
+ nlmsg_set_callback(&ns_new);
+ ns_new.arg_uint = nw->arg_uint;
+
+ /* Copy last (unfinished) header to the new storage */
+ int last_len = nw->offset - completed_len;
+ if (last_len > 0) {
+ memcpy(ns_new.data, nw->hdr, last_len);
+ ns_new.hdr = (struct nlmsghdr *)ns_new.data;
+ ns_new.offset = last_len;
+ }
+
+ NL_LOG(LOG_DEBUG2, "completed: %d bytes, copied: %d bytes", completed_len, last_len);
+
+ /* Flush completed headers & switch to the new nw */
+ nlmsg_flush(nw);
+ memcpy(nw, &ns_new, sizeof(struct nl_writer));
+ NL_LOG(LOG_DEBUG2, "switched buffer: used %d/%d bytes", nw->offset, nw->alloc_len);
+
+ return (true);
+}
+
+bool
+nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
+ uint16_t flags, uint32_t len)
+{
+ struct nlmsghdr *hdr;
+
+ MPASS(nw->hdr == NULL);
+
+ int required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr));
+ if (__predict_false(nw->offset + required_len > nw->alloc_len)) {
+ if (!nlmsg_refill_buffer(nw, required_len))
+ return (false);
+ }
+
+ hdr = (struct nlmsghdr *)(&nw->data[nw->offset]);
+
+ hdr->nlmsg_len = len;
+ hdr->nlmsg_type = type;
+ hdr->nlmsg_flags = flags;
+ hdr->nlmsg_seq = seq;
+ hdr->nlmsg_pid = portid;
+
+ nw->hdr = hdr;
+ nw->offset += sizeof(struct nlmsghdr);
+
+ return (true);
+}
+
+bool
+nlmsg_end(struct nl_writer *nw)
+{
+ MPASS(nw->hdr != NULL);
+
+ if (nw->enomem) {
+ NL_LOG(LOG_DEBUG, "ENOMEM when dumping message");
+ nlmsg_abort(nw);
+ return (false);
+ }
+
+ nw->hdr->nlmsg_len = (uint32_t)(nw->data + nw->offset - (char *)nw->hdr);
+ nw->hdr = NULL;
+ nw->num_messages++;
+ return (true);
+}
+
+void
+nlmsg_abort(struct nl_writer *nw)
+{
+ if (nw->hdr != NULL) {
+ nw->offset = (uint32_t)((char *)nw->hdr - nw->data);
+ nw->hdr = NULL;
+ }
+}
+
+void
+nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *hdr,
+ struct nl_pstate *npt)
+{
+ struct nlmsgerr *errmsg;
+ int payload_len;
+ uint32_t flags = nlp->nl_flags;
+ struct nl_writer *nw = npt->nw;
+ bool cap_ack;
+
+ payload_len = sizeof(struct nlmsgerr);
+
+ /*
+ * The only case when we send the full message in the
+ * reply is when there is an error and NETLINK_CAP_ACK
+ * is not set.
+ */
+ cap_ack = (error == 0) || (flags & NLF_CAP_ACK);
+ if (!cap_ack)
+ payload_len += hdr->nlmsg_len - sizeof(struct nlmsghdr);
+ payload_len = NETLINK_ALIGN(payload_len);
+
+ uint16_t nl_flags = cap_ack ? NLM_F_CAPPED : 0;
+ if ((npt->err_msg || npt->err_off) && nlp->nl_flags & NLF_EXT_ACK)
+ nl_flags |= NLM_F_ACK_TLVS;
+
+ /*
+ * TODO: handle cookies
+ */
+
+ NL_LOG(LOG_DEBUG3, "acknowledging message type %d seq %d",
+ hdr->nlmsg_type, hdr->nlmsg_seq);
+
+ if (!nlmsg_add(nw, nlp->nl_port, hdr->nlmsg_seq, NLMSG_ERROR, nl_flags, payload_len))
+ goto enomem;
+
+ errmsg = nlmsg_reserve_data(nw, payload_len, struct nlmsgerr);
+ errmsg->error = error;
+ /* In case of error copy the whole message, else just the header */
+ memcpy(&errmsg->msg, hdr, cap_ack ? sizeof(*hdr) : hdr->nlmsg_len);
+
+ if (npt->err_msg != NULL && nlp->nl_flags & NLF_EXT_ACK)
+ nlattr_add_string(nw, NLMSGERR_ATTR_MSG, npt->err_msg);
+ if (npt->err_off != 0 && nlp->nl_flags & NLF_EXT_ACK)
+ nlattr_add_u32(nw, NLMSGERR_ATTR_OFFS, npt->err_off);
+
+ if (nlmsg_end(nw))
+ return;
+enomem:
+ NLP_LOG(LOG_DEBUG, nlp, "error allocating ack data for message %d seq %u",
+ hdr->nlmsg_type, hdr->nlmsg_seq);
+ nlmsg_abort(nw);
+}
+
+bool
+nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr)
+{
+ if (!nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) {
+ NL_LOG(LOG_DEBUG, "Error finalizing table dump");
+ return (false);
+ }
+ /* Save operation result */
+ int *perror = nlmsg_reserve_object(nw, int);
+ NL_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error,
+ nw->offset, perror);
+ *perror = error;
+ nlmsg_end(nw);
+
+ return (true);
+}
diff --git a/sys/netlink/netlink_message_writer.h b/sys/netlink/netlink_message_writer.h
new file mode 100644
index 000000000000..95f6dd8e6da0
--- /dev/null
+++ b/sys/netlink/netlink_message_writer.h
@@ -0,0 +1,250 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETLINK_NETLINK_MESSAGE_WRITER_H_
+#define _NETLINK_NETLINK_MESSAGE_WRITER_H_
+
+/*
+ * It is not meant to be included directly
+ */
+
+struct mbuf;
+struct nl_writer;
+typedef bool nl_writer_cb(struct nl_writer *nw, void *buf, int buflen, int cnt);
+
+struct nl_writer {
+ int alloc_len; /* allocated buffer length */
+ int offset; /* offset from the start of the buffer */
+ struct nlmsghdr *hdr; /* Pointer to the currently-filled msg */
+ char *data; /* pointer to the contiguous storage */
+ void *_storage; /* Underlying storage pointer */
+ nl_writer_cb *cb; /* Callback to flush data */
+ union {
+ void *arg_ptr; /* Callback argument as pointer */
+ uint64_t arg_uint; /* Callback argument as int */
+ };
+ int num_messages; /* Number of messages in the buffer */
+ int malloc_flag; /* M_WAITOK or M_NOWAIT */
+ uint8_t writer_type; /* NS_WRITER_TYPE_* */
+ uint8_t writer_target; /* NS_WRITER_TARGET_* */
+ bool ignore_limit; /* If true, ignores RCVBUF limit */
+ bool enomem; /* True if ENOMEM occured */
+};
+#define NS_WRITER_TARGET_SOCKET 0
+#define NS_WRITER_TARGET_GROUP 1
+#define NS_WRITER_TARGET_CHAIN 2
+
+#define NS_WRITER_TYPE_MBUF 0
+#define NS_WRITER_TYPE_BUF 1
+#define NS_WRITER_TYPE_LBUF 2
+#define NS_WRITER_TYPE_MBUFC 3
+
+
+#define NLMSG_SMALL 128
+#define NLMSG_LARGE 2048
+
+/* Message and attribute writing */
+
+struct nlpcb;
+bool nlmsg_get_unicast_writer(struct nl_writer *nw, int expected_size, struct nlpcb *nlp);
+bool nlmsg_get_group_writer(struct nl_writer *nw, int expected_size, int proto, int group_id);
+bool nlmsg_get_chain_writer(struct nl_writer *nw, int expected_size, struct mbuf **pm);
+bool nlmsg_flush(struct nl_writer *nw);
+void nlmsg_ignore_limit(struct nl_writer *nw);
+
+bool nlmsg_refill_buffer(struct nl_writer *nw, int required_size);
+bool nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
+ uint16_t flags, uint32_t len);
+bool nlmsg_end(struct nl_writer *nw);
+void nlmsg_abort(struct nl_writer *nw);
+
+bool nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr);
+
+static inline bool
+nlmsg_reply(struct nl_writer *nw, const struct nlmsghdr *hdr, int payload_len)
+{
+ return (nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type,
+ hdr->nlmsg_flags, payload_len));
+}
+
+#define nlmsg_data(_hdr) ((void *)((_hdr) + 1))
+
+/*
+ * KPI similar to mtodo():
+ * current (uncompleted) header is guaranteed to be contiguous,
+ * but can be reallocated, thus pointers may need to be readjusted.
+ */
+static inline int
+nlattr_save_offset(const struct nl_writer *nw)
+{
+ return (nw->offset - ((char *)nw->hdr - nw->data));
+}
+
+static inline void *
+_nlattr_restore_offset(const struct nl_writer *nw, int off)
+{
+ return ((void *)((char *)nw->hdr + off));
+}
+#define nlattr_restore_offset(_ns, _off, _t) ((_t *)_nlattr_restore_offset(_ns, _off))
+
+static inline void
+nlattr_set_len(const struct nl_writer *nw, int off)
+{
+ struct nlattr *nla = nlattr_restore_offset(nw, off, struct nlattr);
+ nla->nla_len = nlattr_save_offset(nw) - off;
+}
+
+static inline void *
+nlmsg_reserve_data_raw(struct nl_writer *nw, size_t sz)
+{
+ if (__predict_false(nw->offset + NETLINK_ALIGN(sz) > nw->alloc_len)) {
+ if (!nlmsg_refill_buffer(nw, NETLINK_ALIGN(sz)))
+ return (NULL);
+ }
+
+ void *data_ptr = &nw->data[nw->offset];
+ nw->offset += NLMSG_ALIGN(sz);
+
+ return (data_ptr);
+}
+#define nlmsg_reserve_object(_ns, _t) ((_t *)nlmsg_reserve_data_raw(_ns, NLA_ALIGN(sizeof(_t))))
+#define nlmsg_reserve_data(_ns, _sz, _t) ((_t *)nlmsg_reserve_data_raw(_ns, _sz))
+
+static inline int
+nlattr_add_nested(struct nl_writer *nw, uint16_t nla_type)
+{
+ int off = nlattr_save_offset(nw);
+ struct nlattr *nla = nlmsg_reserve_data(nw, sizeof(struct nlattr), struct nlattr);
+ if (__predict_false(nla == NULL))
+ return (0);
+ nla->nla_type = nla_type;
+ return (off);
+}
+
+static inline void *
+_nlmsg_reserve_attr(struct nl_writer *nw, uint16_t nla_type, uint16_t sz)
+{
+ sz += sizeof(struct nlattr);
+
+ struct nlattr *nla = nlmsg_reserve_data(nw, sz, struct nlattr);
+ if (__predict_false(nla == NULL))
+ return (NULL);
+ nla->nla_type = nla_type;
+ nla->nla_len = sz;
+
+ return ((void *)(nla + 1));
+}
+#define nlmsg_reserve_attr(_ns, _at, _t) ((_t *)_nlmsg_reserve_attr(_ns, _at, NLA_ALIGN(sizeof(_t))))
+
+static inline bool
+nlattr_add(struct nl_writer *nw, int attr_type, int attr_len, const void *data)
+{
+ int required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr));
+
+ if (__predict_false(nw->offset + required_len > nw->alloc_len)) {
+ if (!nlmsg_refill_buffer(nw, required_len))
+ return (false);
+ }
+
+ struct nlattr *nla = (struct nlattr *)(&nw->data[nw->offset]);
+
+ nla->nla_len = attr_len + sizeof(struct nlattr);
+ nla->nla_type = attr_type;
+ if (attr_len > 0) {
+ if ((attr_len % 4) != 0) {
+ /* clear padding bytes */
+ bzero((char *)nla + required_len - 4, 4);
+ }
+ memcpy((nla + 1), data, attr_len);
+ }
+ nw->offset += required_len;
+ return (true);
+}
+
+static inline bool
+nlattr_add_u8(struct nl_writer *nw, int attrtype, uint8_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(uint8_t), &value));
+}
+
+static inline bool
+nlattr_add_u16(struct nl_writer *nw, int attrtype, uint16_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(uint16_t), &value));
+}
+
+static inline bool
+nlattr_add_u32(struct nl_writer *nw, int attrtype, uint32_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(uint32_t), &value));
+}
+
+static inline bool
+nlattr_add_u64(struct nl_writer *nw, int attrtype, uint64_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(uint64_t), &value));
+}
+
+static inline bool
+nlattr_add_s8(struct nl_writer *nw, int attrtype, int8_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(int8_t), &value));
+}
+
+static inline bool
+nlattr_add_s16(struct nl_writer *nw, int attrtype, int16_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(int16_t), &value));
+}
+
+static inline bool
+nlattr_add_s32(struct nl_writer *nw, int attrtype, int32_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(int32_t), &value));
+}
+
+static inline bool
+nlattr_add_s64(struct nl_writer *nw, int attrtype, int64_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(int64_t), &value));
+}
+
+static inline bool
+nlattr_add_flag(struct nl_writer *nw, int attrtype)
+{
+ return (nlattr_add(nw, attrtype, 0, NULL));
+}
+
+static inline bool
+nlattr_add_string(struct nl_writer *nw, int attrtype, const char *str)
+{
+ return (nlattr_add(nw, attrtype, strlen(str) + 1, str));
+}
+
+
+#endif
diff --git a/sys/netlink/netlink_module.c b/sys/netlink/netlink_module.c
new file mode 100644
index 000000000000..a1bcb8a29511
--- /dev/null
+++ b/sys/netlink/netlink_module.c
@@ -0,0 +1,228 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/ck.h>
+#include <sys/syslog.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+
+#include <machine/atomic.h>
+
+MALLOC_DEFINE(M_NETLINK, "netlink", "Memory used for netlink packets");
+
+#define DEBUG_MOD_NAME nl_mod
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+SYSCTL_NODE(_net, OID_AUTO, netlink, CTLFLAG_RD, 0, "");
+SYSCTL_NODE(_net_netlink, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+
+#define NL_MAX_HANDLERS 20
+struct nl_proto_handler _nl_handlers[NL_MAX_HANDLERS];
+struct nl_proto_handler *nl_handlers = _nl_handlers;
+
+CK_LIST_HEAD(nl_control_head, nl_control);
+static struct nl_control_head vnets_head = CK_LIST_HEAD_INITIALIZER();
+
+VNET_DEFINE(struct nl_control *, nl_ctl) = NULL;
+
+struct mtx nl_global_mtx;
+MTX_SYSINIT(nl_global_mtx, &nl_global_mtx, "global netlink lock", MTX_DEF);
+
+#define NL_GLOBAL_LOCK() mtx_lock(&nl_global_mtx)
+#define NL_GLOBAL_UNLOCK() mtx_unlock(&nl_global_mtx)
+
+int netlink_unloading = 0;
+
+static void
+free_nl_ctl(struct nl_control *ctl)
+{
+ rm_destroy(&ctl->ctl_lock);
+ free(ctl, M_NETLINK);
+}
+
+struct nl_control *
+vnet_nl_ctl_init(void)
+{
+ struct nl_control *ctl;
+
+ ctl = malloc(sizeof(struct nl_control), M_NETLINK, M_WAITOK | M_ZERO);
+ rm_init(&ctl->ctl_lock, "netlink lock");
+ CK_LIST_INIT(&ctl->ctl_port_head);
+ CK_LIST_INIT(&ctl->ctl_pcb_head);
+
+ NL_GLOBAL_LOCK();
+
+ struct nl_control *tmp = atomic_load_ptr(&V_nl_ctl);
+
+ if (tmp == NULL) {
+ atomic_store_ptr(&V_nl_ctl, ctl);
+ CK_LIST_INSERT_HEAD(&vnets_head, ctl, ctl_next);
+ NL_LOG(LOG_DEBUG2, "VNET %p init done, inserted %p into global list",
+ curvnet, ctl);
+ } else {
+ NL_LOG(LOG_DEBUG, "per-VNET init clash, dropping this instance");
+ free_nl_ctl(ctl);
+ ctl = tmp;
+ }
+
+ NL_GLOBAL_UNLOCK();
+
+ return (ctl);
+}
+
+static void
+vnet_nl_ctl_destroy(const void *unused __unused)
+{
+ struct nl_control *ctl;
+
+ /* Assume at the time all of the processes / sockets are dead */
+
+ NL_GLOBAL_LOCK();
+ ctl = atomic_load_ptr(&V_nl_ctl);
+ atomic_store_ptr(&V_nl_ctl, NULL);
+ if (ctl != NULL) {
+ NL_LOG(LOG_DEBUG2, "Removing %p from global list", ctl);
+ CK_LIST_REMOVE(ctl, ctl_next);
+ }
+ NL_GLOBAL_UNLOCK();
+
+ if (ctl != NULL)
+ free_nl_ctl(ctl);
+}
+VNET_SYSUNINIT(vnet_nl_ctl_destroy, SI_SUB_PROTO_IF, SI_ORDER_ANY,
+ vnet_nl_ctl_destroy, NULL);
+
+int
+nl_verify_proto(int proto)
+{
+ if (proto < 0 || proto >= NL_MAX_HANDLERS) {
+ return (EINVAL);
+ }
+ int handler_defined = nl_handlers[proto].cb != NULL;
+ return (handler_defined ? 0 : EPROTONOSUPPORT);
+}
+
+const char *
+nl_get_proto_name(int proto)
+{
+ return (nl_handlers[proto].proto_name);
+}
+
+bool
+netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler)
+{
+ if ((proto < 0) || (proto >= NL_MAX_HANDLERS))
+ return (false);
+ NL_GLOBAL_LOCK();
+ KASSERT((nl_handlers[proto].cb == NULL), ("netlink handler %d is already set", proto));
+ nl_handlers[proto].cb = handler;
+ nl_handlers[proto].proto_name = proto_name;
+ NL_GLOBAL_UNLOCK();
+ NL_LOG(LOG_DEBUG, "Registered netlink %s(%d) handler", proto_name, proto);
+ return (true);
+}
+
+bool
+netlink_unregister_proto(int proto)
+{
+ if ((proto < 0) || (proto >= NL_MAX_HANDLERS))
+ return (false);
+ NL_GLOBAL_LOCK();
+ KASSERT((nl_handlers[proto].cb != NULL), ("netlink handler %d is not set", proto));
+ nl_handlers[proto].cb = NULL;
+ nl_handlers[proto].proto_name = NULL;
+ NL_GLOBAL_UNLOCK();
+ NL_LOG(LOG_DEBUG, "Unregistered netlink proto %d handler", proto);
+ return (true);
+}
+
+static bool
+can_unload(void)
+{
+ struct nl_control *ctl;
+ bool result = true;
+
+ NL_GLOBAL_LOCK();
+
+ CK_LIST_FOREACH(ctl, &vnets_head, ctl_next) {
+ NL_LOG(LOG_DEBUG2, "Iterating VNET head %p", ctl);
+ if (!CK_LIST_EMPTY(&ctl->ctl_pcb_head)) {
+ NL_LOG(LOG_NOTICE, "non-empty socket list in ctl %p", ctl);
+ result = false;
+ break;
+ }
+ }
+
+ NL_GLOBAL_UNLOCK();
+
+ return (result);
+}
+
+static int
+netlink_modevent(module_t mod __unused, int what, void *priv __unused)
+{
+ int ret = 0;
+
+ switch (what) {
+ case MOD_LOAD:
+ NL_LOG(LOG_DEBUG, "Loading");
+ NL_LOG(LOG_NOTICE, "netlink support is in BETA stage");
+ break;
+
+ case MOD_UNLOAD:
+ NL_LOG(LOG_DEBUG, "Unload called");
+ if (can_unload()) {
+ NL_LOG(LOG_WARNING, "unloading");
+ netlink_unloading = 1;
+ } else
+ ret = EBUSY;
+ break;
+
+ default:
+ ret = EOPNOTSUPP;
+ break;
+ }
+
+ return (ret);
+}
+static moduledata_t netlink_mod = { "netlink", netlink_modevent, NULL };
+
+DECLARE_MODULE(netlink, netlink_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+MODULE_VERSION(netlink, 1);
diff --git a/sys/netlink/netlink_route.c b/sys/netlink/netlink_route.c
new file mode 100644
index 000000000000..f12bf268e252
--- /dev/null
+++ b/sys/netlink/netlink_route.c
@@ -0,0 +1,135 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/ck.h>
+
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#define DEBUG_MOD_NAME nl_route_core
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+#define HANDLER_MAX_NUM (NL_RTM_MAX + 10)
+static const struct rtnl_cmd_handler *rtnl_handler[HANDLER_MAX_NUM] = {};
+
+bool
+rtnl_register_messages(const struct rtnl_cmd_handler *handlers, int count)
+{
+ for (int i = 0; i < count; i++) {
+ if (handlers[i].cmd >= HANDLER_MAX_NUM)
+ return (false);
+ MPASS(rtnl_handler[handlers[i].cmd] == NULL);
+ }
+ for (int i = 0; i < count; i++)
+ rtnl_handler[handlers[i].cmd] = &handlers[i];
+ return (true);
+}
+
+/*
+ * Handler called by netlink subsystem when matching netlink message is received
+ */
+static int
+rtnl_handle_message(struct nlmsghdr *hdr, struct nl_pstate *npt)
+{
+ const struct rtnl_cmd_handler *cmd;
+ struct epoch_tracker et;
+ struct nlpcb *nlp = npt->nlp;
+ int error = 0;
+
+ if (__predict_false(hdr->nlmsg_type >= HANDLER_MAX_NUM)) {
+ NLMSG_REPORT_ERR_MSG(npt, "unknown message type: %d", hdr->nlmsg_type);
+ return (ENOTSUP);
+ }
+
+ cmd = rtnl_handler[hdr->nlmsg_type];
+ if (__predict_false(cmd == NULL)) {
+ NLMSG_REPORT_ERR_MSG(npt, "unknown message type: %d", hdr->nlmsg_type);
+ return (ENOTSUP);
+ }
+
+ NLP_LOG(LOG_DEBUG2, nlp, "received msg %s(%d) len %d", cmd->name,
+ hdr->nlmsg_type, hdr->nlmsg_len);
+
+ if (cmd->priv != 0 && !nlp_has_priv(nlp, cmd->priv)) {
+ NLP_LOG(LOG_DEBUG2, nlp, "priv %d check failed for msg %s", cmd->priv, cmd->name);
+ return (EPERM);
+ } else if (cmd->priv != 0)
+ NLP_LOG(LOG_DEBUG3, nlp, "priv %d check passed for msg %s", cmd->priv, cmd->name);
+
+ bool need_epoch = !(cmd->flags & RTNL_F_NOEPOCH);
+
+ if (need_epoch)
+ NET_EPOCH_ENTER(et);
+ error = cmd->cb(hdr, nlp, npt);
+ if (need_epoch)
+ NET_EPOCH_EXIT(et);
+
+ NLP_LOG(LOG_DEBUG3, nlp, "message %s -> error %d", cmd->name, error);
+
+ return (error);
+}
+
+static struct rtbridge nlbridge = { .route_f = rtnl_handle_route_event };
+static struct rtbridge *nlbridge_orig_p;
+
+static void
+rtnl_load(void *u __unused)
+{
+ NL_LOG(LOG_NOTICE, "rtnl loading");
+ nlbridge_orig_p = netlink_callback_p;
+ netlink_callback_p = &nlbridge;
+ rtnl_neighs_init();
+ rtnl_ifaces_init();
+ rtnl_nexthops_init();
+ rtnl_routes_init();
+ netlink_register_proto(NETLINK_ROUTE, "NETLINK_ROUTE", rtnl_handle_message);
+}
+SYSINIT(rtnl_load, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_load, NULL);
+
+static void
+rtnl_unload(void *u __unused)
+{
+ netlink_callback_p = nlbridge_orig_p;
+ rtnl_ifaces_destroy();
+ rtnl_neighs_destroy();
+
+ /* Wait till all consumers read nlbridge data */
+ epoch_wait_preempt(net_epoch_preempt);
+}
+SYSUNINIT(rtnl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_unload, NULL);
diff --git a/sys/netlink/netlink_route.h b/sys/netlink/netlink_route.h
new file mode 100644
index 000000000000..93445f2e1699
--- /dev/null
+++ b/sys/netlink/netlink_route.h
@@ -0,0 +1,43 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _NETLINK_NETLINK_ROUTE_H_
+#define _NETLINK_NETLINK_ROUTE_H_
+
+#include <sys/types.h>
+
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_var.h>
+
+#include <netlink/route/common.h>
+#include <netlink/route/ifaddrs.h>
+#include <netlink/route/interface.h>
+#include <netlink/route/neigh.h>
+#include <netlink/route/route.h>
+#include <netlink/route/nexthop.h>
+
+#endif
diff --git a/sys/netlink/netlink_var.h b/sys/netlink/netlink_var.h
new file mode 100644
index 000000000000..40d3870fd795
--- /dev/null
+++ b/sys/netlink/netlink_var.h
@@ -0,0 +1,142 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _NETLINK_NETLINK_VAR_H_
+#define _NETLINK_NETLINK_VAR_H_
+
+#include <sys/ck.h>
+#include <sys/epoch.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <net/vnet.h>
+
+#define NLSNDQ 65536 /* Default socket sendspace */
+#define NLRCVQ 65536 /* Default socket recvspace */
+
+struct ucred;
+
+struct nl_io_queue {
+ STAILQ_HEAD(, mbuf) head;
+ int length;
+ int hiwat;
+};
+
+struct nlpcb {
+ struct socket *nl_socket;
+ uint64_t nl_groups;
+ uint32_t nl_port;
+ uint32_t nl_flags;
+ uint32_t nl_process_id;
+ int nl_proto;
+ bool nl_active;
+ bool nl_bound;
+ bool nl_task_pending;
+ bool nl_tx_blocked; /* No new requests accepted */
+ bool nl_linux; /* true if running under compat */
+ struct nl_io_queue rx_queue;
+ struct nl_io_queue tx_queue;
+ struct taskqueue *nl_taskqueue;
+ struct task nl_task;
+ struct ucred *nl_cred; /* Copy of nl_socket->so_cred */
+ uint64_t nl_dropped_bytes;
+ uint64_t nl_dropped_messages;
+ CK_LIST_ENTRY(nlpcb) nl_next;
+ CK_LIST_ENTRY(nlpcb) nl_port_next;
+ volatile u_int nl_refcount;
+ struct mtx nl_lock;
+ struct epoch_context nl_epoch_ctx;
+};
+#define sotonlpcb(so) ((struct nlpcb *)(so)->so_pcb)
+
+#define NLP_LOCK_INIT(_nlp) mtx_init(&((_nlp)->nl_lock), "nlp mtx", NULL, MTX_DEF)
+#define NLP_LOCK_DESTROY(_nlp) mtx_destroy(&((_nlp)->nl_lock))
+#define NLP_LOCK(_nlp) mtx_lock(&((_nlp)->nl_lock))
+#define NLP_UNLOCK(_nlp) mtx_unlock(&((_nlp)->nl_lock))
+
+#define ALIGNED_NL_SZ(_data) roundup2((((struct nlmsghdr *)(_data))->nlmsg_len), 16)
+
+/* nl_flags */
+#define NLF_CAP_ACK 0x01 /* Do not send message body with errmsg */
+#define NLF_EXT_ACK 0x02 /* Allow including extended TLVs in ack */
+#define NLF_STRICT 0x04 /* Perform strict header checks */
+
+SYSCTL_DECL(_net_netlink);
+
+struct nl_io {
+ struct callout callout;
+ struct mbuf *head;
+ struct mbuf *last;
+ int64_t length;
+};
+
+struct nl_control {
+ CK_LIST_HEAD(nl_pid_head, nlpcb) ctl_port_head;
+ CK_LIST_HEAD(nlpcb_head, nlpcb) ctl_pcb_head;
+ CK_LIST_ENTRY(nl_control) ctl_next;
+ struct nl_io ctl_io;
+ struct rmlock ctl_lock;
+};
+VNET_DECLARE(struct nl_control *, nl_ctl);
+#define V_nl_ctl VNET(nl_ctl)
+
+
+struct sockaddr_nl;
+struct sockaddr;
+struct nlmsghdr;
+
+/* netlink_module.c */
+struct nl_control *vnet_nl_ctl_init(void);
+
+int nl_verify_proto(int proto);
+const char *nl_get_proto_name(int proto);
+
+extern int netlink_unloading;
+
+struct nl_proto_handler {
+ nl_handler_f cb;
+ const char *proto_name;
+};
+extern struct nl_proto_handler *nl_handlers;
+
+/* netlink_domain.c */
+void nl_send_group(struct mbuf *m, int cnt, int proto, int group_id);
+
+/* netlink_io.c */
+#define NL_IOF_UNTRANSLATED 0x01
+#define NL_IOF_IGNORE_LIMIT 0x02
+bool nl_send_one(struct mbuf *m, struct nlpcb *nlp, int cnt, int io_flags);
+void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *nlmsg,
+ struct nl_pstate *npt);
+void nl_on_transmit(struct nlpcb *nlp);
+void nl_init_io(struct nlpcb *nlp);
+void nl_free_io(struct nlpcb *nlp);
+
+void nl_taskqueue_handler(void *_arg, int pending);
+int nl_receive_async(struct mbuf *m, struct socket *so);
+void nl_process_receive_locked(struct nlpcb *nlp);
+
+#endif
diff --git a/sys/netlink/route/common.h b/sys/netlink/route/common.h
new file mode 100644
index 000000000000..1bfb888b34c0
--- /dev/null
+++ b/sys/netlink/route/common.h
@@ -0,0 +1,213 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Common defines for all parts of the NETLINK_ROUTE family
+ */
+#ifndef _NETLINK_ROUTE_COMMON_H_
+#define _NETLINK_ROUTE_COMMON_H_
+
+/* Defined NETLINK_ROUTE messages */
+enum {
+ NL_RTM_BASE = 16,
+ NL_RTM_NEWLINK = 16, /* creates new interface */
+ NL_RTM_DELLINK = 17, /* deletes matching interface */
+ NL_RTM_GETLINK = 18, /* lists matching interfaces */
+ NL_RTM_SETLINK = 19, /* not supported */
+ NL_RTM_NEWADDR = 20, /* not supported */
+ NL_RTM_DELADDR = 21, /* not supported */
+ NL_RTM_GETADDR = 22, /* lists matching ifaddrs */
+ NL_RTM_NEWROUTE = 24, /* adds or changes a route */
+ NL_RTM_DELROUTE = 25, /* deletes matching route */
+ NL_RTM_GETROUTE = 26, /* lists matching routes */
+ NL_RTM_NEWNEIGH = 28, /* creates new arp/ndp entry */
+ NL_RTM_DELNEIGH = 29, /* deletes matching arp/ndp entry */
+ NL_RTM_GETNEIGH = 30, /* lists matching arp/ndp entry */
+ NL_RTM_NEWRULE = 32, /* not supported */
+ NL_RTM_DELRULE = 33, /* not supported */
+ NL_RTM_GETRULE = 34, /* not supported */
+ NL_RTM_NEWQDISC = 36, /* not supported */
+ NL_RTM_DELQDISC = 37, /* not supported */
+ NL_RTM_GETQDISC = 38, /* not supported */
+ NL_RTM_NEWTCLASS = 40, /* not supported */
+ NL_RTM_DELTCLASS = 41, /* not supported */
+ NL_RTM_GETTCLASS = 42, /* not supported */
+ NL_RTM_NEWTFILTER = 44, /* not supported */
+ NL_RTM_DELTFILTER = 45, /* not supported */
+ NL_RTM_GETTFILTER = 46, /* not supported */
+ NL_RTM_NEWACTION = 48, /* not supported */
+ NL_RTM_DELACTION = 49, /* not supported */
+ NL_RTM_GETACTION = 50, /* not supported */
+ NL_RTM_NEWPREFIX = 52, /* not supported */
+ NL_RTM_GETMULTICAST = 58, /* not supported */
+ NL_RTM_GETANYCAST = 62, /* not supported */
+ NL_RTM_NEWNEIGHTBL = 64, /* not supported */
+ NL_RTM_GETNEIGHTBL = 66, /* not supported */
+ NL_RTM_SETNEIGHTBL = 67, /* not supported */
+ NL_RTM_NEWNDUSEROPT = 68, /* not supported */
+ NL_RTM_NEWADDRLABEL = 72, /* not supported */
+ NL_RTM_DELADDRLABEL = 73, /* not supported */
+ NL_RTM_GETADDRLABEL = 74, /* not supported */
+ NL_RTM_GETDCB = 78, /* not supported */
+ NL_RTM_SETDCB = 79, /* not supported */
+ NL_RTM_NEWNETCONF = 80, /* not supported */
+ NL_RTM_GETNETCONF = 82, /* not supported */
+ NL_RTM_NEWMDB = 84, /* not supported */
+ NL_RTM_DELMDB = 85, /* not supported */
+ NL_RTM_GETMDB = 86, /* not supported */
+ NL_RTM_NEWNSID = 88, /* not supported */
+ NL_RTM_DELNSID = 89, /* not supported */
+ NL_RTM_GETNSID = 90, /* not supported */
+ NL_RTM_NEWSTATS = 92, /* not supported */
+ NL_RTM_GETSTATS = 94, /* not supported */
+ NL_RTM_NEWNEXTHOP = 104, /* creates new user nexhtop */
+ NL_RTM_DELNEXTHOP = 105, /* deletes matching nexthop */
+ NL_RTM_GETNEXTHOP = 106, /* lists created user nexthops */
+ __NL_RTM_MAX,
+};
+#define NL_RTM_MAX (((__NL_RTM_MAX + 3) & ~3) - 1)
+
+#ifndef _KERNEL
+/*
+ * RTM_* namespace clashes with BSD rtsock namespace.
+ * Use NL_RTM_ prefix in the kernel and map it to RTM_
+ * for userland.
+ */
+#define RTM_BASE NL_RTM_BASE
+#define RTM_NEWLINK NL_RTM_NEWLINK
+#define RTM_DELLINK NL_RTM_DELLINK
+#define RTM_GETLINK NL_RTM_GETLINK
+#define RTM_SETLINK NL_RTM_SETLINK
+#define RTM_NEWADDR NL_RTM_NEWADDR
+#define RTM_DELADDR NL_RTM_DELADDR
+#define RTM_GETADDR NL_RTM_GETADDR
+#define RTM_NEWROUTE NL_RTM_NEWROUTE
+#define RTM_DELROUTE NL_RTM_DELROUTE
+#define RTM_GETROUTE NL_RTM_GETROUTE
+#define RTM_NEWNEXTHOP NL_RTM_NEWNEXTHOP
+#define RTM_DELNEXTHOP NL_RTM_DELNEXTHOP
+#define RTM_GETNEXTHOP NL_RTM_GETNEXTHOP
+#endif
+
+#ifndef _KERNEL
+/* rtnetlink multicast groups - backwards compatibility for userspace */
+#define RTMGRP_LINK 0x01
+#define RTMGRP_NOTIFY 0x02
+#define RTMGRP_NEIGH 0x04
+#define RTMGRP_TC 0x08
+
+#define RTMGRP_IPV4_IFADDR 0x10
+#define RTMGRP_IPV4_MROUTE 0x20
+#define RTMGRP_IPV4_ROUTE 0x40
+#define RTMGRP_IPV4_RULE 0x80
+
+#define RTMGRP_IPV6_IFADDR 0x100
+#define RTMGRP_IPV6_MROUTE 0x200
+#define RTMGRP_IPV6_ROUTE 0x400
+#define RTMGRP_IPV6_IFINFO 0x800
+
+#define RTMGRP_DECnet_IFADDR 0x1000
+#define RTMGRP_DECnet_ROUTE 0x4000
+
+#define RTMGRP_IPV6_PREFIX 0x20000
+#endif
+
+/* Defined NETLINK_ROUTE multicast groups */
+enum rtnetlink_groups {
+ RTNLGRP_NONE,
+#define RTNLGRP_NONE RTNLGRP_NONE
+ RTNLGRP_LINK,
+#define RTNLGRP_LINK RTNLGRP_LINK
+ RTNLGRP_NOTIFY,
+#define RTNLGRP_NOTIFY RTNLGRP_NOTIFY
+ RTNLGRP_NEIGH,
+#define RTNLGRP_NEIGH RTNLGRP_NEIGH
+ RTNLGRP_TC,
+#define RTNLGRP_TC RTNLGRP_TC
+ RTNLGRP_IPV4_IFADDR,
+#define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR
+ RTNLGRP_IPV4_MROUTE,
+#define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE
+ RTNLGRP_IPV4_ROUTE,
+#define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE
+ RTNLGRP_IPV4_RULE,
+#define RTNLGRP_IPV4_RULE RTNLGRP_IPV4_RULE
+ RTNLGRP_IPV6_IFADDR,
+#define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR
+ RTNLGRP_IPV6_MROUTE,
+#define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE
+ RTNLGRP_IPV6_ROUTE,
+#define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE
+ RTNLGRP_IPV6_IFINFO,
+#define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO
+ RTNLGRP_DECnet_IFADDR,
+#define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR
+ RTNLGRP_NOP2,
+ RTNLGRP_DECnet_ROUTE,
+#define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE
+ RTNLGRP_DECnet_RULE,
+#define RTNLGRP_DECnet_RULE RTNLGRP_DECnet_RULE
+ RTNLGRP_NOP4,
+ RTNLGRP_IPV6_PREFIX,
+#define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX
+ RTNLGRP_IPV6_RULE,
+#define RTNLGRP_IPV6_RULE RTNLGRP_IPV6_RULE
+ RTNLGRP_ND_USEROPT,
+#define RTNLGRP_ND_USEROPT RTNLGRP_ND_USEROPT
+ RTNLGRP_PHONET_IFADDR,
+#define RTNLGRP_PHONET_IFADDR RTNLGRP_PHONET_IFADDR
+ RTNLGRP_PHONET_ROUTE,
+#define RTNLGRP_PHONET_ROUTE RTNLGRP_PHONET_ROUTE
+ RTNLGRP_DCB,
+#define RTNLGRP_DCB RTNLGRP_DCB
+ RTNLGRP_IPV4_NETCONF,
+#define RTNLGRP_IPV4_NETCONF RTNLGRP_IPV4_NETCONF
+ RTNLGRP_IPV6_NETCONF,
+#define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF
+ RTNLGRP_MDB,
+#define RTNLGRP_MDB RTNLGRP_MDB
+ RTNLGRP_MPLS_ROUTE,
+#define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE
+ RTNLGRP_NSID,
+#define RTNLGRP_NSID RTNLGRP_NSID
+ RTNLGRP_MPLS_NETCONF,
+#define RTNLGRP_MPLS_NETCONF RTNLGRP_MPLS_NETCONF
+ RTNLGRP_IPV4_MROUTE_R,
+#define RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV4_MROUTE_R
+ RTNLGRP_IPV6_MROUTE_R,
+#define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R
+ RTNLGRP_NEXTHOP,
+#define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP
+ RTNLGRP_BRVLAN,
+#define RTNLGRP_BRVLAN RTNLGRP_BRVLAN
+ __RTNLGRP_MAX
+};
+#define RTNLGRP_MAX (__RTNLGRP_MAX - 1)
+
+
+#endif
+
diff --git a/sys/netlink/route/iface.c b/sys/netlink/route/iface.c
new file mode 100644
index 000000000000..8db24b5507e4
--- /dev/null
+++ b/sys/netlink/route/iface.c
@@ -0,0 +1,857 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/syslog.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_media.h>
+#include <net/if_var.h>
+#include <net/if_clone.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#include <netinet6/scope6_var.h> /* scope deembedding */
+
+#define DEBUG_MOD_NAME nl_iface
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+struct netlink_walkargs {
+ struct nl_writer *nw;
+ struct nlmsghdr hdr;
+ struct nlpcb *so;
+ uint32_t fibnum;
+ int family;
+ int error;
+ int count;
+ int dumped;
+};
+
+static eventhandler_tag ifdetach_event, ifattach_event, ifaddr_event;
+
+static SLIST_HEAD(, nl_cloner) nl_cloners = SLIST_HEAD_INITIALIZER(nl_cloners);
+
+static struct sx rtnl_cloner_lock;
+SX_SYSINIT(rtnl_cloner_lock, &rtnl_cloner_lock, "rtnl cloner lock");
+
+/*
+ * RTM_GETLINK request
+ * sendto(3, {{len=32, type=RTM_GETLINK, flags=NLM_F_REQUEST|NLM_F_DUMP, seq=1641940952, pid=0},
+ * {ifi_family=AF_INET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}, 32, 0, NULL, 0) = 32
+ *
+ * Reply:
+ * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_ETHER, ifi_index=if_nametoindex("enp0s31f6"), ifi_flags=IFF_UP|IFF_BROADCAST|IFF_RUNNING|IFF_MULTICAST|IFF_LOWER_UP, ifi_change=0},
+{{nla_len=10, nla_type=IFLA_ADDRESS}, "\xfe\x54\x00\x52\x3e\x90"}
+
+[
+{{nla_len=14, nla_type=IFLA_IFNAME}, "enp0s31f6"},
+{{nla_len=8, nla_type=IFLA_TXQLEN}, 1000},
+{{nla_len=5, nla_type=IFLA_OPERSTATE}, 6},
+{{nla_len=5, nla_type=IFLA_LINKMODE}, 0},
+{{nla_len=8, nla_type=IFLA_MTU}, 1500},
+{{nla_len=8, nla_type=IFLA_MIN_MTU}, 68},
+ {{nla_len=8, nla_type=IFLA_MAX_MTU}, 9000},
+{{nla_len=8, nla_type=IFLA_GROUP}, 0},
+{{nla_len=8, nla_type=IFLA_PROMISCUITY}, 0},
+{{nla_len=8, nla_type=IFLA_NUM_TX_QUEUES}, 1},
+{{nla_len=8, nla_type=IFLA_GSO_MAX_SEGS}, 65535},
+{{nla_len=8, nla_type=IFLA_GSO_MAX_SIZE}, 65536},
+{{nla_len=8, nla_type=IFLA_NUM_RX_QUEUES}, 1},
+{{nla_len=5, nla_type=IFLA_CARRIER}, 1},
+{{nla_len=13, nla_type=IFLA_QDISC}, "fq_codel"},
+{{nla_len=8, nla_type=IFLA_CARRIER_CHANGES}, 2},
+{{nla_len=5, nla_type=IFLA_PROTO_DOWN}, 0},
+{{nla_len=8, nla_type=IFLA_CARRIER_UP_COUNT}, 1},
+{{nla_len=8, nla_type=IFLA_CARRIER_DOWN_COUNT}, 1},
+ */
+
+struct if_state {
+ uint8_t ifla_operstate;
+ uint8_t ifla_carrier;
+};
+
+static void
+get_operstate_ether(struct ifnet *ifp, struct if_state *pstate)
+{
+ struct ifmediareq ifmr = {};
+ int error;
+ error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (void *)&ifmr);
+
+ if (error != 0) {
+ NL_LOG(LOG_DEBUG, "error calling SIOCGIFMEDIA on %s: %d",
+ if_name(ifp), error);
+ return;
+ }
+
+ switch (IFM_TYPE(ifmr.ifm_active)) {
+ case IFM_ETHER:
+ if (ifmr.ifm_status & IFM_ACTIVE) {
+ pstate->ifla_carrier = 1;
+ if (ifp->if_flags & IFF_MONITOR)
+ pstate->ifla_operstate = IF_OPER_DORMANT;
+ else
+ pstate->ifla_operstate = IF_OPER_UP;
+ } else
+ pstate->ifla_operstate = IF_OPER_DOWN;
+ }
+}
+
+static bool
+get_stats(struct nl_writer *nw, struct ifnet *ifp)
+{
+ struct rtnl_link_stats64 *stats;
+
+ int nla_len = sizeof(struct nlattr) + sizeof(*stats);
+ struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
+ if (nla == NULL)
+ return (false);
+ nla->nla_type = IFLA_STATS64;
+ nla->nla_len = nla_len;
+ stats = (struct rtnl_link_stats64 *)(nla + 1);
+
+ stats->rx_packets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS);
+ stats->tx_packets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS);
+ stats->rx_bytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES);
+ stats->tx_bytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES);
+ stats->rx_errors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS);
+ stats->tx_errors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS);
+ stats->rx_dropped = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS);
+ stats->tx_dropped = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS);
+ stats->multicast = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS);
+ stats->rx_nohandler = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO);
+
+ return (true);
+}
+
+static void
+get_operstate(struct ifnet *ifp, struct if_state *pstate)
+{
+ pstate->ifla_operstate = IF_OPER_UNKNOWN;
+ pstate->ifla_carrier = 0; /* no carrier */
+
+ switch (ifp->if_type) {
+ case IFT_ETHER:
+ get_operstate_ether(ifp, pstate);
+ break;
+ case IFT_LOOP:
+ if (ifp->if_flags & IFF_UP) {
+ pstate->ifla_operstate = IF_OPER_UP;
+ pstate->ifla_carrier = 1;
+ } else
+ pstate->ifla_operstate = IF_OPER_DOWN;
+ break;
+ }
+}
+
+static unsigned
+ifp_flags_to_netlink(const struct ifnet *ifp)
+{
+ return (ifp->if_flags | ifp->if_drv_flags);
+}
+
+#define LLADDR_CONST(s) ((const void *)((s)->sdl_data + (s)->sdl_nlen))
+static bool
+dump_sa(struct nl_writer *nw, int attr, const struct sockaddr *sa)
+{
+ uint32_t addr_len = 0;
+ const void *addr_data = NULL;
+ struct in6_addr addr6;
+
+ if (sa == NULL)
+ return (true);
+
+ switch (sa->sa_family) {
+ case AF_INET:
+ addr_len = sizeof(struct in_addr);
+ addr_data = &((const struct sockaddr_in *)sa)->sin_addr;
+ break;
+ case AF_INET6:
+ in6_splitscope(&((const struct sockaddr_in6 *)sa)->sin6_addr, &addr6, &addr_len);
+ addr_len = sizeof(struct in6_addr);
+ addr_data = &addr6;
+ break;
+ case AF_LINK:
+ addr_len = ((const struct sockaddr_dl *)sa)->sdl_alen;
+ addr_data = LLADDR_CONST((const struct sockaddr_dl *)sa);
+ break;
+ default:
+ NL_LOG(LOG_DEBUG, "unsupported family: %d, skipping", sa->sa_family);
+ return (true);
+ }
+
+ return (nlattr_add(nw, attr, addr_len, addr_data));
+}
+
+/*
+ * Dumps interface state, properties and metrics.
+ * @nw: message writer
+ * @ifp: target interface
+ * @hdr: template header
+ *
+ * This function is called without epoch and MAY sleep.
+ */
+static bool
+dump_iface(struct nl_writer *nw, struct ifnet *ifp, const struct nlmsghdr *hdr)
+{
+ struct ifinfomsg *ifinfo;
+
+ NL_LOG(LOG_DEBUG3, "dumping interface %s data", if_name(ifp));
+
+ if (!nlmsg_reply(nw, hdr, sizeof(struct ifinfomsg)))
+ goto enomem;
+
+ ifinfo = nlmsg_reserve_object(nw, struct ifinfomsg);
+ ifinfo->ifi_family = AF_UNSPEC;
+ ifinfo->__ifi_pad = 0;
+ ifinfo->ifi_type = ifp->if_type;
+ ifinfo->ifi_index = ifp->if_index;
+ ifinfo->ifi_flags = ifp_flags_to_netlink(ifp);
+ ifinfo->ifi_change = 0;
+
+ nlattr_add_string(nw, IFLA_IFNAME, if_name(ifp));
+
+ struct if_state ifs = {};
+ get_operstate(ifp, &ifs);
+
+ nlattr_add_u8(nw, IFLA_OPERSTATE, ifs.ifla_operstate);
+ nlattr_add_u8(nw, IFLA_CARRIER, ifs.ifla_carrier);
+
+/*
+ nlattr_add_u8(nw, IFLA_PROTO_DOWN, val);
+ nlattr_add_u8(nw, IFLA_LINKMODE, val);
+*/
+ if ((ifp->if_addr != NULL)) {
+ dump_sa(nw, IFLA_ADDRESS, ifp->if_addr->ifa_addr);
+ }
+
+ if ((ifp->if_broadcastaddr != NULL)) {
+ nlattr_add(nw, IFLA_BROADCAST, ifp->if_addrlen,
+ ifp->if_broadcastaddr);
+ }
+
+ nlattr_add_u32(nw, IFLA_MTU, ifp->if_mtu);
+/*
+ nlattr_add_u32(nw, IFLA_MIN_MTU, 60);
+ nlattr_add_u32(nw, IFLA_MAX_MTU, 9000);
+ nlattr_add_u32(nw, IFLA_GROUP, 0);
+*/
+ get_stats(nw, ifp);
+
+ uint32_t val = (ifp->if_flags & IFF_PROMISC) != 0;
+ nlattr_add_u32(nw, IFLA_PROMISCUITY, val);
+
+ if (nlmsg_end(nw))
+ return (true);
+
+enomem:
+ NL_LOG(LOG_DEBUG, "unable to dump interface %s state (ENOMEM)", if_name(ifp));
+ nlmsg_abort(nw);
+ return (false);
+}
+
+static bool
+check_ifmsg(void *hdr, struct nl_pstate *npt)
+{
+ struct ifinfomsg *ifm = hdr;
+
+ if (ifm->__ifi_pad != 0 || ifm->ifi_type != 0 ||
+ ifm->ifi_flags != 0 || ifm->ifi_change != 0) {
+ nlmsg_report_err_msg(npt,
+ "strict checking: non-zero values in ifinfomsg header");
+ return (false);
+ }
+
+ return (true);
+}
+
+#define _IN(_field) offsetof(struct ifinfomsg, _field)
+#define _OUT(_field) offsetof(struct nl_parsed_link, _field)
+static const struct nlfield_parser nlf_p_if[] = {
+ { .off_in = _IN(ifi_type), .off_out = _OUT(ifi_type), .cb = nlf_get_u16 },
+ { .off_in = _IN(ifi_index), .off_out = _OUT(ifi_index), .cb = nlf_get_u32 },
+};
+
+static const struct nlattr_parser nla_p_linfo[] = {
+ { .type = IFLA_INFO_KIND, .off = _OUT(ifla_cloner), .cb = nlattr_get_stringn },
+ { .type = IFLA_INFO_DATA, .off = _OUT(ifla_idata), .cb = nlattr_get_nla },
+};
+NL_DECLARE_ATTR_PARSER(linfo_parser, nla_p_linfo);
+
+static const struct nlattr_parser nla_p_if[] = {
+ { .type = IFLA_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string },
+ { .type = IFLA_MTU, .off = _OUT(ifla_mtu), .cb = nlattr_get_uint32 },
+ { .type = IFLA_LINK, .off = _OUT(ifi_index), .cb = nlattr_get_uint32 },
+ { .type = IFLA_LINKINFO, .arg = &linfo_parser, .cb = nlattr_get_nested },
+ { .type = IFLA_GROUP, .off = _OUT(ifla_group), .cb = nlattr_get_string },
+ { .type = IFLA_ALT_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string },
+};
+#undef _IN
+#undef _OUT
+NL_DECLARE_STRICT_PARSER(ifmsg_parser, struct ifinfomsg, check_ifmsg, nlf_p_if, nla_p_if);
+
+static bool
+match_iface(struct nl_parsed_link *attrs, struct ifnet *ifp)
+{
+ if (attrs->ifi_index != 0 && attrs->ifi_index != ifp->if_index)
+ return (false);
+ if (attrs->ifi_type != 0 && attrs->ifi_index != ifp->if_type)
+ return (false);
+ if (attrs->ifla_ifname != NULL && strcmp(attrs->ifla_ifname, if_name(ifp)))
+ return (false);
+ /* TODO: add group match */
+
+ return (true);
+}
+
+/*
+ * {nlmsg_len=52, nlmsg_type=RTM_GETLINK, nlmsg_flags=NLM_F_REQUEST, nlmsg_seq=1662842818, nlmsg_pid=0},
+ * {ifi_family=AF_PACKET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0},
+ * [
+ * [{nla_len=10, nla_type=IFLA_IFNAME}, "vnet9"],
+ * [{nla_len=8, nla_type=IFLA_EXT_MASK}, RTEXT_FILTER_VF]
+ * ]
+ */
+static int
+rtnl_handle_getlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ struct epoch_tracker et;
+ struct ifnet *ifp;
+ int error = 0;
+
+ struct nl_parsed_link attrs = {};
+ error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ struct netlink_walkargs wa = {
+ .so = nlp,
+ .nw = npt->nw,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI,
+ .hdr.nlmsg_type = NL_RTM_NEWLINK,
+ };
+
+ /* Fast track for an interface w/ explicit index match */
+ if (attrs.ifi_index != 0) {
+ NET_EPOCH_ENTER(et);
+ ifp = ifnet_byindex_ref(attrs.ifi_index);
+ NET_EPOCH_EXIT(et);
+ NLP_LOG(LOG_DEBUG3, nlp, "fast track -> searching index %u", attrs.ifi_index);
+ if (ifp != NULL) {
+ if (match_iface(&attrs, ifp)) {
+ if (!dump_iface(wa.nw, ifp, &wa.hdr))
+ error = ENOMEM;
+ } else
+ error = ESRCH;
+ if_rele(ifp);
+ } else
+ error = ESRCH;
+ return (error);
+ }
+
+ /*
+ * Fetching some link properties require performing ioctl's that may be blocking.
+ * Address it by saving referenced pointers of the matching links,
+ * exiting from epoch and going through the list one-by-one.
+ */
+
+ NL_LOG(LOG_DEBUG2, "Start dump");
+
+ struct ifnet **match_array;
+ int offset = 0, base_count = 16; /* start with 128 bytes */
+ match_array = malloc(base_count * sizeof(void *), M_TEMP, M_NOWAIT);
+
+ NLP_LOG(LOG_DEBUG3, nlp, "MATCHING: index=%u type=%d name=%s",
+ attrs.ifi_index, attrs.ifi_type, attrs.ifla_ifname);
+ NET_EPOCH_ENTER(et);
+ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
+ wa.count++;
+ if (match_iface(&attrs, ifp)) {
+ if (offset < base_count) {
+ if (!if_try_ref(ifp))
+ continue;
+ match_array[offset++] = ifp;
+ continue;
+ }
+ /* Too many matches, need to reallocate */
+ struct ifnet **new_array;
+ int sz = base_count * sizeof(void *);
+ base_count *= 2;
+ new_array = malloc(sz * 2, M_TEMP, M_NOWAIT);
+ if (new_array == NULL) {
+ error = ENOMEM;
+ break;
+ }
+ memcpy(new_array, match_array, sz);
+ free(match_array, M_TEMP);
+ match_array = new_array;
+ }
+ }
+ NET_EPOCH_EXIT(et);
+
+ NL_LOG(LOG_DEBUG2, "Matched %d interface(s), dumping", offset);
+ for (int i = 0; error == 0 && i < offset; i++) {
+ if (!dump_iface(wa.nw, match_array[i], &wa.hdr))
+ error = ENOMEM;
+ }
+ for (int i = 0; i < offset; i++)
+ if_rele(match_array[i]);
+ free(match_array, M_TEMP);
+
+ NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped);
+
+ if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) {
+ NL_LOG(LOG_DEBUG, "Unable to finalize the dump");
+ return (ENOMEM);
+ }
+
+ return (error);
+}
+
+/*
+ * sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[
+ * {nlmsg_len=60, nlmsg_type=RTM_NEWLINK, nlmsg_flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, nlmsg_seq=1662715618, nlmsg_pid=0},
+ * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0},
+ * {nla_len=11, nla_type=IFLA_IFNAME}, "dummy0"],
+ * [
+ * {nla_len=16, nla_type=IFLA_LINKINFO},
+ * [
+ * {nla_len=9, nla_type=IFLA_INFO_KIND}, "dummy"...
+ * ]
+ * ]
+ */
+
+static int
+rtnl_handle_dellink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ struct epoch_tracker et;
+ struct ifnet *ifp;
+ int error;
+
+ struct nl_parsed_link attrs = {};
+ error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ NET_EPOCH_ENTER(et);
+ ifp = ifnet_byindex_ref(attrs.ifi_index);
+ NET_EPOCH_EXIT(et);
+ if (ifp == NULL) {
+ NLP_LOG(LOG_DEBUG, nlp, "unable to find interface %u", attrs.ifi_index);
+ return (ENOENT);
+ }
+ NLP_LOG(LOG_DEBUG3, nlp, "mapped ifindex %u to %s", attrs.ifi_index, if_name(ifp));
+
+ sx_xlock(&ifnet_detach_sxlock);
+ error = if_clone_destroy(if_name(ifp));
+ sx_xunlock(&ifnet_detach_sxlock);
+
+ NLP_LOG(LOG_DEBUG2, nlp, "deleting interface %s returned %d", if_name(ifp), error);
+
+ if_rele(ifp);
+ return (error);
+}
+
+static int
+rtnl_handle_newlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ struct nl_cloner *cloner;
+ int error;
+
+ struct nl_parsed_link attrs = {};
+ error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.ifla_ifname == NULL || strlen(attrs.ifla_ifname) == 0) {
+ /* Applications like ip(8) verify RTM_NEWLINK existance
+ * by calling it with empty arguments. Always return "innocent"
+ * error.
+ */
+ NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_IFNAME attribute");
+ return (EPERM);
+ }
+
+ if (attrs.ifla_cloner == NULL || strlen(attrs.ifla_cloner) == 0) {
+ NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_INFO_KIND attribute");
+ return (EINVAL);
+ }
+
+ sx_slock(&rtnl_cloner_lock);
+ SLIST_FOREACH(cloner, &nl_cloners, next) {
+ if (!strcmp(attrs.ifla_cloner, cloner->name)) {
+ error = cloner->create_f(&attrs, nlp, npt);
+ sx_sunlock(&rtnl_cloner_lock);
+ return (error);
+ }
+ }
+ sx_sunlock(&rtnl_cloner_lock);
+
+ /* TODO: load cloner module if not exists & privilege permits */
+ NLMSG_REPORT_ERR_MSG(npt, "interface type %s not supported", attrs.ifla_cloner);
+ return (ENOTSUP);
+
+ return (error);
+}
+
+/*
+
+{ifa_family=AF_INET, ifa_prefixlen=8, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_HOST, ifa_index=if_nametoindex("lo")},
+ [
+ {{nla_len=8, nla_type=IFA_ADDRESS}, inet_addr("127.0.0.1")},
+ {{nla_len=8, nla_type=IFA_LOCAL}, inet_addr("127.0.0.1")},
+ {{nla_len=7, nla_type=IFA_LABEL}, "lo"},
+ {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT},
+ {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=3619, tstamp=3619}}]},
+---
+
+{{len=72, type=RTM_NEWADDR, flags=NLM_F_MULTI, seq=1642191126, pid=566735},
+ {ifa_family=AF_INET6, ifa_prefixlen=96, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_UNIVERSE, ifa_index=if_nametoindex("virbr0")},
+ [
+ {{nla_len=20, nla_type=IFA_ADDRESS}, inet_pton(AF_INET6, "2a01:4f8:13a:70c:ffff::1")},
+ {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=4283, tstamp=4283}},
+ {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}]},
+*/
+
+static uint8_t
+ifa_get_scope(const struct ifaddr *ifa)
+{
+ const struct sockaddr *sa;
+ uint8_t addr_scope = RT_SCOPE_UNIVERSE;
+
+ sa = ifa->ifa_addr;
+ switch (sa->sa_family) {
+ case AF_INET:
+ {
+ struct in_addr addr;
+ addr = ((const struct sockaddr_in *)sa)->sin_addr;
+ if (IN_LOOPBACK(addr.s_addr))
+ addr_scope = RT_SCOPE_HOST;
+ else if (IN_LINKLOCAL(addr.s_addr))
+ addr_scope = RT_SCOPE_LINK;
+ break;
+ }
+ case AF_INET6:
+ {
+ const struct in6_addr *addr;
+ addr = &((const struct sockaddr_in6 *)sa)->sin6_addr;
+ if (IN6_IS_ADDR_LOOPBACK(addr))
+ addr_scope = RT_SCOPE_HOST;
+ else if (IN6_IS_ADDR_LINKLOCAL(addr))
+ addr_scope = RT_SCOPE_LINK;
+ break;
+ }
+ }
+
+ return (addr_scope);
+}
+
+static uint8_t
+inet6_get_plen(const struct in6_addr *addr)
+{
+
+ return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) +
+ bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3]));
+}
+
+static uint8_t
+get_sa_plen(const struct sockaddr *sa)
+{
+ const struct in6_addr *paddr6;
+ const struct in_addr *paddr;
+
+ switch (sa->sa_family) {
+ case AF_INET:
+ if (sa == NULL)
+ return (32);
+ paddr = &(((const struct sockaddr_in *)sa)->sin_addr);
+ return bitcount32(paddr->s_addr);;
+ case AF_INET6:
+ if (sa == NULL)
+ return (128);
+ paddr6 = &(((const struct sockaddr_in6 *)sa)->sin6_addr);
+ return inet6_get_plen(paddr6);
+ }
+
+ return (0);
+}
+
+
+/*
+ * {'attrs': [('IFA_ADDRESS', '12.0.0.1'),
+ ('IFA_LOCAL', '12.0.0.1'),
+ ('IFA_LABEL', 'eth10'),
+ ('IFA_FLAGS', 128),
+ ('IFA_CACHEINFO', {'ifa_preferred': 4294967295, 'ifa_valid': 4294967295, 'cstamp': 63745746, 'tstamp': 63745746})],
+ */
+static bool
+dump_iface_addr(struct nl_writer *nw, struct ifnet *ifp, struct ifaddr *ifa,
+ const struct nlmsghdr *hdr)
+{
+ struct ifaddrmsg *ifamsg;
+ struct sockaddr *sa = ifa->ifa_addr;
+
+ NL_LOG(LOG_DEBUG3, "dumping ifa %p type %s(%d) for interface %s",
+ ifa, rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp));
+
+ if (!nlmsg_reply(nw, hdr, sizeof(struct ifaddrmsg)))
+ goto enomem;
+
+ ifamsg = nlmsg_reserve_object(nw, struct ifaddrmsg);
+ ifamsg->ifa_family = sa->sa_family;
+ ifamsg->ifa_prefixlen = get_sa_plen(ifa->ifa_netmask);
+ ifamsg->ifa_flags = 0; // ifa_flags is useless
+ ifamsg->ifa_scope = ifa_get_scope(ifa);
+ ifamsg->ifa_index = ifp->if_index;
+
+ struct sockaddr *dst_sa = ifa->ifa_dstaddr;
+ if ((dst_sa == NULL) || (dst_sa->sa_family != sa->sa_family))
+ dst_sa = sa;
+ dump_sa(nw, IFA_ADDRESS, dst_sa);
+ dump_sa(nw, IFA_LOCAL, sa);
+ nlattr_add_string(nw, IFA_LABEL, if_name(ifp));
+
+ uint32_t val = 0; // ifa->ifa_flags;
+ nlattr_add_u32(nw, IFA_FLAGS, val);
+
+ if (nlmsg_end(nw))
+ return (true);
+enomem:
+ NL_LOG(LOG_DEBUG, "Failed to dump ifa type %s(%d) for interface %s",
+ rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp));
+ nlmsg_abort(nw);
+ return (false);
+}
+
+static int
+rtnl_handle_getaddr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ struct ifaddr *ifa;
+ struct ifnet *ifp;
+ int error = 0;
+
+ struct netlink_walkargs wa = {
+ .so = nlp,
+ .nw = npt->nw,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI,
+ .hdr.nlmsg_type = NL_RTM_NEWADDR,
+ };
+
+ NL_LOG(LOG_DEBUG2, "Start dump");
+
+ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (wa.family != 0 && wa.family != ifa->ifa_addr->sa_family)
+ continue;
+ if (ifa->ifa_addr->sa_family == AF_LINK)
+ continue;
+ wa.count++;
+ if (!dump_iface_addr(wa.nw, ifp, ifa, &wa.hdr)) {
+ error = ENOMEM;
+ break;
+ }
+ wa.dumped++;
+ }
+ if (error != 0)
+ break;
+ }
+
+ NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped);
+
+ if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) {
+ NL_LOG(LOG_DEBUG, "Unable to finalize the dump");
+ return (ENOMEM);
+ }
+
+ return (error);
+}
+
+static void
+rtnl_handle_ifaddr(void *arg __unused, struct ifaddr *ifa, int cmd)
+{
+ struct nlmsghdr hdr = {};
+ struct nl_writer nw = {};
+ uint32_t group = 0;
+
+ switch (ifa->ifa_addr->sa_family) {
+ case AF_INET:
+ group = RTNLGRP_IPV4_IFADDR;
+ break;
+ case AF_INET6:
+ group = RTNLGRP_IPV6_IFADDR;
+ break;
+ default:
+ NL_LOG(LOG_DEBUG2, "ifa notification for unknown AF: %d",
+ ifa->ifa_addr->sa_family);
+ return;
+ }
+
+ if (!nl_has_listeners(NETLINK_ROUTE, group))
+ return;
+
+ if (!nlmsg_get_group_writer(&nw, NLMSG_LARGE, NETLINK_ROUTE, group)) {
+ NL_LOG(LOG_DEBUG, "error allocating group writer");
+ return;
+ }
+
+ hdr.nlmsg_type = (cmd == RTM_DELETE) ? NL_RTM_DELADDR : NL_RTM_NEWADDR;
+
+ dump_iface_addr(&nw, ifa->ifa_ifp, ifa, &hdr);
+ nlmsg_flush(&nw);
+}
+
+static void
+rtnl_handle_ifattach(void *arg, struct ifnet *ifp)
+{
+ struct nlmsghdr hdr = { .nlmsg_type = NL_RTM_NEWLINK };
+ struct nl_writer nw = {};
+
+ if (!nl_has_listeners(NETLINK_ROUTE, RTNLGRP_LINK))
+ return;
+
+ if (!nlmsg_get_group_writer(&nw, NLMSG_LARGE, NETLINK_ROUTE, RTNLGRP_LINK)) {
+ NL_LOG(LOG_DEBUG, "error allocating mbuf");
+ return;
+ }
+ dump_iface(&nw, ifp, &hdr);
+ nlmsg_flush(&nw);
+}
+
+static void
+rtnl_handle_ifdetach(void *arg, struct ifnet *ifp)
+{
+ struct nlmsghdr hdr = { .nlmsg_type = NL_RTM_DELLINK };
+ struct nl_writer nw = {};
+
+ if (!nl_has_listeners(NETLINK_ROUTE, RTNLGRP_LINK))
+ return;
+
+ if (!nlmsg_get_group_writer(&nw, NLMSG_LARGE, NETLINK_ROUTE, RTNLGRP_LINK)) {
+ NL_LOG(LOG_DEBUG, "error allocating mbuf");
+ return;
+ }
+ dump_iface(&nw, ifp, &hdr);
+ nlmsg_flush(&nw);
+}
+
+static const struct rtnl_cmd_handler cmd_handlers[] = {
+ {
+ .cmd = NL_RTM_GETLINK,
+ .name = "RTM_GETLINK",
+ .cb = &rtnl_handle_getlink,
+ .flags = RTNL_F_NOEPOCH,
+ },
+ {
+ .cmd = NL_RTM_DELLINK,
+ .name = "RTM_DELLINK",
+ .cb = &rtnl_handle_dellink,
+ .priv = PRIV_NET_IFDESTROY,
+ .flags = RTNL_F_NOEPOCH,
+ },
+ {
+ .cmd = NL_RTM_NEWLINK,
+ .name = "RTM_NEWLINK",
+ .cb = &rtnl_handle_newlink,
+ .priv = PRIV_NET_IFCREATE,
+ .flags = RTNL_F_NOEPOCH,
+ },
+ {
+ .cmd = NL_RTM_GETADDR,
+ .name = "RTM_GETADDR",
+ .cb = &rtnl_handle_getaddr,
+ },
+ {
+ .cmd = NL_RTM_NEWADDR,
+ .name = "RTM_NEWADDR",
+ .cb = &rtnl_handle_getaddr,
+ },
+ {
+ .cmd = NL_RTM_DELADDR,
+ .name = "RTM_DELADDR",
+ .cb = &rtnl_handle_getaddr,
+ },
+};
+
+static const struct nlhdr_parser *all_parsers[] = { &ifmsg_parser };
+
+void
+rtnl_iface_add_cloner(struct nl_cloner *cloner)
+{
+ sx_xlock(&rtnl_cloner_lock);
+ SLIST_INSERT_HEAD(&nl_cloners, cloner, next);
+ sx_xunlock(&rtnl_cloner_lock);
+}
+
+void rtnl_iface_del_cloner(struct nl_cloner *cloner)
+{
+ sx_xlock(&rtnl_cloner_lock);
+ SLIST_REMOVE(&nl_cloners, cloner, nl_cloner, next);
+ sx_xunlock(&rtnl_cloner_lock);
+}
+
+void
+rtnl_ifaces_init(void)
+{
+ ifattach_event = EVENTHANDLER_REGISTER(
+ ifnet_arrival_event, rtnl_handle_ifattach, NULL,
+ EVENTHANDLER_PRI_ANY);
+ ifdetach_event = EVENTHANDLER_REGISTER(
+ ifnet_departure_event, rtnl_handle_ifdetach, NULL,
+ EVENTHANDLER_PRI_ANY);
+ ifaddr_event = EVENTHANDLER_REGISTER(
+ rt_addrmsg, rtnl_handle_ifaddr, NULL,
+ EVENTHANDLER_PRI_ANY);
+ NL_VERIFY_PARSERS(all_parsers);
+ rtnl_iface_drivers_register();
+ rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
+}
+
+void
+rtnl_ifaces_destroy(void)
+{
+ EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ifattach_event);
+ EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_event);
+ EVENTHANDLER_DEREGISTER(rt_addrmsg, ifaddr_event);
+}
diff --git a/sys/netlink/route/iface_drivers.c b/sys/netlink/route/iface_drivers.c
new file mode 100644
index 000000000000..ccc8f2184fa3
--- /dev/null
+++ b/sys/netlink/route/iface_drivers.c
@@ -0,0 +1,165 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_media.h>
+#include <net/if_var.h>
+#include <net/if_clone.h>
+#include <net/if_vlan_var.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#include <netinet6/scope6_var.h> /* scope deembedding */
+
+#define DEBUG_MOD_NAME nl_iface_drivers
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+/*
+ *
+ * {len=76, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1662892737, pid=0},
+ * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0},
+ * [
+ * {{nla_len=8, nla_type=IFLA_LINK}, 2},
+ * {{nla_len=12, nla_type=IFLA_IFNAME}, "xvlan22"},
+ * {{nla_len=24, nla_type=IFLA_LINKINFO},
+ * [
+ * {{nla_len=8, nla_type=IFLA_INFO_KIND}, "vlan"...},
+ * {{nla_len=12, nla_type=IFLA_INFO_DATA}, "\x06\x00\x01\x00\x16\x00\x00\x00"}]}]}, iov_len=76}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 76
+ */
+
+struct nl_parsed_vlan {
+ uint16_t vlan_id;
+ uint16_t vlan_proto;
+ struct ifla_vlan_flags vlan_flags;
+};
+
+#define _OUT(_field) offsetof(struct nl_parsed_vlan, _field)
+static const struct nlattr_parser nla_p_vlan[] = {
+ { .type = IFLA_VLAN_ID, .off = _OUT(vlan_id), .cb = nlattr_get_uint16 },
+ { .type = IFLA_VLAN_FLAGS, .off = _OUT(vlan_flags), .cb = nlattr_get_nla },
+ { .type = IFLA_VLAN_PROTOCOL, .off = _OUT(vlan_proto), .cb = nlattr_get_uint16 },
+};
+#undef _OUT
+NL_DECLARE_ATTR_PARSER(vlan_parser, nla_p_vlan);
+
+static int
+create_vlan(struct nl_parsed_link *lattrs, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ struct epoch_tracker et;
+ struct ifnet *ifp;
+ int error;
+
+ /*
+ * lattrs.ifla_ifname is the new interface name
+ * lattrs.ifi_index contains parent interface index
+ * lattrs.ifla_idata contains un-parsed vlan data
+ */
+
+ struct nl_parsed_vlan attrs = {
+ .vlan_id = 0xFEFE,
+ .vlan_proto = ETHERTYPE_VLAN
+ };
+ NLP_LOG(LOG_DEBUG3, nlp, "nested: %p len %d", lattrs->ifla_idata, lattrs->ifla_idata->nla_len);
+
+ if (lattrs->ifla_idata == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "vlan id is required, guessing not supported");
+ return (ENOTSUP);
+ }
+
+ error = nl_parse_nested(lattrs->ifla_idata, &vlan_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+ if (attrs.vlan_id > 4095) {
+ NLMSG_REPORT_ERR_MSG(npt, "Invalid VID: %d", attrs.vlan_id);
+ return (EINVAL);
+ }
+ if (attrs.vlan_proto != ETHERTYPE_VLAN && attrs.vlan_proto != ETHERTYPE_QINQ) {
+ NLMSG_REPORT_ERR_MSG(npt, "Unsupported ethertype: 0x%04X", attrs.vlan_proto);
+ return (ENOTSUP);
+ }
+
+ NET_EPOCH_ENTER(et);
+ ifp = ifnet_byindex_ref(lattrs->ifi_index);
+ NET_EPOCH_EXIT(et);
+ if (ifp == NULL) {
+ NLP_LOG(LOG_DEBUG, nlp, "unable to find parent interface %u",
+ lattrs->ifi_index);
+ return (ENOENT);
+ }
+
+ /* Waiting till if_clone changes lands */
+/*
+ struct vlanreq params = {
+ .vlr_tag = attrs.vlan_id,
+ .vlr_proto = attrs.vlan_proto,
+ };
+*/
+ int ifname_len = strlen(lattrs->ifla_ifname) + 1;
+ error = if_clone_create(lattrs->ifla_ifname, ifname_len, (char *)NULL);
+
+ NLP_LOG(LOG_DEBUG2, nlp, "clone for %s returned %d", lattrs->ifla_ifname, error);
+
+ if_rele(ifp);
+ return (error);
+}
+
+static struct nl_cloner vlan_cloner = {
+ .name = "vlan",
+ .create_f = create_vlan,
+
+};
+
+static const struct nlhdr_parser *all_parsers[] = { &vlan_parser };
+
+void
+rtnl_iface_drivers_register(void)
+{
+ rtnl_iface_add_cloner(&vlan_cloner);
+ NL_VERIFY_PARSERS(all_parsers);
+}
+
+
diff --git a/sys/netlink/route/ifaddrs.h b/sys/netlink/route/ifaddrs.h
new file mode 100644
index 000000000000..e2013cb266d7
--- /dev/null
+++ b/sys/netlink/route/ifaddrs.h
@@ -0,0 +1,90 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Interface address-related (RTM_<NEW|DEL|GET>ADDR) message header and attributes.
+ */
+
+#ifndef _NETLINK_ROUTE_IFADDRS_H_
+#define _NETLINK_ROUTE_IFADDRS_H_
+
+/* Base header for all of the relevant messages */
+struct ifaddrmsg {
+ uint8_t ifa_family; /* Address family */
+ uint8_t ifa_prefixlen; /* Prefix length */
+ uint8_t ifa_flags; /* Address-specific flags */
+ uint8_t ifa_scope; /* Address scope */
+ uint32_t ifa_index; /* Link ifindex */
+};
+
+#ifndef _KERNEL
+#define _NL_IFA_HDRLEN ((int)sizeof(struct ifaddrmsg))
+#define IFA_RTA(_ifa) ((struct rtattr *)(NL_ITEM_DATA(_ifa, _NL_IFA_HDRLEN)))
+#define IFA_PAYLOAD(_hdr) NLMSG_PAYLOAD(_hdr, _NL_IFA_HDRLEN)
+#endif
+
+/* Defined attributes */
+enum {
+ IFA_UNSPEC,
+ IFA_ADDRESS = 1, /* binary, prefix address (destination for p2p) */
+ IFA_LOCAL = 2, /* binary, interface address */
+ IFA_LABEL = 3, /* not supported */
+ IFA_BROADCAST = 4, /* binary, broadcast ifa */
+ IFA_ANYCAST = 5, /* not supported */
+ IFA_CACHEINFO = 6, /* not supported */
+ IFA_MULTICAST = 7, /* not supported */
+ IFA_FLAGS = 8, /* not supported */
+ IFA_RT_PRIORITY = 9, /* not supported */
+ IFA_TARGET_NETNSID = 10, /* not supported */
+ __IFA_MAX,
+};
+#define IFA_MAX (__IFA_MAX - 1)
+
+/* IFA_FLAGS attribute flags */
+#define IFA_F_SECONDARY 0x0001
+#define IFA_F_TEMPORARY IFA_F_SECONDARY
+#define IFA_F_NODAD 0x0002
+#define IFA_F_OPTIMISTIC 0x0004
+#define IFA_F_DADFAILED 0x0008
+#define IFA_F_HOMEADDRESS 0x0010
+#define IFA_F_DEPRECATED 0x0020
+#define IFA_F_TENTATIVE 0x0040
+#define IFA_F_PERMANENT 0x0080
+#define IFA_F_MANAGETEMPADDR 0x0100
+#define IFA_F_NOPREFIXROUTE 0x0200
+#define IFA_F_MCAUTOJOIN 0x0400
+#define IFA_F_STABLE_PRIVACY 0x0800
+
+/* IFA_CACHEINFO value */
+struct ifa_cacheinfo {
+ uint32_t ifa_prefered;
+ uint32_t ifa_valid;
+ uint32_t cstamp;
+ uint32_t tstamp;
+};
+
+#endif
diff --git a/sys/netlink/route/interface.h b/sys/netlink/route/interface.h
new file mode 100644
index 000000000000..cae763cc4a58
--- /dev/null
+++ b/sys/netlink/route/interface.h
@@ -0,0 +1,245 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Interface-related (RTM_<NEW|DEL|GET|SET>LINK) message header and attributes.
+ */
+
+#ifndef _NETLINK_ROUTE_INTERFACE_H_
+#define _NETLINK_ROUTE_INTERFACE_H_
+
+/* Base header for all of the relevant messages */
+struct ifinfomsg {
+ unsigned char ifi_family; /* not used */
+ unsigned char __ifi_pad;
+ unsigned short ifi_type; /* ARPHRD_* */
+ int ifi_index; /* Inteface index */
+ unsigned ifi_flags; /* IFF_* flags */
+ unsigned ifi_change; /* IFF_* change mask */
+};
+
+#ifndef _KERNEL
+/* Compatilbility helpers */
+#define _IFINFO_HDRLEN ((int)sizeof(struct ifinfomsg))
+#define IFLA_RTA(_ifi) ((struct rtattr *)NL_ITEM_DATA(_ifi, _IFINFO_HDRLEN))
+#define IFLA_PAYLOAD(_ifi) NLMSG_PAYLOAD(_ifi, _IFINFO_HDRLEN)
+#endif
+
+enum {
+ IFLA_UNSPEC = 0,
+ IFLA_ADDRESS = 1, /* binary: Link-level address (MAC) */
+#define IFLA_ADDRESS IFLA_ADDRESS
+ IFLA_BROADCAST = 2, /* binary: link-level broadcast address */
+#define IFLA_BROADCAST IFLA_BROADCAST
+ IFLA_IFNAME = 3, /* string: Interface name */
+#define IFLA_IFNAME IFLA_IFNAME
+ IFLA_MTU = 4, /* u32: Current interface L3 mtu */
+#define IFLA_MTU IFLA_MTU
+ IFLA_LINK = 5, /* u32: interface index */
+#define IFLA_LINK IFLA_LINK
+ IFLA_QDISC = 6, /* string: Queing policy (not supported) */
+#define IFLA_QDISC IFLA_QDISC
+ IFLA_STATS = 7, /* Interface counters */
+#define IFLA_STATS IFLA_STATS
+ IFLA_COST = 8, /* not supported */
+#define IFLA_COST IFLA_COST
+ IFLA_PRIORITY = 9, /* not supported */
+#define IFLA_PRIORITY IFLA_PRIORITY
+ IFLA_MASTER = 10, /* u32: parent interface ifindex */
+#define IFLA_MASTER IFLA_MASTER
+ IFLA_WIRELESS = 11, /* not supported */
+#define IFLA_WIRELESS IFLA_WIRELESS
+ IFLA_PROTINFO = 12, /* protocol-specific data */
+#define IFLA_PROTINFO IFLA_PROTINFO
+ IFLA_TXQLEN = 13, /* u32: transmit queue length */
+#define IFLA_TXQLEN IFLA_TXQLEN
+ IFLA_MAP = 14, /* not supported */
+#define IFLA_MAP IFLA_MAP
+ IFLA_WEIGHT = 15, /* not supported */
+#define IFLA_WEIGHT IFLA_WEIGHT
+ IFLA_OPERSTATE = 16, /* u8: ifOperStatus per RFC 2863 */
+#define IFLA_OPERSTATE IFLA_OPERSTATE
+ IFLA_LINKMODE = 17, /* u8: ifmedia (not supported) */
+#define IFLA_LINKMODE IFLA_LINKMODE
+ IFLA_LINKINFO = 18, /* nested: IFLA_INFO_ */
+#define IFLA_LINKINFO IFLA_LINKINFO
+ IFLA_NET_NS_PID = 19, /* u32: vnet id (not supported) */
+#define IFLA_NET_NS_PID IFLA_NET_NS_PID
+ IFLA_IFALIAS = 20, /* not supported */
+#define IFLA_IFALIAS IFLA_IFALIAS
+ IFLA_NUM_VF = 21, /* not supported */
+#define IFLA_NUM_VF IFLA_NUM_VF
+ IFLA_VFINFO_LIST= 22, /* not supported */
+#define IFLA_VFINFO_LIST IFLA_VFINFO_LIST
+ IFLA_STATS64 = 23, /* rtnl_link_stats64: iface stats */
+#define IFLA_STATS64 IFLA_STATS64
+ IFLA_VF_PORTS,
+ IFLA_PORT_SELF,
+ IFLA_AF_SPEC,
+ IFLA_GROUP, /* Group the device belongs to */
+ IFLA_NET_NS_FD,
+ IFLA_EXT_MASK, /* Extended info mask, VFs, etc */
+ IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */
+#define IFLA_PROMISCUITY IFLA_PROMISCUITY
+ IFLA_NUM_TX_QUEUES,
+ IFLA_NUM_RX_QUEUES,
+ IFLA_CARRIER,
+ IFLA_PHYS_PORT_ID,
+ IFLA_CARRIER_CHANGES,
+ IFLA_PHYS_SWITCH_ID,
+ IFLA_LINK_NETNSID,
+ IFLA_PHYS_PORT_NAME,
+ IFLA_PROTO_DOWN,
+ IFLA_GSO_MAX_SEGS,
+ IFLA_GSO_MAX_SIZE,
+ IFLA_PAD,
+ IFLA_XDP,
+ IFLA_EVENT,
+ IFLA_NEW_NETNSID,
+ IFLA_IF_NETNSID,
+ IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */
+ IFLA_CARRIER_UP_COUNT,
+ IFLA_CARRIER_DOWN_COUNT,
+ IFLA_NEW_IFINDEX,
+ IFLA_MIN_MTU,
+ IFLA_MAX_MTU,
+ IFLA_PROP_LIST,
+ IFLA_ALT_IFNAME, /* Alternative ifname */
+ IFLA_PERM_ADDRESS,
+ IFLA_PROTO_DOWN_REASON,
+ __IFLA_MAX
+};
+#define IFLA_MAX (__IFLA_MAX - 1)
+
+/*
+ * Attributes that can be used as filters:
+ * IFLA_IFNAME, IFLA_GROUP, IFLA_ALT_IFNAME
+ * Headers that can be used as filters:
+ * ifi_index, ifi_type
+ */
+
+/*
+ * IFLA_OPERSTATE.
+ * The values below represent the possible
+ * states of ifOperStatus defined by RFC 2863
+ */
+enum {
+ IF_OPER_UNKNOWN = 0, /* status can not be determined */
+ IF_OPER_NOTPRESENT = 1, /* some (hardware) component not present */
+ IF_OPER_DOWN = 2, /* down */
+ IF_OPER_LOWERLAYERDOWN = 3, /* some lower-level interface is down */
+ IF_OPER_TESTING = 4, /* in some test mode */
+ IF_OPER_DORMANT = 5, /* "up" but waiting for some condition (802.1X) */
+ IF_OPER_UP = 6, /* ready to pass packets */
+};
+
+/* IFLA_STATS */
+struct rtnl_link_stats {
+ uint32_t rx_packets; /* total RX packets (IFCOUNTER_IPACKETS) */
+ uint32_t tx_packets; /* total TX packets (IFCOUNTER_OPACKETS) */
+ uint32_t rx_bytes; /* total RX bytes (IFCOUNTER_IBYTES) */
+ uint32_t tx_bytes; /* total TX bytes (IFCOUNTER_OBYTES) */
+ uint32_t rx_errors; /* RX errors (IFCOUNTER_IERRORS) */
+ uint32_t tx_errors; /* RX errors (IFCOUNTER_OERRORS) */
+ uint32_t rx_dropped; /* RX drop (no space in ring/no bufs) (IFCOUNTER_IQDROPS) */
+ uint32_t tx_dropped; /* TX drop (IFCOUNTER_OQDROPS) */
+ uint32_t multicast; /* RX multicast packets (IFCOUNTER_IMCASTS) */
+ uint32_t collisions; /* not supported */
+ uint32_t rx_length_errors; /* not supported */
+ uint32_t rx_over_errors; /* not supported */
+ uint32_t rx_crc_errors; /* not supported */
+ uint32_t rx_frame_errors; /* not supported */
+ uint32_t rx_fifo_errors; /* not supported */
+ uint32_t rx_missed_errors; /* not supported */
+ uint32_t tx_aborted_errors; /* not supported */
+ uint32_t tx_carrier_errors; /* not supported */
+ uint32_t tx_fifo_errors; /* not supported */
+ uint32_t tx_heartbeat_errors; /* not supported */
+ uint32_t tx_window_errors; /* not supported */
+ uint32_t rx_compressed; /* not supported */
+ uint32_t tx_compressed; /* not supported */
+ uint32_t rx_nohandler; /* dropped due to no proto handler (IFCOUNTER_NOPROTO) */
+};
+
+/* IFLA_STATS64 */
+struct rtnl_link_stats64 {
+ uint64_t rx_packets; /* total RX packets (IFCOUNTER_IPACKETS) */
+ uint64_t tx_packets; /* total TX packets (IFCOUNTER_OPACKETS) */
+ uint64_t rx_bytes; /* total RX bytes (IFCOUNTER_IBYTES) */
+ uint64_t tx_bytes; /* total TX bytes (IFCOUNTER_OBYTES) */
+ uint64_t rx_errors; /* RX errors (IFCOUNTER_IERRORS) */
+ uint64_t tx_errors; /* RX errors (IFCOUNTER_OERRORS) */
+ uint64_t rx_dropped; /* RX drop (no space in ring/no bufs) (IFCOUNTER_IQDROPS) */
+ uint64_t tx_dropped; /* TX drop (IFCOUNTER_OQDROPS) */
+ uint64_t multicast; /* RX multicast packets (IFCOUNTER_IMCASTS) */
+ uint64_t collisions; /* not supported */
+ uint64_t rx_length_errors; /* not supported */
+ uint64_t rx_over_errors; /* not supported */
+ uint64_t rx_crc_errors; /* not supported */
+ uint64_t rx_frame_errors; /* not supported */
+ uint64_t rx_fifo_errors; /* not supported */
+ uint64_t rx_missed_errors; /* not supported */
+ uint64_t tx_aborted_errors; /* not supported */
+ uint64_t tx_carrier_errors; /* not supported */
+ uint64_t tx_fifo_errors; /* not supported */
+ uint64_t tx_heartbeat_errors; /* not supported */
+ uint64_t tx_window_errors; /* not supported */
+ uint64_t rx_compressed; /* not supported */
+ uint64_t tx_compressed; /* not supported */
+ uint64_t rx_nohandler; /* dropped due to no proto handler (IFCOUNTER_NOPROTO) */
+};
+
+/* IFLA_LINKINFO child nlattr types */
+enum {
+ IFLA_INFO_UNSPEC,
+ IFLA_INFO_KIND = 1, /* string, link type ("vlan") */
+ IFLA_INFO_DATA = 2, /* Per-link-type custom data */
+ IFLA_INFO_XSTATS = 3,
+ IFLA_INFO_SLAVE_KIND = 4,
+ IFLA_INFO_SLAVE_DATA = 5,
+ __IFLA_INFO_MAX,
+};
+#define IFLA_INFO_MAX (__IFLA_INFO_MAX - 1)
+
+/* IFLA_INFO_DATA vlan attributes */
+enum {
+ IFLA_VLAN_UNSPEC,
+ IFLA_VLAN_ID,
+ IFLA_VLAN_FLAGS,
+ IFLA_VLAN_EGRESS_QOS,
+ IFLA_VLAN_INGRESS_QOS,
+ IFLA_VLAN_PROTOCOL,
+ __IFLA_VLAN_MAX,
+};
+
+#define IFLA_VLAN_MAX (__IFLA_VLAN_MAX - 1)
+struct ifla_vlan_flags {
+ uint32_t flags;
+ uint32_t mask;
+};
+
+#endif
diff --git a/sys/netlink/route/neigh.c b/sys/netlink/route/neigh.c
new file mode 100644
index 000000000000..02ad138240a2
--- /dev/null
+++ b/sys/netlink/route/neigh.c
@@ -0,0 +1,571 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+
+#include <net/if.h>
+#include <net/if_llatbl.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#include <netinet6/in6_var.h> /* nd6.h requires this */
+#include <netinet6/nd6.h> /* nd6 state machine */
+#include <netinet6/scope6_var.h> /* scope deembedding */
+
+#define DEBUG_MOD_NAME nl_neigh
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+static int lle_families[] = { AF_INET, AF_INET6 };
+
+static eventhandler_tag lle_event_p;
+
+struct netlink_walkargs {
+ struct nl_writer *nw;
+ struct nlmsghdr hdr;
+ struct nlpcb *so;
+ struct ifnet *ifp;
+ int family;
+ int error;
+ int count;
+ int dumped;
+};
+
+static int
+lle_state_to_nl_state(int family, struct llentry *lle)
+{
+ int state = lle->ln_state;
+
+ switch (family) {
+ case AF_INET:
+ if (lle->la_flags & (LLE_STATIC | LLE_IFADDR))
+ state = 1;
+ switch (state) {
+ case 0: /* ARP_LLINFO_INCOMPLETE */
+ return (NUD_INCOMPLETE);
+ case 1: /* ARP_LLINFO_REACHABLE */
+ return (NUD_REACHABLE);
+ case 2: /* ARP_LLINFO_VERIFY */
+ return (NUD_PROBE);
+ }
+ break;
+ case AF_INET6:
+ switch (state) {
+ case ND6_LLINFO_INCOMPLETE:
+ return (NUD_INCOMPLETE);
+ case ND6_LLINFO_REACHABLE:
+ return (NUD_REACHABLE);
+ case ND6_LLINFO_STALE:
+ return (NUD_STALE);
+ case ND6_LLINFO_DELAY:
+ return (NUD_DELAY);
+ case ND6_LLINFO_PROBE:
+ return (NUD_PROBE);
+ }
+ break;
+ }
+
+ return (NUD_NONE);
+}
+
+static uint32_t
+lle_flags_to_nl_flags(const struct llentry *lle)
+{
+ uint32_t nl_flags = 0;
+
+ if (lle->la_flags & LLE_IFADDR)
+ nl_flags |= NTF_SELF;
+ if (lle->la_flags & LLE_PUB)
+ nl_flags |= NTF_PROXY;
+ if (lle->la_flags & LLE_STATIC)
+ nl_flags |= NTF_STICKY;
+ if (lle->ln_router != 0)
+ nl_flags |= NTF_ROUTER;
+
+ return (nl_flags);
+}
+
+static int
+dump_lle_locked(struct llentry *lle, void *arg)
+{
+ struct netlink_walkargs *wa = (struct netlink_walkargs *)arg;
+ struct nlmsghdr *hdr = &wa->hdr;
+ struct nl_writer *nw = wa->nw;
+ struct ndmsg *ndm;
+ union {
+ struct in_addr in;
+ struct in6_addr in6;
+ } addr;
+
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ char llebuf[NHOP_PRINT_BUFSIZE];
+ llentry_print_buf_lltable(lle, llebuf, sizeof(llebuf));
+ NL_LOG(LOG_DEBUG2, "dumping %s", llebuf);
+ }
+
+ if (!nlmsg_reply(nw, hdr, sizeof(struct ndmsg)))
+ goto enomem;
+
+ ndm = nlmsg_reserve_object(nw, struct ndmsg);
+ ndm->ndm_family = wa->family;
+ ndm->ndm_ifindex = wa->ifp->if_index;
+ ndm->ndm_state = lle_state_to_nl_state(wa->family, lle);
+ ndm->ndm_flags = lle_flags_to_nl_flags(lle);
+
+ switch (wa->family) {
+#ifdef INET
+ case AF_INET:
+ addr.in = lle->r_l3addr.addr4;
+ nlattr_add(nw, NDA_DST, 4, &addr);
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ addr.in6 = lle->r_l3addr.addr6;
+ in6_clearscope(&addr.in6);
+ nlattr_add(nw, NDA_DST, 16, &addr);
+ break;
+#endif
+ }
+
+ if (lle->r_flags & RLLE_VALID) {
+ /* Has L2 */
+ int addrlen = wa->ifp->if_addrlen;
+ nlattr_add(nw, NDA_LLADDR, addrlen, lle->ll_addr);
+ }
+
+ nlattr_add_u32(nw, NDA_PROBES, lle->la_asked);
+
+ struct nda_cacheinfo *cache;
+ cache = nlmsg_reserve_attr(nw, NDA_CACHEINFO, struct nda_cacheinfo);
+ if (cache == NULL)
+ goto enomem;
+ /* TODO: provide confirmed/updated */
+ cache->ndm_refcnt = lle->lle_refcnt;
+
+ if (nlmsg_end(nw))
+ return (0);
+enomem:
+ NL_LOG(LOG_DEBUG, "unable to dump lle state (ENOMEM)");
+ nlmsg_abort(nw);
+ return (ENOMEM);
+}
+
+static int
+dump_lle(struct lltable *llt, struct llentry *lle, void *arg)
+{
+ int error;
+
+ LLE_RLOCK(lle);
+ error = dump_lle_locked(lle, arg);
+ LLE_RUNLOCK(lle);
+ return (error);
+}
+
+static bool
+dump_llt(struct lltable *llt, struct netlink_walkargs *wa)
+{
+ lltable_foreach_lle(llt, dump_lle, wa);
+
+ return (true);
+}
+
+static int
+dump_llts_iface(struct netlink_walkargs *wa, struct ifnet *ifp, int family)
+{
+ int error = 0;
+
+ wa->ifp = ifp;
+ for (int i = 0; i < sizeof(lle_families) / sizeof(int); i++) {
+ int fam = lle_families[i];
+ struct lltable *llt = lltable_get(ifp, fam);
+ if (llt != NULL && (family == 0 || family == fam)) {
+ wa->count++;
+ wa->family = fam;
+ if (!dump_llt(llt, wa)) {
+ error = ENOMEM;
+ break;
+ }
+ wa->dumped++;
+ }
+ }
+ return (error);
+}
+
+static int
+dump_llts(struct netlink_walkargs *wa, struct ifnet *ifp, int family)
+{
+ NL_LOG(LOG_DEBUG, "Start dump ifp=%s family=%d", ifp ? if_name(ifp) : "NULL", family);
+
+ wa->hdr.nlmsg_flags |= NLM_F_MULTI;
+
+ if (ifp != NULL) {
+ dump_llts_iface(wa, ifp, family);
+ } else {
+ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
+ dump_llts_iface(wa, ifp, family);
+ }
+ }
+
+ NL_LOG(LOG_DEBUG, "End dump, iterated %d dumped %d", wa->count, wa->dumped);
+
+ if (!nlmsg_end_dump(wa->nw, wa->error, &wa->hdr)) {
+ NL_LOG(LOG_DEBUG, "Unable to add new message");
+ return (ENOMEM);
+ }
+
+ return (0);
+}
+
+static int
+get_lle(struct netlink_walkargs *wa, struct ifnet *ifp, int family, struct sockaddr *dst)
+{
+ struct lltable *llt = lltable_get(ifp, family);
+ if (llt == NULL)
+ return (ESRCH);
+
+#ifdef INET6
+ if (dst->sa_family == AF_INET6) {
+ struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst;
+
+ if (IN6_IS_SCOPE_LINKLOCAL(&dst6->sin6_addr))
+ in6_set_unicast_scopeid(&dst6->sin6_addr, ifp->if_index);
+ }
+#endif
+ struct llentry *lle = lla_lookup(llt, LLE_UNLOCKED, dst);
+ if (lle == NULL)
+ return (ESRCH);
+
+ wa->ifp = ifp;
+ wa->family = family;
+
+ return (dump_lle(llt, lle, wa));
+}
+
+struct nl_parsed_neigh {
+ struct sockaddr *nda_dst;
+ struct ifnet *nda_ifp;
+ struct nlattr *nda_lladdr;
+ uint32_t ndm_flags;
+ uint16_t ndm_state;
+ uint8_t ndm_family;
+};
+
+#define _IN(_field) offsetof(struct ndmsg, _field)
+#define _OUT(_field) offsetof(struct nl_parsed_neigh, _field)
+static struct nlfield_parser nlf_p_neigh[] = {
+ { .off_in = _IN(ndm_family), .off_out = _OUT(ndm_family), .cb = nlf_get_u8 },
+ { .off_in = _IN(ndm_flags), .off_out = _OUT(ndm_flags), .cb = nlf_get_u8_u32 },
+ { .off_in = _IN(ndm_state), .off_out = _OUT(ndm_state), .cb = nlf_get_u16 },
+ { .off_in = _IN(ndm_ifindex), .off_out = _OUT(nda_ifp), .cb = nlf_get_ifpz },
+};
+
+static struct nlattr_parser nla_p_neigh[] = {
+ { .type = NDA_DST, .off = _OUT(nda_dst), .cb = nlattr_get_ip },
+ { .type = NDA_LLADDR, .off = _OUT(nda_lladdr), .cb = nlattr_get_nla },
+ { .type = NDA_IFINDEX, .off = _OUT(nda_ifp), .cb = nlattr_get_ifp },
+ { .type = NDA_FLAGS_EXT, .off = _OUT(ndm_flags), .cb = nlattr_get_uint32 },
+};
+#undef _IN
+#undef _OUT
+NL_DECLARE_PARSER(ndmsg_parser, struct ndmsg, nlf_p_neigh, nla_p_neigh);
+
+
+/*
+ * type=RTM_NEWNEIGH, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1661941473, pid=0},
+ * {ndm_family=AF_INET6, ndm_ifindex=if_nametoindex("enp0s31f6"), ndm_state=NUD_PERMANENT, ndm_flags=0, ndm_type=RTN_UNSPEC},
+ * [
+ * {{nla_len=20, nla_type=NDA_DST}, inet_pton(AF_INET6, "2a01:4f8:13a:70c::3")},
+ * {{nla_len=10, nla_type=NDA_LLADDR}, 20:4e:71:62:ae:f2}]}, iov_len=60}
+ */
+
+static int
+rtnl_handle_newneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ int error;
+
+ struct nl_parsed_neigh attrs = {};
+ error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.nda_ifp == NULL || attrs.nda_dst == NULL || attrs.nda_lladdr == NULL) {
+ if (attrs.nda_ifp == NULL)
+ NLMSG_REPORT_ERR_MSG(npt, "NDA_IFINDEX / ndm_ifindex not set");
+ if (attrs.nda_dst == NULL)
+ NLMSG_REPORT_ERR_MSG(npt, "NDA_DST not set");
+ if (attrs.nda_lladdr == NULL)
+ NLMSG_REPORT_ERR_MSG(npt, "NDA_LLADDR not set");
+ return (EINVAL);
+ }
+
+ if (attrs.nda_dst->sa_family != attrs.ndm_family) {
+ NLMSG_REPORT_ERR_MSG(npt,
+ "NDA_DST family (%d) is different from ndm_family (%d)",
+ attrs.nda_dst->sa_family, attrs.ndm_family);
+ return (EINVAL);
+ }
+
+ int addrlen = attrs.nda_ifp->if_addrlen;
+ if (attrs.nda_lladdr->nla_len != sizeof(struct nlattr) + addrlen) {
+ NLMSG_REPORT_ERR_MSG(npt,
+ "NDA_LLADDR address length (%ld) is different from expected (%d)",
+ attrs.nda_lladdr->nla_len - sizeof(struct nlattr), addrlen);
+ return (EINVAL);
+ }
+
+ if (attrs.ndm_state != NUD_PERMANENT) {
+ NLMSG_REPORT_ERR_MSG(npt, "ndm_state %d not supported", attrs.ndm_state);
+ return (ENOTSUP);
+ }
+
+ const uint16_t supported_flags = NTF_PROXY | NTF_STICKY;
+ if ((attrs.ndm_flags & supported_flags) != attrs.ndm_flags) {
+ NLMSG_REPORT_ERR_MSG(npt, "ndm_flags %X not supported",
+ attrs.ndm_flags &~ supported_flags);
+ return (ENOTSUP);
+ }
+
+ /* Replacement requires new entry creation anyway */
+ if ((hdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_REPLACE)) == 0)
+ return (ENOTSUP);
+
+ struct lltable *llt = lltable_get(attrs.nda_ifp, attrs.ndm_family);
+ if (llt == NULL)
+ return (EAFNOSUPPORT);
+
+
+ uint8_t linkhdr[LLE_MAX_LINKHDR];
+ size_t linkhdrsize = sizeof(linkhdr);
+ int lladdr_off = 0;
+ if (lltable_calc_llheader(attrs.nda_ifp, attrs.ndm_family,
+ (char *)(attrs.nda_lladdr + 1), linkhdr, &linkhdrsize, &lladdr_off) != 0) {
+ NLMSG_REPORT_ERR_MSG(npt, "unable to calculate lle prepend data");
+ return (EINVAL);
+ }
+
+ int lle_flags = LLE_STATIC | ((attrs.ndm_flags & NTF_PROXY) ? LLE_PUB : 0);
+ struct llentry *lle = lltable_alloc_entry(llt, lle_flags, attrs.nda_dst);
+ if (lle == NULL)
+ return (ENOMEM);
+ lltable_set_entry_addr(attrs.nda_ifp, lle, linkhdr, linkhdrsize, lladdr_off);
+
+ /* llentry created, try to insert or update :*/
+ IF_AFDATA_WLOCK(attrs.nda_ifp);
+ LLE_WLOCK(lle);
+ struct llentry *lle_tmp = lla_lookup(llt, LLE_EXCLUSIVE, attrs.nda_dst);
+ if (lle_tmp != NULL) {
+ if (hdr->nlmsg_flags & NLM_F_EXCL) {
+ LLE_WUNLOCK(lle_tmp);
+ lle_tmp = NULL;
+ error = EEXIST;
+ } else if (hdr->nlmsg_flags & NLM_F_REPLACE) {
+ lltable_unlink_entry(llt, lle_tmp);
+ lltable_link_entry(llt, lle);
+ } else
+ error = EEXIST;
+ } else {
+ if (hdr->nlmsg_flags & NLM_F_CREATE)
+ lltable_link_entry(llt, lle);
+ else
+ error = ENOENT;
+ }
+ IF_AFDATA_WUNLOCK(attrs.nda_ifp);
+
+ if (error != 0) {
+ if (lle != NULL)
+ llentry_free(lle);
+ return (error);
+ }
+
+ if (lle_tmp != NULL)
+ llentry_free(lle_tmp);
+
+ /* XXX: We're inside epoch */
+ EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED);
+ LLE_WUNLOCK(lle);
+
+ return (0);
+}
+
+static int
+rtnl_handle_delneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ int error;
+
+ struct nl_parsed_neigh attrs = {};
+ error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.nda_dst == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "NDA_DST not set");
+ return (EINVAL);
+ }
+
+ if (attrs.nda_ifp == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "no ifindex provided");
+ return (EINVAL);
+ }
+
+ struct lltable *llt = lltable_get(attrs.nda_ifp, attrs.ndm_family);
+ if (llt == NULL)
+ return (EAFNOSUPPORT);
+
+ IF_AFDATA_WLOCK(attrs.nda_ifp);
+ struct llentry *lle = lla_lookup(llt, LLE_EXCLUSIVE, attrs.nda_dst);
+ if (lle != NULL) {
+ if ((lle->la_flags & LLE_IFADDR) != 0) {
+ LLE_WUNLOCK(lle);
+ lle = NULL;
+ error = EPERM;
+ } else
+ lltable_unlink_entry(llt, lle);
+ } else
+ error = ENOENT;
+ IF_AFDATA_WUNLOCK(attrs.nda_ifp);
+
+ if (error == 0 && lle != NULL)
+ EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED);
+
+ if (lle != NULL)
+ llentry_free(lle);
+
+ return (error);
+}
+
+static int
+rtnl_handle_getneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ int error;
+
+ struct nl_parsed_neigh attrs = {};
+ error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.nda_dst != NULL && attrs.nda_ifp == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "has NDA_DST but no ifindex provided");
+ return (EINVAL);
+ }
+
+ struct netlink_walkargs wa = {
+ .so = nlp,
+ .nw = npt->nw,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_NEWNEIGH,
+ };
+
+ if (attrs.nda_dst == NULL)
+ error = dump_llts(&wa, attrs.nda_ifp, attrs.ndm_family);
+ else
+ error = get_lle(&wa, attrs.nda_ifp, attrs.ndm_family, attrs.nda_dst);
+
+ return (error);
+}
+
+static const struct rtnl_cmd_handler cmd_handlers[] = {
+ {
+ .cmd = NL_RTM_NEWNEIGH,
+ .name = "RTM_NEWNEIGH",
+ .cb = &rtnl_handle_newneigh,
+ },
+ {
+ .cmd = NL_RTM_DELNEIGH,
+ .name = "RTM_DELNEIGH",
+ .cb = &rtnl_handle_delneigh,
+ .priv = PRIV_NET_ROUTE,
+ },
+ {
+ .cmd = NL_RTM_GETNEIGH,
+ .name = "RTM_GETNEIGH",
+ .cb = &rtnl_handle_getneigh,
+ .priv = PRIV_NET_ROUTE,
+ }
+};
+
+static void
+rtnl_lle_event(void *arg __unused, struct llentry *lle, int evt)
+{
+ struct ifnet *ifp;
+ int family;
+
+ LLE_WLOCK_ASSERT(lle);
+
+ ifp = lltable_get_ifp(lle->lle_tbl);
+ family = lltable_get_af(lle->lle_tbl);
+
+ if (family != AF_INET && family != AF_INET6)
+ return;
+
+ int nlmsgs_type = evt == LLENTRY_RESOLVED ? NL_RTM_NEWNEIGH : NL_RTM_DELNEIGH;
+
+ struct nl_writer nw = {};
+ if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEIGH)) {
+ NL_LOG(LOG_DEBUG, "error allocating group writer");
+ return;
+ }
+
+ struct netlink_walkargs wa = {
+ .hdr.nlmsg_type = nlmsgs_type,
+ .nw = &nw,
+ .ifp = ifp,
+ .family = family,
+ };
+
+ dump_lle_locked(lle, &wa);
+ nlmsg_flush(&nw);
+}
+
+static const struct nlhdr_parser *all_parsers[] = { &ndmsg_parser };
+
+void
+rtnl_neighs_init()
+{
+ NL_VERIFY_PARSERS(all_parsers);
+ rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
+ lle_event_p = EVENTHANDLER_REGISTER(lle_event, rtnl_lle_event, NULL,
+ EVENTHANDLER_PRI_ANY);
+}
+
+void
+rtnl_neighs_destroy()
+{
+ EVENTHANDLER_DEREGISTER(lle_event, lle_event_p);
+}
diff --git a/sys/netlink/route/neigh.h b/sys/netlink/route/neigh.h
new file mode 100644
index 000000000000..1ec1b95fdcde
--- /dev/null
+++ b/sys/netlink/route/neigh.h
@@ -0,0 +1,105 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Neighbors-related (RTM_<NEW|DEL|GET>NEIGH) message header and attributes.
+ */
+
+#ifndef _NETLINK_ROUTE_NEIGH_H_
+#define _NETLINK_ROUTE_NEIGH_H_
+
+/* Base header for all of the relevant messages */
+struct ndmsg {
+ uint8_t ndm_family;
+ uint8_t ndm_pad1;
+ uint16_t ndm_pad2;
+ int32_t ndm_ifindex;
+ uint16_t ndm_state;
+ uint8_t ndm_flags;
+ uint8_t ndm_type;
+};
+
+/* Attributes */
+enum {
+ NDA_UNSPEC,
+ NDA_DST, /* binary: neigh l3 address */
+ NDA_LLADDR, /* binary: neigh link-level address */
+ NDA_CACHEINFO, /* binary, struct nda_cacheinfo */
+ NDA_PROBES, /* XXX */
+ NDA_VLAN, /* upper 802.1Q tag */
+ NDA_PORT, /* not supported */
+ NDA_VNI, /* not supported */
+ NDA_IFINDEX, /* interface index */
+ NDA_MASTER, /* not supported */
+ NDA_LINK_NETNSID, /* not supported */
+ NDA_SRC_VNI, /* not supported */
+ NDA_PROTOCOL, /* XXX */
+ NDA_NH_ID, /* not supported */
+ NDA_FDB_EXT_ATTRS, /* not supported */
+ NDA_FLAGS_EXT, /* u32: ndm_flags */
+ NDA_NDM_STATE_MASK, /* XXX */
+ NDA_NDM_FLAGS_MASK, /* XXX */
+ __NDA_MAX
+};
+
+#define NDA_MAX (__NDA_MAX - 1)
+
+
+/* ndm_flags / NDA_FLAGS_EXT */
+#define NTF_USE 0x0001 /* XXX */
+#define NTF_SELF 0x0002 /* local station */
+#define NTF_MASTER 0x0004 /* XXX */
+#define NTF_PROXY 0x0008 /* proxy entry */
+#define NTF_EXT_LEARNED 0x0010 /* not used */
+#define NTF_OFFLOADED 0x0020 /* not used */
+#define NTF_STICKY 0x0040 /* permament entry */
+#define NTF_ROUTER 0x0080 /* dst indicated itself as a router */
+/* start of NDA_FLAGS_EXT */
+#define NTF_EXT_MANAGED 0x0100 /* not used */
+
+/* ndm_state */
+#define NUD_INCOMPLETE 0x01 /* No lladdr, address resolution in progress */
+#define NUD_REACHABLE 0x02 /* reachable & recently resolved */
+#define NUD_STALE 0x04 /* has lladdr but it's stale */
+#define NUD_DELAY 0x08 /* has lladdr, is stale, probes delayed */
+#define NUD_PROBE 0x10 /* has lladdr, is stale, probes sent */
+#define NUD_FAILED 0x20 /* unused */
+
+/* Dummy states */
+#define NUD_NOARP 0x40 /* not used */
+#define NUD_PERMANENT 0x80 /* not flushed */
+#define NUD_NONE 0x00
+
+/* NDA_CACHEINFO */
+struct nda_cacheinfo {
+ uint32_t ndm_confirmed; /* seconds since ARP/ND was received from neigh */
+ uint32_t ndm_used; /* seconds since last used (not provided) */
+ uint32_t ndm_updated; /* seconds since state was updated last */
+ uint32_t ndm_refcnt; /* number of references held */
+};
+
+#endif
diff --git a/sys/netlink/route/nexthop.c b/sys/netlink/route/nexthop.c
new file mode 100644
index 000000000000..92555aa8b123
--- /dev/null
+++ b/sys/netlink/route/nexthop.c
@@ -0,0 +1,1000 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+#include <sys/ck.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_utils.h>
+
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <netinet6/scope6_var.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#define DEBUG_MOD_NAME nl_nhop
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG3);
+
+/*
+ * This file contains the logic to maintain kernel nexthops and
+ * nexhop groups based om the data provided by the user.
+ *
+ * Kernel stores (nearly) all of the routing data in the nexthops,
+ * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT).
+ *
+ * Netlink API provides higher-level abstraction for the user. Each
+ * user-created nexthop may map to multiple kernel nexthops.
+ *
+ * The following variations require separate kernel nexthop to be
+ * created:
+ * * prefix flags (NHF_HOST, NHF_DEFAULT)
+ * * using IPv6 gateway for IPv4 routes
+ * * different fibnum
+ *
+ * These kernel nexthops have the lifetime bound to the lifetime of
+ * the user_nhop object. They are not collected until user requests
+ * to delete the created user_nhop.
+ *
+ */
+struct user_nhop {
+ uint32_t un_idx; /* Userland-provided index */
+ uint32_t un_fibfam; /* fibnum+af(as highest byte) */
+ uint8_t un_protocol; /* protocol that install the record */
+ struct nhop_object *un_nhop; /* "production" nexthop */
+ struct nhop_object *un_nhop_src; /* nexthop to copy from */
+ struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */
+ uint32_t un_nhgrp_count; /* number of nexthops */
+ struct user_nhop *un_next; /* next item in hash chain */
+ struct user_nhop *un_nextchild; /* master -> children */
+ struct epoch_context un_epoch_ctx; /* epoch ctl helper */
+};
+
+/* produce hash value for an object */
+#define unhop_hash_obj(_obj) (hash_unhop(_obj))
+/* compare two objects */
+#define unhop_cmp(_one, _two) (cmp_unhop(_one, _two))
+/* next object accessor */
+#define unhop_next(_obj) (_obj)->un_next
+
+CHT_SLIST_DEFINE(unhop, struct user_nhop);
+
+struct unhop_ctl {
+ struct unhop_head un_head;
+ struct rmlock un_lock;
+};
+#define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl")
+#define UN_TRACKER struct rm_priotracker un_tracker
+#define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker)
+#define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker)
+
+#define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock);
+#define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock);
+
+VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL;
+#define V_un_ctl VNET(un_ctl)
+
+static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size);
+static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b);
+static unsigned int hash_unhop(const struct user_nhop *obj);
+
+static void destroy_unhop(struct user_nhop *unhop);
+static struct nhop_object *clone_unhop(const struct user_nhop *unhop,
+ uint32_t fibnum, int family, int nh_flags);
+
+static int
+cmp_unhop(const struct user_nhop *a, const struct user_nhop *b)
+{
+ return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam);
+}
+
+/*
+ * Hash callback: calculate hash of an object
+ */
+static unsigned int
+hash_unhop(const struct user_nhop *obj)
+{
+ return (obj->un_idx ^ obj->un_fibfam);
+}
+
+#define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0)
+
+/*
+ * Factory interface for creating matching kernel nexthops/nexthop groups
+ *
+ * @uidx: userland nexhop index used to create the nexthop
+ * @fibnum: fibnum nexthop will be used in
+ * @family: upper family nexthop will be used in
+ * @nh_flags: desired nexthop prefix flags
+ * @perror: pointer to store error to
+ *
+ * Returns referenced nexthop linked to @fibnum/@family rib on success.
+ */
+struct nhop_object *
+nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx,
+ int nh_flags, int *perror)
+{
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ UN_TRACKER;
+
+ if (__predict_false(ctl == NULL))
+ return (NULL);
+
+ struct user_nhop key= {
+ .un_idx = uidx,
+ .un_fibfam = fibnum | ((uint32_t)family) << 24,
+ };
+ struct user_nhop *unhop;
+
+ nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT);
+
+ if (__predict_false(family == 0))
+ return (NULL);
+
+ UN_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop != NULL) {
+ struct nhop_object *nh = unhop->un_nhop;
+ UN_RLOCK(ctl);
+ *perror = 0;
+ nhop_ref_any(nh);
+ return (nh);
+ }
+
+ /*
+ * Exact nexthop not found. Search for template nexthop to clone from.
+ */
+ key.un_fibfam = 0;
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop == NULL) {
+ UN_RUNLOCK(ctl);
+ *perror = ESRCH;
+ return (NULL);
+ }
+
+ UN_RUNLOCK(ctl);
+
+ /* Create entry to insert first */
+ struct user_nhop *un_new, *un_tmp;
+ un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
+ if (un_new == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ un_new->un_idx = uidx;
+ un_new->un_fibfam = fibnum | ((uint32_t)family) << 24;
+
+ /* Relying on epoch to protect unhop here */
+ un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags);
+ if (un_new->un_nhop == NULL) {
+ free(un_new, M_NETLINK);
+ *perror = ENOMEM;
+ return (NULL);
+ }
+
+ /* Insert back and report */
+ UN_WLOCK(ctl);
+
+ /* First, find template record once again */
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop == NULL) {
+ /* Someone deleted the nexthop during the call */
+ UN_WUNLOCK(ctl);
+ *perror = ESRCH;
+ destroy_unhop(un_new);
+ return (NULL);
+ }
+
+ /* Second, check the direct match */
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp);
+ struct nhop_object *nh;
+ if (un_tmp != NULL) {
+ /* Another thread already created the desired nextop, use it */
+ nh = un_tmp->un_nhop;
+ } else {
+ /* Finally, insert the new nexthop and link it to the primary */
+ nh = un_new->un_nhop;
+ CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new);
+ un_new->un_nextchild = unhop->un_nextchild;
+ unhop->un_nextchild = un_new;
+ un_new = NULL;
+ NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh);
+ }
+
+ UN_WUNLOCK(ctl);
+
+ if (un_new != NULL)
+ destroy_unhop(un_new);
+
+ *perror = 0;
+ nhop_ref_any(nh);
+ return (nh);
+}
+
+static struct user_nhop *
+nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx)
+{
+ struct user_nhop key= { .un_idx = uidx };
+ struct user_nhop *unhop = NULL;
+ UN_TRACKER;
+
+ UN_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ UN_RUNLOCK(ctl);
+
+ return (unhop);
+}
+
+#define MAX_STACK_NHOPS 4
+static struct nhop_object *
+clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags)
+{
+ const struct weightened_nhop *wn;
+ struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS];
+ struct nhop_object *nh = NULL;
+ uint32_t num_nhops;
+ int error;
+
+ if (unhop->un_nhop_src != NULL) {
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf));
+ FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src,
+ "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum,
+ family, nh_flags);
+ }
+ struct nhop_object *nh;
+ nh = nhop_alloc(fibnum, AF_UNSPEC);
+ if (nh == NULL)
+ return (NULL);
+ nhop_copy(nh, unhop->un_nhop_src);
+ /* Check that nexthop gateway is compatible with the new family */
+ if (!nhop_set_upper_family(nh, family)) {
+ nhop_free(nh);
+ return (NULL);
+ }
+ nhop_set_uidx(nh, unhop->un_idx);
+ nhop_set_pxtype_flag(nh, nh_flags);
+ return (nhop_get_nhop(nh, &error));
+ }
+
+ wn = unhop->un_nhgrp_src;
+ num_nhops = unhop->un_nhgrp_count;
+
+ if (num_nhops > MAX_STACK_NHOPS) {
+ wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT);
+ if (wn_new == NULL)
+ return (NULL);
+ } else
+ wn_new = wn_base;
+
+ for (int i = 0; i < num_nhops; i++) {
+ uint32_t uidx = nhop_get_uidx(wn[i].nh);
+ MPASS(uidx != 0);
+ wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error);
+ if (error != 0)
+ break;
+ wn_new[i].weight = wn[i].weight;
+ }
+
+ if (error == 0) {
+ struct rib_head *rh = nhop_get_rh(wn_new[0].nh);
+ struct nhgrp_object *nhg;
+
+ error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg);
+ nh = (struct nhop_object *)nhg;
+ }
+
+ if (wn_new != wn_base)
+ free(wn_new, M_TEMP);
+ return (nh);
+}
+
+static void
+destroy_unhop(struct user_nhop *unhop)
+{
+ if (unhop->un_nhop != NULL)
+ nhop_free_any(unhop->un_nhop);
+ if (unhop->un_nhop_src != NULL)
+ nhop_free_any(unhop->un_nhop_src);
+ free(unhop, M_NETLINK);
+}
+
+static void
+destroy_unhop_epoch(epoch_context_t ctx)
+{
+ struct user_nhop *unhop;
+
+ unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx);
+
+ destroy_unhop(unhop);
+}
+
+static uint32_t
+find_spare_uidx(struct unhop_ctl *ctl)
+{
+ struct user_nhop *unhop, key = {};
+ uint32_t uidx = 0;
+ UN_TRACKER;
+
+ UN_RLOCK(ctl);
+ /* This should return spare uid with 75% of 65k used in ~99/100 cases */
+ for (int i = 0; i < 16; i++) {
+ key.un_idx = (arc4random() % 65536) + 65536 * 4;
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop == NULL) {
+ uidx = key.un_idx;
+ break;
+ }
+ }
+ UN_RUNLOCK(ctl);
+
+ return (uidx);
+}
+
+
+/*
+ * Actual netlink code
+ */
+struct netlink_walkargs {
+ struct nl_writer *nw;
+ struct nlmsghdr hdr;
+ struct nlpcb *so;
+ int family;
+ int error;
+ int count;
+ int dumped;
+};
+#define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem
+
+static bool
+dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr,
+ struct nl_writer *nw)
+{
+
+ if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
+ goto enomem;
+
+ struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
+ nhm->nh_family = AF_UNSPEC;
+ nhm->nh_scope = 0;
+ nhm->nh_protocol = unhop->un_protocol;
+ nhm->nh_flags = 0;
+
+ nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
+ nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH);
+
+ struct weightened_nhop *wn = unhop->un_nhgrp_src;
+ uint32_t num_nhops = unhop->un_nhgrp_count;
+ /* TODO: a better API? */
+ int nla_len = sizeof(struct nlattr);
+ nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp));
+ struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
+ if (nla == NULL)
+ goto enomem;
+ nla->nla_type = NHA_GROUP;
+ nla->nla_len = nla_len;
+ for (int i = 0; i < num_nhops; i++) {
+ struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i];
+ grp->id = nhop_get_uidx(wn[i].nh);
+ grp->weight = wn[i].weight;
+ grp->resvd1 = 0;
+ grp->resvd2 = 0;
+ }
+
+ if (nlmsg_end(nw))
+ return (true);
+enomem:
+ NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory");
+ nlmsg_abort(nw);
+ return (false);
+}
+
+static bool
+dump_nhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
+ struct nl_writer *nw)
+{
+ struct nhop_object *nh = unhop->un_nhop_src;
+
+ if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
+ goto enomem;
+
+ struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
+ ENOMEM_IF_NULL(nhm);
+ nhm->nh_family = nhop_get_neigh_family(nh);
+ nhm->nh_scope = 0; // XXX: what's that?
+ nhm->nh_protocol = unhop->un_protocol;
+ nhm->nh_flags = 0;
+
+ nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
+ if (nh->nh_flags & NHF_BLACKHOLE) {
+ nlattr_add_flag(nw, NHA_BLACKHOLE);
+ goto done;
+ }
+ nlattr_add_u32(nw, NHA_OIF, nh->nh_ifp->if_index);
+
+ switch (nh->gw_sa.sa_family) {
+#ifdef INET
+ case AF_INET:
+ nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct in6_addr addr = nh->gw6_sa.sin6_addr;
+ in6_clearscope(&addr);
+ nlattr_add(nw, NHA_GATEWAY, 16, &addr);
+ break;
+ }
+#endif
+ }
+
+done:
+ if (nlmsg_end(nw))
+ return (true);
+enomem:
+ nlmsg_abort(nw);
+ return (false);
+}
+
+static void
+dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
+ struct nl_writer *nw)
+{
+ if (unhop->un_nhop_src != NULL)
+ dump_nhop(unhop, hdr, nw);
+ else
+ dump_nhgrp(unhop, hdr, nw);
+}
+
+static int
+delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx)
+{
+ struct user_nhop *unhop_ret, *unhop_base, *unhop_chain;
+
+ struct user_nhop key = { .un_idx = uidx };
+
+ UN_WLOCK(ctl);
+
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base);
+
+ if (unhop_base != NULL) {
+ CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret);
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf));
+ FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop,
+ "removed base nhop %u: %s", uidx, nhbuf);
+ }
+ /* Unlink all child nexhops as well, keeping the chain intact */
+ unhop_chain = unhop_base->un_nextchild;
+ while (unhop_chain != NULL) {
+ CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain,
+ unhop_ret);
+ MPASS(unhop_chain == unhop_ret);
+ IF_DEBUG_LEVEL(LOG_DEBUG3) {
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf_any(unhop_chain->un_nhop,
+ nhbuf, sizeof(nhbuf));
+ FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop,
+ "removed child nhop %u: %s", uidx, nhbuf);
+ }
+ unhop_chain = unhop_chain->un_nextchild;
+ }
+ }
+
+ UN_WUNLOCK(ctl);
+
+ if (unhop_base == NULL) {
+ NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx);
+ return (ENOENT);
+ }
+
+ /* Report nexthop deletion */
+ struct netlink_walkargs wa = {
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_DELNEXTHOP,
+ };
+
+ struct nl_writer nw = {};
+ if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
+ NL_LOG(LOG_DEBUG, "error allocating message writer");
+ return (ENOMEM);
+ }
+
+ dump_unhop(unhop_base, &wa.hdr, &nw);
+ nlmsg_flush(&nw);
+
+ while (unhop_base != NULL) {
+ unhop_chain = unhop_base->un_nextchild;
+ epoch_call(net_epoch_preempt, destroy_unhop_epoch,
+ &unhop_base->un_epoch_ctx);
+ unhop_base = unhop_chain;
+ }
+
+ return (0);
+}
+
+static void
+consider_resize(struct unhop_ctl *ctl, uint32_t new_size)
+{
+ void *new_ptr = NULL;
+ size_t alloc_size;
+
+ if (new_size == 0)
+ return;
+
+ if (new_size != 0) {
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size);
+ new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (new_ptr == NULL)
+ return;
+ }
+
+ NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size);
+ UN_WLOCK(ctl);
+ if (new_ptr != NULL) {
+ CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size);
+ }
+ UN_WUNLOCK(ctl);
+
+
+ if (new_ptr != NULL)
+ free(new_ptr, M_NETLINK);
+}
+
+static bool __noinline
+vnet_init_unhops()
+{
+ uint32_t num_buckets = 16;
+ size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
+
+ struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK,
+ M_NOWAIT | M_ZERO);
+ if (ctl == NULL)
+ return (false);
+
+ void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (ptr == NULL) {
+ free(ctl, M_NETLINK);
+ return (false);
+ }
+ CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets);
+ UN_LOCK_INIT(ctl);
+
+ if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) {
+ free(ptr, M_NETLINK);
+ free(ctl, M_NETLINK);
+ }
+
+ if (atomic_load_ptr(&V_un_ctl) == NULL)
+ return (false);
+
+ NL_LOG(LOG_NOTICE, "UNHOPS init done");
+
+ return (true);
+}
+
+static void
+vnet_destroy_unhops(const void *unused __unused)
+{
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ struct user_nhop *unhop, *tmp;
+
+ if (ctl == NULL)
+ return;
+ V_un_ctl = NULL;
+
+ /* Wait till all unhop users finish their reads */
+ epoch_wait_preempt(net_epoch_preempt);
+
+ UN_WLOCK(ctl);
+ CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) {
+ destroy_unhop(unhop);
+ } CHT_SLIST_FOREACH_SAFE_END;
+ UN_WUNLOCK(ctl);
+
+ free(ctl->un_head.ptr, M_NETLINK);
+ free(ctl, M_NETLINK);
+}
+VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY,
+ vnet_destroy_unhops, NULL);
+
+static int
+nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ int error = 0;
+
+ /* Verify attribute correctness */
+ struct nexthop_grp *grp = NLA_DATA(nla);
+ int data_len = NLA_DATA_LEN(nla);
+
+ int count = data_len / sizeof(*grp);
+ if (count == 0 || (count * sizeof(*grp) != data_len)) {
+ NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len);
+ return (EINVAL);
+ }
+
+ *((struct nlattr **)target) = nla;
+ return (error);
+}
+
+struct nl_parsed_nhop {
+ uint32_t nha_id;
+ uint8_t nha_blackhole;
+ uint8_t nha_groups;
+ struct ifnet *nha_oif;
+ struct sockaddr *nha_gw;
+ struct nlattr *nha_group;
+ uint8_t nh_family;
+ uint8_t nh_protocol;
+};
+
+#define _IN(_field) offsetof(struct nhmsg, _field)
+#define _OUT(_field) offsetof(struct nl_parsed_nhop, _field)
+static const struct nlfield_parser nlf_p_nh[] = {
+ { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 },
+ { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 },
+};
+
+static const struct nlattr_parser nla_p_nh[] = {
+ { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 },
+ { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg },
+ { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag },
+ { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp },
+ { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip },
+ { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag },
+};
+#undef _IN
+#undef _OUT
+NL_DECLARE_PARSER(nhmsg_parser, struct nhmsg, nlf_p_nh, nla_p_nh);
+
+static bool
+eligible_nhg(const struct nhop_object *nh)
+{
+ return (nh->nh_flags & NHF_GATEWAY);
+}
+
+static int
+newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
+{
+ struct nexthop_grp *grp = NLA_DATA(attrs->nha_group);
+ int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp);
+ struct weightened_nhop *wn;
+
+ wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (wn == NULL)
+ return (ENOMEM);
+
+ for (int i = 0; i < count; i++) {
+ struct user_nhop *unhop;
+ unhop = nl_find_base_unhop(ctl, grp[i].id);
+ if (unhop == NULL) {
+ NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id);
+ free(wn, M_NETLINK);
+ return (ESRCH);
+ } else if (unhop->un_nhop_src == NULL) {
+ NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported",
+ grp[i].id);
+ free(wn, M_NETLINK);
+ return (ENOTSUP);
+ } else if (!eligible_nhg(unhop->un_nhop_src)) {
+ NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible",
+ grp[i].id);
+ free(wn, M_NETLINK);
+ return (ENOTSUP);
+ }
+ /*
+ * TODO: consider more rigid eligibility checks:
+ * restrict nexthops with the same gateway
+ */
+ wn[i].nh = unhop->un_nhop_src;
+ wn[i].weight = grp[i].weight;
+ }
+ unhop->un_nhgrp_src = wn;
+ unhop->un_nhgrp_count = count;
+ return (0);
+}
+
+static int
+newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
+{
+ struct ifaddr *ifa = NULL;
+ struct nhop_object *nh;
+ int error;
+
+ if (!attrs->nha_blackhole) {
+ if (attrs->nha_gw == NULL) {
+ NL_LOG(LOG_DEBUG, "missing NHA_GATEWAY");
+ return (EINVAL);
+ }
+ if (attrs->nha_oif == NULL) {
+ NL_LOG(LOG_DEBUG, "missing NHA_OIF");
+ return (EINVAL);
+ }
+ if (ifa == NULL)
+ ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif);
+ if (ifa == NULL) {
+ NL_LOG(LOG_DEBUG, "Unable to determine default source IP");
+ return (EINVAL);
+ }
+ }
+
+ int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family;
+
+ nh = nhop_alloc(RT_DEFAULT_FIB, family);
+ if (nh == NULL) {
+ NL_LOG(LOG_DEBUG, "Unable to allocate nexthop");
+ return (ENOMEM);
+ }
+ nhop_set_uidx(nh, attrs->nha_id);
+
+ if (attrs->nha_blackhole)
+ nhop_set_blackhole(nh, NHF_BLACKHOLE);
+ else {
+ nhop_set_gw(nh, attrs->nha_gw, true);
+ nhop_set_transmit_ifp(nh, attrs->nha_oif);
+ nhop_set_src(nh, ifa);
+ }
+
+ error = nhop_get_unlinked(nh);
+ if (error != 0) {
+ NL_LOG(LOG_DEBUG, "unable to finalize nexthop");
+ return (error);
+ }
+
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf(nh, nhbuf, sizeof(nhbuf));
+ NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf);
+ }
+
+ unhop->un_nhop_src = nh;
+ return (0);
+}
+
+static int
+rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nl_pstate *npt)
+{
+ struct user_nhop *unhop;
+ int error;
+
+ if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops()))
+ return (ENOMEM);
+ struct unhop_ctl *ctl = V_un_ctl;
+
+ struct nl_parsed_nhop attrs = {};
+ error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class
+ * citizen.
+ */
+ if (attrs.nha_id == 0) {
+ attrs.nha_id = find_spare_uidx(ctl);
+ if (attrs.nha_id == 0) {
+ NL_LOG(LOG_DEBUG, "Unable to get spare uidx");
+ return (ENOSPC);
+ }
+ }
+
+ NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? attrs.nha_oif->if_index : 0);
+
+ unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
+ if (unhop == NULL) {
+ NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop");
+ return (ENOMEM);
+ }
+ unhop->un_idx = attrs.nha_id;
+ unhop->un_protocol = attrs.nh_protocol;
+
+ if (attrs.nha_group)
+ error = newnhg(ctl, &attrs, unhop);
+ else
+ error = newnhop(&attrs, unhop);
+
+ if (error != 0) {
+ free(unhop, M_NETLINK);
+ return (error);
+ }
+
+ UN_WLOCK(ctl);
+ /* Check if uidx already exists */
+ struct user_nhop *tmp = NULL;
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp);
+ if (tmp != NULL) {
+ UN_WUNLOCK(ctl);
+ NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id);
+ destroy_unhop(unhop);
+ return (EEXIST);
+ }
+ CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop);
+ uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head);
+ UN_WUNLOCK(ctl);
+
+ /* Report addition of the next nexhop */
+ struct netlink_walkargs wa = {
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
+ };
+
+ struct nl_writer nw = {};
+ if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
+ NL_LOG(LOG_DEBUG, "error allocating message writer");
+ return (ENOMEM);
+ }
+
+ dump_unhop(unhop, &wa.hdr, &nw);
+ nlmsg_flush(&nw);
+
+ consider_resize(ctl, num_buckets_new);
+
+ return (0);
+}
+
+static int
+rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nl_pstate *npt)
+{
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ int error;
+
+ if (__predict_false(ctl == NULL))
+ return (ESRCH);
+
+ struct nl_parsed_nhop attrs = {};
+ error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.nha_id == 0) {
+ NL_LOG(LOG_DEBUG, "NHA_ID not set");
+ return (EINVAL);
+ }
+
+ error = delete_unhop(ctl, hdr, attrs.nha_id);
+
+ return (error);
+}
+
+static bool
+match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
+{
+ if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id)
+ return (false);
+ if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL)
+ return (false);
+ if (attrs->nha_oif != NULL &&
+ (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif))
+ return (false);
+
+ return (true);
+}
+
+static int
+rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nl_pstate *npt)
+{
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ struct user_nhop *unhop;
+ UN_TRACKER;
+ int error;
+
+ if (__predict_false(ctl == NULL))
+ return (ESRCH);
+
+ struct nl_parsed_nhop attrs = {};
+ error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ struct netlink_walkargs wa = {
+ .nw = npt->nw,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
+ };
+
+ if (attrs.nha_id != 0) {
+ NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id);
+ struct user_nhop key= { .un_idx = attrs.nha_id };
+ UN_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ UN_RUNLOCK(ctl);
+
+ if (unhop == NULL)
+ return (ESRCH);
+ dump_unhop(unhop, &wa.hdr, wa.nw);
+ return (0);
+ }
+
+ UN_RLOCK(ctl);
+ wa.hdr.nlmsg_flags |= NLM_F_MULTI;
+ CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) {
+ if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop))
+ dump_unhop(unhop, &wa.hdr, wa.nw);
+ } CHT_SLIST_FOREACH_END;
+ UN_RUNLOCK(ctl);
+
+ if (wa.error == 0) {
+ if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr))
+ return (ENOMEM);
+ }
+ return (0);
+}
+
+static const struct rtnl_cmd_handler cmd_handlers[] = {
+ {
+ .cmd = NL_RTM_NEWNEXTHOP,
+ .name = "RTM_NEWNEXTHOP",
+ .cb = &rtnl_handle_newnhop,
+ .priv = PRIV_NET_ROUTE,
+ },
+ {
+ .cmd = NL_RTM_DELNEXTHOP,
+ .name = "RTM_DELNEXTHOP",
+ .cb = &rtnl_handle_delnhop,
+ .priv = PRIV_NET_ROUTE,
+ },
+ {
+ .cmd = NL_RTM_GETNEXTHOP,
+ .name = "RTM_GETNEXTHOP",
+ .cb = &rtnl_handle_getnhop,
+ }
+};
+
+static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser };
+
+void
+rtnl_nexthops_init()
+{
+ NL_VERIFY_PARSERS(all_parsers);
+ rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
+}
diff --git a/sys/netlink/route/nexthop.h b/sys/netlink/route/nexthop.h
new file mode 100644
index 000000000000..310c3e08fc4b
--- /dev/null
+++ b/sys/netlink/route/nexthop.h
@@ -0,0 +1,102 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * NEXTHOP-related (RTM_<NEW|DEL|GET>NEXTHOP) message header and attributes.
+ */
+
+#ifndef _NETLINK_ROUTE_NEXTHOP_H_
+#define _NETLINK_ROUTE_NEXTHOP_H_
+
+/* Base header for all of the relevant messages */
+struct nhmsg {
+ unsigned char nh_family; /* transport family */
+ unsigned char nh_scope; /* ignored on RX, filled by kernel */
+ unsigned char nh_protocol; /* Routing protocol that installed nh */
+ unsigned char resvd;
+ unsigned int nh_flags; /* RTNH_F_* flags from route.h */
+};
+
+enum {
+ NHA_UNSPEC,
+ NHA_ID, /* u32: nexthop userland index, auto-assigned if 0 */
+ NHA_GROUP, /* binary: array of struct nexthop_grp */
+ NHA_GROUP_TYPE, /* u16: set to NEXTHOP_GRP_TYPE */
+ NHA_BLACKHOLE, /* flag: nexthop used to blackhole packets */
+ NHA_OIF, /* u32: transmit ifindex */
+ NHA_GATEWAY, /* network: IPv4/IPv6 gateway addr */
+ NHA_ENCAP_TYPE, /* not supported */
+ NHA_ENCAP, /* not supported */
+ NHA_GROUPS, /* flag: match nexthop groups */
+ NHA_MASTER, /* not supported */
+ NHA_FDB, /* not supported */
+ NHA_RES_GROUP, /* not supported */
+ NHA_RES_BUCKET, /* not supported */
+ __NHA_MAX,
+};
+#define NHA_MAX (__NHA_MAX - 1)
+
+/*
+ * Attributes that can be used as filters:
+ * NHA_ID (nexhop or group), NHA_OIF, NHA_GROUPS,
+ */
+
+/*
+ * NHA_GROUP: array of the following structures.
+ * If attribute is set, the only other valid attributes are
+ * NHA_ID and NHA_GROUP_TYPE.
+ * NHA_RES_GROUP and NHA_RES_BUCKET are not supported yet
+ */
+struct nexthop_grp {
+ uint32_t id; /* nexhop userland index */
+ uint8_t weight; /* weight of this nexthop */
+ uint8_t resvd1;
+ uint16_t resvd2;
+};
+
+/* NHA_GROUP_TYPE: u16 */
+enum {
+ NEXTHOP_GRP_TYPE_MPATH, /* default nexthop group */
+ NEXTHOP_GRP_TYPE_RES, /* resilient nexthop group */
+ __NEXTHOP_GRP_TYPE_MAX,
+};
+#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1)
+
+
+/* NHA_RES_GROUP */
+enum {
+ NHA_RES_GROUP_UNSPEC,
+ NHA_RES_GROUP_PAD = NHA_RES_GROUP_UNSPEC,
+ NHA_RES_GROUP_BUCKETS,
+ NHA_RES_GROUP_IDLE_TIMER,
+ NHA_RES_GROUP_UNBALANCED_TIMER,
+ NHA_RES_GROUP_UNBALANCED_TIME,
+ __NHA_RES_GROUP_MAX,
+};
+#define NHA_RES_GROUP_MAX (__NHA_RES_GROUP_MAX - 1)
+
+#endif
diff --git a/sys/netlink/route/route.c b/sys/netlink/route/route.c
new file mode 100644
index 000000000000..7573b371155e
--- /dev/null
+++ b/sys/netlink/route/route.c
@@ -0,0 +1,972 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_route.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#define DEBUG_MOD_NAME nl_route
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+static unsigned char
+get_rtm_type(const struct nhop_object *nh)
+{
+ int nh_flags = nh->nh_flags;
+
+ /* Use the fact that nhg runtime flags are only NHF_MULTIPATH */
+ if (nh_flags & NHF_BLACKHOLE)
+ return (RTN_BLACKHOLE);
+ else if (nh_flags & NHF_REJECT)
+ return (RTN_PROHIBIT);
+ return (RTN_UNICAST);
+}
+
+static uint8_t
+nl_get_rtm_protocol(const struct nhop_object *nh)
+{
+ if (NH_IS_NHGRP(nh)) {
+ const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh;
+ uint8_t origin = nhgrp_get_origin(nhg);
+ if (origin != RTPROT_UNSPEC)
+ return (origin);
+ nh = nhg->nhops[0];
+ }
+ uint8_t origin = nhop_get_origin(nh);
+ if (origin != RTPROT_UNSPEC)
+ return (origin);
+ /* TODO: remove guesswork once all kernel users fill in origin */
+ int rt_flags = nhop_get_rtflags(nh);
+ if (rt_flags & RTF_PROTO1)
+ return (RTPROT_ZEBRA);
+ if (rt_flags & RTF_STATIC)
+ return (RTPROT_STATIC);
+ return (RTPROT_KERNEL);
+}
+
+static int
+get_rtmsg_type_from_rtsock(int cmd)
+{
+ switch (cmd) {
+ case RTM_ADD:
+ case RTM_CHANGE:
+ case RTM_GET:
+ return NL_RTM_NEWROUTE;
+ case RTM_DELETE:
+ return NL_RTM_DELROUTE;
+ }
+
+ return (0);
+}
+
+/*
+ * fibnum heuristics
+ *
+ * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS
+ * msg rtm_table RTA_TABLE result
+ * RTM_GETROUTE/dump 0 - RT_ALL_FIBS
+ * RTM_GETROUTE/dump 1 - 1
+ * RTM_GETROUTE/get 0 - 0
+ *
+ */
+
+static struct nhop_object *
+rc_get_nhop(const struct rib_cmd_info *rc)
+{
+ return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new);
+}
+
+static void
+dump_rc_nhop_gw(struct nl_writer *nw, const struct nhop_object *nh)
+{
+ int upper_family;
+
+ switch (nhop_get_neigh_family(nh)) {
+ case AF_LINK:
+ /* onlink prefix, skip */
+ break;
+ case AF_INET:
+ nlattr_add(nw, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
+ break;
+ case AF_INET6:
+ upper_family = nhop_get_upper_family(nh);
+ if (upper_family == AF_INET6) {
+ nlattr_add(nw, NL_RTA_GATEWAY, 16, &nh->gw6_sa.sin6_addr);
+ } else if (upper_family == AF_INET) {
+ /* IPv4 over IPv6 */
+ char buf[20];
+ struct rtvia *via = (struct rtvia *)&buf[0];
+ via->rtvia_family = AF_INET6;
+ memcpy(via->rtvia_addr, &nh->gw6_sa.sin6_addr, 16);
+ nlattr_add(nw, NL_RTA_VIA, 17, via);
+ }
+ break;
+ }
+}
+
+static void
+dump_rc_nhop_mtu(struct nl_writer *nw, const struct nhop_object *nh)
+{
+ int nla_len = sizeof(struct nlattr) * 2 + sizeof(uint32_t);
+ struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
+
+ if (nla == NULL)
+ return;
+ nla->nla_type = NL_RTA_METRICS;
+ nla->nla_len = nla_len;
+ nla++;
+ nla->nla_type = NL_RTAX_MTU;
+ nla->nla_len = sizeof(struct nlattr) + sizeof(uint32_t);
+ *((uint32_t *)(nla + 1)) = nh->nh_mtu;
+}
+
+static void
+dump_rc_nhg(struct nl_writer *nw, const struct nhgrp_object *nhg, struct rtmsg *rtm)
+{
+ uint32_t uidx = nhgrp_get_uidx(nhg);
+ uint32_t num_nhops;
+ const struct weightened_nhop *wn = nhgrp_get_nhops(nhg, &num_nhops);
+ uint32_t base_rtflags = nhop_get_rtflags(wn[0].nh);
+
+ if (uidx != 0)
+ nlattr_add_u32(nw, NL_RTA_NH_ID, uidx);
+
+ nlattr_add_u32(nw, NL_RTA_RTFLAGS, base_rtflags);
+ int off = nlattr_add_nested(nw, NL_RTA_MULTIPATH);
+ if (off == 0)
+ return;
+
+ for (int i = 0; i < num_nhops; i++) {
+ int nh_off = nlattr_save_offset(nw);
+ struct rtnexthop *rtnh = nlmsg_reserve_object(nw, struct rtnexthop);
+ if (rtnh == NULL)
+ return;
+ rtnh->rtnh_flags = 0;
+ rtnh->rtnh_ifindex = wn[i].nh->nh_ifp->if_index;
+ rtnh->rtnh_hops = wn[i].weight;
+ dump_rc_nhop_gw(nw, wn[i].nh);
+ uint32_t rtflags = nhop_get_rtflags(wn[i].nh);
+ if (rtflags != base_rtflags)
+ nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags);
+ if (rtflags & RTF_FIXEDMTU)
+ dump_rc_nhop_mtu(nw, wn[i].nh);
+ rtnh = nlattr_restore_offset(nw, nh_off, struct rtnexthop);
+ /*
+ * nlattr_add() allocates 4-byte aligned storage, no need to aligh
+ * length here
+ * */
+ rtnh->rtnh_len = nlattr_save_offset(nw) - nh_off;
+ }
+ nlattr_set_len(nw, off);
+}
+
+static void
+dump_rc_nhop(struct nl_writer *nw, const struct nhop_object *nh, struct rtmsg *rtm)
+{
+ if (NH_IS_NHGRP(nh)) {
+ dump_rc_nhg(nw, (const struct nhgrp_object *)nh, rtm);
+ return;
+ }
+
+ uint32_t rtflags = nhop_get_rtflags(nh);
+
+ /*
+ * IPv4 over IPv6
+ * ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2),
+ * IPv4 w/ gw
+ * ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)],
+ * Direct route:
+ * ('RTA_OIF', 2)
+ */
+ if (nh->nh_flags & NHF_GATEWAY)
+ dump_rc_nhop_gw(nw, nh);
+
+ uint32_t uidx = nhop_get_uidx(nh);
+ if (uidx != 0)
+ nlattr_add_u32(nw, NL_RTA_NH_ID, uidx);
+ nlattr_add_u32(nw, NL_RTA_KNH_ID, nhop_get_idx(nh));
+ nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags);
+
+ if (rtflags & RTF_FIXEDMTU)
+ dump_rc_nhop_mtu(nw, nh);
+ uint32_t nh_expire = nhop_get_expire(nh);
+ if (nh_expire > 0)
+ nlattr_add_u32(nw, NL_RTA_EXPIRES, nh_expire - time_uptime);
+
+ /* In any case, fill outgoing interface */
+ nlattr_add_u32(nw, NL_RTA_OIF, nh->nh_ifp->if_index);
+}
+
+/*
+ * Dumps output from a rib command into an rtmsg
+ */
+
+static int
+dump_px(uint32_t fibnum, const struct nlmsghdr *hdr,
+ const struct rtentry *rt, struct route_nhop_data *rnd,
+ struct nl_writer *nw)
+{
+ struct rtmsg *rtm;
+ int error = 0;
+
+ NET_EPOCH_ASSERT();
+
+ if (!nlmsg_reply(nw, hdr, sizeof(struct rtmsg)))
+ goto enomem;
+
+ int family = rt_get_family(rt);
+ int rtm_off = nlattr_save_offset(nw);
+ rtm = nlmsg_reserve_object(nw, struct rtmsg);
+ rtm->rtm_family = family;
+ rtm->rtm_dst_len = 0;
+ rtm->rtm_src_len = 0;
+ rtm->rtm_tos = 0;
+ if (fibnum < 255)
+ rtm->rtm_table = (unsigned char)fibnum;
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ if (!NH_IS_NHGRP(rnd->rnd_nhop)) {
+ rtm->rtm_protocol = nl_get_rtm_protocol(rnd->rnd_nhop);
+ rtm->rtm_type = get_rtm_type(rnd->rnd_nhop);
+ } else {
+ rtm->rtm_protocol = RTPROT_UNSPEC; /* TODO: protocol from nhg? */
+ rtm->rtm_type = RTN_UNICAST;
+ }
+
+ nlattr_add_u32(nw, NL_RTA_TABLE, fibnum);
+
+ int plen = 0;
+ uint32_t scopeid = 0;
+ switch (family) {
+ case AF_INET:
+ {
+ struct in_addr addr;
+ rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid);
+ nlattr_add(nw, NL_RTA_DST, 4, &addr);
+ break;
+ }
+ case AF_INET6:
+ {
+ struct in6_addr addr;
+ rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid);
+ nlattr_add(nw, NL_RTA_DST, 16, &addr);
+ break;
+ }
+ default:
+ FIB_LOG(LOG_NOTICE, fibnum, family, "unsupported rt family: %d", family);
+ error = EAFNOSUPPORT;
+ goto flush;
+ }
+
+ rtm = nlattr_restore_offset(nw, rtm_off, struct rtmsg);
+ if (plen > 0)
+ rtm->rtm_dst_len = plen;
+ dump_rc_nhop(nw, rnd->rnd_nhop, rtm);
+
+ if (nlmsg_end(nw))
+ return (0);
+enomem:
+ error = ENOMEM;
+flush:
+ nlmsg_abort(nw);
+ return (error);
+}
+
+static int
+family_to_group(int family)
+{
+ switch (family) {
+ case AF_INET:
+ return (RTNLGRP_IPV4_ROUTE);
+ case AF_INET6:
+ return (RTNLGRP_IPV6_ROUTE);
+ }
+ return (0);
+}
+
+
+static void
+report_operation(uint32_t fibnum, struct rib_cmd_info *rc,
+ struct nlpcb *nlp, struct nlmsghdr *hdr)
+{
+ struct nl_writer nw;
+
+ uint32_t group_id = family_to_group(rt_get_family(rc->rc_rt));
+ if (nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) {
+ struct route_nhop_data rnd = {
+ .rnd_nhop = rc_get_nhop(rc),
+ .rnd_weight = rc->rc_nh_weight,
+ };
+ hdr->nlmsg_flags &= ~(NLM_F_REPLACE | NLM_F_CREATE);
+ hdr->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_APPEND);
+ switch (rc->rc_cmd) {
+ case RTM_ADD:
+ hdr->nlmsg_type = NL_RTM_NEWROUTE;
+ hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL;
+ break;
+ case RTM_CHANGE:
+ hdr->nlmsg_type = NL_RTM_NEWROUTE;
+ hdr->nlmsg_flags |= NLM_F_REPLACE;
+ break;
+ case RTM_DELETE:
+ hdr->nlmsg_type = NL_RTM_DELROUTE;
+ break;
+ }
+ dump_px(fibnum, hdr, rc->rc_rt, &rnd, &nw);
+ nlmsg_flush(&nw);
+ }
+
+ rtsock_callback_p->route_f(fibnum, rc);
+}
+
+struct rta_mpath_nh {
+ struct sockaddr *gw;
+ struct ifnet *ifp;
+ uint8_t rtnh_flags;
+ uint8_t rtnh_weight;
+};
+
+#define _IN(_field) offsetof(struct rtnexthop, _field)
+#define _OUT(_field) offsetof(struct rta_mpath_nh, _field)
+const static struct nlattr_parser nla_p_rtnh[] = {
+ { .type = NL_RTA_GATEWAY, .off = _OUT(gw), .cb = nlattr_get_ip },
+ { .type = NL_RTA_VIA, .off = _OUT(gw), .cb = nlattr_get_ipvia },
+};
+const static struct nlfield_parser nlf_p_rtnh[] = {
+ { .off_in = _IN(rtnh_flags), .off_out = _OUT(rtnh_flags), .cb = nlf_get_u8 },
+ { .off_in = _IN(rtnh_hops), .off_out = _OUT(rtnh_weight), .cb = nlf_get_u8 },
+ { .off_in = _IN(rtnh_ifindex), .off_out = _OUT(ifp), .cb = nlf_get_ifpz },
+};
+#undef _IN
+#undef _OUT
+NL_DECLARE_PARSER(mpath_parser, struct rtnexthop, nlf_p_rtnh, nla_p_rtnh);
+
+struct rta_mpath {
+ int num_nhops;
+ struct rta_mpath_nh nhops[0];
+};
+
+static int
+nlattr_get_multipath(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ int data_len = nla->nla_len - sizeof(struct nlattr);
+ struct rtnexthop *rtnh;
+
+ int max_nhops = data_len / sizeof(struct rtnexthop);
+
+ struct rta_mpath *mp = npt_alloc(npt, (max_nhops + 2) * sizeof(struct rta_mpath_nh));
+ mp->num_nhops = 0;
+
+ for (rtnh = (struct rtnexthop *)(nla + 1); data_len > 0; ) {
+ struct rta_mpath_nh *mpnh = &mp->nhops[mp->num_nhops++];
+
+ int error = nl_parse_header(rtnh, rtnh->rtnh_len, &mpath_parser,
+ npt, mpnh);
+ if (error != 0) {
+ NLMSG_REPORT_ERR_MSG(npt, "RTA_MULTIPATH: nexhop %d: parse failed",
+ mp->num_nhops - 1);
+ return (error);
+ }
+
+ int len = NL_ITEM_ALIGN(rtnh->rtnh_len);
+ data_len -= len;
+ rtnh = (struct rtnexthop *)((char *)rtnh + len);
+ }
+ if (data_len != 0 || mp->num_nhops == 0) {
+ NLMSG_REPORT_ERR_MSG(npt, "invalid RTA_MULTIPATH attr");
+ return (EINVAL);
+ }
+
+ *((struct rta_mpath **)target) = mp;
+ return (0);
+}
+
+
+struct nl_parsed_route {
+ struct sockaddr *rta_dst;
+ struct sockaddr *rta_gw;
+ struct ifnet *rta_oif;
+ struct rta_mpath *rta_multipath;
+ uint32_t rta_table;
+ uint32_t rta_rtflags;
+ uint32_t rta_nh_id;
+ uint32_t rtax_mtu;
+ uint8_t rtm_family;
+ uint8_t rtm_dst_len;
+};
+
+#define _IN(_field) offsetof(struct rtmsg, _field)
+#define _OUT(_field) offsetof(struct nl_parsed_route, _field)
+static struct nlattr_parser nla_p_rtmetrics[] = {
+ { .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = nlattr_get_uint32 },
+};
+NL_DECLARE_ATTR_PARSER(metrics_parser, nla_p_rtmetrics);
+
+static const struct nlattr_parser nla_p_rtmsg[] = {
+ { .type = NL_RTA_DST, .off = _OUT(rta_dst), .cb = nlattr_get_ip },
+ { .type = NL_RTA_OIF, .off = _OUT(rta_oif), .cb = nlattr_get_ifp },
+ { .type = NL_RTA_GATEWAY, .off = _OUT(rta_gw), .cb = nlattr_get_ip },
+ { .type = NL_RTA_METRICS, .arg = &metrics_parser, .cb = nlattr_get_nested },
+ { .type = NL_RTA_MULTIPATH, .off = _OUT(rta_multipath), .cb = nlattr_get_multipath },
+ { .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = nlattr_get_uint32 },
+ { .type = NL_RTA_TABLE, .off = _OUT(rta_table), .cb = nlattr_get_uint32 },
+ { .type = NL_RTA_VIA, .off = _OUT(rta_gw), .cb = nlattr_get_ipvia },
+ { .type = NL_RTA_NH_ID, .off = _OUT(rta_nh_id), .cb = nlattr_get_uint32 },
+};
+
+static const struct nlfield_parser nlf_p_rtmsg[] = {
+ {.off_in = _IN(rtm_family), .off_out = _OUT(rtm_family), .cb = nlf_get_u8 },
+ {.off_in = _IN(rtm_dst_len), .off_out = _OUT(rtm_dst_len), .cb = nlf_get_u8 },
+};
+#undef _IN
+#undef _OUT
+NL_DECLARE_PARSER(rtm_parser, struct rtmsg, nlf_p_rtmsg, nla_p_rtmsg);
+
+struct netlink_walkargs {
+ struct nl_writer *nw;
+ struct route_nhop_data rnd;
+ struct nlmsghdr hdr;
+ struct nlpcb *nlp;
+ uint32_t fibnum;
+ int family;
+ int error;
+ int count;
+ int dumped;
+ int dumped_tables;
+};
+
+static int
+dump_rtentry(struct rtentry *rt, void *_arg)
+{
+ struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg;
+ int error;
+
+ wa->count++;
+ if (wa->error != 0)
+ return (0);
+ wa->dumped++;
+
+ rt_get_rnd(rt, &wa->rnd);
+
+ error = dump_px(wa->fibnum, &wa->hdr, rt, &wa->rnd, wa->nw);
+
+ IF_DEBUG_LEVEL(LOG_DEBUG3) {
+ char rtbuf[INET6_ADDRSTRLEN + 5];
+ FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family,
+ "Dump %s, offset %u, error %d",
+ rt_print_buf(rt, rtbuf, sizeof(rtbuf)),
+ wa->nw->offset, error);
+ }
+ wa->error = error;
+
+ return (0);
+}
+
+static void
+dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family)
+{
+ FIB_LOG(LOG_DEBUG2, fibnum, family, "Start dump");
+ wa->count = 0;
+ wa->dumped = 0;
+
+ rib_walk(fibnum, family, false, dump_rtentry, wa);
+
+ wa->dumped_tables++;
+
+ FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d",
+ wa->count, wa->dumped);
+ NL_LOG(LOG_DEBUG2, "Current offset: %d", wa->nw->offset);
+}
+
+static int
+dump_rtable_fib(struct netlink_walkargs *wa, uint32_t fibnum, int family)
+{
+ wa->fibnum = fibnum;
+
+ if (family == AF_UNSPEC) {
+ for (int i = 0; i < AF_MAX; i++) {
+ if (rt_tables_get_rnh(fibnum, i) != 0) {
+ wa->family = i;
+ dump_rtable_one(wa, fibnum, i);
+ if (wa->error != 0)
+ break;
+ }
+ }
+ } else {
+ if (rt_tables_get_rnh(fibnum, family) != 0) {
+ wa->family = family;
+ dump_rtable_one(wa, fibnum, family);
+ }
+ }
+
+ return (wa->error);
+}
+
+static int
+handle_rtm_getroute(struct nlpcb *nlp, struct nl_parsed_route *attrs,
+ struct nlmsghdr *hdr, struct nl_pstate *npt)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rnh;
+ struct rtentry *rt;
+ uint32_t fibnum = attrs->rta_table;
+ sa_family_t family = attrs->rtm_family;
+
+ if (attrs->rta_dst == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "No RTA_DST supplied");
+ return (EINVAL);
+ }
+
+ FIB_LOG(LOG_DEBUG, fibnum, family, "getroute called");
+
+ rnh = rt_tables_get_rnh(fibnum, family);
+ if (rnh == NULL)
+ return (EAFNOSUPPORT);
+
+ RIB_RLOCK(rnh);
+
+ rt = (struct rtentry *)rnh->rnh_matchaddr(attrs->rta_dst, &rnh->head);
+ if (rt == NULL) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
+ }
+
+ struct route_nhop_data rnd;
+ rt_get_rnd(rt, &rnd);
+ rnd.rnd_nhop = nhop_select_func(rnd.rnd_nhop, 0);
+
+ RIB_RUNLOCK(rnh);
+
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ char rtbuf[NHOP_PRINT_BUFSIZE] __unused, nhbuf[NHOP_PRINT_BUFSIZE] __unused;
+ FIB_LOG(LOG_DEBUG2, fibnum, family, "getroute completed: got %s for %s",
+ nhop_print_buf_any(rnd.rnd_nhop, nhbuf, sizeof(nhbuf)),
+ rt_print_buf(rt, rtbuf, sizeof(rtbuf)));
+ }
+
+ hdr->nlmsg_type = NL_RTM_NEWROUTE;
+ dump_px(fibnum, hdr, rt, &rnd, npt->nw);
+
+ return (0);
+}
+
+static int
+handle_rtm_dump(struct nlpcb *nlp, uint32_t fibnum, int family,
+ struct nlmsghdr *hdr, struct nl_writer *nw)
+{
+ struct netlink_walkargs wa = {
+ .nlp = nlp,
+ .nw = nw,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_type = NL_RTM_NEWROUTE,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI,
+ };
+
+ if (fibnum == RT_TABLE_UNSPEC) {
+ for (int i = 0; i < V_rt_numfibs; i++) {
+ dump_rtable_fib(&wa, fibnum, family);
+ if (wa.error != 0)
+ break;
+ }
+ } else
+ dump_rtable_fib(&wa, fibnum, family);
+
+ if (wa.error == 0 && wa.dumped_tables == 0) {
+ FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum/family");
+ wa.error = ESRCH;
+ // How do we propagate it?
+ }
+
+ if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) {
+ NL_LOG(LOG_DEBUG, "Unable to finalize the dump");
+ return (ENOMEM);
+ }
+
+ return (wa.error);
+}
+
+static struct nhop_object *
+finalize_nhop(struct nhop_object *nh, int *perror)
+{
+ /*
+ * The following MUST be filled:
+ * nh_ifp, nh_ifa, nh_gw
+ */
+ if (nh->gw_sa.sa_family == 0) {
+ /*
+ * Empty gateway. Can be direct route with RTA_OIF set.
+ */
+ if (nh->nh_ifp != NULL)
+ nhop_set_direct_gw(nh, nh->nh_ifp);
+ else {
+ NL_LOG(LOG_DEBUG, "empty gateway and interface, skipping");
+ *perror = EINVAL;
+ return (NULL);
+ }
+ /* Both nh_ifp and gateway are set */
+ } else {
+ /* Gateway is set up, we can derive ifp if not set */
+ if (nh->nh_ifp == NULL) {
+ struct ifaddr *ifa = ifa_ifwithnet(&nh->gw_sa, 1, nhop_get_fibnum(nh));
+ if (ifa == NULL) {
+ NL_LOG(LOG_DEBUG, "Unable to determine ifp, skipping");
+ *perror = EINVAL;
+ return (NULL);
+ }
+ nhop_set_transmit_ifp(nh, ifa->ifa_ifp);
+ }
+ }
+ /* Both nh_ifp and gateway are set */
+ if (nh->nh_ifa == NULL) {
+ struct ifaddr *ifa = ifaof_ifpforaddr(&nh->gw_sa, nh->nh_ifp);
+ if (ifa == NULL) {
+ NL_LOG(LOG_DEBUG, "Unable to determine ifa, skipping");
+ *perror = EINVAL;
+ return (NULL);
+ }
+ nhop_set_src(nh, ifa);
+ }
+
+ return (nhop_get_nhop(nh, perror));
+}
+
+static int
+get_pxflag(const struct nl_parsed_route *attrs)
+{
+ int pxflag = 0;
+ switch (attrs->rtm_family) {
+ case AF_INET:
+ if (attrs->rtm_dst_len == 32)
+ pxflag = NHF_HOST;
+ else if (attrs->rtm_dst_len == 0)
+ pxflag = NHF_DEFAULT;
+ break;
+ case AF_INET6:
+ if (attrs->rtm_dst_len == 32)
+ pxflag = NHF_HOST;
+ else if (attrs->rtm_dst_len == 0)
+ pxflag = NHF_DEFAULT;
+ break;
+ }
+
+ return (pxflag);
+}
+
+static int
+get_op_flags(int nlm_flags)
+{
+ int op_flags = 0;
+
+ op_flags |= (nlm_flags & NLM_F_REPLACE) ? RTM_F_REPLACE : 0;
+ op_flags |= (nlm_flags & NLM_F_EXCL) ? RTM_F_EXCL : 0;
+ op_flags |= (nlm_flags & NLM_F_CREATE) ? RTM_F_CREATE : 0;
+ op_flags |= (nlm_flags & NLM_F_APPEND) ? RTM_F_APPEND : 0;
+
+ return (op_flags);
+}
+
+static int
+create_nexthop_one(struct nl_parsed_route *attrs, struct rta_mpath_nh *mpnh,
+ struct nl_pstate *npt, struct nhop_object **pnh)
+{
+ int error;
+
+ if (mpnh->gw == NULL)
+ return (EINVAL);
+
+ struct nhop_object *nh = nhop_alloc(attrs->rta_table, attrs->rtm_family);
+ if (nh == NULL)
+ return (ENOMEM);
+
+ nhop_set_gw(nh, mpnh->gw, true);
+ if (mpnh->ifp != NULL)
+ nhop_set_transmit_ifp(nh, mpnh->ifp);
+ nhop_set_rtflags(nh, attrs->rta_rtflags);
+
+ *pnh = finalize_nhop(nh, &error);
+
+ return (error);
+}
+
+static struct nhop_object *
+create_nexthop_from_attrs(struct nl_parsed_route *attrs,
+ struct nl_pstate *npt, int *perror)
+{
+ struct nhop_object *nh;
+ int error = 0;
+
+ if (attrs->rta_multipath != NULL) {
+ /* Multipath w/o explicit nexthops */
+ int num_nhops = attrs->rta_multipath->num_nhops;
+ struct weightened_nhop *wn = npt_alloc(npt, sizeof(*wn) * num_nhops);
+ nh = NULL;
+
+ for (int i = 0; i < num_nhops; i++) {
+ struct rta_mpath_nh *mpnh = &attrs->rta_multipath->nhops[i];
+
+ error = create_nexthop_one(attrs, mpnh, npt, &wn[i].nh);
+ if (error != 0) {
+ for (int j = 0; j < i; j++)
+ nhop_free(wn[j].nh);
+ break;
+ }
+ wn[i].weight = mpnh->rtnh_weight > 0 ? mpnh->rtnh_weight : 1;
+ }
+ if (error == 0) {
+ struct rib_head *rh = nhop_get_rh(wn[0].nh);
+
+ error = nhgrp_get_group(rh, wn, num_nhops, 0,
+ (struct nhgrp_object **)&nh);
+
+ for (int i = 0; i < num_nhops; i++)
+ nhop_free(wn[i].nh);
+ }
+ *perror = error;
+ } else {
+ nh = nhop_alloc(attrs->rta_table, attrs->rtm_family);
+ if (nh == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ if (attrs->rta_gw != NULL)
+ nhop_set_gw(nh, attrs->rta_gw, true);
+ if (attrs->rta_oif != NULL)
+ nhop_set_transmit_ifp(nh, attrs->rta_oif);
+ if (attrs->rtax_mtu != 0)
+ nhop_set_mtu(nh, attrs->rtax_mtu, true);
+ if (attrs->rta_rtflags & RTF_BROADCAST)
+ nhop_set_broadcast(nh, true);
+ if (attrs->rta_rtflags & RTF_BLACKHOLE)
+ nhop_set_blackhole(nh, NHF_BLACKHOLE);
+ if (attrs->rta_rtflags & RTF_REJECT)
+ nhop_set_blackhole(nh, NHF_REJECT);
+ nhop_set_rtflags(nh, attrs->rta_rtflags);
+ nh = finalize_nhop(nh, perror);
+ }
+
+ return (nh);
+}
+
+static int
+rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nl_pstate *npt)
+{
+ struct rib_cmd_info rc = {};
+ struct nhop_object *nh = NULL;
+ int error;
+
+ struct nl_parsed_route attrs = {};
+ error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ /* Check if we have enough data */
+ if (attrs.rta_dst == NULL) {
+ NL_LOG(LOG_DEBUG, "missing RTA_DST");
+ return (EINVAL);
+ }
+
+ if (attrs.rta_nh_id != 0) {
+ /* Referenced uindex */
+ int pxflag = get_pxflag(&attrs);
+ nh = nl_find_nhop(attrs.rta_table, attrs.rtm_family, attrs.rta_nh_id,
+ pxflag, &error);
+ if (error != 0)
+ return (error);
+ } else {
+ nh = create_nexthop_from_attrs(&attrs, npt, &error);
+ if (error != 0) {
+ NL_LOG(LOG_DEBUG, "Error creating nexthop");
+ return (error);
+ }
+ }
+
+ int weight = NH_IS_NHGRP(nh) ? 0 : RT_DEFAULT_WEIGHT;
+ struct route_nhop_data rnd = { .rnd_nhop = nh, .rnd_weight = weight };
+ int op_flags = get_op_flags(hdr->nlmsg_flags);
+
+ error = rib_add_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len,
+ &rnd, op_flags, &rc);
+ if (error == 0)
+ report_operation(attrs.rta_table, &rc, nlp, hdr);
+ return (error);
+}
+
+static int
+path_match_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
+{
+ struct nl_parsed_route *attrs = (struct nl_parsed_route *)_data;
+
+ if ((attrs->rta_gw != NULL) && !rib_match_gw(rt, nh, attrs->rta_gw))
+ return (0);
+
+ if ((attrs->rta_oif != NULL) && (attrs->rta_oif != nh->nh_ifp))
+ return (0);
+
+ return (1);
+}
+
+static int
+rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nl_pstate *npt)
+{
+ struct rib_cmd_info rc;
+ int error;
+
+ struct nl_parsed_route attrs = {};
+ error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.rta_dst == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "RTA_DST is not set");
+ return (ESRCH);
+ }
+
+ error = rib_del_route_px(attrs.rta_table, attrs.rta_dst,
+ attrs.rtm_dst_len, path_match_func, &attrs, 0, &rc);
+ if (error == 0)
+ report_operation(attrs.rta_table, &rc, nlp, hdr);
+ return (error);
+}
+
+static int
+rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ int error;
+
+ struct nl_parsed_route attrs = {};
+ error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (hdr->nlmsg_flags & NLM_F_DUMP)
+ error = handle_rtm_dump(nlp, attrs.rta_table, attrs.rtm_family, hdr, npt->nw);
+ else
+ error = handle_rtm_getroute(nlp, &attrs, hdr, npt);
+
+ return (error);
+}
+
+void
+rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc)
+{
+ int family, nlm_flags = 0;
+
+ struct nl_writer nw;
+
+ family = rt_get_family(rc->rc_rt);
+
+ /* XXX: check if there are active listeners first */
+
+ /* TODO: consider passing PID/type/seq */
+ switch (rc->rc_cmd) {
+ case RTM_ADD:
+ nlm_flags = NLM_F_EXCL | NLM_F_CREATE;
+ break;
+ case RTM_CHANGE:
+ nlm_flags = NLM_F_REPLACE;
+ break;
+ case RTM_DELETE:
+ nlm_flags = 0;
+ break;
+ }
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ char rtbuf[NHOP_PRINT_BUFSIZE] __unused;
+ FIB_LOG(LOG_DEBUG2, fibnum, family,
+ "received event %s for %s / nlm_flags=%X",
+ rib_print_cmd(rc->rc_cmd),
+ rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)),
+ nlm_flags);
+ }
+
+ struct nlmsghdr hdr = {
+ .nlmsg_flags = nlm_flags,
+ .nlmsg_type = get_rtmsg_type_from_rtsock(rc->rc_cmd),
+ };
+
+ struct route_nhop_data rnd = {
+ .rnd_nhop = rc_get_nhop(rc),
+ .rnd_weight = rc->rc_nh_weight,
+ };
+
+ uint32_t group_id = family_to_group(family);
+ if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) {
+ NL_LOG(LOG_DEBUG, "error allocating event buffer");
+ return;
+ }
+
+ dump_px(fibnum, &hdr, rc->rc_rt, &rnd, &nw);
+ nlmsg_flush(&nw);
+}
+
+static const struct rtnl_cmd_handler cmd_handlers[] = {
+ {
+ .cmd = NL_RTM_GETROUTE,
+ .name = "RTM_GETROUTE",
+ .cb = &rtnl_handle_getroute,
+ },
+ {
+ .cmd = NL_RTM_DELROUTE,
+ .name = "RTM_DELROUTE",
+ .cb = &rtnl_handle_delroute,
+ .priv = PRIV_NET_ROUTE,
+ },
+ {
+ .cmd = NL_RTM_NEWROUTE,
+ .name = "RTM_NEWROUTE",
+ .cb = &rtnl_handle_newroute,
+ .priv = PRIV_NET_ROUTE,
+ }
+};
+
+static const struct nlhdr_parser *all_parsers[] = {&mpath_parser, &metrics_parser, &rtm_parser};
+
+void
+rtnl_routes_init()
+{
+ NL_VERIFY_PARSERS(all_parsers);
+ rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
+}
diff --git a/sys/netlink/route/route.h b/sys/netlink/route/route.h
new file mode 100644
index 000000000000..6e1ef6cbf0c6
--- /dev/null
+++ b/sys/netlink/route/route.h
@@ -0,0 +1,366 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Route-related (RTM_<NEW|DEL|GET>ROUTE) message header and attributes.
+ */
+
+#ifndef _NETLINK_ROUTE_ROUTE_H_
+#define _NETLINK_ROUTE_ROUTE_H_
+
+/* Base header for all of the relevant messages */
+struct rtmsg {
+ unsigned char rtm_family; /* address family */
+ unsigned char rtm_dst_len; /* Prefix length */
+ unsigned char rtm_src_len; /* Source prefix length (not used) */
+ unsigned char rtm_tos; /* Type of service (not used) */
+ unsigned char rtm_table; /* rtable id */
+ unsigned char rtm_protocol; /* Routing protocol id (RTPROT_) */
+ unsigned char rtm_scope; /* Route distance (RT_SCOPE_) */
+ unsigned char rtm_type; /* Route type (RTN_) */
+ unsigned rtm_flags; /* Route flags (RTM_F_) */
+};
+
+/*
+ * RFC 3549, 3.1.1, route type (rtm_type field).
+ */
+enum {
+ RTN_UNSPEC,
+ RTN_UNICAST, /* Unicast route */
+ RTN_LOCAL, /* Accept locally (not supported) */
+ RTN_BROADCAST, /* Accept locally as broadcast, send as broadcast */
+ RTN_ANYCAST, /* Accept locally as broadcast, but send as unicast */
+ RTN_MULTICAST, /* Multicast route */
+ RTN_BLACKHOLE, /* Drop traffic towards destination */
+ RTN_UNREACHABLE, /* Destination is unreachable */
+ RTN_PROHIBIT, /* Administratively prohibited */
+ RTN_THROW, /* Not in this table (not supported) */
+ RTN_NAT, /* Translate this address (not supported) */
+ RTN_XRESOLVE, /* Use external resolver (not supported) */
+ __RTN_MAX,
+};
+#define RTN_MAX (__RTN_MAX - 1)
+
+/*
+ * RFC 3549, 3.1.1, protocol (Identifies what/who added the route).
+ * Values larger than RTPROT_STATIC(4) are not interpreted by the
+ * kernel, they are just for user information.
+ */
+#define RTPROT_UNSPEC 0
+#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirect */
+#define RTPROT_KERNEL 2 /* Route installed by kernel */
+#define RTPROT_BOOT 3 /* Route installed during boot */
+#define RTPROT_STATIC 4 /* Route installed by administrator */
+
+#define RTPROT_GATED 8
+#define RTPROT_RA 9
+#define RTPROT_MRT 1
+#define RTPROT_ZEBRA 11
+#define RTPROT_BIRD 12
+#define RTPROT_DNROUTED 13
+#define RTPROT_XORP 14
+#define RTPROT_NTK 15
+#define RTPROT_DHCP 16
+#define RTPROT_MROUTED 17
+#define RTPROT_KEEPALIVED 18
+#define RTPROT_BABEL 42
+#define RTPROT_OPENR 99
+#define RTPROT_BGP 186
+#define RTPROT_ISIS 187
+#define RTPROT_OSPF 188
+#define RTPROT_RIP 189
+#define RTPROT_EIGRP 192
+
+/*
+ * RFC 3549 3.1.1 Route scope (valid distance to destination).
+ *
+ * The values between RT_SCOPE_UNIVERSE(0) and RT_SCOPE_SITE(200)
+ * are available to the user.
+ */
+enum rt_scope_t {
+ RT_SCOPE_UNIVERSE = 0,
+ /* User defined values */
+ RT_SCOPE_SITE = 200,
+ RT_SCOPE_LINK = 253,
+ RT_SCOPE_HOST = 254,
+ RT_SCOPE_NOWHERE = 255
+};
+
+/*
+ * RFC 3549 3.1.1 Route flags (rtm_flags).
+ * Is a composition of RTNH_F flags (0x1..0x40 range), RTM_F flags (below)
+ * and per-protocol (IPv4/IPv6) flags.
+ */
+#define RTM_F_NOTIFY 0x00000100 /* not supported */
+#define RTM_F_CLONED 0x00000200 /* not supported */
+#define RTM_F_EQUALIZE 0x00000400 /* not supported */
+#define RTM_F_PREFIX 0x00000800 /* not supported */
+#define RTM_F_LOOKUP_TABLE 0x00001000 /* not supported */
+#define RTM_F_FIB_MATCH 0x00002000 /* not supported */
+#define RTM_F_OFFLOAD 0x00004000 /* not supported */
+#define RTM_F_TRAP 0x00008000 /* not supported */
+#define RTM_F_OFFLOAD_FAILED 0x20000000 /* not supported */
+
+/* Compatibility handling helpers */
+#ifndef _KERNEL
+#define NL_RTM_HDRLEN ((int)sizeof(struct rtmsg))
+#define RTM_RTA(_rtm) ((struct rtattr *)((char *)(_rtm) + NL_RTM_HDRLEN))
+#define RTM_PAYLOAD(_hdr) NLMSG_PAYLOAD((_hdr), NL_RTM_HDRLEN)
+#endif
+
+/*
+ * Routing table identifiers.
+ * FreeBSD route table numbering starts from 0, where 0 is a valid default routing table.
+ * Indicating "all tables" via netlink can be done by not including RTA_TABLE attribute
+ * and keeping rtm_table=0 (compatibility) or setting RTA_TABLE value to RT_TABLE_UNSPEC.
+ */
+#define RT_TABLE_MAIN 0 /* RT_DEFAULT_FIB */
+#define RT_TABLE_UNSPEC 0xFFFFFFFF /* RT_ALL_FIBS */
+
+enum rtattr_type_t {
+ NL_RTA_UNSPEC,
+ NL_RTA_DST = 1, /* binary, IPv4/IPv6 destination */
+ NL_RTA_SRC = 2, /* binary, preferred source address */
+ NL_RTA_IIF = 3, /* not supported */
+ NL_RTA_OIF = 4, /* u32, transmit ifindex */
+ NL_RTA_GATEWAY = 5, /* binary: IPv4/IPv6 gateway */
+ NL_RTA_PRIORITY = 6, /* not supported */
+ NL_RTA_PREFSRC = 7, /* not supported */
+ NL_RTA_METRICS = 8, /* nested, list of NL_RTAX* attrs */
+ NL_RTA_MULTIPATH = 9, /* binary, array of struct rtnexthop */
+ NL_RTA_PROTOINFO = 10, /* not supported / deprecated */
+ NL_RTA_KNH_ID = 10, /* u32, FreeBSD specific, kernel nexthop index */
+ NL_RTA_FLOW = 11, /* not supported */
+ NL_RTA_CACHEINFO = 12, /* not supported */
+ NL_RTA_SESSION = 13, /* not supported / deprecated */
+ NL_RTA_MP_ALGO = 14, /* not supported / deprecated */
+ NL_RTA_RTFLAGS = 14, /* u32, FreeBSD specific, */
+ NL_RTA_TABLE = 15, /* u32, fibnum */
+ NL_RTA_MARK = 16, /* not supported */
+ NL_RTA_MFC_STATS = 17, /* not supported */
+ NL_RTA_VIA = 18, /* binary, struct rtvia */
+ NL_RTA_NEWDST = 19, /* not supported */
+ NL_RTA_PREF = 20, /* not supported */
+ NL_RTA_ENCAP_TYPE = 21, /* not supported */
+ NL_RTA_ENCAP = 22, /* not supported */
+ NL_RTA_EXPIRES = 23, /* u32, seconds till expiration */
+ NL_RTA_PAD = 24, /* not supported */
+ NL_RTA_UID = 25, /* not supported */
+ NL_RTA_TTL_PROPAGATE = 26, /* not supported */
+ NL_RTA_IP_PROTO = 27, /* not supported */
+ NL_RTA_SPORT = 28, /* not supported */
+ NL_RTA_DPORT = 29, /* not supported */
+ NL_RTA_NH_ID = 30, /* u32, nexthop/nexthop group index */
+ __RTA_MAX
+};
+#define NL_RTA_MAX (__RTA_MAX - 1)
+
+/*
+ * Attributes that can be used as filters:
+ *
+ */
+
+#ifndef _KERNEL
+/*
+ * RTA_* space has clashes with rtsock namespace.
+ * Use NL_RTA_ prefix in the kernel and map to
+ * RTA_ for userland.
+ */
+#define RTA_UNSPEC NL_RTA_UNSPEC
+#define RTA_DST NL_RTA_DST
+#define RTA_SRC NL_RTA_SRC
+#define RTA_IIF NL_RTA_IIF
+#define RTA_OIF NL_RTA_OIF
+#define RTA_GATEWAY NL_RTA_GATEWAY
+#define RTA_PRIORITY NL_RTA_PRIORITY
+#define RTA_PREFSRC NL_RTA_PREFSRC
+#define RTA_METRICS NL_RTA_METRICS
+#define RTA_MULTIPATH NL_RTA_MULTIPATH
+#define RTA_PROTOINFO NL_RTA_PROTOINFO
+#define RTA_KNH_ID NL_RTA_KNH_ID
+#define RTA_FLOW NL_RTA_FLOW
+#define RTA_CACHEINFO NL_RTA_CACHEINFO
+#define RTA_SESSION NL_RTA_SESSION
+#define RTA_MP_ALGO NL_RTA_MP_ALGO
+#define RTA_TABLE NL_RTA_TABLE
+#define RTA_MARK NL_RTA_MARK
+#define RTA_MFC_STATS NL_RTA_MFC_STATS
+#define RTA_VIA NL_RTA_VIA
+#define RTA_NEWDST NL_RTA_NEWDST
+#define RTA_PREF NL_RTA_PREF
+#define RTA_ENCAP_TYPE NL_RTA_ENCAP_TYPE
+#define RTA_ENCAP NL_RTA_ENCAP
+#define RTA_EXPIRES NL_RTA_EXPIRES
+#define RTA_PAD NL_RTA_PAD
+#define RTA_UID NL_RTA_UID
+#define RTA_TTL_PROPAGATE NL_RTA_TTL_PROPAGATE
+#define RTA_IP_PROTO NL_RTA_IP_PROTO
+#define RTA_SPORT NL_RTA_SPORT
+#define RTA_DPORT NL_RTA_DPORT
+#define RTA_NH_ID NL_RTA_NH_ID
+#define RTA_MAX NL_RTA_MAX
+#endif
+
+/* route attribute header */
+struct rtattr {
+ unsigned short rta_len;
+ unsigned short rta_type;
+};
+
+#define NL_RTA_ALIGN_SIZE NL_ITEM_ALIGN_SIZE
+#define NL_RTA_ALIGN NL_ITEM_ALIGN
+#define NL_RTA_HDRLEN ((int)sizeof(struct rtattr))
+#define NL_RTA_DATA_LEN(_rta) ((int)((_rta)->rta_len - NL_RTA_HDRLEN))
+#define NL_RTA_DATA(_rta) NL_ITEM_DATA(_rta, NL_RTA_HDRLEN)
+#define NL_RTA_DATA_CONST(_rta) NL_ITEM_DATA_CONST(_rta, NL_RTA_HDRLEN)
+
+/* Compatibility attribute handling helpers */
+#ifndef _KERNEL
+#define RTA_ALIGNTO NL_RTA_ALIGN_SIZE
+#define RTA_ALIGN(_len) NL_RTA_ALIGN(_len)
+#define _RTA_LEN(_rta) ((int)(_rta)->rta_len)
+#define _RTA_ALIGNED_LEN(_rta) RTA_ALIGN(_RTA_LEN(_rta))
+#define RTA_OK(_rta, _len) NL_ITEM_OK(_rta, _len, NL_RTA_HDRLEN, _RTA_LEN)
+#define RTA_NEXT(_rta, _len) NL_ITEM_ITER(_rta, _len, _RTA_ALIGNED_LEN)
+#define RTA_LENGTH(_len) (NL_RTA_HDRLEN + (_len))
+#define RTA_SPACE(_len) RTA_ALIGN(RTA_LENGTH(_len))
+#define RTA_DATA(_rta) NL_RTA_DATA(_rta)
+#define RTA_PAYLOAD(_rta) ((int)(_RTA_LEN(_rta) - NL_RTA_HDRLEN))
+#endif
+
+/* RTA attribute headers */
+
+/* RTA_VIA */
+struct rtvia {
+ sa_family_t rtvia_family;
+ uint8_t rtvia_addr[0];
+};
+
+/*
+ * RTA_METRICS is a nested attribute, consisting of a list of
+ * TLVs with types defined below.
+ */
+ enum {
+ NL_RTAX_UNSPEC,
+ NL_RTAX_LOCK = 1, /* not supported */
+ NL_RTAX_MTU = 2, /* desired path MTU */
+ NL_RTAX_WINDOW = 3, /* not supported */
+ NL_RTAX_RTT = 4, /* not supported */
+ NL_RTAX_RTTVAR = 5, /* not supported */
+ NL_RTAX_SSTHRESH = 6, /* not supported */
+ NL_RTAX_CWND = 7, /* not supported */
+ NL_RTAX_ADVMSS = 8, /* not supported */
+ NL_RTAX_REORDERING = 9, /* not supported */
+ NL_RTAX_HOPLIMIT = 10, /* not supported */
+ NL_RTAX_INITCWND = 11, /* not supporrted */
+ NL_RTAX_FEATURES = 12, /* not supported */
+ NL_RTAX_RTO_MIN = 13, /* not supported */
+ NL_RTAX_INITRWND = 14, /* not supported */
+ NL_RTAX_QUICKACK = 15, /* not supported */
+ NL_RTAX_CC_ALGO = 15, /* not supported */
+ NL_RTAX_FASTOPEN_NO_COOKIE = 16, /* not supported */
+ __NL_RTAX_MAX
+};
+#define NL_RTAX_MAX (__NL_RTAX_MAX - 1)
+
+#define RTAX_FEATURE_ECN (1 << 0)
+#define RTAX_FEATURE_SACK (1 << 1)
+#define RTAX_FEATURE_TIMESTAMP (1 << 2)
+#define RTAX_FEATURE_ALLFRAG (1 << 3)
+
+#define RTAX_FEATURE_MASK \
+ (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | RTAX_FEATURE_TIMESTAMP | \
+ RTAX_FEATURE_ALLFRAG)
+
+#ifndef _KERNEL
+
+/*
+ * RTAX_* space clashes with rtsock namespace.
+ * Use NL_RTAX_ prefix in the kernel and map to
+ * RTAX_ for userland.
+ */
+#define RTAX_UNSPEC NL_RTAX_UNSPEC
+#define RTAX_LOCK NL_RTAX_LOCK
+#define RTAX_MTU NL_RTAX_MTU
+#define RTAX_WINDOW NL_RTAX_WINDOW
+#define RTAX_RTT NL_RTAX_RTT
+#define RTAX_RTTVAR NL_RTAX_RTTVAR
+#define RTAX_SSTHRESH NL_RTAX_SSTHRESH
+#define RTAX_CWND NL_RTAX_CWND
+#define RTAX_ADVMSS NL_RTAX_ADVMSS
+#define RTAX_REORDERING NL_RTAX_REORDERING
+#define RTAX_HOPLIMIT NL_RTAX_HOPLIMIT
+#define RTAX_INITCWND NL_RTAX_INITCWND
+#define RTAX_FEATURES NL_RTAX_FEATURES
+#define RTAX_RTO_MIN NL_RTAX_RTO_MIN
+#define RTAX_INITRWND NL_RTAX_INITRWND
+#define RTAX_QUICKACK NL_RTAX_QUICKACK
+#define RTAX_CC_ALGO NL_RTAX_CC_ALGO
+#define RTAX_FASTOPEN_NO_COOKIE NL_RTAX_FASTOPEN_NO_COOKIE
+#endif
+
+/*
+ * RTA_MULTIPATH consists of an array of rtnexthop structures.
+ * Each rtnexthop structure contains RTA_GATEWAY or RTA_VIA
+ * attribute following the header.
+ */
+struct rtnexthop {
+ unsigned short rtnh_len;
+ unsigned char rtnh_flags;
+ unsigned char rtnh_hops; /* nexthop weight */
+ int rtnh_ifindex;
+};
+
+/* rtnh_flags */
+#define RTNH_F_DEAD 0x01 /* not supported */
+#define RTNH_F_PERVASIVE 0x02 /* not supported */
+#define RTNH_F_ONLINK 0x04 /* not supported */
+#define RTNH_F_OFFLOAD 0x08 /* not supported */
+#define RTNH_F_LINKDOWN 0x10 /* not supported */
+#define RTNH_F_UNRESOLVED 0x20 /* not supported */
+#define RTNH_F_TRAP 0x40 /* not supported */
+
+#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN | \
+ RTNH_F_OFFLOAD | RTNH_F_TRAP)
+
+/* Macros to handle hexthops */
+#define RTNH_ALIGNTO NL_ITEM_ALIGN_SIZE
+#define RTNH_ALIGN(_len) NL_ITEM_ALIGN(_len)
+#define RTNH_HDRLEN ((int)sizeof(struct rtnexthop))
+#define _RTNH_LEN(_nh) ((int)(_nh)->rtnh_len)
+#define _RTNH_ALIGNED_LEN(_nh) RTNH_ALIGN(_RTNH_LEN(_nh))
+#define RTNH_OK(_nh, _len) NL_ITEM_OK(_nh, _len, RTNH_HDRLEN, _RTNH_LEN)
+#define RTNH_NEXT(_nh) ((struct rtnexthop *)((char *)(_nh) + _RTNH_ALIGNED_LEN(_nh)))
+#define RTNH_LENGTH(_len) (RTNH_HDRLEN + (_len))
+#define RTNH_SPACE(_len) RTNH_ALIGN(RTNH_LENGTH(_len))
+#define RTNH_DATA(_nh) ((struct rtattr *)NL_ITEM_DATA(_nh, RTNH_HDRLEN))
+
+struct rtgenmsg {
+ unsigned char rtgen_family;
+};
+
+#endif
diff --git a/sys/netlink/route/route_var.h b/sys/netlink/route/route_var.h
new file mode 100644
index 000000000000..7a31a8c896a5
--- /dev/null
+++ b/sys/netlink/route/route_var.h
@@ -0,0 +1,101 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This file contains definitions shared among NETLINK_ROUTE family
+ */
+#ifndef _NETLINK_ROUTE_ROUTE_VAR_H_
+#define _NETLINK_ROUTE_ROUTE_VAR_H_
+
+#include <sys/priv.h> /* values for priv_check */
+
+struct nlmsghdr;
+struct nlpcb;
+struct nl_pstate;
+
+typedef int rtnl_msg_cb_f(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nl_pstate *npt);
+
+struct rtnl_cmd_handler {
+ int cmd;
+ const char *name;
+ rtnl_msg_cb_f *cb;
+ int priv;
+ int flags;
+};
+
+#define RTNL_F_NOEPOCH 0x01
+
+bool rtnl_register_messages(const struct rtnl_cmd_handler *handlers, int count);
+
+/* route.c */
+struct rib_cmd_info;
+void rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc);
+void rtnl_routes_init(void);
+
+/* neigh.c */
+void rtnl_neighs_init(void);
+void rtnl_neighs_destroy(void);
+
+/* iface.c */
+struct nl_parsed_link {
+ char *ifla_group;
+ char *ifla_ifname;
+ char *ifla_cloner;
+ struct nlattr *ifla_idata;
+ unsigned short ifi_type;
+ int ifi_index;
+ uint32_t ifla_mtu;
+};
+
+typedef int rtnl_iface_create_f(struct nl_parsed_link *lattrs, struct nlpcb *nlp,
+ struct nl_pstate *npt);
+typedef int rtnl_iface_modify_f(struct nl_parsed_link *lattrs, struct nlpcb *nlp,
+ struct nl_pstate *npt);
+
+struct nl_cloner {
+ const char *name;
+ rtnl_iface_create_f *create_f;
+ rtnl_iface_modify_f *modify_f;
+ SLIST_ENTRY(nl_cloner) next;
+};
+
+void rtnl_ifaces_init(void);
+void rtnl_ifaces_destroy(void);
+void rtnl_iface_add_cloner(struct nl_cloner *cloner);
+void rtnl_iface_del_cloner(struct nl_cloner *cloner);
+
+/* iface_drivers.c */
+void rtnl_iface_drivers_register(void);
+
+/* nexthop.c */
+void rtnl_nexthops_init(void);
+struct nhop_object *nl_find_nhop(uint32_t fibnum, int family,
+ uint32_t uidx, int nh_flags, int *perror);
+
+
+#endif