aboutsummaryrefslogtreecommitdiff
path: root/sys/netlink
diff options
context:
space:
mode:
Diffstat (limited to 'sys/netlink')
-rw-r--r--sys/netlink/ktest_netlink_message_writer.c113
-rw-r--r--sys/netlink/ktest_netlink_message_writer.h46
-rw-r--r--sys/netlink/netlink.h263
-rw-r--r--sys/netlink/netlink_bitset.h57
-rw-r--r--sys/netlink/netlink_ctl.h124
-rw-r--r--sys/netlink/netlink_debug.h85
-rw-r--r--sys/netlink/netlink_domain.c1002
-rw-r--r--sys/netlink/netlink_generic.c525
-rw-r--r--sys/netlink/netlink_generic.h114
-rw-r--r--sys/netlink/netlink_glue.c292
-rw-r--r--sys/netlink/netlink_io.c369
-rw-r--r--sys/netlink/netlink_linux.h53
-rw-r--r--sys/netlink/netlink_message_parser.c635
-rw-r--r--sys/netlink/netlink_message_parser.h337
-rw-r--r--sys/netlink/netlink_message_writer.c399
-rw-r--r--sys/netlink/netlink_message_writer.h312
-rw-r--r--sys/netlink/netlink_module.c221
-rw-r--r--sys/netlink/netlink_route.c143
-rw-r--r--sys/netlink/netlink_route.h44
-rw-r--r--sys/netlink/netlink_snl.h1330
-rw-r--r--sys/netlink/netlink_snl_generic.h175
-rw-r--r--sys/netlink/netlink_snl_route.h201
-rw-r--r--sys/netlink/netlink_snl_route_compat.h53
-rw-r--r--sys/netlink/netlink_snl_route_parsers.h392
-rw-r--r--sys/netlink/netlink_sysevent.c205
-rw-r--r--sys/netlink/netlink_sysevent.h49
-rw-r--r--sys/netlink/netlink_var.h181
-rw-r--r--sys/netlink/route/common.h259
-rw-r--r--sys/netlink/route/iface.c1530
-rw-r--r--sys/netlink/route/iface_drivers.c145
-rw-r--r--sys/netlink/route/ifaddrs.h99
-rw-r--r--sys/netlink/route/interface.h266
-rw-r--r--sys/netlink/route/neigh.c601
-rw-r--r--sys/netlink/route/neigh.h111
-rw-r--r--sys/netlink/route/nexthop.c1123
-rw-r--r--sys/netlink/route/nexthop.h113
-rw-r--r--sys/netlink/route/route.h368
-rw-r--r--sys/netlink/route/route_var.h140
-rw-r--r--sys/netlink/route/rt.c1139
39 files changed, 13614 insertions, 0 deletions
diff --git a/sys/netlink/ktest_netlink_message_writer.c b/sys/netlink/ktest_netlink_message_writer.c
new file mode 100644
index 000000000000..805f52197f69
--- /dev/null
+++ b/sys/netlink/ktest_netlink_message_writer.c
@@ -0,0 +1,113 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <tests/ktest.h>
+#include <sys/cdefs.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+#include <netlink/netlink_message_writer.h>
+
+#define KTEST_CALLER
+#include <netlink/ktest_netlink_message_writer.h>
+
+#ifdef INVARIANTS
+
+struct test_nlbuf_attrs {
+ uint32_t size;
+ uint32_t expected_avail;
+ int waitok;
+};
+
+#define _OUT(_field) offsetof(struct test_nlbuf_attrs, _field)
+static const struct nlattr_parser nla_p_nlbuf_w[] = {
+ { .type = 1, .off = _OUT(size), .cb = nlattr_get_uint32 },
+ { .type = 2, .off = _OUT(expected_avail), .cb = nlattr_get_uint32 },
+ { .type = 3, .off = _OUT(waitok), .cb = nlattr_get_uint32 },
+};
+#undef _OUT
+NL_DECLARE_ATTR_PARSER(nlbuf_w_parser, nla_p_nlbuf_w);
+
+static int
+test_nlbuf_parser(struct ktest_test_context *ctx, struct nlattr *nla)
+{
+ struct test_nlbuf_attrs *attrs = npt_alloc(ctx->npt, sizeof(*attrs));
+
+ ctx->arg = attrs;
+ if (attrs != NULL)
+ return (nl_parse_nested(nla, &nlbuf_w_parser, ctx->npt, attrs));
+ return (ENOMEM);
+}
+
+static int
+test_nlbuf_writer_allocation(struct ktest_test_context *ctx)
+{
+ struct test_nlbuf_attrs *attrs = ctx->arg;
+ struct nl_writer nw = {};
+ u_int alloc_len;
+ bool ret;
+
+ ret = nlmsg_get_buf_wrapper(&nw, attrs->size, attrs->waitok);
+ if (!ret)
+ return (EINVAL);
+
+ alloc_len = nw.buf->buflen;
+ KTEST_LOG(ctx, "requested %u, allocated %d", attrs->size, alloc_len);
+
+ /* Mark enomem to avoid reallocation */
+ nw.enomem = true;
+
+ if (nlmsg_reserve_data(&nw, alloc_len, void *) == NULL) {
+ KTEST_LOG(ctx, "unable to get %d bytes from the writer", alloc_len);
+ return (EINVAL);
+ }
+
+ nl_buf_free(nw.buf);
+
+ if (alloc_len < attrs->expected_avail) {
+ KTEST_LOG(ctx, "alloc_len %d, expected %u",
+ alloc_len, attrs->expected_avail);
+ return (EINVAL);
+ }
+
+ return (0);
+}
+#endif
+
+static const struct ktest_test_info tests[] = {
+#ifdef INVARIANTS
+ {
+ .name = "test_nlbuf_writer_allocation",
+ .desc = "test different buffer sizes in the netlink writer",
+ .func = &test_nlbuf_writer_allocation,
+ .parse = &test_nlbuf_parser,
+ },
+#endif
+};
+KTEST_MODULE_DECLARE(ktest_netlink_message_writer, tests);
diff --git a/sys/netlink/ktest_netlink_message_writer.h b/sys/netlink/ktest_netlink_message_writer.h
new file mode 100644
index 000000000000..447593e0e700
--- /dev/null
+++ b/sys/netlink/ktest_netlink_message_writer.h
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETLINK_KTEST_NETLINK_MESSAGE_WRITER_H_
+#define _NETLINK_KTEST_NETLINK_MESSAGE_WRITER_H_
+
+#if defined(_KERNEL) && defined(INVARIANTS)
+
+bool nlmsg_get_buf_wrapper(struct nl_writer *nw, size_t size, bool waitok);
+
+#ifndef KTEST_CALLER
+
+bool
+nlmsg_get_buf_wrapper(struct nl_writer *nw, size_t size, bool waitok)
+{
+ return (nlmsg_get_buf(nw, size, waitok));
+}
+#endif
+
+#endif
+
+#endif
diff --git a/sys/netlink/netlink.h b/sys/netlink/netlink.h
new file mode 100644
index 000000000000..2395726e7455
--- /dev/null
+++ b/sys/netlink/netlink.h
@@ -0,0 +1,263 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (C) The Internet Society (2003). All Rights Reserved.
+ *
+ * This document and translations of it may be copied and furnished to
+ * others, and derivative works that comment on or otherwise explain it
+ * or assist in its implementation may be prepared, copied, published
+ * and distributed, in whole or in part, without restriction of any
+ * kind, provided that the above copyright notice and this paragraph are
+ * included on all such copies and derivative works. However, this
+ * document itself may not be modified in any way, such as by removing
+ * the copyright notice or references to the Internet Society or other
+ * Internet organizations, except as needed for the purpose of
+ * developing Internet standards in which case the procedures for
+ * copyrights defined in the Internet Standards process must be
+ * followed, or as required to translate it into languages other than
+ * English.
+ *
+ * The limited permissions granted above are perpetual and will not be
+ * revoked by the Internet Society or its successors or assignees.
+ *
+ * This document and the information contained herein is provided on an
+ * "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
+ * TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
+ * HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+
+ */
+
+/*
+ * This file contains structures and constants for RFC 3549 (Netlink)
+ * protocol. Some values have been taken from Linux implementation.
+ */
+
+#ifndef _NETLINK_NETLINK_H_
+#define _NETLINK_NETLINK_H_
+
+#include <sys/param.h>
+#include <sys/socket.h>
+
+struct sockaddr_nl {
+ uint8_t nl_len; /* sizeof(sockaddr_nl) */
+ sa_family_t nl_family; /* netlink family */
+ uint16_t nl_pad; /* reserved, set to 0 */
+ uint32_t nl_pid; /* desired port ID, 0 for auto-select */
+ uint32_t nl_groups; /* multicast groups mask to bind to */
+};
+
+#define SOL_NETLINK 270
+
+/* Netlink socket options */
+#define NETLINK_ADD_MEMBERSHIP 1 /* Subscribe for the specified group notifications */
+#define NETLINK_DROP_MEMBERSHIP 2 /* Unsubscribe from the specified group */
+#define NETLINK_PKTINFO 3 /* XXX: not supported */
+#define NETLINK_BROADCAST_ERROR 4 /* XXX: not supported */
+#define NETLINK_NO_ENOBUFS 5 /* XXX: not supported */
+#define NETLINK_RX_RING 6 /* XXX: not supported */
+#define NETLINK_TX_RING 7 /* XXX: not supported */
+#define NETLINK_LISTEN_ALL_NSID 8 /* XXX: not supported */
+
+#define NETLINK_LIST_MEMBERSHIPS 9
+#define NETLINK_CAP_ACK 10 /* Send only original message header in the reply */
+#define NETLINK_EXT_ACK 11 /* Ack support for receiving additional TLVs in ack */
+#define NETLINK_GET_STRICT_CHK 12 /* Strict header checking */
+
+#define NETLINK_MSG_INFO 257 /* (FreeBSD-specific) Receive message originator data in cmsg */
+
+/*
+ * RFC 3549, 2.3.2 Netlink Message Header
+ */
+struct nlmsghdr {
+ uint32_t nlmsg_len; /* Length of message including header */
+ uint16_t nlmsg_type; /* Message type identifier */
+ uint16_t nlmsg_flags; /* Flags (NLM_F_) */
+ uint32_t nlmsg_seq; /* Sequence number */
+ uint32_t nlmsg_pid; /* Sending process port ID */
+};
+
+/*
+ * RFC 3549, 2.3.2 standard flag bits (nlmsg_flags)
+ */
+#define NLM_F_REQUEST 0x01 /* Indicateds request to kernel */
+#define NLM_F_MULTI 0x02 /* Message is part of a group terminated by NLMSG_DONE msg */
+#define NLM_F_ACK 0x04 /* Reply with ack message containing resulting error code */
+#define NLM_F_ECHO 0x08 /* (not supported) Echo this request back */
+#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */
+#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */
+
+/*
+ * RFC 3549, 2.3.2 Additional flag bits for GET requests
+ */
+#define NLM_F_ROOT 0x100 /* Return the complete table */
+#define NLM_F_MATCH 0x200 /* Return all entries matching criteria */
+#define NLM_F_ATOMIC 0x400 /* Return an atomic snapshot (ignored) */
+#define NLM_F_DUMP (NLM_F_ROOT | NLM_F_MATCH)
+
+/*
+ * RFC 3549, 2.3.2 Additional flag bits for NEW requests
+ */
+#define NLM_F_REPLACE 0x100 /* Replace existing matching config object */
+#define NLM_F_EXCL 0x200 /* Don't replace the object if exists */
+#define NLM_F_CREATE 0x400 /* Create if it does not exist */
+#define NLM_F_APPEND 0x800 /* Add to end of list */
+
+/* Modifiers to DELETE requests */
+#define NLM_F_NONREC 0x100 /* Do not delete recursively */
+
+/* Flags for ACK message */
+#define NLM_F_CAPPED 0x100 /* request was capped */
+#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */
+
+/*
+ * RFC 3549, 2.3.2 standard message types (nlmsg_type).
+ */
+#define NLMSG_NOOP 0x1 /* Message is ignored. */
+#define NLMSG_ERROR 0x2 /* reply error code reporting */
+#define NLMSG_DONE 0x3 /* Message terminates a multipart message. */
+#define NLMSG_OVERRUN 0x4 /* overrun detected, data is lost */
+
+#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */
+
+/*
+ * Defition of numbers assigned to the netlink subsystems.
+ */
+#define NETLINK_ROUTE 0 /* Routing/device hook */
+#define NETLINK_UNUSED 1 /* not supported */
+#define NETLINK_USERSOCK 2 /* not supported */
+#define NETLINK_FIREWALL 3 /* not supported */
+#define NETLINK_SOCK_DIAG 4 /* not supported */
+#define NETLINK_NFLOG 5 /* not supported */
+#define NETLINK_XFRM 6 /* (not supported) PF_SETKEY */
+#define NETLINK_SELINUX 7 /* not supported */
+#define NETLINK_ISCSI 8 /* not supported */
+#define NETLINK_AUDIT 9 /* not supported */
+#define NETLINK_FIB_LOOKUP 10 /* not supported */
+#define NETLINK_CONNECTOR 11 /* not supported */
+#define NETLINK_NETFILTER 12 /* not supported */
+#define NETLINK_IP6_FW 13 /* not supported */
+#define NETLINK_DNRTMSG 14 /* not supported */
+#define NETLINK_KOBJECT_UEVENT 15 /* not supported */
+#define NETLINK_GENERIC 16 /* Generic netlink (dynamic families) */
+
+/*
+ * RFC 3549, 2.3.2.2 The ACK Netlink Message
+ */
+struct nlmsgerr {
+ int error;
+ struct nlmsghdr msg;
+};
+
+enum nlmsgerr_attrs {
+ NLMSGERR_ATTR_UNUSED,
+ NLMSGERR_ATTR_MSG = 1, /* string, error message */
+ NLMSGERR_ATTR_OFFS = 2, /* u32, offset of the invalid attr from nl header */
+ NLMSGERR_ATTR_COOKIE = 3, /* binary, data to pass to userland */
+ NLMSGERR_ATTR_POLICY = 4, /* not supported */
+ __NLMSGERR_ATTR_MAX,
+ NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1
+};
+
+/* FreeBSD-specific debugging info */
+
+enum nlmsginfo_attrs {
+ NLMSGINFO_ATTR_UNUSED,
+ NLMSGINFO_ATTR_PROCESS_ID = 1, /* u32, source process PID */
+ NLMSGINFO_ATTR_PORT_ID = 2, /* u32, source socket nl_pid */
+ NLMSGINFO_ATTR_SEQ_ID = 3, /* u32, source message seq_id */
+};
+
+
+#define NL_ITEM_ALIGN_SIZE sizeof(uint32_t)
+#define NL_ITEM_ALIGN(_len) __align_up(_len, NL_ITEM_ALIGN_SIZE)
+#define NL_ITEM_DATA(_ptr, _off) ((void *)((char *)(_ptr) + _off))
+#define NL_ITEM_DATA_CONST(_ptr, _off) ((const void *)((const char *)(_ptr) + _off))
+
+#define NL_ITEM_OK(_ptr, _len, _hlen, _LEN_M) \
+ ((_len) >= _hlen && _LEN_M(_ptr) >= _hlen && _LEN_M(_ptr) <= (_len))
+#define NL_ITEM_NEXT(_ptr, _LEN_M) ((__typeof(_ptr))((char *)(_ptr) + _LEN_M(_ptr)))
+#define NL_ITEM_ITER(_ptr, _len, _LEN_MACRO) \
+ ((_len) -= _LEN_MACRO(_ptr), NL_ITEM_NEXT(_ptr, _LEN_MACRO))
+
+/* part of netlink(3) API */
+#define NLMSG_ALIGNTO NL_ITEM_ALIGN_SIZE
+#define NLMSG_ALIGN(_len) NL_ITEM_ALIGN(_len)
+
+#ifndef _KERNEL
+/* part of netlink(3) API */
+#define NLMSG_HDRLEN (sizeof(struct nlmsghdr))
+#define NLMSG_LENGTH(_len) ((_len) + NLMSG_HDRLEN)
+#define NLMSG_SPACE(_len) NLMSG_ALIGN(NLMSG_LENGTH(_len))
+#define NLMSG_DATA(_hdr) NL_ITEM_DATA(_hdr, NLMSG_HDRLEN)
+#define _NLMSG_LEN(_hdr) ((_hdr)->nlmsg_len)
+#define _NLMSG_ALIGNED_LEN(_hdr) NLMSG_ALIGN(_NLMSG_LEN(_hdr))
+#define NLMSG_OK(_hdr, _len) NL_ITEM_OK(_hdr, _len, NLMSG_HDRLEN, _NLMSG_LEN)
+#define NLMSG_PAYLOAD(_hdr,_len) (_NLMSG_LEN(_hdr) - NLMSG_SPACE((_len)))
+#define NLMSG_NEXT(_hdr, _len) NL_ITEM_ITER(_hdr, _len, _NLMSG_ALIGNED_LEN)
+
+#else
+#define NLMSG_HDRLEN (NLMSG_ALIGN(sizeof(struct nlmsghdr)))
+#endif
+
+/*
+ * Base netlink attribute TLV header.
+ */
+struct nlattr {
+ uint16_t nla_len; /* Total attribute length */
+ uint16_t nla_type; /* Attribute type */
+};
+
+/*
+ *
+ * nl_type field enconding:
+ *
+ * 0 1
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |N|O| Attribute type |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * N - attribute contains other attributes (mostly unused)
+ * O - encoded in network byte order (mostly unused)
+ * Note: N & O are mutually exclusive
+ *
+ * Note: attribute type value scope normally is either parent attribute
+ * or the message/message group.
+ */
+
+#define NLA_F_NESTED (1 << 15)
+#define NLA_F_NET_BYTEORDER (1 << 14)
+#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER)
+
+#ifndef _KERNEL
+#define NLA_ALIGNTO NL_ITEM_ALIGN_SIZE
+#define NLA_ALIGN(_len) NL_ITEM_ALIGN(_len)
+#define NLA_HDRLEN ((int)sizeof(struct nlattr))
+#endif
+
+#endif
diff --git a/sys/netlink/netlink_bitset.h b/sys/netlink/netlink_bitset.h
new file mode 100644
index 000000000000..9a918bd20997
--- /dev/null
+++ b/sys/netlink/netlink_bitset.h
@@ -0,0 +1,57 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Generic netlink message header and attributes
+ */
+#ifndef _NETLINK_NETLINK_BITSET_H_
+#define _NETLINK_NETLINK_BITSET_H_
+
+#include <netlink/netlink.h>
+
+/* Bitset type nested attributes */
+enum {
+ NLA_BITSET_UNSPEC,
+ NLA_BITSET_NOMASK = 1, /* flag: mask of valid bits not provided */
+ NLA_BITSET_SIZE = 2, /* u32: max valid bit # */
+ NLA_BITSET_BITS = 3, /* nested: array of NLA_BITSET_BIT */
+ NLA_BITSET_VALUE = 4, /* binary: array of bit values */
+ NLA_BITSET_MASK = 5, /* binary: array of valid bits */
+ __NLA_BITSET_MAX,
+};
+#define NLA_BITSET_MAX (__NLA_BITSET_MAX - 1)
+
+enum {
+ NLA_BITSET_BIT_UNSPEC,
+ NLA_BITSET_BIT_INDEX = 1, /* u32: index of the bit */
+ NLA_BITSET_BIT_NAME = 2, /* string: bit description */
+ NLA_BITSET_BIT_VALUE = 3, /* flag: provided if bit is set */
+ __NLA_BITSET_BIT_MAX,
+};
+#define NLA_BITSET_BIT_MAX (__NLA_BITSET_BIT_MAX - 1)
+
+#endif
diff --git a/sys/netlink/netlink_ctl.h b/sys/netlink/netlink_ctl.h
new file mode 100644
index 000000000000..7f43e0f2c25e
--- /dev/null
+++ b/sys/netlink/netlink_ctl.h
@@ -0,0 +1,124 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETLINK_NETLINK_CTL_H_
+#define _NETLINK_NETLINK_CTL_H_
+
+#ifdef _KERNEL
+/*
+ * This file provides headers for the public KPI of the netlink
+ * subsystem
+ */
+#include <sys/_eventhandler.h>
+
+MALLOC_DECLARE(M_NETLINK);
+
+/*
+ * Macro for handling attribute TLVs
+ */
+#define _roundup2(x, y) (((x)+((y)-1))&(~((y)-1)))
+
+#define NETLINK_ALIGN_SIZE sizeof(uint32_t)
+#define NETLINK_ALIGN(_len) _roundup2(_len, NETLINK_ALIGN_SIZE)
+
+#define NLA_ALIGN_SIZE sizeof(uint32_t)
+#define NLA_ALIGN(_len) _roundup2(_len, NLA_ALIGN_SIZE)
+#define NLA_HDRLEN ((uint16_t)sizeof(struct nlattr))
+#define NLA_DATA_LEN(_nla) ((_nla)->nla_len - NLA_HDRLEN)
+#define NLA_DATA(_nla) NL_ITEM_DATA(_nla, NLA_HDRLEN)
+#define NLA_DATA_CONST(_nla) NL_ITEM_DATA_CONST(_nla, NLA_HDRLEN)
+#define NLA_TYPE(_nla) ((_nla)->nla_type & 0x3FFF)
+
+#ifndef typeof
+#define typeof __typeof
+#endif
+
+#define NLA_NEXT(_attr) (struct nlattr *)((char *)_attr + NLA_ALIGN(_attr->nla_len))
+#define _NLA_END(_start, _len) ((char *)(_start) + (_len))
+#define NLA_FOREACH(_attr, _start, _len) \
+ for (typeof(_attr) _end = (typeof(_attr))_NLA_END(_start, _len), _attr = (_start); \
+ ((char *)_attr < (char *)_end) && \
+ ((char *)NLA_NEXT(_attr) <= (char *)_end); \
+ _attr = (_len -= NLA_ALIGN(_attr->nla_len), NLA_NEXT(_attr)))
+
+#include <netlink/netlink_message_writer.h>
+#include <netlink/netlink_message_parser.h>
+
+
+/* Protocol handlers */
+struct nl_pstate;
+typedef int (*nl_handler_f)(struct nlmsghdr *hdr, struct nl_pstate *npt);
+
+bool netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler);
+bool netlink_unregister_proto(int proto);
+
+/* Common helpers */
+bool nlp_has_priv(struct nlpcb *nlp, int priv);
+struct ucred *nlp_get_cred(struct nlpcb *nlp);
+uint32_t nlp_get_pid(const struct nlpcb *nlp);
+bool nlp_unconstrained_vnet(const struct nlpcb *nlp);
+
+/* netlink_generic.c */
+struct genl_cmd {
+ const char *cmd_name;
+ nl_handler_f cmd_cb;
+ uint32_t cmd_flags;
+ uint32_t cmd_priv;
+ uint32_t cmd_num;
+};
+
+uint16_t genl_register_family(const char *family_name, size_t hdrsize,
+ uint16_t family_version, uint16_t max_attr_idx);
+void genl_unregister_family(uint16_t family);
+bool genl_register_cmds(uint16_t family, const struct genl_cmd *cmds,
+ u_int count);
+uint32_t genl_register_group(uint16_t family, const char *group_name);
+void genl_unregister_group(uint16_t family, uint32_t group);
+
+typedef void (*genl_family_event_handler_t)(void *arg, const char *family_name,
+ uint16_t family_id, u_int action);
+EVENTHANDLER_DECLARE(genl_family_event, genl_family_event_handler_t);
+
+struct thread;
+#if defined(NETLINK) || defined(NETLINK_MODULE)
+/* Provide optimized calls to the functions inside the same linking unit */
+struct nlpcb *_nl_get_thread_nlp(struct thread *td);
+
+static inline struct nlpcb *
+nl_get_thread_nlp(struct thread *td)
+{
+ return (_nl_get_thread_nlp(td));
+}
+
+#else
+/* Provide access to the functions via netlink_glue.c */
+struct nlpcb *nl_get_thread_nlp(struct thread *td);
+
+#endif /* defined(NETLINK) || defined(NETLINK_MODULE) */
+
+#endif
+#endif
diff --git a/sys/netlink/netlink_debug.h b/sys/netlink/netlink_debug.h
new file mode 100644
index 000000000000..db987b26b6d7
--- /dev/null
+++ b/sys/netlink/netlink_debug.h
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETLINK_NETLINK_DEBUG_H_
+#define _NETLINK_NETLINK_DEBUG_H_
+
+#ifdef _KERNEL
+
+#define _DEBUG_SYSCTL_OID _net_netlink_debug
+#include <net/route/route_debug.h>
+
+SYSCTL_DECL(_net_netlink_debug);
+
+/*
+ * Generic debug
+ * [nl_domain] func_name: debug text
+ */
+#define NL_LOG RT_LOG
+
+/*
+ * Logging for events specific for particular process
+ * Example: [nl_domain] PID 4834 fdump_sa: unsupported family: 45
+ */
+#define NL_RAW_PID_LOG(_l, _pid, _fmt, ...) \
+ NL_RAW_PID_LOG_##_l(_l, _pid, _fmt, ## __VA_ARGS__)
+#define _NL_RAW_PID_LOG(_l, _pid, _fmt, ...) \
+ if (_DEBUG_PASS_MSG(_l)) { \
+ _output("[" DEBUG_PREFIX_NAME "] PID %u %s: " _fmt "\n", _pid, \
+ __func__, ##__VA_ARGS__); \
+ }
+
+#define NLP_LOG(_l, _nlp, _fmt, ...) \
+ NL_RAW_PID_LOG_##_l(_l, nlp_get_pid(_nlp), _fmt, ## __VA_ARGS__)
+
+#if DEBUG_MAX_LEVEL>=LOG_DEBUG3
+#define NL_RAW_PID_LOG_LOG_DEBUG3 _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_DEBUG3(_l, _pid, _fmt, ...)
+#endif
+#if DEBUG_MAX_LEVEL>=LOG_DEBUG2
+#define NL_RAW_PID_LOG_LOG_DEBUG2 _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_DEBUG2(_l, _pid, _fmt, ...)
+#endif
+#if DEBUG_MAX_LEVEL>=LOG_DEBUG
+#define NL_RAW_PID_LOG_LOG_DEBUG _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_DEBUG(_l, _pid, _fmt, ...)
+#endif
+#if DEBUG_MAX_LEVEL>=LOG_INFO
+#define NL_RAW_PID_LOG_LOG_INFO _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_INFO(_l, _pid, _fmt, ...)
+#endif
+#define NL_RAW_PID_LOG_LOG_NOTICE _NL_RAW_PID_LOG
+#define NL_RAW_PID_LOG_LOG_ERR _NL_RAW_PID_LOG
+#define NL_RAW_PID_LOG_LOG_WARNING _NL_RAW_PID_LOG
+
+#endif /* _KERNEL */
+#endif /* !_NETLINK_NETLINK_DEBUG_H_ */
diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c
new file mode 100644
index 000000000000..74b46114716e
--- /dev/null
+++ b/sys/netlink/netlink_domain.c
@@ -0,0 +1,1002 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ * Copyright (c) 2023 Gleb Smirnoff <glebius@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This file contains socket and protocol bindings for netlink.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/domain.h>
+#include <sys/jail.h>
+#include <sys/mbuf.h>
+#include <sys/osd.h>
+#include <sys/protosw.h>
+#include <sys/proc.h>
+#include <sys/ck.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/priv.h>
+#include <sys/uio.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+
+#define DEBUG_MOD_NAME nl_domain
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_INFO);
+
+_Static_assert((NLP_MAX_GROUPS % 64) == 0,
+ "NLP_MAX_GROUPS has to be multiple of 64");
+_Static_assert(NLP_MAX_GROUPS >= 64,
+ "NLP_MAX_GROUPS has to be at least 64");
+
+#define NLCTL_TRACKER struct rm_priotracker nl_tracker
+#define NLCTL_RLOCK() rm_rlock(&V_nl_ctl.ctl_lock, &nl_tracker)
+#define NLCTL_RUNLOCK() rm_runlock(&V_nl_ctl.ctl_lock, &nl_tracker)
+#define NLCTL_LOCK_ASSERT() rm_assert(&V_nl_ctl.ctl_lock, RA_LOCKED)
+
+#define NLCTL_WLOCK() rm_wlock(&V_nl_ctl.ctl_lock)
+#define NLCTL_WUNLOCK() rm_wunlock(&V_nl_ctl.ctl_lock)
+#define NLCTL_WLOCK_ASSERT() rm_assert(&V_nl_ctl.ctl_lock, RA_WLOCKED)
+
+static u_long nl_sendspace = NLSNDQ;
+SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0,
+ "Default netlink socket send space");
+
+static u_long nl_recvspace = NLSNDQ;
+SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0,
+ "Default netlink socket receive space");
+
+extern u_long sb_max_adj;
+static u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */
+static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS);
+SYSCTL_OID(_net_netlink, OID_AUTO, nl_maxsockbuf,
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, &nl_maxsockbuf, 0,
+ sysctl_handle_nl_maxsockbuf, "LU",
+ "Maximum Netlink socket buffer size");
+
+
+static unsigned int osd_slot_id = 0;
+
+void
+nl_osd_register(void)
+{
+ osd_slot_id = osd_register(OSD_THREAD, NULL, NULL);
+}
+
+void
+nl_osd_unregister(void)
+{
+ osd_deregister(OSD_THREAD, osd_slot_id);
+}
+
+struct nlpcb *
+_nl_get_thread_nlp(struct thread *td)
+{
+ return (osd_get(OSD_THREAD, &td->td_osd, osd_slot_id));
+}
+
+void
+nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp)
+{
+ NLP_LOG(LOG_DEBUG2, nlp, "Set thread %p nlp to %p (slot %u)", td, nlp, osd_slot_id);
+ if (osd_set(OSD_THREAD, &td->td_osd, osd_slot_id, nlp) == 0)
+ return;
+ /* Failed, need to realloc */
+ void **rsv = osd_reserve(osd_slot_id);
+ osd_set_reserved(OSD_THREAD, &td->td_osd, osd_slot_id, rsv, nlp);
+}
+
+/*
+ * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx.
+ * Returns nlpcb pointer if present else NULL
+ */
+static struct nlpcb *
+nl_port_lookup(uint32_t port_id)
+{
+ struct nlpcb *nlp;
+
+ CK_LIST_FOREACH(nlp, &V_nl_ctl.ctl_port_head, nl_port_next) {
+ if (nlp->nl_port == port_id)
+ return (nlp);
+ }
+ return (NULL);
+}
+
+static void
+nlp_join_group(struct nlpcb *nlp, unsigned int group_id)
+{
+ MPASS(group_id < NLP_MAX_GROUPS);
+ NLCTL_WLOCK_ASSERT();
+
+ /* TODO: add family handler callback */
+ if (!nlp_unconstrained_vnet(nlp))
+ return;
+
+ BIT_SET(NLP_MAX_GROUPS, group_id, &nlp->nl_groups);
+}
+
+static void
+nlp_leave_group(struct nlpcb *nlp, unsigned int group_id)
+{
+ MPASS(group_id < NLP_MAX_GROUPS);
+ NLCTL_WLOCK_ASSERT();
+
+ BIT_CLR(NLP_MAX_GROUPS, group_id, &nlp->nl_groups);
+}
+
+static bool
+nlp_memberof_group(struct nlpcb *nlp, unsigned int group_id)
+{
+ MPASS(group_id < NLP_MAX_GROUPS);
+ NLCTL_LOCK_ASSERT();
+
+ return (BIT_ISSET(NLP_MAX_GROUPS, group_id, &nlp->nl_groups));
+}
+
+static uint32_t
+nlp_get_groups_compat(struct nlpcb *nlp)
+{
+ uint32_t groups_mask = 0;
+
+ NLCTL_LOCK_ASSERT();
+
+ for (int i = 0; i < 32; i++) {
+ if (nlp_memberof_group(nlp, i + 1))
+ groups_mask |= (1 << i);
+ }
+
+ return (groups_mask);
+}
+
+static struct nl_buf *
+nl_buf_copy(struct nl_buf *nb)
+{
+ struct nl_buf *copy;
+
+ copy = nl_buf_alloc(nb->buflen, M_NOWAIT);
+ if (__predict_false(copy == NULL))
+ return (NULL);
+ memcpy(copy, nb, sizeof(*nb) + nb->buflen);
+
+ return (copy);
+}
+
+/*
+ * Broadcasts in the writer's buffer.
+ */
+bool
+nl_send_group(struct nl_writer *nw)
+{
+ struct nl_buf *nb = nw->buf;
+ struct nlpcb *nlp_last = NULL;
+ struct nlpcb *nlp;
+ NLCTL_TRACKER;
+
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ struct nlmsghdr *hdr = (struct nlmsghdr *)nb->data;
+ NL_LOG(LOG_DEBUG2, "MCAST len %u msg type %d len %u to group %d/%d",
+ nb->datalen, hdr->nlmsg_type, hdr->nlmsg_len,
+ nw->group.proto, nw->group.id);
+ }
+
+ nw->buf = NULL;
+
+ NLCTL_RLOCK();
+ CK_LIST_FOREACH(nlp, &V_nl_ctl.ctl_pcb_head, nl_next) {
+ if ((nw->group.priv == 0 || priv_check_cred(
+ nlp->nl_socket->so_cred, nw->group.priv) == 0) &&
+ nlp->nl_proto == nw->group.proto &&
+ nlp_memberof_group(nlp, nw->group.id)) {
+ if (nlp_last != NULL) {
+ struct nl_buf *copy;
+
+ copy = nl_buf_copy(nb);
+ if (copy != NULL) {
+ nw->buf = copy;
+ (void)nl_send(nw, nlp_last);
+ } else {
+ NLP_LOCK(nlp_last);
+ if (nlp_last->nl_socket != NULL)
+ sorwakeup(nlp_last->nl_socket);
+ NLP_UNLOCK(nlp_last);
+ }
+ }
+ nlp_last = nlp;
+ }
+ }
+ if (nlp_last != NULL) {
+ nw->buf = nb;
+ (void)nl_send(nw, nlp_last);
+ } else
+ nl_buf_free(nb);
+
+ NLCTL_RUNLOCK();
+
+ return (true);
+}
+
+void
+nl_clear_group(u_int group)
+{
+ struct nlpcb *nlp;
+
+ NLCTL_WLOCK();
+ CK_LIST_FOREACH(nlp, &V_nl_ctl.ctl_pcb_head, nl_next)
+ if (nlp_memberof_group(nlp, group))
+ nlp_leave_group(nlp, group);
+ NLCTL_WUNLOCK();
+}
+
+static uint32_t
+nl_find_port(void)
+{
+ /*
+ * app can open multiple netlink sockets.
+ * Start with current pid, if already taken,
+ * try random numbers in 65k..256k+65k space,
+ * avoiding clash with pids.
+ */
+ if (nl_port_lookup(curproc->p_pid) == NULL)
+ return (curproc->p_pid);
+ for (int i = 0; i < 16; i++) {
+ uint32_t nl_port = (arc4random() % 65536) + 65536 * 4;
+ if (nl_port_lookup(nl_port) == 0)
+ return (nl_port);
+ NL_LOG(LOG_DEBUG3, "tried %u\n", nl_port);
+ }
+ return (curproc->p_pid);
+}
+
+static int
+nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl)
+{
+ if (nlp->nl_bound) {
+ if (nlp->nl_port != snl->nl_pid) {
+ NL_LOG(LOG_DEBUG,
+ "bind() failed: program pid %d "
+ "is different from provided pid %d",
+ nlp->nl_port, snl->nl_pid);
+ return (EINVAL); // XXX: better error
+ }
+ } else {
+ if (snl->nl_pid == 0)
+ snl->nl_pid = nl_find_port();
+ if (nl_port_lookup(snl->nl_pid) != NULL)
+ return (EADDRINUSE);
+ nlp->nl_port = snl->nl_pid;
+ nlp->nl_bound = true;
+ CK_LIST_INSERT_HEAD(&V_nl_ctl.ctl_port_head, nlp, nl_port_next);
+ }
+ for (int i = 0; i < 32; i++) {
+ if (snl->nl_groups & ((uint32_t)1 << i))
+ nlp_join_group(nlp, i + 1);
+ else
+ nlp_leave_group(nlp, i + 1);
+ }
+
+ return (0);
+}
+
+static int
+nl_attach(struct socket *so, int proto, struct thread *td)
+{
+ struct nlpcb *nlp;
+ int error;
+
+ if (__predict_false(netlink_unloading != 0))
+ return (EAFNOSUPPORT);
+
+ error = nl_verify_proto(proto);
+ if (error != 0)
+ return (error);
+
+ bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX;
+ NL_LOG(LOG_DEBUG2, "socket %p, %sPID %d: attaching socket to %s",
+ so, is_linux ? "(linux) " : "", curproc->p_pid,
+ nl_get_proto_name(proto));
+
+ nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO);
+ error = soreserve(so, nl_sendspace, nl_recvspace);
+ if (error != 0) {
+ free(nlp, M_PCB);
+ return (error);
+ }
+ TAILQ_INIT(&so->so_rcv.nl_queue);
+ TAILQ_INIT(&so->so_snd.nl_queue);
+ so->so_pcb = nlp;
+ nlp->nl_socket = so;
+ nlp->nl_proto = proto;
+ nlp->nl_process_id = curproc->p_pid;
+ nlp->nl_linux = is_linux;
+ nlp->nl_unconstrained_vnet = !jailed_without_vnet(so->so_cred);
+ nlp->nl_need_thread_setup = true;
+ NLP_LOCK_INIT(nlp);
+ refcount_init(&nlp->nl_refcount, 1);
+
+ nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK,
+ taskqueue_thread_enqueue, &nlp->nl_taskqueue);
+ TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp);
+ taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT,
+ "netlink_socket (PID %u)", nlp->nl_process_id);
+
+ NLCTL_WLOCK();
+ CK_LIST_INSERT_HEAD(&V_nl_ctl.ctl_pcb_head, nlp, nl_next);
+ NLCTL_WUNLOCK();
+
+ soisconnected(so);
+
+ return (0);
+}
+
+static int
+nl_bind(struct socket *so, struct sockaddr *sa, struct thread *td)
+{
+ struct nlpcb *nlp = sotonlpcb(so);
+ struct sockaddr_nl *snl = (struct sockaddr_nl *)sa;
+ int error;
+
+ NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ if (snl->nl_len != sizeof(*snl)) {
+ NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
+ return (EINVAL);
+ }
+
+
+ NLCTL_WLOCK();
+ NLP_LOCK(nlp);
+ error = nl_bind_locked(nlp, snl);
+ NLP_UNLOCK(nlp);
+ NLCTL_WUNLOCK();
+ NL_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so,
+ snl->nl_pid, snl->nl_groups, error);
+
+ return (error);
+}
+
+
+static int
+nl_assign_port(struct nlpcb *nlp, uint32_t port_id)
+{
+ struct sockaddr_nl snl = {
+ .nl_pid = port_id,
+ };
+ int error;
+
+ NLCTL_WLOCK();
+ NLP_LOCK(nlp);
+ snl.nl_groups = nlp_get_groups_compat(nlp);
+ error = nl_bind_locked(nlp, &snl);
+ NLP_UNLOCK(nlp);
+ NLCTL_WUNLOCK();
+
+ NL_LOG(LOG_DEBUG3, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error);
+ return (error);
+}
+
+/*
+ * nl_autobind_port binds a unused portid to @nlp
+ * @nlp: pcb data for the netlink socket
+ * @candidate_id: first id to consider
+ */
+static int
+nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id)
+{
+ uint32_t port_id = candidate_id;
+ NLCTL_TRACKER;
+ bool exist;
+ int error = EADDRINUSE;
+
+ for (int i = 0; i < 10; i++) {
+ NL_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id);
+ NLCTL_RLOCK();
+ exist = nl_port_lookup(port_id) != 0;
+ NLCTL_RUNLOCK();
+ if (!exist) {
+ error = nl_assign_port(nlp, port_id);
+ if (error != EADDRINUSE)
+ break;
+ }
+ port_id++;
+ }
+ NL_LOG(LOG_DEBUG3, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error);
+ return (error);
+}
+
+static int
+nl_connect(struct socket *so, struct sockaddr *sa, struct thread *td)
+{
+ struct sockaddr_nl *snl = (struct sockaddr_nl *)sa;
+ struct nlpcb *nlp;
+
+ NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ if (snl->nl_len != sizeof(*snl)) {
+ NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
+ return (EINVAL);
+ }
+
+ nlp = sotonlpcb(so);
+ if (!nlp->nl_bound) {
+ int error = nl_autobind_port(nlp, td->td_proc->p_pid);
+ if (error != 0) {
+ NL_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error);
+ return (error);
+ }
+ }
+ /* XXX: Handle socket flags & multicast */
+ soisconnected(so);
+
+ NL_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid);
+
+ return (0);
+}
+
+static void
+destroy_nlpcb_epoch(epoch_context_t ctx)
+{
+ struct nlpcb *nlp;
+
+ nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx);
+
+ NLP_LOCK_DESTROY(nlp);
+ free(nlp, M_PCB);
+}
+
+static void
+nl_close(struct socket *so)
+{
+ MPASS(sotonlpcb(so) != NULL);
+ struct nlpcb *nlp;
+ struct nl_buf *nb;
+
+ NL_LOG(LOG_DEBUG2, "detaching socket %p, PID %d", so, curproc->p_pid);
+ nlp = sotonlpcb(so);
+
+ /* Mark as inactive so no new work can be enqueued */
+ NLP_LOCK(nlp);
+ bool was_bound = nlp->nl_bound;
+ NLP_UNLOCK(nlp);
+
+ /* Wait till all scheduled work has been completed */
+ taskqueue_drain_all(nlp->nl_taskqueue);
+ taskqueue_free(nlp->nl_taskqueue);
+
+ NLCTL_WLOCK();
+ NLP_LOCK(nlp);
+ if (was_bound) {
+ CK_LIST_REMOVE(nlp, nl_port_next);
+ NL_LOG(LOG_DEBUG3, "socket %p, unlinking bound pid %u", so, nlp->nl_port);
+ }
+ CK_LIST_REMOVE(nlp, nl_next);
+ nlp->nl_socket = NULL;
+ NLP_UNLOCK(nlp);
+ NLCTL_WUNLOCK();
+
+ so->so_pcb = NULL;
+
+ while ((nb = TAILQ_FIRST(&so->so_snd.nl_queue)) != NULL) {
+ TAILQ_REMOVE(&so->so_snd.nl_queue, nb, tailq);
+ nl_buf_free(nb);
+ }
+ while ((nb = TAILQ_FIRST(&so->so_rcv.nl_queue)) != NULL) {
+ TAILQ_REMOVE(&so->so_rcv.nl_queue, nb, tailq);
+ nl_buf_free(nb);
+ }
+
+ NL_LOG(LOG_DEBUG3, "socket %p, detached", so);
+
+ /* XXX: is delayed free needed? */
+ NET_EPOCH_CALL(destroy_nlpcb_epoch, &nlp->nl_epoch_ctx);
+}
+
+static int
+nl_disconnect(struct socket *so)
+{
+ NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ return (ENOTCONN);
+}
+
+static int
+nl_sockaddr(struct socket *so, struct sockaddr *sa)
+{
+
+ *(struct sockaddr_nl *)sa = (struct sockaddr_nl ){
+ /* TODO: set other fields */
+ .nl_len = sizeof(struct sockaddr_nl),
+ .nl_family = AF_NETLINK,
+ .nl_pid = sotonlpcb(so)->nl_port,
+ };
+
+ return (0);
+}
+
+static int
+nl_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
+ struct mbuf *m, struct mbuf *control, int flags, struct thread *td)
+{
+ struct nlpcb *nlp = sotonlpcb(so);
+ struct sockbuf *sb = &so->so_snd;
+ struct nl_buf *nb;
+ size_t len;
+ int error;
+
+ MPASS(m == NULL && uio != NULL);
+
+ if (__predict_false(control != NULL)) {
+ m_freem(control);
+ return (EINVAL);
+ }
+
+ if (__predict_false(flags & MSG_OOB)) /* XXXGL: or just ignore? */
+ return (EOPNOTSUPP);
+
+ if (__predict_false(uio->uio_resid < sizeof(struct nlmsghdr)))
+ return (ENOBUFS); /* XXXGL: any better error? */
+
+ if (__predict_false(uio->uio_resid > sb->sb_hiwat))
+ return (EMSGSIZE);
+
+ error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
+ if (error)
+ return (error);
+
+ len = roundup2(uio->uio_resid, 8) + SCRATCH_BUFFER_SIZE;
+ if (nlp->nl_linux)
+ len += roundup2(uio->uio_resid, 8);
+ nb = nl_buf_alloc(len, M_WAITOK);
+ nb->datalen = uio->uio_resid;
+ error = uiomove(&nb->data[0], uio->uio_resid, uio);
+ if (__predict_false(error))
+ goto out;
+
+ NL_LOG(LOG_DEBUG2, "sending message to kernel %u bytes", nb->datalen);
+
+ SOCK_SENDBUF_LOCK(so);
+restart:
+ if (sb->sb_hiwat - sb->sb_ccc >= nb->datalen) {
+ TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq);
+ sb->sb_acc += nb->datalen;
+ sb->sb_ccc += nb->datalen;
+ nb = NULL;
+ } else if ((so->so_state & SS_NBIO) ||
+ (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
+ SOCK_SENDBUF_UNLOCK(so);
+ error = EWOULDBLOCK;
+ goto out;
+ } else {
+ if ((error = sbwait(so, SO_SND)) != 0) {
+ SOCK_SENDBUF_UNLOCK(so);
+ goto out;
+ } else
+ goto restart;
+ }
+ SOCK_SENDBUF_UNLOCK(so);
+
+ if (nb == NULL) {
+ NL_LOG(LOG_DEBUG3, "success");
+ NLP_LOCK(nlp);
+ nl_schedule_taskqueue(nlp);
+ NLP_UNLOCK(nlp);
+ }
+
+out:
+ SOCK_IO_SEND_UNLOCK(so);
+ if (nb != NULL) {
+ NL_LOG(LOG_DEBUG3, "failure, error %d", error);
+ nl_buf_free(nb);
+ }
+ return (error);
+}
+
+/* Create control data for recvmsg(2) on Netlink socket. */
+static struct mbuf *
+nl_createcontrol(struct nlpcb *nlp)
+{
+ struct {
+ struct nlattr nla;
+ uint32_t val;
+ } data[] = {
+ {
+ .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t),
+ .nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID,
+ .val = nlp->nl_process_id,
+ },
+ {
+ .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t),
+ .nla.nla_type = NLMSGINFO_ATTR_PORT_ID,
+ .val = nlp->nl_port,
+ },
+ };
+
+ return (sbcreatecontrol(data, sizeof(data), NETLINK_MSG_INFO,
+ SOL_NETLINK, M_WAITOK));
+}
+
+static int
+nl_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
+ struct mbuf **mp, struct mbuf **controlp, int *flagsp)
+{
+ static const struct sockaddr_nl nl_empty_src = {
+ .nl_len = sizeof(struct sockaddr_nl),
+ .nl_family = PF_NETLINK,
+ .nl_pid = 0 /* comes from the kernel */
+ };
+ struct sockbuf *sb = &so->so_rcv;
+ struct nlpcb *nlp = sotonlpcb(so);
+ struct nl_buf *first, *last, *nb, *next;
+ struct nlmsghdr *hdr;
+ int flags, error;
+ u_int len, overflow, partoff, partlen, msgrcv, datalen;
+ bool nonblock, trunc, peek;
+
+ MPASS(mp == NULL && uio != NULL);
+
+ NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+
+ if (psa != NULL)
+ *psa = sodupsockaddr((const struct sockaddr *)&nl_empty_src,
+ M_WAITOK);
+
+ if (controlp != NULL && (nlp->nl_flags & NLF_MSG_INFO))
+ *controlp = nl_createcontrol(nlp);
+
+ flags = flagsp != NULL ? *flagsp & ~MSG_TRUNC : 0;
+ trunc = flagsp != NULL ? *flagsp & MSG_TRUNC : false;
+ nonblock = (so->so_state & SS_NBIO) ||
+ (flags & (MSG_DONTWAIT | MSG_NBIO));
+ peek = flags & MSG_PEEK;
+
+ error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
+ if (__predict_false(error))
+ return (error);
+
+ len = 0;
+ overflow = 0;
+ msgrcv = 0;
+ datalen = 0;
+
+ SOCK_RECVBUF_LOCK(so);
+ while ((first = TAILQ_FIRST(&sb->nl_queue)) == NULL) {
+ if (nonblock) {
+ SOCK_RECVBUF_UNLOCK(so);
+ SOCK_IO_RECV_UNLOCK(so);
+ return (EWOULDBLOCK);
+ }
+ error = sbwait(so, SO_RCV);
+ if (error) {
+ SOCK_RECVBUF_UNLOCK(so);
+ SOCK_IO_RECV_UNLOCK(so);
+ return (error);
+ }
+ }
+
+ /*
+ * Netlink socket buffer consists of a queue of nl_bufs, but for the
+ * userland there should be no boundaries. However, there are Netlink
+ * messages, that shouldn't be split. Internal invariant is that a
+ * message never spans two nl_bufs.
+ * If a large userland buffer is provided, we would traverse the queue
+ * until either queue end is reached or the buffer is fulfilled. If
+ * an application provides a buffer that isn't able to fit a single
+ * message, we would truncate it and lose its tail. This is the only
+ * condition where we would lose data. If buffer is able to fit at
+ * least one message, we would return it and won't truncate the next.
+ *
+ * We use same code for normal and MSG_PEEK case. At first queue pass
+ * we scan nl_bufs and count lenght. In case we can read entire buffer
+ * at one write everything is trivial. In case we can not, we save
+ * pointer to the last (or partial) nl_buf and in the !peek case we
+ * split the queue into two pieces. We can safely drop the queue lock,
+ * as kernel would only append nl_bufs to the end of the queue, and
+ * we are the exclusive owner of queue beginning due to sleepable lock.
+ * At the second pass we copy data out and in !peek case free nl_bufs.
+ */
+ TAILQ_FOREACH(nb, &sb->nl_queue, tailq) {
+ u_int offset;
+
+ MPASS(nb->offset < nb->datalen);
+ offset = nb->offset;
+ while (offset < nb->datalen) {
+ hdr = (struct nlmsghdr *)&nb->data[offset];
+ MPASS(nb->offset + hdr->nlmsg_len <= nb->datalen);
+ if (uio->uio_resid < len + hdr->nlmsg_len) {
+ overflow = len + hdr->nlmsg_len -
+ uio->uio_resid;
+ partoff = nb->offset;
+ if (offset > partoff) {
+ partlen = offset - partoff;
+ if (!peek) {
+ nb->offset = offset;
+ datalen += partlen;
+ }
+ } else if (len == 0 && uio->uio_resid > 0) {
+ flags |= MSG_TRUNC;
+ partlen = uio->uio_resid;
+ if (peek)
+ goto nospace;
+ datalen += hdr->nlmsg_len;
+ if (nb->offset + hdr->nlmsg_len ==
+ nb->datalen) {
+ /*
+ * Avoid leaving empty nb.
+ * Process last nb normally.
+ * Trust uiomove() to care
+ * about negative uio_resid.
+ */
+ nb = TAILQ_NEXT(nb, tailq);
+ overflow = 0;
+ partlen = 0;
+ } else
+ nb->offset += hdr->nlmsg_len;
+ msgrcv++;
+ } else
+ partlen = 0;
+ goto nospace;
+ }
+ len += hdr->nlmsg_len;
+ offset += hdr->nlmsg_len;
+ MPASS(offset <= nb->buflen);
+ msgrcv++;
+ }
+ MPASS(offset == nb->datalen);
+ datalen += nb->datalen - nb->offset;
+ }
+nospace:
+ last = nb;
+ if (!peek) {
+ if (last == NULL)
+ TAILQ_INIT(&sb->nl_queue);
+ else {
+ /* XXXGL: create TAILQ_SPLIT */
+ TAILQ_FIRST(&sb->nl_queue) = last;
+ last->tailq.tqe_prev = &TAILQ_FIRST(&sb->nl_queue);
+ }
+ MPASS(sb->sb_acc >= datalen);
+ sb->sb_acc -= datalen;
+ sb->sb_ccc -= datalen;
+ }
+ SOCK_RECVBUF_UNLOCK(so);
+
+ for (nb = first; nb != last; nb = next) {
+ next = TAILQ_NEXT(nb, tailq);
+ if (__predict_true(error == 0))
+ error = uiomove(&nb->data[nb->offset],
+ (int)(nb->datalen - nb->offset), uio);
+ if (!peek)
+ nl_buf_free(nb);
+ }
+ if (last != NULL && partlen > 0 && __predict_true(error == 0))
+ error = uiomove(&nb->data[partoff], (int)partlen, uio);
+
+ if (trunc && overflow > 0) {
+ uio->uio_resid -= overflow;
+ MPASS(uio->uio_resid < 0);
+ } else
+ MPASS(uio->uio_resid >= 0);
+
+ if (uio->uio_td)
+ uio->uio_td->td_ru.ru_msgrcv += msgrcv;
+
+ if (flagsp != NULL)
+ *flagsp |= flags;
+
+ SOCK_IO_RECV_UNLOCK(so);
+
+ nl_on_transmit(sotonlpcb(so));
+
+ return (error);
+}
+
+static int
+nl_getoptflag(int sopt_name)
+{
+ switch (sopt_name) {
+ case NETLINK_CAP_ACK:
+ return (NLF_CAP_ACK);
+ case NETLINK_EXT_ACK:
+ return (NLF_EXT_ACK);
+ case NETLINK_GET_STRICT_CHK:
+ return (NLF_STRICT);
+ case NETLINK_MSG_INFO:
+ return (NLF_MSG_INFO);
+ }
+
+ return (0);
+}
+
+static int
+nl_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ struct nlpcb *nlp = sotonlpcb(so);
+ uint32_t flag;
+ int optval, error = 0;
+ NLCTL_TRACKER;
+
+ NL_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get",
+ so, sopt->sopt_name);
+
+ switch (sopt->sopt_dir) {
+ case SOPT_SET:
+ switch (sopt->sopt_name) {
+ case NETLINK_ADD_MEMBERSHIP:
+ case NETLINK_DROP_MEMBERSHIP:
+ error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
+ if (error != 0)
+ break;
+ if (optval <= 0 || optval >= NLP_MAX_GROUPS) {
+ error = ERANGE;
+ break;
+ }
+ NL_LOG(LOG_DEBUG2, "ADD/DEL group %d", (uint32_t)optval);
+
+ NLCTL_WLOCK();
+ if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP)
+ nlp_join_group(nlp, optval);
+ else
+ nlp_leave_group(nlp, optval);
+ NLCTL_WUNLOCK();
+ break;
+ case NETLINK_CAP_ACK:
+ case NETLINK_EXT_ACK:
+ case NETLINK_GET_STRICT_CHK:
+ case NETLINK_MSG_INFO:
+ error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
+ if (error != 0)
+ break;
+
+ flag = nl_getoptflag(sopt->sopt_name);
+
+ if ((flag == NLF_MSG_INFO) && nlp->nl_linux) {
+ error = EINVAL;
+ break;
+ }
+
+ NLCTL_WLOCK();
+ if (optval != 0)
+ nlp->nl_flags |= flag;
+ else
+ nlp->nl_flags &= ~flag;
+ NLCTL_WUNLOCK();
+ break;
+ default:
+ error = ENOPROTOOPT;
+ }
+ break;
+ case SOPT_GET:
+ switch (sopt->sopt_name) {
+ case NETLINK_LIST_MEMBERSHIPS:
+ NLCTL_RLOCK();
+ optval = nlp_get_groups_compat(nlp);
+ NLCTL_RUNLOCK();
+ error = sooptcopyout(sopt, &optval, sizeof(optval));
+ break;
+ case NETLINK_CAP_ACK:
+ case NETLINK_EXT_ACK:
+ case NETLINK_GET_STRICT_CHK:
+ case NETLINK_MSG_INFO:
+ NLCTL_RLOCK();
+ optval = (nlp->nl_flags & nl_getoptflag(sopt->sopt_name)) != 0;
+ NLCTL_RUNLOCK();
+ error = sooptcopyout(sopt, &optval, sizeof(optval));
+ break;
+ default:
+ error = ENOPROTOOPT;
+ }
+ break;
+ default:
+ error = ENOPROTOOPT;
+ }
+
+ return (error);
+}
+
+static int
+sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS)
+{
+ int error = 0;
+ u_long tmp_maxsockbuf = nl_maxsockbuf;
+
+ error = sysctl_handle_long(oidp, &tmp_maxsockbuf, arg2, req);
+ if (error || !req->newptr)
+ return (error);
+ if (tmp_maxsockbuf < MSIZE + MCLBYTES)
+ return (EINVAL);
+ nl_maxsockbuf = tmp_maxsockbuf;
+
+ return (0);
+}
+
+static int
+nl_setsbopt(struct socket *so, struct sockopt *sopt)
+{
+ int error, optval;
+ bool result;
+
+ if (sopt->sopt_name != SO_RCVBUF)
+ return (sbsetopt(so, sopt));
+
+ /* Allow to override max buffer size in certain conditions */
+
+ error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
+ if (error != 0)
+ return (error);
+ NL_LOG(LOG_DEBUG2, "socket %p, PID %d, SO_RCVBUF=%d", so, curproc->p_pid, optval);
+ if (optval > sb_max_adj) {
+ if (priv_check(curthread, PRIV_NET_ROUTE) != 0)
+ return (EPERM);
+ }
+
+ SOCK_RECVBUF_LOCK(so);
+ result = sbreserve_locked_limit(so, SO_RCV, optval, nl_maxsockbuf, curthread);
+ SOCK_RECVBUF_UNLOCK(so);
+
+ return (result ? 0 : ENOBUFS);
+}
+
+#define NETLINK_PROTOSW \
+ .pr_flags = PR_ATOMIC | PR_ADDR | PR_SOCKBUF, \
+ .pr_ctloutput = nl_ctloutput, \
+ .pr_setsbopt = nl_setsbopt, \
+ .pr_attach = nl_attach, \
+ .pr_bind = nl_bind, \
+ .pr_connect = nl_connect, \
+ .pr_disconnect = nl_disconnect, \
+ .pr_sosend = nl_sosend, \
+ .pr_soreceive = nl_soreceive, \
+ .pr_sockaddr = nl_sockaddr, \
+ .pr_close = nl_close
+
+static struct protosw netlink_raw_sw = {
+ .pr_type = SOCK_RAW,
+ NETLINK_PROTOSW
+};
+
+static struct protosw netlink_dgram_sw = {
+ .pr_type = SOCK_DGRAM,
+ NETLINK_PROTOSW
+};
+
+static struct domain netlinkdomain = {
+ .dom_family = PF_NETLINK,
+ .dom_name = "netlink",
+ .dom_flags = DOMF_UNLOADABLE,
+ .dom_nprotosw = 2,
+ .dom_protosw = { &netlink_raw_sw, &netlink_dgram_sw },
+};
+
+DOMAIN_SET(netlink);
diff --git a/sys/netlink/netlink_generic.c b/sys/netlink/netlink_generic.c
new file mode 100644
index 000000000000..00f47e60f013
--- /dev/null
+++ b/sys/netlink/netlink_generic.c
@@ -0,0 +1,525 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/ck.h>
+#include <sys/epoch.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/jail.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/priv.h>
+#include <sys/socket.h>
+#include <sys/sx.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_generic.h>
+#include <netlink/netlink_var.h>
+
+#define DEBUG_MOD_NAME nl_generic
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_INFO);
+
+static int nlctrl_handle_getfamily(struct nlmsghdr *, struct nl_pstate *);
+
+static struct genl_cmd nlctrl_cmds[] = {
+ [CTRL_CMD_GETFAMILY] = {
+ .cmd_num = CTRL_CMD_GETFAMILY,
+ .cmd_name = "GETFAMILY",
+ .cmd_cb = nlctrl_handle_getfamily,
+ .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP |
+ GENL_CMD_CAP_HASPOL,
+ },
+};
+
+static struct genl_family {
+ const char *family_name;
+ uint16_t family_hdrsize;
+ uint16_t family_version;
+ uint16_t family_attr_max;
+ uint16_t family_cmd_size;
+ uint16_t family_num_groups;
+ struct genl_cmd *family_cmds;
+} families[MAX_FAMILIES] = {
+ [CTRL_FAMILY_ID] = {
+ .family_name = CTRL_FAMILY_NAME,
+ .family_hdrsize = 0,
+ .family_version = 2,
+ .family_attr_max = CTRL_ATTR_MAX,
+ .family_cmd_size = CTRL_CMD_GETFAMILY + 1,
+ .family_cmds = nlctrl_cmds,
+ .family_num_groups = 1,
+ },
+};
+
+static struct genl_group {
+ struct genl_family *group_family;
+ const char *group_name;
+} groups[MAX_GROUPS] = {
+ [CTRL_GROUP_ID] = {
+ .group_family = &families[CTRL_FAMILY_ID],
+ .group_name = CTRL_GROUP_NAME,
+ },
+};
+
+static inline struct genl_family *
+genl_family(uint16_t family_id)
+{
+ struct genl_family *gf;
+
+ gf = &families[family_id - GENL_MIN_ID];
+ KASSERT(family_id - GENL_MIN_ID < MAX_FAMILIES &&
+ gf->family_name != NULL, ("family %u does not exist", family_id));
+ return (gf);
+}
+
+static inline uint16_t
+genl_family_id(const struct genl_family *gf)
+{
+ MPASS(gf >= &families[0] && gf < &families[MAX_FAMILIES]);
+ return ((uint16_t)(gf - &families[0]) + GENL_MIN_ID);
+}
+
+/*
+ * Handler called by netlink subsystem when matching netlink message is received
+ */
+static int
+genl_handle_message(struct nlmsghdr *hdr, struct nl_pstate *npt)
+{
+ struct nlpcb *nlp = npt->nlp;
+ struct genl_family *gf;
+ uint16_t family_id;
+ int error = 0;
+
+ if (__predict_false(hdr->nlmsg_len < sizeof(struct nlmsghdr) +
+ GENL_HDRLEN)) {
+ NLP_LOG(LOG_DEBUG, nlp, "invalid message size: %d",
+ hdr->nlmsg_len);
+ return (EINVAL);
+ }
+
+ family_id = hdr->nlmsg_type - GENL_MIN_ID;
+ gf = &families[family_id];
+ if (__predict_false(family_id >= MAX_FAMILIES ||
+ gf->family_name == NULL)) {
+ NLP_LOG(LOG_DEBUG, nlp, "invalid message type: %d",
+ hdr->nlmsg_type);
+ return (ENOTSUP);
+ }
+
+ struct genlmsghdr *ghdr = (struct genlmsghdr *)(hdr + 1);
+
+ if (ghdr->cmd >= gf->family_cmd_size || gf->family_cmds[ghdr->cmd].cmd_cb == NULL) {
+ NLP_LOG(LOG_DEBUG, nlp, "family %s: invalid cmd %d",
+ gf->family_name, ghdr->cmd);
+ return (ENOTSUP);
+ }
+
+ struct genl_cmd *cmd = &gf->family_cmds[ghdr->cmd];
+
+ if (cmd->cmd_priv != 0 && !nlp_has_priv(nlp, cmd->cmd_priv)) {
+ NLP_LOG(LOG_DEBUG, nlp, "family %s: cmd %d priv_check() failed",
+ gf->family_name, ghdr->cmd);
+ return (EPERM);
+ }
+
+ NLP_LOG(LOG_DEBUG2, nlp, "received family %s cmd %s(%d) len %d",
+ gf->family_name, cmd->cmd_name, ghdr->cmd, hdr->nlmsg_len);
+
+ error = cmd->cmd_cb(hdr, npt);
+
+ return (error);
+}
+
+static uint32_t
+get_cmd_flags(const struct genl_cmd *cmd)
+{
+ uint32_t flags = cmd->cmd_flags;
+ if (cmd->cmd_priv != 0)
+ flags |= GENL_ADMIN_PERM;
+ return (flags);
+}
+
+static int
+dump_family(struct nlmsghdr *hdr, struct genlmsghdr *ghdr,
+ const struct genl_family *gf, struct nl_writer *nw)
+{
+ if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr)))
+ goto enomem;
+
+ struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr);
+ ghdr_new->cmd = ghdr->cmd;
+ ghdr_new->version = gf->family_version;
+ ghdr_new->reserved = 0;
+
+ nlattr_add_string(nw, CTRL_ATTR_FAMILY_NAME, gf->family_name);
+ nlattr_add_u16(nw, CTRL_ATTR_FAMILY_ID, genl_family_id(gf));
+ nlattr_add_u32(nw, CTRL_ATTR_VERSION, gf->family_version);
+ nlattr_add_u32(nw, CTRL_ATTR_HDRSIZE, gf->family_hdrsize);
+ nlattr_add_u32(nw, CTRL_ATTR_MAXATTR, gf->family_attr_max);
+
+ if (gf->family_cmd_size > 0) {
+ int off = nlattr_add_nested(nw, CTRL_ATTR_OPS);
+ if (off == 0)
+ goto enomem;
+ for (int i = 0, cnt=0; i < gf->family_cmd_size; i++) {
+ struct genl_cmd *cmd = &gf->family_cmds[i];
+ if (cmd->cmd_cb == NULL)
+ continue;
+ int cmd_off = nlattr_add_nested(nw, ++cnt);
+ if (cmd_off == 0)
+ goto enomem;
+
+ nlattr_add_u32(nw, CTRL_ATTR_OP_ID, cmd->cmd_num);
+ nlattr_add_u32(nw, CTRL_ATTR_OP_FLAGS, get_cmd_flags(cmd));
+ nlattr_set_len(nw, cmd_off);
+ }
+ nlattr_set_len(nw, off);
+ }
+ if (gf->family_num_groups > 0) {
+ int off = nlattr_add_nested(nw, CTRL_ATTR_MCAST_GROUPS);
+ if (off == 0)
+ goto enomem;
+ for (u_int i = 0, cnt = 0; i < MAX_GROUPS; i++) {
+ struct genl_group *gg = &groups[i];
+
+ if (gg->group_family != gf)
+ continue;
+
+ int cmd_off = nlattr_add_nested(nw, ++cnt);
+ if (cmd_off == 0)
+ goto enomem;
+ nlattr_add_u32(nw, CTRL_ATTR_MCAST_GRP_ID, i + MIN_GROUP_NUM);
+ nlattr_add_string(nw, CTRL_ATTR_MCAST_GRP_NAME, gg->group_name);
+ nlattr_set_len(nw, cmd_off);
+ }
+ nlattr_set_len(nw, off);
+ }
+ if (nlmsg_end(nw))
+ return (0);
+enomem:
+ NL_LOG(LOG_DEBUG, "unable to dump family %s state (ENOMEM)", gf->family_name);
+ nlmsg_abort(nw);
+ return (ENOMEM);
+}
+
+struct nl_parsed_family {
+ char *family_name;
+ uint16_t family_id;
+ uint8_t version;
+};
+
+#define _IN(_field) offsetof(struct genlmsghdr, _field)
+#define _OUT(_field) offsetof(struct nl_parsed_family, _field)
+static const struct nlfield_parser nlf_p_generic[] = {
+ { .off_in = _IN(version), .off_out = _OUT(version), .cb = nlf_get_u8 },
+};
+
+static struct nlattr_parser nla_p_generic[] = {
+ { .type = CTRL_ATTR_FAMILY_ID , .off = _OUT(family_id), .cb = nlattr_get_uint16 },
+ { .type = CTRL_ATTR_FAMILY_NAME , .off = _OUT(family_name), .cb = nlattr_get_string },
+};
+#undef _IN
+#undef _OUT
+NL_DECLARE_PARSER(genl_parser, struct genlmsghdr, nlf_p_generic, nla_p_generic);
+
+static int
+nlctrl_handle_getfamily(struct nlmsghdr *hdr, struct nl_pstate *npt)
+{
+ int error = 0;
+
+ struct nl_parsed_family attrs = {};
+ error = nl_parse_nlmsg(hdr, &genl_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ struct genlmsghdr ghdr = {
+ .cmd = CTRL_CMD_NEWFAMILY,
+ };
+
+ if (attrs.family_id != 0 || attrs.family_name != NULL) {
+ for (u_int i = 0; i < MAX_FAMILIES; i++) {
+ struct genl_family *gf = &families[i];
+
+ if (gf->family_name == NULL)
+ continue;
+ if (attrs.family_id != 0 &&
+ attrs.family_id != genl_family_id(gf))
+ continue;
+ if (attrs.family_name != NULL &&
+ strcmp(attrs.family_name, gf->family_name) != 0)
+ continue;
+ return (dump_family(hdr, &ghdr, gf, npt->nw));
+ }
+ return (ENOENT);
+ }
+
+ hdr->nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI;
+ for (u_int i = 0; i < MAX_FAMILIES; i++) {
+ struct genl_family *gf = &families[i];
+
+ if (gf->family_name != NULL) {
+ error = dump_family(hdr, &ghdr, gf, npt->nw);
+ if (error != 0)
+ break;
+ }
+ }
+
+ if (!nlmsg_end_dump(npt->nw, error, hdr)) {
+ NL_LOG(LOG_DEBUG, "Unable to finalize the dump");
+ return (ENOMEM);
+ }
+
+ return (error);
+}
+
+static void
+nlctrl_notify(void *arg __unused, const char *family_name __unused,
+ uint16_t family_id, u_int cmd)
+{
+ struct nlmsghdr hdr = {.nlmsg_type = NETLINK_GENERIC };
+ struct genlmsghdr ghdr = { .cmd = cmd };
+ struct genl_family *gf;
+ struct nl_writer nw;
+
+ gf = genl_family(family_id);
+ if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_GENERIC, CTRL_GROUP_ID,
+ 0, false)) {
+ NL_LOG(LOG_DEBUG, "error allocating group writer");
+ return;
+ }
+
+ dump_family(&hdr, &ghdr, gf, &nw);
+ nlmsg_flush(&nw);
+}
+
+static const struct nlhdr_parser *all_parsers[] = { &genl_parser };
+static eventhandler_tag family_event_tag;
+
+static void
+genl_load_all(void *u __unused)
+{
+ NL_VERIFY_PARSERS(all_parsers);
+ family_event_tag = EVENTHANDLER_REGISTER(genl_family_event,
+ nlctrl_notify, NULL, EVENTHANDLER_PRI_ANY);
+ netlink_register_proto(NETLINK_GENERIC, "NETLINK_GENERIC",
+ genl_handle_message);
+}
+SYSINIT(genl_load_all, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_load_all, NULL);
+
+static void
+genl_unload(void *u __unused)
+{
+ netlink_unregister_proto(NETLINK_GENERIC);
+ EVENTHANDLER_DEREGISTER(genl_family_event, family_event_tag);
+ NET_EPOCH_WAIT();
+}
+SYSUNINIT(genl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_unload, NULL);
+
+/*
+ * Public KPI for NETLINK_GENERIC families/groups registration logic below.
+ */
+
+static struct sx sx_lock;
+SX_SYSINIT(genl_lock, &sx_lock, "genetlink lock");
+#define GENL_LOCK() sx_xlock(&sx_lock)
+#define GENL_UNLOCK() sx_xunlock(&sx_lock)
+#define GENL_ASSERT_LOCKED() sx_assert(&sx_lock, SA_LOCKED)
+#define GENL_ASSERT_XLOCKED() sx_assert(&sx_lock, SA_XLOCKED)
+
+uint16_t
+genl_register_family(const char *family_name, size_t hdrsize,
+ uint16_t family_version, uint16_t max_attr_idx)
+{
+ struct genl_family *gf;
+ uint16_t family_id;
+
+ MPASS(family_name != NULL);
+
+ GENL_LOCK();
+ for (u_int i = 0; i < MAX_FAMILIES; i++)
+ if (families[i].family_name != NULL &&
+ strcmp(families[i].family_name, family_name) == 0)
+ return (0);
+
+ /* Microoptimization: index 0 is reserved for the control family. */
+ gf = NULL;
+ for (u_int i = 1; i < MAX_FAMILIES; i++)
+ if (families[i].family_name == NULL) {
+ gf = &families[i];
+ break;
+ }
+ KASSERT(gf, ("%s: maximum of %u generic netlink families allocated",
+ __func__, MAX_FAMILIES));
+
+ *gf = (struct genl_family) {
+ .family_name = family_name,
+ .family_version = family_version,
+ .family_hdrsize = hdrsize,
+ .family_attr_max = max_attr_idx,
+ };
+ family_id = genl_family_id(gf);
+ GENL_UNLOCK();
+
+ NL_LOG(LOG_DEBUG2, "Registered family %s id %d", gf->family_name,
+ family_id);
+ EVENTHANDLER_INVOKE(genl_family_event, gf->family_name, family_id,
+ CTRL_CMD_NEWFAMILY);
+
+ return (family_id);
+}
+
+void
+genl_unregister_family(uint16_t family_id)
+{
+ struct genl_family *gf;
+
+ GENL_LOCK();
+ gf = genl_family(family_id);
+
+ EVENTHANDLER_INVOKE(genl_family_event, gf->family_name,
+ family_id, CTRL_CMD_DELFAMILY);
+ for (u_int i = 0; i < MAX_GROUPS; i++) {
+ struct genl_group *gg = &groups[i];
+ if (gg->group_family == gf && gg->group_name != NULL) {
+ gg->group_family = NULL;
+ gg->group_name = NULL;
+ }
+ }
+ if (gf->family_cmds != NULL)
+ free(gf->family_cmds, M_NETLINK);
+ bzero(gf, sizeof(*gf));
+ GENL_UNLOCK();
+}
+
+bool
+genl_register_cmds(uint16_t family_id, const struct genl_cmd *cmds,
+ u_int count)
+{
+ struct genl_family *gf;
+ uint16_t cmd_size;
+
+ GENL_LOCK();
+ gf = genl_family(family_id);
+
+ cmd_size = gf->family_cmd_size;
+
+ for (u_int i = 0; i < count; i++) {
+ MPASS(cmds[i].cmd_cb != NULL);
+ if (cmds[i].cmd_num >= cmd_size)
+ cmd_size = cmds[i].cmd_num + 1;
+ }
+
+ if (cmd_size > gf->family_cmd_size) {
+ void *old_data;
+
+ /* need to realloc */
+ size_t sz = cmd_size * sizeof(struct genl_cmd);
+ void *data = malloc(sz, M_NETLINK, M_WAITOK | M_ZERO);
+
+ memcpy(data, gf->family_cmds,
+ gf->family_cmd_size * sizeof(struct genl_cmd));
+ old_data = gf->family_cmds;
+ gf->family_cmds = data;
+ gf->family_cmd_size = cmd_size;
+ free(old_data, M_NETLINK);
+ }
+
+ for (u_int i = 0; i < count; i++) {
+ const struct genl_cmd *cmd = &cmds[i];
+
+ MPASS(gf->family_cmds[cmd->cmd_num].cmd_cb == NULL);
+ gf->family_cmds[cmd->cmd_num] = cmds[i];
+ NL_LOG(LOG_DEBUG2, "Adding cmd %s(%d) to family %s",
+ cmd->cmd_name, cmd->cmd_num, gf->family_name);
+ }
+ GENL_UNLOCK();
+ return (true);
+}
+
+uint32_t
+genl_register_group(uint16_t family_id, const char *group_name)
+{
+ struct genl_family *gf;
+ uint32_t group_id = 0;
+
+ MPASS(group_name != NULL);
+
+ GENL_LOCK();
+ gf = genl_family(family_id);
+
+ for (u_int i = 0; i < MAX_GROUPS; i++)
+ if (groups[i].group_family == gf &&
+ strcmp(groups[i].group_name, group_name) == 0) {
+ GENL_UNLOCK();
+ return (0);
+ }
+
+ /* Microoptimization: index 0 is reserved for the control family */
+ for (u_int i = 1; i < MAX_GROUPS; i++) {
+ struct genl_group *gg = &groups[i];
+ if (gg->group_family == NULL) {
+ gf->family_num_groups++;
+ gg->group_family = gf;
+ gg->group_name = group_name;
+ group_id = i + MIN_GROUP_NUM;
+ break;
+ }
+ }
+ GENL_UNLOCK();
+
+ return (group_id);
+}
+
+void
+genl_unregister_group(uint16_t family_id, uint32_t group_id)
+{
+ struct genl_family *gf;
+ struct genl_group *gg;
+
+ MPASS(group_id > MIN_GROUP_NUM &&
+ group_id < MIN_GROUP_NUM + MAX_GROUPS);
+
+ nl_clear_group(group_id);
+
+ group_id -= MIN_GROUP_NUM;
+
+ GENL_LOCK();
+ gf = genl_family(family_id);
+ gg = &groups[group_id];
+
+ MPASS(gg->group_family == gf);
+ MPASS(gf->family_num_groups > 0);
+
+ gf->family_num_groups--;
+ gg->group_family = NULL;
+ gg->group_name = NULL;
+ GENL_UNLOCK();
+}
diff --git a/sys/netlink/netlink_generic.h b/sys/netlink/netlink_generic.h
new file mode 100644
index 000000000000..fbd4ae785cbe
--- /dev/null
+++ b/sys/netlink/netlink_generic.h
@@ -0,0 +1,114 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Generic netlink message header and attributes
+ */
+#ifndef _NETLINK_NETLINK_GENERIC_H_
+#define _NETLINK_NETLINK_GENERIC_H_
+
+#include <netlink/netlink.h>
+
+/* Base header for all of the relevant messages */
+struct genlmsghdr {
+ uint8_t cmd; /* CTRL_CMD_ */
+ uint8_t version; /* ABI version for the cmd */
+ uint16_t reserved; /* reserved: set to 0 */
+};
+#define GENL_HDRLEN NL_ITEM_ALIGN(sizeof(struct genlmsghdr))
+
+/* Dynamic family number range, inclusive */
+#define GENL_MIN_ID NLMSG_MIN_TYPE
+#define GENL_MAX_ID 1023
+
+/* Pre-defined family numbers */
+#define GENL_ID_CTRL GENL_MIN_ID
+
+/* Available commands */
+enum {
+ CTRL_CMD_UNSPEC = 0,
+ CTRL_CMD_NEWFAMILY = 1,
+ CTRL_CMD_DELFAMILY = 2,
+ CTRL_CMD_GETFAMILY = 3, /* lists all (or matching) genetlink families */
+ CTRL_CMD_NEWOPS = 4,
+ CTRL_CMD_DELOPS = 5,
+ CTRL_CMD_GETOPS = 6,
+ CTRL_CMD_NEWMCAST_GRP = 7,
+ CTRL_CMD_DELMCAST_GRP = 8,
+ CTRL_CMD_GETMCAST_GRP = 9,
+ CTRL_CMD_GETPOLICY = 10,
+ __CTRL_CMD_MAX,
+};
+#define CTRL_CMD_MAX (__CTRL_CMD_MAX - 1)
+
+/* Generic attributes */
+enum {
+ CTRL_ATTR_UNSPEC,
+ CTRL_ATTR_FAMILY_ID = 1, /* u16, dynamically-assigned ID */
+ CTRL_ATTR_FAMILY_NAME = 2, /* string, family name */
+ CTRL_ATTR_VERSION = 3, /* u32, command version */
+ CTRL_ATTR_HDRSIZE = 4, /* u32, family header size */
+ CTRL_ATTR_MAXATTR = 5, /* u32, maximum family attr # */
+ CTRL_ATTR_OPS = 6, /* nested, available operations */
+ CTRL_ATTR_MCAST_GROUPS = 7,
+ CTRL_ATTR_POLICY = 8,
+ CTRL_ATTR_OP_POLICY = 9,
+ CTRL_ATTR_OP = 10,
+ __CTRL_ATTR_MAX,
+};
+#define CTRL_ATTR_MAX (__CTRL_ATTR_MAX - 1)
+
+#define GENL_NAMSIZ 16 /* max family name length including \0 */
+
+/* CTRL_ATTR_OPS attributes */
+enum {
+ CTRL_ATTR_OP_UNSPEC,
+ CTRL_ATTR_OP_ID = 1, /* u32, operation # */
+ CTRL_ATTR_OP_FLAGS = 2, /* u32, flags-based op description */
+ __CTRL_ATTR_OP_MAX,
+};
+#define CTRL_ATTR_OP_MAX (__CTRL_ATTR_OP_MAX - 1)
+
+/* CTRL_ATTR_OP_FLAGS values */
+#define GENL_ADMIN_PERM 0x0001 /* Requires elevated permissions */
+#define GENL_CMD_CAP_DO 0x0002 /* Operation is a modification request */
+#define GENL_CMD_CAP_DUMP 0x0004 /* Operation is a get/dump request */
+#define GENL_CMD_CAP_HASPOL 0x0008 /* Operation has a validation policy */
+#define GENL_UNS_ADMIN_PERM 0x0010
+
+/* CTRL_ATTR_MCAST_GROUPS attributes */
+enum {
+ CTRL_ATTR_MCAST_GRP_UNSPEC,
+ CTRL_ATTR_MCAST_GRP_NAME, /* string, group name */
+ CTRL_ATTR_MCAST_GRP_ID, /* u32, dynamically-assigned group id */
+ __CTRL_ATTR_MCAST_GRP_MAX,
+};
+#define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1)
+
+
+#endif
+
diff --git a/sys/netlink/netlink_glue.c b/sys/netlink/netlink_glue.c
new file mode 100644
index 000000000000..4b593fd9657b
--- /dev/null
+++ b/sys/netlink/netlink_glue.c
@@ -0,0 +1,292 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/domain.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/proc.h>
+#include <sys/ck.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/priv.h> /* priv_check */
+
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+#include <netlink/route/route_var.h>
+
+/* Standard bits: built-in the kernel */
+SYSCTL_NODE(_net, OID_AUTO, netlink, CTLFLAG_RD, 0,
+ "RFC3549 Netlink network state socket family");
+SYSCTL_NODE(_net_netlink, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+ "Netlink per-subsystem debug levels");
+
+MALLOC_DEFINE(M_NETLINK, "netlink", "Memory used for netlink packets");
+
+/* Netlink-related callbacks needed to glue rtsock, netlink and linuxolator */
+static void
+ignore_route_event(uint32_t fibnum, const struct rib_cmd_info *rc)
+{
+}
+
+static void
+ignore_ifmsg_event(struct ifnet *ifp, int if_flags_mask)
+{
+}
+
+static struct rtbridge ignore_cb = {
+ .route_f = ignore_route_event,
+ .ifmsg_f = ignore_ifmsg_event,
+};
+
+void *linux_netlink_p = NULL; /* Callback pointer for Linux translator functions */
+struct rtbridge *rtsock_callback_p = &ignore_cb;
+struct rtbridge *netlink_callback_p = &ignore_cb;
+
+
+/*
+ * nlp accessors.
+ * TODO: move to a separate file once the number grows.
+ */
+bool
+nlp_has_priv(struct nlpcb *nlp, int priv)
+{
+ return (priv_check_cred(nlp->nl_socket->so_cred, priv) == 0);
+}
+
+struct ucred *
+nlp_get_cred(struct nlpcb *nlp)
+{
+ return (nlp->nl_socket->so_cred);
+}
+
+uint32_t
+nlp_get_pid(const struct nlpcb *nlp)
+{
+ return (nlp->nl_process_id);
+}
+
+bool
+nlp_unconstrained_vnet(const struct nlpcb *nlp)
+{
+ return (nlp->nl_unconstrained_vnet);
+}
+
+#ifndef NETLINK
+/* Stub implementations for the loadable functions */
+
+static bool
+nl_writer_unicast_stub(struct nl_writer *nw, size_t size, struct nlpcb *nlp,
+ bool waitok)
+{
+ return (get_stub_writer(nw));
+}
+
+static bool
+nl_writer_group_stub(struct nl_writer *nw, size_t size, uint16_t protocol,
+ uint16_t group_id, int priv, bool waitok)
+{
+ return (get_stub_writer(nw));
+}
+
+static bool
+nlmsg_flush_stub(struct nl_writer *nw __unused)
+{
+ return (false);
+}
+
+static void
+nlmsg_ignore_limit_stub(struct nl_writer *nw __unused)
+{
+}
+
+static bool
+nlmsg_refill_buffer_stub(struct nl_writer *nw __unused,
+ size_t required_len __unused)
+{
+ return (false);
+}
+
+static bool
+nlmsg_add_stub(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
+ uint16_t flags, uint32_t len)
+{
+ return (false);
+}
+
+static bool
+nlmsg_end_stub(struct nl_writer *nw __unused)
+{
+ return (false);
+}
+
+static void
+nlmsg_abort_stub(struct nl_writer *nw __unused)
+{
+}
+
+static bool
+nlmsg_end_dump_stub(struct nl_writer *nw, int error, struct nlmsghdr *hdr)
+{
+ return (false);
+}
+
+static int
+nl_modify_ifp_generic_stub(struct ifnet *ifp __unused,
+ struct nl_parsed_link *lattrs __unused, const struct nlattr_bmask *bm __unused,
+ struct nl_pstate *npt __unused)
+{
+ return (ENOTSUP);
+}
+
+static void
+nl_store_ifp_cookie_stub(struct nl_pstate *npt __unused, struct ifnet *ifp __unused)
+{
+}
+
+static struct nlpcb *
+nl_get_thread_nlp_stub(struct thread *td __unused)
+{
+ return (NULL);
+}
+
+const static struct nl_function_wrapper nl_stub = {
+ .nlmsg_add = nlmsg_add_stub,
+ .nlmsg_refill_buffer = nlmsg_refill_buffer_stub,
+ .nlmsg_flush = nlmsg_flush_stub,
+ .nlmsg_end = nlmsg_end_stub,
+ .nlmsg_abort = nlmsg_abort_stub,
+ .nlmsg_ignore_limit = nlmsg_ignore_limit_stub,
+ .nl_writer_unicast = nl_writer_unicast_stub,
+ .nl_writer_group = nl_writer_group_stub,
+ .nlmsg_end_dump = nlmsg_end_dump_stub,
+ .nl_modify_ifp_generic = nl_modify_ifp_generic_stub,
+ .nl_store_ifp_cookie = nl_store_ifp_cookie_stub,
+ .nl_get_thread_nlp = nl_get_thread_nlp_stub,
+};
+
+/*
+ * If the kernel is compiled with netlink as a module,
+ * provide a way to introduce non-stub functioms
+ */
+static const struct nl_function_wrapper *_nl = &nl_stub;
+
+void
+nl_set_functions(const struct nl_function_wrapper *nl)
+{
+ _nl = (nl != NULL) ? nl : &nl_stub;
+}
+
+/* Function wrappers */
+bool
+nl_writer_unicast(struct nl_writer *nw, size_t size, struct nlpcb *nlp,
+ bool waitok)
+{
+ return (_nl->nl_writer_unicast(nw, size, nlp, waitok));
+}
+
+bool
+nl_writer_group(struct nl_writer *nw, size_t size, uint16_t protocol,
+ uint16_t group_id, int priv, bool waitok)
+{
+ return (_nl->nl_writer_group(nw, size, protocol, group_id, priv,
+ waitok));
+}
+
+bool
+nlmsg_flush(struct nl_writer *nw)
+{
+ return (_nl->nlmsg_flush(nw));
+}
+
+void nlmsg_ignore_limit(struct nl_writer *nw)
+{
+ _nl->nlmsg_ignore_limit(nw);
+}
+
+bool
+nlmsg_refill_buffer(struct nl_writer *nw, size_t required_len)
+{
+ return (_nl->nlmsg_refill_buffer(nw, required_len));
+}
+
+bool
+nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
+ uint16_t flags, uint32_t len)
+{
+ return (_nl->nlmsg_add(nw, portid, seq, type, flags, len));
+}
+
+bool
+nlmsg_end(struct nl_writer *nw)
+{
+ return (_nl->nlmsg_end(nw));
+}
+
+void
+nlmsg_abort(struct nl_writer *nw)
+{
+ _nl->nlmsg_abort(nw);
+}
+
+bool
+nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr)
+{
+ return (_nl->nlmsg_end_dump(nw, error, hdr));
+}
+
+int
+nl_modify_ifp_generic(struct ifnet *ifp, struct nl_parsed_link *lattrs,
+ const struct nlattr_bmask *bm , struct nl_pstate *npt)
+{
+ return (_nl->nl_modify_ifp_generic(ifp, lattrs, bm, npt));
+}
+
+void
+nl_store_ifp_cookie(struct nl_pstate *npt, struct ifnet *ifp)
+{
+ return (_nl->nl_store_ifp_cookie(npt, ifp));
+}
+
+struct nlpcb *
+nl_get_thread_nlp(struct thread *td)
+{
+ return (_nl->nl_get_thread_nlp(td));
+}
+
+#endif /* !NETLINK */
+
diff --git a/sys/netlink/netlink_io.c b/sys/netlink/netlink_io.c
new file mode 100644
index 000000000000..e7908d6f3a44
--- /dev/null
+++ b/sys/netlink/netlink_io.c
@@ -0,0 +1,369 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/ck.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syslog.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_linux.h>
+#include <netlink/netlink_var.h>
+
+#define DEBUG_MOD_NAME nl_io
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_INFO);
+
+/*
+ * The logic below provide a p2p interface for receiving and
+ * sending netlink data between the kernel and userland.
+ */
+
+static bool nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp);
+
+struct nl_buf *
+nl_buf_alloc(size_t len, int mflag)
+{
+ struct nl_buf *nb;
+
+ KASSERT(len > 0 && len <= UINT_MAX, ("%s: invalid length %zu",
+ __func__, len));
+
+ nb = malloc(sizeof(struct nl_buf) + len, M_NETLINK, mflag);
+ if (__predict_true(nb != NULL)) {
+ nb->buflen = len;
+ nb->datalen = nb->offset = 0;
+ }
+
+ return (nb);
+}
+
+void
+nl_buf_free(struct nl_buf *nb)
+{
+
+ free(nb, M_NETLINK);
+}
+
+void
+nl_schedule_taskqueue(struct nlpcb *nlp)
+{
+ if (!nlp->nl_task_pending) {
+ nlp->nl_task_pending = true;
+ taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
+ NL_LOG(LOG_DEBUG3, "taskqueue scheduled");
+ } else {
+ NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped");
+ }
+}
+
+static bool
+nl_process_received_one(struct nlpcb *nlp)
+{
+ struct socket *so = nlp->nl_socket;
+ struct sockbuf *sb;
+ struct nl_buf *nb;
+ bool reschedule = false;
+
+ NLP_LOCK(nlp);
+ nlp->nl_task_pending = false;
+ NLP_UNLOCK(nlp);
+
+ /*
+ * Do not process queued up requests if there is no space to queue
+ * replies.
+ */
+ sb = &so->so_rcv;
+ SOCK_RECVBUF_LOCK(so);
+ if (sb->sb_hiwat <= sb->sb_ccc) {
+ SOCK_RECVBUF_UNLOCK(so);
+ NL_LOG(LOG_DEBUG3, "socket %p stuck", so);
+ return (false);
+ }
+ SOCK_RECVBUF_UNLOCK(so);
+
+ sb = &so->so_snd;
+ SOCK_SENDBUF_LOCK(so);
+ while ((nb = TAILQ_FIRST(&sb->nl_queue)) != NULL) {
+ TAILQ_REMOVE(&sb->nl_queue, nb, tailq);
+ SOCK_SENDBUF_UNLOCK(so);
+ reschedule = nl_process_nbuf(nb, nlp);
+ SOCK_SENDBUF_LOCK(so);
+ if (reschedule) {
+ sb->sb_acc -= nb->datalen;
+ sb->sb_ccc -= nb->datalen;
+ /* XXXGL: potentially can reduce lock&unlock count. */
+ sowwakeup_locked(so);
+ nl_buf_free(nb);
+ SOCK_SENDBUF_LOCK(so);
+ } else {
+ TAILQ_INSERT_HEAD(&sb->nl_queue, nb, tailq);
+ break;
+ }
+ }
+ SOCK_SENDBUF_UNLOCK(so);
+
+ return (reschedule);
+}
+
+static void
+nl_process_received(struct nlpcb *nlp)
+{
+ NL_LOG(LOG_DEBUG3, "taskqueue called");
+
+ if (__predict_false(nlp->nl_need_thread_setup)) {
+ nl_set_thread_nlp(curthread, nlp);
+ NLP_LOCK(nlp);
+ nlp->nl_need_thread_setup = false;
+ NLP_UNLOCK(nlp);
+ }
+
+ while (nl_process_received_one(nlp))
+ ;
+}
+
+/*
+ * Called after some data have been read from the socket.
+ */
+void
+nl_on_transmit(struct nlpcb *nlp)
+{
+ NLP_LOCK(nlp);
+
+ struct socket *so = nlp->nl_socket;
+ if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) {
+ unsigned long dropped_bytes = nlp->nl_dropped_bytes;
+ unsigned long dropped_messages = nlp->nl_dropped_messages;
+ nlp->nl_dropped_bytes = 0;
+ nlp->nl_dropped_messages = 0;
+
+ struct sockbuf *sb = &so->so_rcv;
+ NLP_LOG(LOG_DEBUG, nlp,
+ "socket RX overflowed, %lu messages (%lu bytes) dropped. "
+ "bytes: [%u/%u]", dropped_messages, dropped_bytes,
+ sb->sb_ccc, sb->sb_hiwat);
+ /* TODO: send netlink message */
+ }
+
+ nl_schedule_taskqueue(nlp);
+ NLP_UNLOCK(nlp);
+}
+
+void
+nl_taskqueue_handler(void *_arg, int pending)
+{
+ struct nlpcb *nlp = (struct nlpcb *)_arg;
+
+ CURVNET_SET(nlp->nl_socket->so_vnet);
+ nl_process_received(nlp);
+ CURVNET_RESTORE();
+}
+
+/*
+ * Tries to send current data buffer from writer.
+ *
+ * Returns true on success.
+ * If no queue overrunes happened, wakes up socket owner.
+ */
+bool
+nl_send(struct nl_writer *nw, struct nlpcb *nlp)
+{
+ struct socket *so = nlp->nl_socket;
+ struct sockbuf *sb = &so->so_rcv;
+ struct nl_buf *nb;
+
+ MPASS(nw->hdr == NULL);
+ MPASS(nw->buf != NULL);
+ MPASS(nw->buf->datalen > 0);
+
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ struct nlmsghdr *hdr = (struct nlmsghdr *)nw->buf->data;
+ NLP_LOG(LOG_DEBUG2, nlp,
+ "TX len %u msgs %u msg type %d first hdrlen %u",
+ nw->buf->datalen, nw->num_messages, hdr->nlmsg_type,
+ hdr->nlmsg_len);
+ }
+
+ if (nlp->nl_linux && linux_netlink_p != NULL &&
+ __predict_false(!linux_netlink_p->msgs_to_linux(nw, nlp))) {
+ nl_buf_free(nw->buf);
+ nw->buf = NULL;
+ return (false);
+ }
+
+ nb = nw->buf;
+ nw->buf = NULL;
+
+ SOCK_RECVBUF_LOCK(so);
+ if (!nw->ignore_limit && __predict_false(sb->sb_hiwat <= sb->sb_ccc)) {
+ SOCK_RECVBUF_UNLOCK(so);
+ NLP_LOCK(nlp);
+ nlp->nl_dropped_bytes += nb->datalen;
+ nlp->nl_dropped_messages += nw->num_messages;
+ NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
+ (unsigned long)nlp->nl_dropped_messages, nw->num_messages,
+ (unsigned long)nlp->nl_dropped_bytes, nb->datalen);
+ NLP_UNLOCK(nlp);
+ nl_buf_free(nb);
+ return (false);
+ } else {
+ bool full;
+
+ TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq);
+ sb->sb_acc += nb->datalen;
+ sb->sb_ccc += nb->datalen;
+ full = sb->sb_hiwat <= sb->sb_ccc;
+ sorwakeup_locked(so);
+ if (full) {
+ NLP_LOCK(nlp);
+ nlp->nl_tx_blocked = true;
+ NLP_UNLOCK(nlp);
+ }
+ return (true);
+ }
+}
+
+static int
+nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
+ struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ nl_handler_f handler = nl_handlers[nlp->nl_proto].cb;
+ int error = 0;
+
+ NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
+ hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq,
+ hdr->nlmsg_pid);
+
+ if (__predict_false(hdr->nlmsg_len > remaining_length)) {
+ NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d",
+ hdr->nlmsg_len, remaining_length);
+ return (EINVAL);
+ } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
+ NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
+ return (EINVAL);
+ }
+ /* Stamp each message with sender pid */
+ hdr->nlmsg_pid = nlp->nl_port;
+
+ npt->hdr = hdr;
+
+ if (hdr->nlmsg_flags & NLM_F_REQUEST &&
+ hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
+ NL_LOG(LOG_DEBUG2, "handling message with msg type: %d",
+ hdr->nlmsg_type);
+ if (nlp->nl_linux) {
+ MPASS(linux_netlink_p != NULL);
+ error = linux_netlink_p->msg_from_linux(nlp->nl_proto,
+ &hdr, npt);
+ if (error)
+ goto ack;
+ }
+ error = handler(hdr, npt);
+ NL_LOG(LOG_DEBUG2, "retcode: %d", error);
+ }
+ack:
+ if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
+ if (!npt->nw->suppress_ack) {
+ NL_LOG(LOG_DEBUG3, "ack");
+ nlmsg_ack(nlp, error, hdr, npt);
+ }
+ }
+
+ return (0);
+}
+
+static void
+npt_clear(struct nl_pstate *npt)
+{
+ lb_clear(&npt->lb);
+ npt->cookie = NULL;
+ npt->error = 0;
+ npt->err_msg = NULL;
+ npt->err_off = 0;
+ npt->hdr = NULL;
+ npt->nw->suppress_ack = false;
+}
+
+/*
+ * Processes an incoming packet, which can contain multiple netlink messages
+ */
+static bool
+nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp)
+{
+ struct nl_writer nw;
+ struct nlmsghdr *hdr;
+ int error;
+
+ NL_LOG(LOG_DEBUG3, "RX netlink buf %p on %p", nb, nlp->nl_socket);
+
+ if (!nl_writer_unicast(&nw, NLMSG_SMALL, nlp, false)) {
+ NL_LOG(LOG_DEBUG, "error allocating socket writer");
+ return (true);
+ }
+
+ nlmsg_ignore_limit(&nw);
+
+ struct nl_pstate npt = {
+ .nlp = nlp,
+ .lb.base = &nb->data[roundup2(nb->datalen, 8)],
+ .lb.size = nb->buflen - roundup2(nb->datalen, 8),
+ .nw = &nw,
+ .strict = nlp->nl_flags & NLF_STRICT,
+ };
+
+ for (; nb->offset + sizeof(struct nlmsghdr) <= nb->datalen;) {
+ hdr = (struct nlmsghdr *)&nb->data[nb->offset];
+ /* Save length prior to calling handler */
+ int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
+ NL_LOG(LOG_DEBUG3, "parsing offset %d/%d",
+ nb->offset, nb->datalen);
+ npt_clear(&npt);
+ error = nl_receive_message(hdr, nb->datalen - nb->offset, nlp,
+ &npt);
+ nb->offset += msglen;
+ if (__predict_false(error != 0 || nlp->nl_tx_blocked))
+ break;
+ }
+ NL_LOG(LOG_DEBUG3, "packet parsing done");
+ nlmsg_flush(&nw);
+
+ if (nlp->nl_tx_blocked) {
+ NLP_LOCK(nlp);
+ nlp->nl_tx_blocked = false;
+ NLP_UNLOCK(nlp);
+ return (false);
+ } else
+ return (true);
+}
diff --git a/sys/netlink/netlink_linux.h b/sys/netlink/netlink_linux.h
new file mode 100644
index 000000000000..d4c451d470b2
--- /dev/null
+++ b/sys/netlink/netlink_linux.h
@@ -0,0 +1,53 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETLINK_LINUX_VAR_H_
+#define _NETLINK_LINUX_VAR_H_
+#ifdef _KERNEL
+
+/*
+ * The file contains headers for the bridge interface between
+ * linux[_common] module and the netlink module
+ */
+struct nlpcb;
+struct nl_pstate;
+struct nl_writer;
+
+typedef bool msgs_to_linux_cb_t(struct nl_writer *nw, struct nlpcb *nlp);
+typedef int msg_from_linux_cb_t(int netlink_family, struct nlmsghdr **hdr,
+ struct nl_pstate *npt);
+
+struct linux_netlink_provider {
+ msgs_to_linux_cb_t *msgs_to_linux;
+ msg_from_linux_cb_t *msg_from_linux;
+
+};
+
+extern struct linux_netlink_provider *linux_netlink_p;
+
+#endif
+#endif
diff --git a/sys/netlink/netlink_message_parser.c b/sys/netlink/netlink_message_parser.c
new file mode 100644
index 000000000000..4c41235efaac
--- /dev/null
+++ b/sys/netlink/netlink_message_parser.c
@@ -0,0 +1,635 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+#include <sys/stdarg.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+
+#include <net/route/route_ctl.h>
+#include <netinet/in.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+#include <netlink/netlink_route.h>
+
+#define DEBUG_MOD_NAME nl_parser
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_INFO);
+
+bool
+nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...)
+{
+ va_list ap;
+
+ if (npt->err_msg != NULL)
+ return (false);
+ char *buf = npt_alloc(npt, NL_MAX_ERROR_BUF);
+ if (buf == NULL)
+ return (false);
+ va_start(ap, fmt);
+ vsnprintf(buf, NL_MAX_ERROR_BUF, fmt, ap);
+ va_end(ap);
+
+ npt->err_msg = buf;
+ return (true);
+}
+
+bool
+nlmsg_report_err_offset(struct nl_pstate *npt, uint32_t off)
+{
+ if (npt->err_off != 0)
+ return (false);
+ npt->err_off = off;
+ return (true);
+}
+
+void
+nlmsg_report_cookie(struct nl_pstate *npt, struct nlattr *nla)
+{
+ MPASS(nla->nla_type == NLMSGERR_ATTR_COOKIE);
+ MPASS(nla->nla_len >= sizeof(struct nlattr));
+ npt->cookie = nla;
+}
+
+void
+nlmsg_report_cookie_u32(struct nl_pstate *npt, uint32_t val)
+{
+ struct nlattr *nla = npt_alloc(npt, sizeof(*nla) + sizeof(uint32_t));
+
+ nla->nla_type = NLMSGERR_ATTR_COOKIE;
+ nla->nla_len = sizeof(*nla) + sizeof(uint32_t);
+ memcpy(nla + 1, &val, sizeof(uint32_t));
+ nlmsg_report_cookie(npt, nla);
+}
+
+static const struct nlattr_parser *
+search_states(const struct nlattr_parser *ps, u_int pslen, int key)
+{
+ int left_i = 0, right_i = pslen - 1;
+
+ if (key < ps[0].type || key > ps[pslen - 1].type)
+ return (NULL);
+
+ while (left_i + 1 < right_i) {
+ int mid_i = (left_i + right_i) / 2;
+ if (key < ps[mid_i].type)
+ right_i = mid_i;
+ else if (key > ps[mid_i].type)
+ left_i = mid_i + 1;
+ else
+ return (&ps[mid_i]);
+ }
+ if (ps[left_i].type == key)
+ return (&ps[left_i]);
+ else if (ps[right_i].type == key)
+ return (&ps[right_i]);
+ return (NULL);
+}
+
+int
+nl_parse_attrs_raw(struct nlattr *nla_head, uint16_t len,
+ const struct nlattr_parser *ps, u_int pslen, struct nl_pstate *npt,
+ void *target)
+{
+ const struct nlattr_parser *s;
+ struct nlattr *nla;
+ uint16_t orig_len, off;
+ int error = 0;
+
+ NL_LOG(LOG_DEBUG3, "parse %p remaining_len %d", nla_head, len);
+ orig_len = len;
+ NLA_FOREACH(nla, nla_head, len) {
+ NL_LOG(LOG_DEBUG3, ">> parsing %p attr_type %u len %u (rem %u)",
+ nla, nla->nla_type, nla->nla_len, len);
+ if (nla->nla_len < sizeof(struct nlattr)) {
+ NLMSG_REPORT_ERR_MSG(npt,
+ "Invalid attr %p type %u len: %u",
+ nla, nla->nla_type, nla->nla_len);
+ off = (char *)nla - (char *)npt->hdr;
+ nlmsg_report_err_offset(npt, off);
+ return (EINVAL);
+ }
+
+ s = search_states(ps, pslen, nla->nla_type & NLA_TYPE_MASK);
+ if (s != NULL) {
+ void *ptr;
+
+ ptr = (void *)((char *)target + s->off);
+ error = s->cb(nla, npt, s->arg, ptr);
+ if (error != 0) {
+ off = (char *)nla - (char *)npt->hdr;
+ nlmsg_report_err_offset(npt, off);
+ NL_LOG(LOG_DEBUG3,
+ "parse failed at offset %u", off);
+ return (error);
+ }
+ } else {
+ /* Ignore non-specified attributes */
+ NL_LOG(LOG_DEBUG3, "ignoring attr %u", nla->nla_type);
+ }
+ }
+ if (len >= sizeof(struct nlattr)) {
+ nla = (struct nlattr *)((char *)nla_head + (orig_len - len));
+ NL_LOG(LOG_DEBUG3, " >>> end %p attr_type %u len %u", nla,
+ nla->nla_type, nla->nla_len);
+ }
+ NL_LOG(LOG_DEBUG3, "end parse: %p remaining_len %u", nla, len);
+
+ return (0);
+}
+
+void
+nl_get_attrs_bmask_raw(struct nlattr *nla_head, uint32_t len,
+ struct nlattr_bmask *bm)
+{
+ struct nlattr *nla = NULL;
+ uint16_t nla_type;
+
+ BIT_ZERO(NL_ATTR_BMASK_SIZE, bm);
+
+ NLA_FOREACH(nla, nla_head, len) {
+ if (nla->nla_len < sizeof(struct nlattr))
+ return;
+ nla_type = nla->nla_type & NLA_TYPE_MASK;
+ if (nla_type < NL_ATTR_BMASK_SIZE)
+ BIT_SET(NL_ATTR_BMASK_SIZE, nla_type, bm);
+ else
+ NL_LOG(LOG_DEBUG2,
+ "Skipping type %u in the mask: too short",
+ nla_type);
+ }
+}
+
+bool
+nl_has_attr(const struct nlattr_bmask *bm, uint16_t nla_type)
+{
+ MPASS(nla_type < NL_ATTR_BMASK_SIZE);
+
+ return (BIT_ISSET(NL_ATTR_BMASK_SIZE, nla_type, bm));
+}
+
+int
+nlattr_get_flag(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != 0)) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not a flag",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+
+ *((uint8_t *)target) = 1;
+ return (0);
+}
+
+static struct sockaddr *
+parse_rta_ip4(void *rta_data, struct nl_pstate *npt, int *perror)
+{
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)npt_alloc_sockaddr(npt,
+ sizeof(struct sockaddr_in));
+ if (__predict_false(sin == NULL)) {
+ *perror = ENOBUFS;
+ return (NULL);
+ }
+ sin->sin_len = sizeof(struct sockaddr_in);
+ sin->sin_family = AF_INET;
+ memcpy(&sin->sin_addr, rta_data, sizeof(struct in_addr));
+ return ((struct sockaddr *)sin);
+}
+
+static struct sockaddr *
+parse_rta_ip6(void *rta_data, struct nl_pstate *npt, int *perror)
+{
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)npt_alloc_sockaddr(npt,
+ sizeof(struct sockaddr_in6));
+ if (__predict_false(sin6 == NULL)) {
+ *perror = ENOBUFS;
+ return (NULL);
+ }
+ sin6->sin6_len = sizeof(struct sockaddr_in6);
+ sin6->sin6_family = AF_INET6;
+ memcpy(&sin6->sin6_addr, rta_data, sizeof(struct in6_addr));
+ return ((struct sockaddr *)sin6);
+}
+
+static struct sockaddr *
+parse_rta_ip(struct rtattr *rta, struct nl_pstate *npt, int *perror)
+{
+ void *rta_data = NL_RTA_DATA(rta);
+ int rta_len = NL_RTA_DATA_LEN(rta);
+
+ if (rta_len == sizeof(struct in_addr)) {
+ return (parse_rta_ip4(rta_data, npt, perror));
+ } else if (rta_len == sizeof(struct in6_addr)) {
+ return (parse_rta_ip6(rta_data, npt, perror));
+ } else {
+ NLMSG_REPORT_ERR_MSG(npt, "unknown IP len: %d for rta type %d",
+ rta_len, rta->rta_type);
+ *perror = ENOTSUP;
+ return (NULL);
+ }
+ return (NULL);
+}
+
+int
+nlattr_get_ip(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ int error = 0;
+
+ struct sockaddr *sa = parse_rta_ip((struct rtattr *)nla, npt, &error);
+
+ *((struct sockaddr **)target) = sa;
+ return (error);
+}
+
+static struct sockaddr *
+parse_rta_via(struct rtattr *rta, struct nl_pstate *npt, int *perror)
+{
+ struct rtvia *via = NL_RTA_DATA(rta);
+ int data_len = NL_RTA_DATA_LEN(rta);
+
+ if (__predict_false(data_len) < sizeof(struct rtvia)) {
+ NLMSG_REPORT_ERR_MSG(npt, "undersized RTA_VIA(%d) attr: len %d",
+ rta->rta_type, data_len);
+ *perror = EINVAL;
+ return (NULL);
+ }
+ data_len -= offsetof(struct rtvia, rtvia_addr);
+
+ switch (via->rtvia_family) {
+ case AF_INET:
+ if (__predict_false(data_len < sizeof(struct in_addr))) {
+ *perror = EINVAL;
+ return (NULL);
+ }
+ return (parse_rta_ip4(via->rtvia_addr, npt, perror));
+ case AF_INET6:
+ if (__predict_false(data_len < sizeof(struct in6_addr))) {
+ *perror = EINVAL;
+ return (NULL);
+ }
+ return (parse_rta_ip6(via->rtvia_addr, npt, perror));
+ default:
+ *perror = ENOTSUP;
+ return (NULL);
+ }
+}
+
+int
+nlattr_get_ipvia(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ int error = 0;
+
+ struct sockaddr *sa = parse_rta_via((struct rtattr *)nla, npt, &error);
+
+ *((struct sockaddr **)target) = sa;
+ return (error);
+}
+
+int
+nlattr_get_bool(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != sizeof(bool))) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not bool",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+ *((bool *)target) = *((const bool *)NL_RTA_DATA_CONST(nla));
+ return (0);
+}
+
+int
+nlattr_get_uint8(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint8_t))) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint8",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+ *((uint8_t *)target) = *((const uint8_t *)NL_RTA_DATA_CONST(nla));
+ return (0);
+}
+
+int
+nlattr_get_uint16(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint16_t))) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint16",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+ *((uint16_t *)target) = *((const uint16_t *)NL_RTA_DATA_CONST(nla));
+ return (0);
+}
+
+int
+nlattr_get_uint32(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint32_t))) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint32",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+ *((uint32_t *)target) = *((const uint32_t *)NL_RTA_DATA_CONST(nla));
+ return (0);
+}
+
+int
+nlattr_get_uint64(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint64_t))) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint64",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+ memcpy(target, NL_RTA_DATA_CONST(nla), sizeof(uint64_t));
+ return (0);
+}
+
+int
+nlattr_get_in_addr(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != sizeof(in_addr_t))) {
+ NLMSG_REPORT_ERR_MSG(npt,
+ "nla type %d size(%u) is not in_addr_t",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+ memcpy(target, NLA_DATA_CONST(nla), sizeof(in_addr_t));
+ return (0);
+}
+
+int
+nlattr_get_in6_addr(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != sizeof(struct in6_addr))) {
+ NLMSG_REPORT_ERR_MSG(npt,
+ "nla type %d size(%u) is not struct in6_addr",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+ memcpy(target, NLA_DATA_CONST(nla), sizeof(struct in6_addr));
+ return (0);
+}
+
+static int
+nlattr_get_ifp_internal(struct nlattr *nla, struct nl_pstate *npt,
+ void *target, bool zero_ok)
+{
+ struct ifnet *ifp;
+ u_int ifindex;
+
+ if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint32_t))) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint32",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+ ifindex = *((const u_int *)NLA_DATA_CONST(nla));
+
+ if (ifindex == 0 && zero_ok) {
+ *((struct ifnet **)target) = NULL;
+ return (0);
+ }
+
+ NET_EPOCH_ASSERT();
+
+ ifp = ifnet_byindex(ifindex);
+ if (__predict_false(ifp == NULL)) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d: ifindex %u invalid",
+ nla->nla_type, ifindex);
+ return (ENOENT);
+ }
+ *((struct ifnet **)target) = ifp;
+ NL_LOG(LOG_DEBUG3, "nla type %d: ifindex %u -> %s", nla->nla_type,
+ ifindex, if_name(ifp));
+
+ return (0);
+}
+
+int
+nlattr_get_ifp(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ return (nlattr_get_ifp_internal(nla, npt, target, false));
+}
+
+int
+nlattr_get_ifpz(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ return (nlattr_get_ifp_internal(nla, npt, target, true));
+}
+
+int
+nlattr_get_chara(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ int maxlen = NLA_DATA_LEN(nla);
+ int target_size = (size_t)arg;
+ int len = strnlen((char *)NLA_DATA(nla), maxlen);
+
+ if (__predict_false(len >= maxlen) ||
+ __predict_false(len >= target_size)) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not "
+ "NULL-terminated or longer than %u",
+ nla->nla_type, maxlen, target_size);
+ return (EINVAL);
+ }
+
+ strncpy((char *)target, (char *)NLA_DATA(nla), target_size);
+ return (0);
+}
+
+int
+nlattr_get_string(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ int maxlen = NLA_DATA_LEN(nla);
+
+ if (__predict_false(strnlen((char *)NLA_DATA(nla), maxlen) >= maxlen)) {
+ NLMSG_REPORT_ERR_MSG(npt,
+ "nla type %d size(%u) is not NULL-terminated",
+ nla->nla_type, maxlen);
+ return (EINVAL);
+ }
+
+ *((char **)target) = (char *)NLA_DATA(nla);
+ return (0);
+}
+
+int
+nlattr_get_stringn(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ int maxlen = NLA_DATA_LEN(nla);
+
+ char *buf = npt_alloc(npt, maxlen + 1);
+ if (buf == NULL)
+ return (ENOMEM);
+ buf[maxlen] = '\0';
+ memcpy(buf, NLA_DATA(nla), maxlen);
+
+ *((char **)target) = buf;
+ return (0);
+}
+
+int
+nlattr_get_bytes(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ size_t size = (size_t)arg;
+
+ if (NLA_DATA_LEN(nla) != size)
+ return (EINVAL);
+
+ memcpy(target, NLA_DATA(nla), size);
+
+ return (0);
+}
+
+int
+nlattr_get_nla(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ NL_LOG(LOG_DEBUG3, "STORING %p len %d", nla, nla->nla_len);
+ *((struct nlattr **)target) = nla;
+ return (0);
+}
+
+int
+nlattr_get_nested(struct nlattr *nla, struct nl_pstate *npt, const void *arg,
+ void *target)
+{
+ const struct nlhdr_parser *p = (const struct nlhdr_parser *)arg;
+
+ /* Assumes target points to the beginning of the structure. */
+ return (nl_parse_header(NLA_DATA(nla), NLA_DATA_LEN(nla), p, npt,
+ target));
+}
+
+int
+nlattr_get_nested_ptr(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target)
+{
+ const struct nlhdr_parser *p = (const struct nlhdr_parser *)arg;
+
+ /* Assumes target points to the beginning of the structure. */
+ return (nl_parse_header(NLA_DATA(nla), NLA_DATA_LEN(nla), p, npt,
+ *(void **)target));
+}
+
+int
+nlf_get_ifp(void *src, struct nl_pstate *npt, void *target)
+{
+ struct ifnet *ifp;
+ u_int ifindex;
+
+ NET_EPOCH_ASSERT();
+
+ ifindex = *((const u_int *)src);
+ ifp = ifnet_byindex(ifindex);
+ if (ifp == NULL) {
+ NL_LOG(LOG_DEBUG, "ifindex %u invalid", ifindex);
+ return (ENOENT);
+ }
+ *((struct ifnet **)target) = ifp;
+
+ return (0);
+}
+
+int
+nlf_get_ifpz(void *src, struct nl_pstate *npt, void *target)
+{
+ struct ifnet *ifp;
+ u_int ifindex;
+
+ NET_EPOCH_ASSERT();
+
+ ifindex = *((const u_int *)src);
+ ifp = ifnet_byindex(ifindex);
+ if (ifindex != 0 && ifp == NULL) {
+ NL_LOG(LOG_DEBUG, "ifindex %u invalid", ifindex);
+ return (ENOENT);
+ }
+ *((struct ifnet **)target) = ifp;
+
+ return (0);
+}
+
+int
+nlf_get_u8(void *src, struct nl_pstate *npt, void *target)
+{
+ uint8_t val = *((const uint8_t *)src);
+
+ *((uint8_t *)target) = val;
+
+ return (0);
+}
+
+int
+nlf_get_u8_u32(void *src, struct nl_pstate *npt, void *target)
+{
+ *((uint32_t *)target) = *((const uint8_t *)src);
+ return (0);
+}
+
+int
+nlf_get_u16(void *src, struct nl_pstate *npt, void *target)
+{
+ *((uint16_t *)target) = *((const uint16_t *)src);
+ return (0);
+}
+
+int
+nlf_get_u32(void *src, struct nl_pstate *npt, void *target)
+{
+ *((uint32_t *)target) = *((const uint32_t *)src);
+ return (0);
+}
diff --git a/sys/netlink/netlink_message_parser.h b/sys/netlink/netlink_message_parser.h
new file mode 100644
index 000000000000..720317ed74f3
--- /dev/null
+++ b/sys/netlink/netlink_message_parser.h
@@ -0,0 +1,337 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETLINK_NETLINK_MESSAGE_PARSER_H_
+#define _NETLINK_NETLINK_MESSAGE_PARSER_H_
+
+#ifdef _KERNEL
+
+#include <sys/bitset.h>
+
+/*
+ * It is not meant to be included directly
+ */
+
+/* Parsing state */
+struct linear_buffer {
+ char *base; /* Base allocated memory pointer */
+ uint32_t offset; /* Currently used offset */
+ uint32_t size; /* Total buffer size */
+} __aligned(_Alignof(__max_align_t));
+
+static inline void *
+lb_alloc(struct linear_buffer *lb, int len)
+{
+ len = roundup2(len, _Alignof(__max_align_t));
+ if (lb->offset + len > lb->size)
+ return (NULL);
+ void *data = (void *)(lb->base + lb->offset);
+ lb->offset += len;
+ return (data);
+}
+
+static inline void
+lb_clear(struct linear_buffer *lb)
+{
+ memset(lb->base, 0, lb->size);
+ lb->offset = 0;
+}
+
+#define NL_MAX_ERROR_BUF 128
+#define SCRATCH_BUFFER_SIZE (1024 + NL_MAX_ERROR_BUF)
+struct nl_pstate {
+ struct linear_buffer lb; /* Per-message scratch buffer */
+ struct nlpcb *nlp; /* Originator socket */
+ struct nl_writer *nw; /* Message writer to use */
+ struct nlmsghdr *hdr; /* Current parsed message header */
+ uint32_t err_off; /* error offset from hdr start */
+ int error; /* last operation error */
+ char *err_msg; /* Description of last error */
+ struct nlattr *cookie; /* NLA to return to the userspace */
+ bool strict; /* Strict parsing required */
+};
+
+static inline void *
+npt_alloc(struct nl_pstate *npt, int len)
+{
+ return (lb_alloc(&npt->lb, len));
+}
+#define npt_alloc_sockaddr(_npt, _len) \
+ ((struct sockaddr *)(npt_alloc((_npt), (_len))))
+
+typedef int parse_field_f(void *hdr, struct nl_pstate *npt, void *target);
+struct nlfield_parser {
+ uint16_t off_in;
+ uint16_t off_out;
+ parse_field_f *cb;
+};
+static const struct nlfield_parser nlf_p_empty[] = {};
+
+int nlf_get_ifp(void *src, struct nl_pstate *npt, void *target);
+int nlf_get_ifpz(void *src, struct nl_pstate *npt, void *target);
+int nlf_get_u8(void *src, struct nl_pstate *npt, void *target);
+int nlf_get_u16(void *src, struct nl_pstate *npt, void *target);
+int nlf_get_u32(void *src, struct nl_pstate *npt, void *target);
+int nlf_get_u8_u32(void *src, struct nl_pstate *npt, void *target);
+
+struct nlattr_parser;
+typedef int parse_attr_f(struct nlattr *attr, struct nl_pstate *npt,
+ const void *arg, void *target);
+struct nlattr_parser {
+ uint16_t type; /* Attribute type */
+ uint16_t off; /* field offset in the target structure */
+ parse_attr_f *cb; /* parser function to call */
+ const void *arg;
+};
+
+typedef bool strict_parser_f(void *hdr, struct nl_pstate *npt);
+typedef bool post_parser_f(void *parsed_attrs, struct nl_pstate *npt);
+
+struct nlhdr_parser {
+ u_int nl_hdr_off; /* aligned netlink header size */
+ u_int out_hdr_off; /* target header size */
+ u_int fp_size;
+ u_int np_size;
+ const struct nlfield_parser *fp; /* array of header field parsers */
+ const struct nlattr_parser *np; /* array of attribute parsers */
+ strict_parser_f *sp; /* Pre-parse strict validation function */
+ post_parser_f *post_parse;
+};
+
+#define NL_DECLARE_PARSER_EXT(_name, _t, _sp, _fp, _np, _pp) \
+static const struct nlhdr_parser _name = { \
+ .nl_hdr_off = sizeof(_t), \
+ .fp = &((_fp)[0]), \
+ .np = &((_np)[0]), \
+ .fp_size = nitems(_fp), \
+ .np_size = nitems(_np), \
+ .sp = _sp, \
+ .post_parse = _pp, \
+}
+
+#define NL_DECLARE_PARSER(_name, _t, _fp, _np) \
+ NL_DECLARE_PARSER_EXT(_name, _t, NULL, _fp, _np, NULL)
+
+#define NL_DECLARE_STRICT_PARSER(_name, _t, _sp, _fp, _np) \
+ NL_DECLARE_PARSER_EXT(_name, _t, _sp, _fp, _np, NULL)
+
+#define NL_DECLARE_ARR_PARSER(_name, _t, _o, _fp, _np) \
+static const struct nlhdr_parser _name = { \
+ .nl_hdr_off = sizeof(_t), \
+ .out_hdr_off = sizeof(_o), \
+ .fp = &((_fp)[0]), \
+ .np = &((_np)[0]), \
+ .fp_size = nitems(_fp), \
+ .np_size = nitems(_np), \
+}
+
+#define NL_DECLARE_ATTR_PARSER_EXT(_name, _np, _pp) \
+static const struct nlhdr_parser _name = { \
+ .np = &((_np)[0]), \
+ .np_size = nitems(_np), \
+ .post_parse = (_pp) \
+}
+
+#define NL_DECLARE_ATTR_PARSER(_name, _np) \
+ NL_DECLARE_ATTR_PARSER_EXT(_name, _np, NULL)
+
+#define NL_ATTR_BMASK_SIZE 128
+BITSET_DEFINE(nlattr_bmask, NL_ATTR_BMASK_SIZE);
+
+void nl_get_attrs_bmask_raw(struct nlattr *nla_head, uint32_t len,
+ struct nlattr_bmask *bm);
+bool nl_has_attr(const struct nlattr_bmask *bm, uint16_t nla_type);
+
+int nl_parse_attrs_raw(struct nlattr *nla_head, uint16_t len,
+ const struct nlattr_parser *ps, u_int pslen, struct nl_pstate *npt,
+ void *target);
+
+int nlattr_get_flag(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_ip(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_bool(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_uint8(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_uint16(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_uint32(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_uint64(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_in_addr(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_in6_addr(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_ifp(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_ifpz(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_ipvia(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_chara(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_string(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_stringn(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_bytes(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_nla(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_nested(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+int nlattr_get_nested_ptr(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target);
+
+bool nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...)
+ __printflike(2, 3);
+
+#define NLMSG_REPORT_ERR_MSG(_npt, _fmt, ...) { \
+ nlmsg_report_err_msg(_npt, _fmt, ## __VA_ARGS__); \
+ NLP_LOG(LOG_DEBUG, (_npt)->nlp, _fmt, ## __VA_ARGS__); \
+}
+
+bool nlmsg_report_err_offset(struct nl_pstate *npt, uint32_t off);
+
+void nlmsg_report_cookie(struct nl_pstate *npt, struct nlattr *nla);
+void nlmsg_report_cookie_u32(struct nl_pstate *npt, uint32_t val);
+
+/*
+ * Have it inline so compiler can optimize field accesses into
+ * the list of direct function calls without iteration.
+ */
+static inline int
+nl_parse_header(void *hdr, uint32_t len, const struct nlhdr_parser *parser,
+ struct nl_pstate *npt, void *target)
+{
+ int error;
+
+ if (__predict_false(len < parser->nl_hdr_off)) {
+ void *tmp_hdr;
+
+ if (npt->strict) {
+ nlmsg_report_err_msg(npt,
+ "header too short: expected %d, got %d",
+ parser->nl_hdr_off, len);
+ return (EINVAL);
+ }
+
+ /*
+ * Compatibility with older applications:
+ * pretend there's a full header.
+ */
+ tmp_hdr = npt_alloc(npt, parser->nl_hdr_off);
+ if (tmp_hdr == NULL)
+ return (EINVAL);
+ memcpy(tmp_hdr, hdr, len);
+ hdr = tmp_hdr;
+ len = parser->nl_hdr_off;
+ }
+
+ if (npt->strict && parser->sp != NULL && !parser->sp(hdr, npt))
+ return (EINVAL);
+
+ /* Extract fields first */
+ for (u_int i = 0; i < parser->fp_size; i++) {
+ const struct nlfield_parser *fp = &parser->fp[i];
+ void *src = (char *)hdr + fp->off_in;
+ void *dst = (char *)target + fp->off_out;
+
+ error = fp->cb(src, npt, dst);
+ if (error != 0)
+ return (error);
+ }
+
+ error = nl_parse_attrs_raw(
+ (struct nlattr *)((char *)hdr + parser->nl_hdr_off),
+ len - parser->nl_hdr_off, parser->np, parser->np_size, npt, target);
+
+ if (parser->post_parse != NULL && error == 0) {
+ if (!parser->post_parse(target, npt))
+ return (EINVAL);
+ }
+
+ return (error);
+}
+
+static inline int
+nl_parse_nested(struct nlattr *nla, const struct nlhdr_parser *parser,
+ struct nl_pstate *npt, void *target)
+{
+ return (nl_parse_attrs_raw((struct nlattr *)NLA_DATA(nla),
+ NLA_DATA_LEN(nla), parser->np, parser->np_size, npt, target));
+}
+
+/*
+ * Checks that attributes are sorted by attribute type.
+ */
+static inline void
+nl_verify_parsers(const struct nlhdr_parser **parser, int count)
+{
+#ifdef INVARIANTS
+ for (int i = 0; i < count; i++) {
+ const struct nlhdr_parser *p = parser[i];
+ int attr_type = 0;
+ for (int j = 0; j < p->np_size; j++) {
+ MPASS(p->np[j].type > attr_type);
+ attr_type = p->np[j].type;
+
+ /* Recurse into nested objects. */
+ if (p->np[j].cb == nlattr_get_nested ||
+ p->np[j].cb == nlattr_get_nested_ptr) {
+ const struct nlhdr_parser *np =
+ (const struct nlhdr_parser *)p->np[j].arg;
+ nl_verify_parsers(&np, 1);
+ }
+ }
+ }
+#endif
+}
+void nl_verify_parsers(const struct nlhdr_parser **parser, int count);
+#define NL_VERIFY_PARSERS(_p) nl_verify_parsers((_p), nitems(_p))
+
+static inline int
+nl_parse_nlmsg(struct nlmsghdr *hdr, const struct nlhdr_parser *parser,
+ struct nl_pstate *npt, void *target)
+{
+ return (nl_parse_header(hdr + 1, hdr->nlmsg_len - sizeof(*hdr), parser,
+ npt, target));
+}
+
+static inline void
+nl_get_attrs_bmask_nlmsg(struct nlmsghdr *hdr,
+ const struct nlhdr_parser *parser, struct nlattr_bmask *bm)
+{
+ nl_get_attrs_bmask_raw(
+ (struct nlattr *)((char *)(hdr + 1) + parser->nl_hdr_off),
+ hdr->nlmsg_len - sizeof(*hdr) - parser->nl_hdr_off, bm);
+}
+
+#endif
+#endif
diff --git a/sys/netlink/netlink_message_writer.c b/sys/netlink/netlink_message_writer.c
new file mode 100644
index 000000000000..8c5b3ec14058
--- /dev/null
+++ b/sys/netlink/netlink_message_writer.c
@@ -0,0 +1,399 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syslog.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_linux.h>
+#include <netlink/netlink_var.h>
+
+#define DEBUG_MOD_NAME nl_writer
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_INFO);
+
+static bool
+nlmsg_get_buf(struct nl_writer *nw, size_t len, bool waitok)
+{
+ const int mflag = waitok ? M_WAITOK : M_NOWAIT;
+
+ MPASS(nw->buf == NULL);
+
+ NL_LOG(LOG_DEBUG3, "Setting up nw %p len %zu %s", nw, len,
+ waitok ? "wait" : "nowait");
+
+ nw->buf = nl_buf_alloc(len, mflag);
+ if (__predict_false(nw->buf == NULL))
+ return (false);
+ nw->hdr = NULL;
+ nw->malloc_flag = mflag;
+ nw->num_messages = 0;
+ nw->enomem = false;
+
+ return (true);
+}
+
+static bool
+nl_send_one(struct nl_writer *nw)
+{
+
+ return (nl_send(nw, nw->nlp));
+}
+
+bool
+_nl_writer_unicast(struct nl_writer *nw, size_t size, struct nlpcb *nlp,
+ bool waitok)
+{
+ *nw = (struct nl_writer){
+ .nlp = nlp,
+ .cb = nl_send_one,
+ };
+
+ return (nlmsg_get_buf(nw, size, waitok));
+}
+
+bool
+_nl_writer_group(struct nl_writer *nw, size_t size, uint16_t protocol,
+ uint16_t group_id, int priv, bool waitok)
+{
+ *nw = (struct nl_writer){
+ .group.proto = protocol,
+ .group.id = group_id,
+ .group.priv = priv,
+ .cb = nl_send_group,
+ };
+
+ return (nlmsg_get_buf(nw, size, waitok));
+}
+
+void
+_nlmsg_ignore_limit(struct nl_writer *nw)
+{
+ nw->ignore_limit = true;
+}
+
+bool
+_nlmsg_flush(struct nl_writer *nw)
+{
+ bool result;
+
+ if (__predict_false(nw->hdr != NULL)) {
+ /* Last message has not been completed, skip it. */
+ int completed_len = (char *)nw->hdr - nw->buf->data;
+ /* Send completed messages */
+ nw->buf->datalen -= nw->buf->datalen - completed_len;
+ nw->hdr = NULL;
+ }
+
+ if (nw->buf->datalen == 0) {
+ MPASS(nw->num_messages == 0);
+ nl_buf_free(nw->buf);
+ nw->buf = NULL;
+ return (true);
+ }
+
+ result = nw->cb(nw);
+ nw->num_messages = 0;
+
+ if (!result) {
+ NL_LOG(LOG_DEBUG, "nw %p flush with %p() failed", nw, nw->cb);
+ }
+
+ return (result);
+}
+
+/*
+ * Flushes previous data and allocates new underlying storage
+ * sufficient for holding at least @required_len bytes.
+ * Return true on success.
+ */
+bool
+_nlmsg_refill_buffer(struct nl_writer *nw, size_t required_len)
+{
+ struct nl_buf *new;
+ size_t completed_len, new_len, last_len;
+
+ MPASS(nw->buf != NULL);
+
+ if (nw->enomem)
+ return (false);
+
+ NL_LOG(LOG_DEBUG3, "no space at offset %u/%u (want %zu), trying to "
+ "reclaim", nw->buf->datalen, nw->buf->buflen, required_len);
+
+ /* Calculate new buffer size and allocate it. */
+ completed_len = (nw->hdr != NULL) ?
+ (char *)nw->hdr - nw->buf->data : nw->buf->datalen;
+ if (completed_len > 0 && required_len < NLMBUFSIZE) {
+ /* We already ran out of space, use largest effective size. */
+ new_len = max(nw->buf->buflen, NLMBUFSIZE);
+ } else {
+ if (nw->buf->buflen < NLMBUFSIZE)
+ /* XXXGL: does this happen? */
+ new_len = NLMBUFSIZE;
+ else
+ new_len = nw->buf->buflen * 2;
+ while (new_len < required_len)
+ new_len *= 2;
+ }
+
+ new = nl_buf_alloc(new_len, nw->malloc_flag | M_ZERO);
+ if (__predict_false(new == NULL)) {
+ nw->enomem = true;
+ NL_LOG(LOG_DEBUG, "getting new buf failed, setting ENOMEM");
+ return (false);
+ }
+
+ /* Copy last (unfinished) header to the new storage. */
+ last_len = nw->buf->datalen - completed_len;
+ if (last_len > 0) {
+ memcpy(new->data, nw->hdr, last_len);
+ new->datalen = last_len;
+ }
+
+ NL_LOG(LOG_DEBUG2, "completed: %zu bytes, copied: %zu bytes",
+ completed_len, last_len);
+
+ if (completed_len > 0) {
+ nlmsg_flush(nw);
+ MPASS(nw->buf == NULL);
+ } else
+ nl_buf_free(nw->buf);
+ nw->buf = new;
+ nw->hdr = (last_len > 0) ? (struct nlmsghdr *)new->data : NULL;
+ NL_LOG(LOG_DEBUG2, "switched buffer: used %u/%u bytes",
+ new->datalen, new->buflen);
+
+ return (true);
+}
+
+bool
+_nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
+ uint16_t flags, uint32_t len)
+{
+ struct nl_buf *nb = nw->buf;
+ struct nlmsghdr *hdr;
+ size_t required_len;
+
+ MPASS(nw->hdr == NULL);
+
+ required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr));
+ if (__predict_false(nb->datalen + required_len > nb->buflen)) {
+ if (!nlmsg_refill_buffer(nw, required_len))
+ return (false);
+ nb = nw->buf;
+ }
+
+ hdr = (struct nlmsghdr *)(&nb->data[nb->datalen]);
+
+ hdr->nlmsg_len = len;
+ hdr->nlmsg_type = type;
+ hdr->nlmsg_flags = flags;
+ hdr->nlmsg_seq = seq;
+ hdr->nlmsg_pid = portid;
+
+ nw->hdr = hdr;
+ nb->datalen += sizeof(struct nlmsghdr);
+
+ return (true);
+}
+
+bool
+_nlmsg_end(struct nl_writer *nw)
+{
+ struct nl_buf *nb = nw->buf;
+
+ MPASS(nw->hdr != NULL);
+
+ if (nw->enomem) {
+ NL_LOG(LOG_DEBUG, "ENOMEM when dumping message");
+ nlmsg_abort(nw);
+ return (false);
+ }
+
+ nw->hdr->nlmsg_len = nb->data + nb->datalen - (char *)nw->hdr;
+ NL_LOG(LOG_DEBUG2, "wrote msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
+ nw->hdr->nlmsg_len, nw->hdr->nlmsg_type, nw->hdr->nlmsg_flags,
+ nw->hdr->nlmsg_seq, nw->hdr->nlmsg_pid);
+ nw->hdr = NULL;
+ nw->num_messages++;
+ return (true);
+}
+
+void
+_nlmsg_abort(struct nl_writer *nw)
+{
+ struct nl_buf *nb = nw->buf;
+
+ if (nw->hdr != NULL) {
+ nb->datalen = (char *)nw->hdr - nb->data;
+ nw->hdr = NULL;
+ }
+}
+
+void
+nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *hdr,
+ struct nl_pstate *npt)
+{
+ struct nlmsgerr *errmsg;
+ int payload_len;
+ uint32_t flags = nlp->nl_flags;
+ struct nl_writer *nw = npt->nw;
+ bool cap_ack;
+
+ payload_len = sizeof(struct nlmsgerr);
+
+ /*
+ * The only case when we send the full message in the
+ * reply is when there is an error and NETLINK_CAP_ACK
+ * is not set.
+ */
+ cap_ack = (error == 0) || (flags & NLF_CAP_ACK);
+ if (!cap_ack)
+ payload_len += hdr->nlmsg_len - sizeof(struct nlmsghdr);
+ payload_len = NETLINK_ALIGN(payload_len);
+
+ uint16_t nl_flags = cap_ack ? NLM_F_CAPPED : 0;
+ if ((npt->err_msg || npt->err_off) && nlp->nl_flags & NLF_EXT_ACK)
+ nl_flags |= NLM_F_ACK_TLVS;
+
+ NL_LOG(LOG_DEBUG3, "acknowledging message type %d seq %d",
+ hdr->nlmsg_type, hdr->nlmsg_seq);
+
+ if (!nlmsg_add(nw, nlp->nl_port, hdr->nlmsg_seq, NLMSG_ERROR, nl_flags, payload_len))
+ goto enomem;
+
+ errmsg = nlmsg_reserve_data(nw, payload_len, struct nlmsgerr);
+ errmsg->error = error;
+ /* In case of error copy the whole message, else just the header */
+ memcpy(&errmsg->msg, hdr, cap_ack ? sizeof(*hdr) : hdr->nlmsg_len);
+
+ if (npt->err_msg != NULL && nlp->nl_flags & NLF_EXT_ACK)
+ nlattr_add_string(nw, NLMSGERR_ATTR_MSG, npt->err_msg);
+ if (npt->err_off != 0 && nlp->nl_flags & NLF_EXT_ACK)
+ nlattr_add_u32(nw, NLMSGERR_ATTR_OFFS, npt->err_off);
+ if (npt->cookie != NULL)
+ nlattr_add_raw(nw, npt->cookie);
+
+ if (nlmsg_end(nw))
+ return;
+enomem:
+ NLP_LOG(LOG_DEBUG, nlp, "error allocating ack data for message %d seq %u",
+ hdr->nlmsg_type, hdr->nlmsg_seq);
+ nlmsg_abort(nw);
+}
+
+bool
+_nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr)
+{
+ if (!nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) {
+ NL_LOG(LOG_DEBUG, "Error finalizing table dump");
+ return (false);
+ }
+ /* Save operation result */
+ int *perror = nlmsg_reserve_object(nw, int);
+ NL_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error,
+ nw->buf->datalen, perror);
+ *perror = error;
+ nlmsg_end(nw);
+ nw->suppress_ack = true;
+
+ return (true);
+}
+
+/*
+ * KPI functions.
+ */
+
+u_int
+nlattr_save_offset(const struct nl_writer *nw)
+{
+ return (nw->buf->datalen - ((char *)nw->hdr - nw->buf->data));
+}
+
+void *
+nlmsg_reserve_data_raw(struct nl_writer *nw, size_t sz)
+{
+ struct nl_buf *nb = nw->buf;
+ void *data;
+
+ sz = NETLINK_ALIGN(sz);
+ if (__predict_false(nb->datalen + sz > nb->buflen)) {
+ if (!nlmsg_refill_buffer(nw, sz))
+ return (NULL);
+ nb = nw->buf;
+ }
+
+ data = &nb->data[nb->datalen];
+ bzero(data, sz);
+ nb->datalen += sz;
+
+ return (data);
+}
+
+bool
+nlattr_add(struct nl_writer *nw, uint16_t attr_type, uint16_t attr_len,
+ const void *data)
+{
+ struct nl_buf *nb = nw->buf;
+ struct nlattr *nla;
+ size_t required_len;
+
+ KASSERT(attr_len <= UINT16_MAX - sizeof(struct nlattr),
+ ("%s: invalid attribute length %u", __func__, attr_len));
+
+ required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr));
+ if (__predict_false(nb->datalen + required_len > nb->buflen)) {
+ if (!nlmsg_refill_buffer(nw, required_len))
+ return (false);
+ nb = nw->buf;
+ }
+
+ nla = (struct nlattr *)(&nb->data[nb->datalen]);
+
+ nla->nla_len = attr_len + sizeof(struct nlattr);
+ nla->nla_type = attr_type;
+ if (attr_len > 0) {
+ if ((attr_len % 4) != 0) {
+ /* clear padding bytes */
+ bzero((char *)nla + required_len - 4, 4);
+ }
+ memcpy((nla + 1), data, attr_len);
+ }
+ nb->datalen += required_len;
+ return (true);
+}
+
+#include <netlink/ktest_netlink_message_writer.h>
diff --git a/sys/netlink/netlink_message_writer.h b/sys/netlink/netlink_message_writer.h
new file mode 100644
index 000000000000..83f925e8d93d
--- /dev/null
+++ b/sys/netlink/netlink_message_writer.h
@@ -0,0 +1,312 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETLINK_NETLINK_MESSAGE_WRITER_H_
+#define _NETLINK_NETLINK_MESSAGE_WRITER_H_
+
+#ifdef _KERNEL
+
+#include <netinet/in.h>
+
+/*
+ * It is not meant to be included directly
+ */
+
+struct nl_buf;
+struct nl_writer;
+typedef bool nl_writer_cb(struct nl_writer *nw);
+
+struct nl_writer {
+ struct nl_buf *buf; /* Underlying storage pointer */
+ struct nlmsghdr *hdr; /* Pointer to the currently-filled msg */
+ nl_writer_cb *cb; /* Callback to flush data */
+ union {
+ struct nlpcb *nlp;
+ struct {
+ uint16_t proto;
+ uint16_t id;
+ int priv;
+ } group;
+ };
+ u_int num_messages; /* Number of messages in the buffer */
+ int malloc_flag; /* M_WAITOK or M_NOWAIT */
+ bool ignore_limit; /* If true, ignores RCVBUF limit */
+ bool enomem; /* True if ENOMEM occured */
+ bool suppress_ack; /* If true, don't send NLMSG_ERR */
+};
+
+#define NLMSG_SMALL 128
+#define NLMSG_LARGE 2048
+
+/* Message and attribute writing */
+#if defined(NETLINK) || defined(NETLINK_MODULE)
+/* Provide optimized calls to the functions inside the same linking unit */
+
+bool _nl_writer_unicast(struct nl_writer *, size_t, struct nlpcb *nlp, bool);
+bool _nl_writer_group(struct nl_writer *, size_t, uint16_t, uint16_t, int,
+ bool);
+bool _nlmsg_flush(struct nl_writer *nw);
+void _nlmsg_ignore_limit(struct nl_writer *nw);
+
+bool _nlmsg_refill_buffer(struct nl_writer *nw, size_t required_len);
+bool _nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq,
+ uint16_t type, uint16_t flags, uint32_t len);
+bool _nlmsg_end(struct nl_writer *nw);
+void _nlmsg_abort(struct nl_writer *nw);
+
+bool _nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr);
+
+
+static inline bool
+nl_writer_unicast(struct nl_writer *nw, size_t size, struct nlpcb *nlp,
+ bool waitok)
+{
+ return (_nl_writer_unicast(nw, size, nlp, waitok));
+}
+
+static inline bool
+nl_writer_group(struct nl_writer *nw, size_t size, uint16_t proto,
+ uint16_t group_id, int priv, bool waitok)
+{
+ return (_nl_writer_group(nw, size, proto, group_id, priv, waitok));
+}
+
+static inline bool
+nlmsg_flush(struct nl_writer *nw)
+{
+ return (_nlmsg_flush(nw));
+}
+
+static inline void
+nlmsg_ignore_limit(struct nl_writer *nw)
+{
+ _nlmsg_ignore_limit(nw);
+}
+
+static inline bool
+nlmsg_refill_buffer(struct nl_writer *nw, size_t required_size)
+{
+ return (_nlmsg_refill_buffer(nw, required_size));
+}
+
+static inline bool
+nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
+ uint16_t flags, uint32_t len)
+{
+ return (_nlmsg_add(nw, portid, seq, type, flags, len));
+}
+
+static inline bool
+nlmsg_end(struct nl_writer *nw)
+{
+ return (_nlmsg_end(nw));
+}
+
+static inline void
+nlmsg_abort(struct nl_writer *nw)
+{
+ return (_nlmsg_abort(nw));
+}
+
+static inline bool
+nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr)
+{
+ return (_nlmsg_end_dump(nw, error, hdr));
+}
+
+#else
+/* Provide access to the functions via netlink_glue.c */
+
+bool nl_writer_unicast(struct nl_writer *, size_t, struct nlpcb *, bool waitok);
+bool nl_writer_group(struct nl_writer *, size_t, uint16_t, uint16_t, int,
+ bool waitok);
+bool nlmsg_flush(struct nl_writer *nw);
+void nlmsg_ignore_limit(struct nl_writer *nw);
+
+bool nlmsg_refill_buffer(struct nl_writer *nw, size_t required_size);
+bool nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq,
+ uint16_t type, uint16_t flags, uint32_t len);
+bool nlmsg_end(struct nl_writer *nw);
+void nlmsg_abort(struct nl_writer *nw);
+
+bool nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr);
+
+#endif /* defined(NETLINK) || defined(NETLINK_MODULE) */
+
+static inline bool
+nlmsg_reply(struct nl_writer *nw, const struct nlmsghdr *hdr, int payload_len)
+{
+ return (nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type,
+ hdr->nlmsg_flags, payload_len));
+}
+
+/*
+ * KPI similar to mtodo():
+ * current (uncompleted) header is guaranteed to be contiguous,
+ * but can be reallocated, thus pointers may need to be readjusted.
+ */
+u_int nlattr_save_offset(const struct nl_writer *nw);
+
+static inline void *
+_nlattr_restore_offset(const struct nl_writer *nw, int off)
+{
+ return ((void *)((char *)nw->hdr + off));
+}
+#define nlattr_restore_offset(_ns, _off, _t) ((_t *)_nlattr_restore_offset(_ns, _off))
+
+static inline void
+nlattr_set_len(const struct nl_writer *nw, int off)
+{
+ struct nlattr *nla = nlattr_restore_offset(nw, off, struct nlattr);
+ nla->nla_len = nlattr_save_offset(nw) - off;
+}
+
+void *nlmsg_reserve_data_raw(struct nl_writer *nw, size_t sz);
+#define nlmsg_reserve_object(_ns, _t) ((_t *)nlmsg_reserve_data_raw(_ns, sizeof(_t)))
+#define nlmsg_reserve_data(_ns, _sz, _t) ((_t *)nlmsg_reserve_data_raw(_ns, _sz))
+
+static inline int
+nlattr_add_nested(struct nl_writer *nw, uint16_t nla_type)
+{
+ int off = nlattr_save_offset(nw);
+ struct nlattr *nla = nlmsg_reserve_data(nw, sizeof(struct nlattr), struct nlattr);
+ if (__predict_false(nla == NULL))
+ return (0);
+ nla->nla_type = nla_type;
+ return (off);
+}
+
+static inline void *
+_nlmsg_reserve_attr(struct nl_writer *nw, uint16_t nla_type, uint16_t sz)
+{
+ sz += sizeof(struct nlattr);
+
+ struct nlattr *nla = nlmsg_reserve_data(nw, sz, struct nlattr);
+ if (__predict_false(nla == NULL))
+ return (NULL);
+ nla->nla_type = nla_type;
+ nla->nla_len = sz;
+
+ return ((void *)(nla + 1));
+}
+#define nlmsg_reserve_attr(_ns, _at, _t) ((_t *)_nlmsg_reserve_attr(_ns, _at, NLA_ALIGN(sizeof(_t))))
+
+bool nlattr_add(struct nl_writer *nw, uint16_t attr_type, uint16_t attr_len,
+ const void *data);
+
+static inline bool
+nlattr_add_raw(struct nl_writer *nw, const struct nlattr *nla_src)
+{
+ MPASS(nla_src->nla_len >= sizeof(struct nlattr));
+
+ return (nlattr_add(nw, nla_src->nla_type,
+ nla_src->nla_len - sizeof(struct nlattr),
+ (const void *)(nla_src + 1)));
+}
+
+static inline bool
+nlattr_add_bool(struct nl_writer *nw, uint16_t attrtype, bool value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(bool), &value));
+}
+
+static inline bool
+nlattr_add_u8(struct nl_writer *nw, uint16_t attrtype, uint8_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(uint8_t), &value));
+}
+
+static inline bool
+nlattr_add_u16(struct nl_writer *nw, uint16_t attrtype, uint16_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(uint16_t), &value));
+}
+
+static inline bool
+nlattr_add_u32(struct nl_writer *nw, uint16_t attrtype, uint32_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(uint32_t), &value));
+}
+
+static inline bool
+nlattr_add_u64(struct nl_writer *nw, uint16_t attrtype, uint64_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(uint64_t), &value));
+}
+
+static inline bool
+nlattr_add_s8(struct nl_writer *nw, uint16_t attrtype, int8_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(int8_t), &value));
+}
+
+static inline bool
+nlattr_add_s16(struct nl_writer *nw, uint16_t attrtype, int16_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(int16_t), &value));
+}
+
+static inline bool
+nlattr_add_s32(struct nl_writer *nw, uint16_t attrtype, int32_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(int32_t), &value));
+}
+
+static inline bool
+nlattr_add_s64(struct nl_writer *nw, uint16_t attrtype, int64_t value)
+{
+ return (nlattr_add(nw, attrtype, sizeof(int64_t), &value));
+}
+
+static inline bool
+nlattr_add_flag(struct nl_writer *nw, uint16_t attrtype)
+{
+ return (nlattr_add(nw, attrtype, 0, NULL));
+}
+
+static inline bool
+nlattr_add_string(struct nl_writer *nw, uint16_t attrtype, const char *str)
+{
+ return (nlattr_add(nw, attrtype, strlen(str) + 1, str));
+}
+
+static inline bool
+nlattr_add_in_addr(struct nl_writer *nw, uint16_t attrtype,
+ const struct in_addr *in)
+{
+ return (nlattr_add(nw, attrtype, sizeof(*in), in));
+}
+
+static inline bool
+nlattr_add_in6_addr(struct nl_writer *nw, uint16_t attrtype,
+ const struct in6_addr *in6)
+{
+ return (nlattr_add(nw, attrtype, sizeof(*in6), in6));
+}
+#endif
+#endif
diff --git a/sys/netlink/netlink_module.c b/sys/netlink/netlink_module.c
new file mode 100644
index 000000000000..6c3cd90e61ab
--- /dev/null
+++ b/sys/netlink/netlink_module.c
@@ -0,0 +1,221 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/ck.h>
+#include <sys/syslog.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+#include <netlink/route/route_var.h>
+
+#include <machine/atomic.h>
+
+FEATURE(netlink, "Netlink support");
+
+#define DEBUG_MOD_NAME nl_mod
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_INFO);
+
+
+#define NL_MAX_HANDLERS 20
+struct nl_proto_handler _nl_handlers[NL_MAX_HANDLERS];
+struct nl_proto_handler *nl_handlers = _nl_handlers;
+
+CK_LIST_HEAD(nl_control_head, nl_control);
+static struct nl_control_head vnets_head = CK_LIST_HEAD_INITIALIZER();
+
+VNET_DEFINE(struct nl_control, nl_ctl) = {
+ .ctl_port_head = CK_LIST_HEAD_INITIALIZER(),
+ .ctl_pcb_head = CK_LIST_HEAD_INITIALIZER(),
+};
+
+struct mtx nl_global_mtx;
+MTX_SYSINIT(nl_global_mtx, &nl_global_mtx, "global netlink lock", MTX_DEF);
+
+#define NL_GLOBAL_LOCK() mtx_lock(&nl_global_mtx)
+#define NL_GLOBAL_UNLOCK() mtx_unlock(&nl_global_mtx)
+
+int netlink_unloading = 0;
+
+static void
+vnet_nl_init(const void *unused __unused)
+{
+ rm_init(&V_nl_ctl.ctl_lock, "netlink lock");
+
+ NL_GLOBAL_LOCK();
+ CK_LIST_INSERT_HEAD(&vnets_head, &V_nl_ctl, ctl_next);
+ NL_LOG(LOG_DEBUG2, "VNET %p init done, inserted %p into global list",
+ curvnet, &V_nl_ctl);
+ NL_GLOBAL_UNLOCK();
+}
+VNET_SYSINIT(vnet_nl_init, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_nl_init, NULL);
+
+static void
+vnet_nl_uninit(const void *unused __unused)
+{
+ /* Assume at the time all of the processes / sockets are dead */
+ NL_GLOBAL_LOCK();
+ NL_LOG(LOG_DEBUG2, "Removing %p from global list", &V_nl_ctl);
+ CK_LIST_REMOVE(&V_nl_ctl, ctl_next);
+ NL_GLOBAL_UNLOCK();
+
+ rm_destroy(&V_nl_ctl.ctl_lock);
+}
+VNET_SYSUNINIT(vnet_nl_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_nl_uninit,
+ NULL);
+
+int
+nl_verify_proto(int proto)
+{
+ if (proto < 0 || proto >= NL_MAX_HANDLERS) {
+ return (EINVAL);
+ }
+ int handler_defined = nl_handlers[proto].cb != NULL;
+ return (handler_defined ? 0 : EPROTONOSUPPORT);
+}
+
+const char *
+nl_get_proto_name(int proto)
+{
+ return (nl_handlers[proto].proto_name);
+}
+
+bool
+netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler)
+{
+ if ((proto < 0) || (proto >= NL_MAX_HANDLERS))
+ return (false);
+ NL_GLOBAL_LOCK();
+ KASSERT((nl_handlers[proto].cb == NULL), ("netlink handler %d is already set", proto));
+ nl_handlers[proto].cb = handler;
+ nl_handlers[proto].proto_name = proto_name;
+ NL_GLOBAL_UNLOCK();
+ NL_LOG(LOG_DEBUG2, "Registered netlink %s(%d) handler", proto_name, proto);
+ return (true);
+}
+
+bool
+netlink_unregister_proto(int proto)
+{
+ if ((proto < 0) || (proto >= NL_MAX_HANDLERS))
+ return (false);
+ NL_GLOBAL_LOCK();
+ KASSERT((nl_handlers[proto].cb != NULL), ("netlink handler %d is not set", proto));
+ nl_handlers[proto].cb = NULL;
+ nl_handlers[proto].proto_name = NULL;
+ NL_GLOBAL_UNLOCK();
+ NL_LOG(LOG_DEBUG2, "Unregistered netlink proto %d handler", proto);
+ return (true);
+}
+
+#if !defined(NETLINK) && defined(NETLINK_MODULE)
+/* Non-stub function provider */
+const static struct nl_function_wrapper nl_module = {
+ .nlmsg_add = _nlmsg_add,
+ .nlmsg_refill_buffer = _nlmsg_refill_buffer,
+ .nlmsg_flush = _nlmsg_flush,
+ .nlmsg_end = _nlmsg_end,
+ .nlmsg_abort = _nlmsg_abort,
+ .nl_writer_unicast = _nl_writer_unicast,
+ .nl_writer_group = _nl_writer_group,
+ .nlmsg_end_dump = _nlmsg_end_dump,
+ .nl_modify_ifp_generic = _nl_modify_ifp_generic,
+ .nl_store_ifp_cookie = _nl_store_ifp_cookie,
+ .nl_get_thread_nlp = _nl_get_thread_nlp,
+};
+#endif
+
+static bool
+can_unload(void)
+{
+ struct nl_control *ctl;
+ bool result = true;
+
+ NL_GLOBAL_LOCK();
+
+ CK_LIST_FOREACH(ctl, &vnets_head, ctl_next) {
+ NL_LOG(LOG_DEBUG2, "Iterating VNET head %p", ctl);
+ if (!CK_LIST_EMPTY(&ctl->ctl_pcb_head)) {
+ NL_LOG(LOG_NOTICE, "non-empty socket list in ctl %p", ctl);
+ result = false;
+ break;
+ }
+ }
+
+ NL_GLOBAL_UNLOCK();
+
+ return (result);
+}
+
+static int
+netlink_modevent(module_t mod __unused, int what, void *priv __unused)
+{
+ int ret = 0;
+
+ switch (what) {
+ case MOD_LOAD:
+ NL_LOG(LOG_DEBUG2, "Loading");
+ nl_osd_register();
+#if !defined(NETLINK) && defined(NETLINK_MODULE)
+ nl_set_functions(&nl_module);
+#endif
+ break;
+
+ case MOD_UNLOAD:
+ NL_LOG(LOG_DEBUG2, "Unload called");
+ if (can_unload()) {
+ NL_LOG(LOG_WARNING, "unloading");
+ netlink_unloading = 1;
+#if !defined(NETLINK) && defined(NETLINK_MODULE)
+ nl_set_functions(NULL);
+#endif
+ nl_osd_unregister();
+ } else
+ ret = EBUSY;
+ break;
+
+ default:
+ ret = EOPNOTSUPP;
+ break;
+ }
+
+ return (ret);
+}
+static moduledata_t netlink_mod = { "netlink", netlink_modevent, NULL };
+
+DECLARE_MODULE(netlink, netlink_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+MODULE_VERSION(netlink, 1);
diff --git a/sys/netlink/netlink_route.c b/sys/netlink/netlink_route.c
new file mode 100644
index 000000000000..0123193c204f
--- /dev/null
+++ b/sys/netlink/netlink_route.c
@@ -0,0 +1,143 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/ck.h>
+#include <sys/epoch.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#define DEBUG_MOD_NAME nl_route_core
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_INFO);
+
+#define HANDLER_MAX_NUM (NL_RTM_MAX + 10)
+static const struct rtnl_cmd_handler *rtnl_handler[HANDLER_MAX_NUM] = {};
+
+bool
+rtnl_register_messages(const struct rtnl_cmd_handler *handlers, int count)
+{
+ for (int i = 0; i < count; i++) {
+ if (handlers[i].cmd >= HANDLER_MAX_NUM)
+ return (false);
+ MPASS(rtnl_handler[handlers[i].cmd] == NULL);
+ }
+ for (int i = 0; i < count; i++)
+ rtnl_handler[handlers[i].cmd] = &handlers[i];
+ return (true);
+}
+
+/*
+ * Handler called by netlink subsystem when matching netlink message is received
+ */
+static int
+rtnl_handle_message(struct nlmsghdr *hdr, struct nl_pstate *npt)
+{
+ const struct rtnl_cmd_handler *cmd;
+ struct epoch_tracker et;
+ struct nlpcb *nlp = npt->nlp;
+ int error = 0;
+
+ if (__predict_false(hdr->nlmsg_type >= HANDLER_MAX_NUM)) {
+ NLMSG_REPORT_ERR_MSG(npt, "unknown message type: %d", hdr->nlmsg_type);
+ return (ENOTSUP);
+ }
+
+ cmd = rtnl_handler[hdr->nlmsg_type];
+ if (__predict_false(cmd == NULL)) {
+ NLMSG_REPORT_ERR_MSG(npt, "unknown message type: %d", hdr->nlmsg_type);
+ return (ENOTSUP);
+ }
+
+ NLP_LOG(LOG_DEBUG2, nlp, "received msg %s(%d) len %d", cmd->name,
+ hdr->nlmsg_type, hdr->nlmsg_len);
+
+ if (cmd->priv != 0 && !nlp_has_priv(nlp, cmd->priv)) {
+ NLP_LOG(LOG_DEBUG2, nlp, "priv %d check failed for msg %s", cmd->priv, cmd->name);
+ return (EPERM);
+ } else if (cmd->priv != 0)
+ NLP_LOG(LOG_DEBUG3, nlp, "priv %d check passed for msg %s", cmd->priv, cmd->name);
+
+ if (!nlp_unconstrained_vnet(nlp) && (cmd->flags & RTNL_F_ALLOW_NONVNET_JAIL) == 0) {
+ NLP_LOG(LOG_DEBUG2, nlp, "jail check failed for msg %s", cmd->name);
+ return (EPERM);
+ }
+
+ bool need_epoch = !(cmd->flags & RTNL_F_NOEPOCH);
+
+ if (need_epoch)
+ NET_EPOCH_ENTER(et);
+ error = cmd->cb(hdr, nlp, npt);
+ if (need_epoch)
+ NET_EPOCH_EXIT(et);
+
+ NLP_LOG(LOG_DEBUG3, nlp, "message %s -> error %d", cmd->name, error);
+
+ return (error);
+}
+
+static struct rtbridge nlbridge = {
+ .route_f = rtnl_handle_route_event,
+ .ifmsg_f = rtnl_handle_ifnet_event,
+};
+static struct rtbridge *nlbridge_orig_p;
+
+static void
+rtnl_load(void *u __unused)
+{
+ NL_LOG(LOG_DEBUG2, "rtnl loading");
+ nlbridge_orig_p = netlink_callback_p;
+ netlink_callback_p = &nlbridge;
+ rtnl_neighs_init();
+ rtnl_ifaces_init();
+ rtnl_nexthops_init();
+ rtnl_routes_init();
+ netlink_register_proto(NETLINK_ROUTE, "NETLINK_ROUTE", rtnl_handle_message);
+}
+SYSINIT(rtnl_load, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_load, NULL);
+
+static void
+rtnl_unload(void *u __unused)
+{
+ netlink_callback_p = nlbridge_orig_p;
+ netlink_unregister_proto(NETLINK_ROUTE);
+ rtnl_ifaces_destroy();
+ rtnl_neighs_destroy();
+
+ /* Wait till all consumers read nlbridge data */
+ NET_EPOCH_WAIT();
+}
+SYSUNINIT(rtnl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_unload, NULL);
diff --git a/sys/netlink/netlink_route.h b/sys/netlink/netlink_route.h
new file mode 100644
index 000000000000..ecdad83312de
--- /dev/null
+++ b/sys/netlink/netlink_route.h
@@ -0,0 +1,44 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _NETLINK_NETLINK_ROUTE_H_
+#define _NETLINK_NETLINK_ROUTE_H_
+
+#include <sys/types.h>
+
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_var.h>
+
+#include <netlink/netlink_bitset.h>
+#include <netlink/route/common.h>
+#include <netlink/route/ifaddrs.h>
+#include <netlink/route/interface.h>
+#include <netlink/route/neigh.h>
+#include <netlink/route/route.h>
+#include <netlink/route/nexthop.h>
+
+#endif
diff --git a/sys/netlink/netlink_snl.h b/sys/netlink/netlink_snl.h
new file mode 100644
index 000000000000..586716776bc5
--- /dev/null
+++ b/sys/netlink/netlink_snl.h
@@ -0,0 +1,1330 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _NETLINK_NETLINK_SNL_H_
+#define _NETLINK_NETLINK_SNL_H_
+
+/*
+ * Simple Netlink Library
+ */
+
+#include <sys/param.h>
+#include <sys/socket.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_bitset.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdalign.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define _roundup2(x, y) (((x)+((y)-1))&(~((y)-1)))
+
+#define NETLINK_ALIGN_SIZE sizeof(uint32_t)
+#define NETLINK_ALIGN(_len) _roundup2(_len, NETLINK_ALIGN_SIZE)
+
+#define NLA_ALIGN_SIZE sizeof(uint32_t)
+#define NLA_HDRLEN ((int)sizeof(struct nlattr))
+#define NLA_DATA_LEN(_nla) ((int)((_nla)->nla_len - NLA_HDRLEN))
+#define NLA_DATA(_nla) NL_ITEM_DATA(_nla, NLA_HDRLEN)
+#define NLA_DATA_CONST(_nla) NL_ITEM_DATA_CONST(_nla, NLA_HDRLEN)
+
+#define NLA_TYPE(_nla) ((_nla)->nla_type & 0x3FFF)
+
+#define NLA_NEXT(_attr) (struct nlattr *)(void *)((char *)_attr + NLA_ALIGN(_attr->nla_len))
+
+#define _NLA_END(_start, _len) ((char *)(_start) + (_len))
+#define NLA_FOREACH(_attr, _start, _len) \
+ for (_attr = (struct nlattr *)(_start); \
+ ((char *)_attr < _NLA_END(_start, _len)) && \
+ ((char *)NLA_NEXT(_attr) <= _NLA_END(_start, _len)); \
+ _attr = NLA_NEXT(_attr))
+
+struct linear_buffer {
+ char *base; /* Base allocated memory pointer */
+ uint32_t offset; /* Currently used offset */
+ uint32_t size; /* Total buffer size */
+ struct linear_buffer *next; /* Buffer chaining */
+} __aligned(alignof(__max_align_t));
+
+static inline struct linear_buffer *
+lb_init(uint32_t size)
+{
+ struct linear_buffer *lb = (struct linear_buffer *)calloc(1, size);
+
+ if (lb != NULL) {
+ lb->base = (char *)(lb + 1);
+ lb->size = size - sizeof(*lb);
+ }
+
+ return (lb);
+}
+
+static inline void
+lb_free(struct linear_buffer *lb)
+{
+ free(lb);
+}
+
+static inline char *
+lb_allocz(struct linear_buffer *lb, int len)
+{
+ len = roundup2(len, alignof(__max_align_t));
+ if (lb->offset + len > lb->size)
+ return (NULL);
+ char *data = (lb->base + lb->offset);
+ lb->offset += len;
+ return (data);
+}
+
+static inline void
+lb_clear(struct linear_buffer *lb)
+{
+ memset(lb->base, 0, lb->offset);
+ lb->offset = 0;
+}
+
+struct snl_state {
+ int fd;
+ char *buf;
+ size_t off;
+ size_t bufsize;
+ size_t datalen;
+ uint32_t seq;
+ bool init_done;
+ struct linear_buffer *lb;
+};
+#define SCRATCH_BUFFER_SIZE 1024
+#define SNL_WRITER_BUFFER_SIZE 256
+
+typedef void snl_parse_field_f(struct snl_state *ss, void *hdr, void *target);
+struct snl_field_parser {
+ uint16_t off_in;
+ uint16_t off_out;
+ snl_parse_field_f *cb;
+};
+static const struct snl_field_parser snl_f_p_empty[] = {};
+
+typedef bool snl_parse_attr_f(struct snl_state *ss, struct nlattr *attr,
+ const void *arg, void *target);
+struct snl_attr_parser {
+ uint16_t type; /* Attribute type */
+ uint16_t off; /* field offset in the target structure */
+ snl_parse_attr_f *cb; /* parser function to call */
+
+ /* Optional parser argument */
+ union {
+ const void *arg;
+ const uint32_t arg_u32;
+ };
+};
+
+typedef bool snl_parse_post_f(struct snl_state *ss, void *target);
+
+struct snl_hdr_parser {
+ uint16_t in_hdr_size; /* Input header size */
+ uint16_t out_size; /* Output structure size */
+ uint16_t fp_size; /* Number of items in field parser */
+ uint16_t np_size; /* Number of items in attribute parser */
+ const struct snl_field_parser *fp; /* array of header field parsers */
+ const struct snl_attr_parser *np; /* array of attribute parsers */
+ snl_parse_post_f *cb_post; /* post-parse callback */
+};
+
+#define SNL_DECLARE_PARSER_EXT(_name, _sz_h_in, _sz_out, _fp, _np, _cb) \
+static const struct snl_hdr_parser _name = { \
+ .in_hdr_size = _sz_h_in, \
+ .out_size = _sz_out, \
+ .fp = &((_fp)[0]), \
+ .np = &((_np)[0]), \
+ .fp_size = nitems(_fp), \
+ .np_size = nitems(_np), \
+ .cb_post = _cb, \
+}
+
+#define SNL_DECLARE_PARSER(_name, _t, _fp, _np) \
+ SNL_DECLARE_PARSER_EXT(_name, sizeof(_t), 0, _fp, _np, NULL)
+
+#define SNL_DECLARE_FIELD_PARSER_EXT(_name, _sz_h_in, _sz_out, _fp, _cb) \
+static const struct snl_hdr_parser _name = { \
+ .in_hdr_size = _sz_h_in, \
+ .out_size = _sz_out, \
+ .fp = &((_fp)[0]), \
+ .fp_size = nitems(_fp), \
+ .cb_post = _cb, \
+}
+
+#define SNL_DECLARE_FIELD_PARSER(_name, _t, _fp) \
+ SNL_DECLARE_FIELD_PARSER_EXT(_name, sizeof(_t), 0, _fp, NULL)
+
+#define SNL_DECLARE_ATTR_PARSER_EXT(_name, _sz_out, _np, _cb) \
+static const struct snl_hdr_parser _name = { \
+ .out_size = _sz_out, \
+ .np = &((_np)[0]), \
+ .np_size = nitems(_np), \
+ .cb_post = _cb, \
+}
+
+#define SNL_DECLARE_ATTR_PARSER(_name, _np) \
+ SNL_DECLARE_ATTR_PARSER_EXT(_name, 0, _np, NULL)
+
+
+static inline void *
+snl_allocz(struct snl_state *ss, int len)
+{
+ void *data = lb_allocz(ss->lb, len);
+
+ if (data == NULL) {
+ uint32_t size = ss->lb->size * 2;
+
+ while (size < len + sizeof(struct linear_buffer))
+ size *= 2;
+
+ struct linear_buffer *lb = lb_init(size);
+
+ if (lb != NULL) {
+ lb->next = ss->lb;
+ ss->lb = lb;
+ data = lb_allocz(ss->lb, len);
+ }
+ }
+
+ return (data);
+}
+
+static inline void
+snl_clear_lb(struct snl_state *ss)
+{
+ struct linear_buffer *lb = ss->lb;
+
+ lb_clear(lb);
+ lb = lb->next;
+ ss->lb->next = NULL;
+ /* Remove all linear bufs except the largest one */
+ while (lb != NULL) {
+ struct linear_buffer *lb_next = lb->next;
+ lb_free(lb);
+ lb = lb_next;
+ }
+}
+
+static void
+snl_free(struct snl_state *ss)
+{
+ if (ss->init_done)
+ close(ss->fd);
+ if (ss->buf != NULL)
+ free(ss->buf);
+ if (ss->lb != NULL) {
+ snl_clear_lb(ss);
+ lb_free(ss->lb);
+ }
+}
+
+static inline bool
+snl_init(struct snl_state *ss, int netlink_family)
+{
+ memset(ss, 0, sizeof(*ss));
+
+ ss->fd = socket(AF_NETLINK, SOCK_RAW, netlink_family);
+ if (ss->fd == -1)
+ return (false);
+ ss->init_done = true;
+
+ int val = 1;
+ socklen_t optlen = sizeof(val);
+ if (setsockopt(ss->fd, SOL_NETLINK, NETLINK_EXT_ACK, &val, optlen) == -1) {
+ snl_free(ss);
+ return (false);
+ }
+
+ int rcvbuf;
+ if (getsockopt(ss->fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf, &optlen) == -1) {
+ snl_free(ss);
+ return (false);
+ }
+
+ ss->bufsize = rcvbuf;
+ ss->buf = (char *)malloc(ss->bufsize);
+ if (ss->buf == NULL) {
+ snl_free(ss);
+ return (false);
+ }
+
+ ss->lb = lb_init(SCRATCH_BUFFER_SIZE);
+ if (ss->lb == NULL) {
+ snl_free(ss);
+ return (false);
+ }
+
+ return (true);
+}
+
+static inline bool
+snl_clone(struct snl_state *ss, const struct snl_state *orig)
+{
+ *ss = (struct snl_state){
+ .fd = orig->fd,
+ .init_done = false,
+ };
+ return ((ss->lb = lb_init(SCRATCH_BUFFER_SIZE)) != NULL);
+}
+
+static inline bool
+snl_send(struct snl_state *ss, void *data, int sz)
+{
+ return (send(ss->fd, data, sz, 0) == sz);
+}
+
+static inline bool
+snl_send_message(struct snl_state *ss, struct nlmsghdr *hdr)
+{
+ ssize_t sz = NLMSG_ALIGN(hdr->nlmsg_len);
+
+ return (send(ss->fd, hdr, sz, 0) == sz);
+}
+
+static inline uint32_t
+snl_get_seq(struct snl_state *ss)
+{
+ return (++ss->seq);
+}
+
+struct snl_msg_info {
+ int cmsg_type;
+ int cmsg_level;
+ uint32_t process_id;
+ uint8_t port_id;
+ uint8_t seq_id;
+};
+static inline bool parse_cmsg(struct snl_state *ss, const struct msghdr *msg,
+ struct snl_msg_info *attrs);
+
+static inline struct nlmsghdr *
+snl_read_message_dbg(struct snl_state *ss, struct snl_msg_info *cinfo)
+{
+ memset(cinfo, 0, sizeof(*cinfo));
+
+ if (ss->off == ss->datalen) {
+ struct sockaddr_nl nladdr;
+ char cbuf[64];
+
+ struct iovec iov = {
+ .iov_base = ss->buf,
+ .iov_len = ss->bufsize,
+ };
+ struct msghdr msg = {
+ .msg_name = &nladdr,
+ .msg_namelen = sizeof(nladdr),
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = cbuf,
+ .msg_controllen = sizeof(cbuf),
+ };
+ ss->off = 0;
+ ss->datalen = 0;
+ for (;;) {
+ ssize_t datalen = recvmsg(ss->fd, &msg, 0);
+ if (datalen > 0) {
+ ss->datalen = datalen;
+ parse_cmsg(ss, &msg, cinfo);
+ break;
+ } else if (errno != EINTR)
+ return (NULL);
+ }
+ }
+ struct nlmsghdr *hdr = (struct nlmsghdr *)(void *)&ss->buf[ss->off];
+ ss->off += NLMSG_ALIGN(hdr->nlmsg_len);
+ return (hdr);
+}
+
+
+static inline struct nlmsghdr *
+snl_read_message(struct snl_state *ss)
+{
+ if (ss->off == ss->datalen) {
+ struct sockaddr_nl nladdr;
+ struct iovec iov = {
+ .iov_base = ss->buf,
+ .iov_len = ss->bufsize,
+ };
+ struct msghdr msg = {
+ .msg_name = &nladdr,
+ .msg_namelen = sizeof(nladdr),
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ };
+ ss->off = 0;
+ ss->datalen = 0;
+ for (;;) {
+ ssize_t datalen = recvmsg(ss->fd, &msg, 0);
+ if (datalen > 0) {
+ ss->datalen = datalen;
+ break;
+ } else if (errno != EINTR)
+ return (NULL);
+ }
+ }
+ struct nlmsghdr *hdr = (struct nlmsghdr *)(void *)&ss->buf[ss->off];
+ ss->off += NLMSG_ALIGN(hdr->nlmsg_len);
+ return (hdr);
+}
+
+static inline struct nlmsghdr *
+snl_read_reply(struct snl_state *ss, uint32_t nlmsg_seq)
+{
+ struct nlmsghdr *hdr;
+
+ while ((hdr = snl_read_message(ss)) != NULL) {
+ if (hdr->nlmsg_seq == nlmsg_seq)
+ return (hdr);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Checks that attributes are sorted by attribute type.
+ */
+static inline void
+snl_verify_parsers(const struct snl_hdr_parser **parser, int count)
+{
+ for (int i = 0; i < count; i++) {
+ const struct snl_hdr_parser *p = parser[i];
+ int attr_type = 0;
+ for (int j = 0; j < p->np_size; j++) {
+ assert(p->np[j].type > attr_type);
+ attr_type = p->np[j].type;
+ }
+ }
+}
+#define SNL_VERIFY_PARSERS(_p) snl_verify_parsers((_p), nitems(_p))
+
+static const struct snl_attr_parser *
+find_parser(const struct snl_attr_parser *ps, int pslen, int key)
+{
+ int left_i = 0, right_i = pslen - 1;
+
+ if (key < ps[0].type || key > ps[pslen - 1].type)
+ return (NULL);
+
+ while (left_i + 1 < right_i) {
+ int mid_i = (left_i + right_i) / 2;
+ if (key < ps[mid_i].type)
+ right_i = mid_i;
+ else if (key > ps[mid_i].type)
+ left_i = mid_i + 1;
+ else
+ return (&ps[mid_i]);
+ }
+ if (ps[left_i].type == key)
+ return (&ps[left_i]);
+ else if (ps[right_i].type == key)
+ return (&ps[right_i]);
+ return (NULL);
+}
+
+static inline bool
+snl_parse_attrs_raw(struct snl_state *ss, struct nlattr *nla_head, int len,
+ const struct snl_attr_parser *ps, int pslen, void *target)
+{
+ struct nlattr *nla;
+
+ NLA_FOREACH(nla, nla_head, len) {
+ if (nla->nla_len < sizeof(struct nlattr))
+ return (false);
+ int nla_type = nla->nla_type & NLA_TYPE_MASK;
+ const struct snl_attr_parser *s = find_parser(ps, pslen, nla_type);
+ if (s != NULL) {
+ void *ptr = (void *)((char *)target + s->off);
+ if (!s->cb(ss, nla, s->arg, ptr))
+ return (false);
+ }
+ }
+ return (true);
+}
+
+static inline bool
+snl_parse_attrs(struct snl_state *ss, struct nlmsghdr *hdr, int hdrlen,
+ const struct snl_attr_parser *ps, int pslen, void *target)
+{
+ int off = NLMSG_HDRLEN + NETLINK_ALIGN(hdrlen);
+ int len = hdr->nlmsg_len - off;
+ struct nlattr *nla_head = (struct nlattr *)(void *)((char *)hdr + off);
+
+ return (snl_parse_attrs_raw(ss, nla_head, len, ps, pslen, target));
+}
+
+static inline void
+snl_parse_fields(struct snl_state *ss, struct nlmsghdr *hdr, int hdrlen __unused,
+ const struct snl_field_parser *ps, int pslen, void *target)
+{
+ for (int i = 0; i < pslen; i++) {
+ const struct snl_field_parser *fp = &ps[i];
+ void *src = (char *)hdr + fp->off_in;
+ void *dst = (char *)target + fp->off_out;
+
+ fp->cb(ss, src, dst);
+ }
+}
+
+static inline bool
+snl_parse_header(struct snl_state *ss, void *hdr, int len,
+ const struct snl_hdr_parser *parser, void *target)
+{
+ struct nlattr *nla_head;
+
+ /* Extract fields first (if any) */
+ snl_parse_fields(ss, (struct nlmsghdr *)hdr, parser->in_hdr_size,
+ parser->fp, parser->fp_size, target);
+
+ nla_head = (struct nlattr *)(void *)((char *)hdr + parser->in_hdr_size);
+ bool result = snl_parse_attrs_raw(ss, nla_head, len - parser->in_hdr_size,
+ parser->np, parser->np_size, target);
+
+ if (result && parser->cb_post != NULL)
+ result = parser->cb_post(ss, target);
+
+ return (result);
+}
+
+static inline bool
+snl_parse_nlmsg(struct snl_state *ss, struct nlmsghdr *hdr,
+ const struct snl_hdr_parser *parser, void *target)
+{
+ return (snl_parse_header(ss, hdr + 1, hdr->nlmsg_len - sizeof(*hdr), parser, target));
+}
+
+static inline bool
+snl_attr_get_flag(struct snl_state *ss __unused, struct nlattr *nla, const void *arg __unused,
+ void *target)
+{
+ if (NLA_DATA_LEN(nla) == 0) {
+ *((uint8_t *)target) = 1;
+ return (true);
+ }
+ return (false);
+}
+
+static inline bool
+snl_attr_get_bytes(struct snl_state *ss __unused, struct nlattr *nla, const void *arg,
+ void *target)
+{
+ if ((size_t)NLA_DATA_LEN(nla) != (size_t)arg)
+ return (false);
+
+ memcpy(target, NLA_DATA_CONST(nla), (size_t)arg);
+
+ return (true);
+}
+
+static inline bool
+snl_attr_get_bool(struct snl_state *ss __unused, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ if (NLA_DATA_LEN(nla) == sizeof(bool)) {
+ *((bool *)target) = *((const bool *)NLA_DATA_CONST(nla));
+ return (true);
+ }
+ return (false);
+}
+
+static inline bool
+snl_attr_get_uint8(struct snl_state *ss __unused, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ if (NLA_DATA_LEN(nla) == sizeof(uint8_t)) {
+ *((uint8_t *)target) = *((const uint8_t *)NLA_DATA_CONST(nla));
+ return (true);
+ }
+ return (false);
+}
+
+static inline bool
+snl_attr_get_uint16(struct snl_state *ss __unused, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ if (NLA_DATA_LEN(nla) == sizeof(uint16_t)) {
+ *((uint16_t *)target) = *((const uint16_t *)NLA_DATA_CONST(nla));
+ return (true);
+ }
+ return (false);
+}
+
+static inline bool
+snl_attr_get_uint32(struct snl_state *ss __unused, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ if (NLA_DATA_LEN(nla) == sizeof(uint32_t)) {
+ *((uint32_t *)target) = *((const uint32_t *)NLA_DATA_CONST(nla));
+ return (true);
+ }
+ return (false);
+}
+
+static inline bool
+snl_attr_get_uint64(struct snl_state *ss __unused, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ if (NLA_DATA_LEN(nla) == sizeof(uint64_t)) {
+ memcpy(target, NLA_DATA_CONST(nla), sizeof(uint64_t));
+ return (true);
+ }
+ return (false);
+}
+
+static inline bool
+snl_attr_get_int8(struct snl_state *ss, struct nlattr *nla, const void *arg,
+ void *target)
+{
+ return (snl_attr_get_uint8(ss, nla, arg, target));
+}
+
+static inline bool
+snl_attr_get_int16(struct snl_state *ss, struct nlattr *nla, const void *arg,
+ void *target)
+{
+ return (snl_attr_get_uint16(ss, nla, arg, target));
+}
+
+static inline bool
+snl_attr_get_int32(struct snl_state *ss, struct nlattr *nla, const void *arg,
+ void *target)
+{
+ return (snl_attr_get_uint32(ss, nla, arg, target));
+}
+
+static inline bool
+snl_attr_get_int64(struct snl_state *ss, struct nlattr *nla, const void *arg,
+ void *target)
+{
+ return (snl_attr_get_uint64(ss, nla, arg, target));
+}
+
+static inline bool
+snl_attr_get_string(struct snl_state *ss __unused, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ size_t maxlen = NLA_DATA_LEN(nla);
+
+ if (strnlen((char *)NLA_DATA(nla), maxlen) < maxlen) {
+ *((char **)target) = (char *)NLA_DATA(nla);
+ return (true);
+ }
+ return (false);
+}
+
+static inline bool
+snl_attr_get_stringn(struct snl_state *ss, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ int maxlen = NLA_DATA_LEN(nla);
+
+ char *buf = (char *)snl_allocz(ss, maxlen + 1);
+ if (buf == NULL)
+ return (false);
+ buf[maxlen] = '\0';
+ memcpy(buf, NLA_DATA(nla), maxlen);
+
+ *((char **)target) = buf;
+ return (true);
+}
+
+static inline bool
+snl_attr_copy_string(struct snl_state *ss, struct nlattr *nla,
+ const void *arg, void *target)
+{
+ char *tmp;
+
+ if (snl_attr_get_string(ss, nla, NULL, &tmp)) {
+ strlcpy((char *)target, tmp, (size_t)arg);
+ return (true);
+ }
+ return (false);
+}
+
+static inline bool
+snl_attr_dup_string(struct snl_state *ss __unused, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ size_t maxlen = NLA_DATA_LEN(nla);
+
+ if (strnlen((char *)NLA_DATA(nla), maxlen) < maxlen) {
+ char *buf = (char *)snl_allocz(ss, maxlen);
+ if (buf == NULL)
+ return (false);
+ memcpy(buf, NLA_DATA(nla), maxlen);
+ *((char **)target) = buf;
+ return (true);
+ }
+ return (false);
+}
+
+static inline bool
+snl_attr_get_nested(struct snl_state *ss, struct nlattr *nla, const void *arg, void *target)
+{
+ const struct snl_hdr_parser *p = (const struct snl_hdr_parser *)arg;
+
+ /* Assumes target points to the beginning of the structure */
+ return (snl_parse_header(ss, NLA_DATA(nla), NLA_DATA_LEN(nla), p, target));
+}
+
+struct snl_parray {
+ uint32_t count;
+ void **items;
+};
+
+static inline bool
+snl_attr_get_parray_sz(struct snl_state *ss, struct nlattr *container_nla,
+ uint32_t start_size, const void *arg, void *target)
+{
+ const struct snl_hdr_parser *p = (const struct snl_hdr_parser *)arg;
+ struct snl_parray *array = (struct snl_parray *)target;
+ struct nlattr *nla;
+ uint32_t count = 0, size = start_size;
+
+ if (p->out_size == 0)
+ return (false);
+
+ array->items = (void **)snl_allocz(ss, size * sizeof(void *));
+ if (array->items == NULL)
+ return (false);
+
+ /*
+ * If the provided parser is an attribute parser, assume that each
+ * nla in the container nla is the container nla itself and parse
+ * the contents of this nla.
+ * Otherwise, run the parser on raw data, assuming the header of this
+ * data has u16 field with total size in the beginning.
+ */
+ uint32_t data_off = 0;
+
+ if (p->in_hdr_size == 0)
+ data_off = sizeof(struct nlattr);
+
+ NLA_FOREACH(nla, NLA_DATA(container_nla), NLA_DATA_LEN(container_nla)) {
+ void *item = snl_allocz(ss, p->out_size);
+
+ if (item == NULL)
+ return (false);
+
+ void *data = (char *)(void *)nla + data_off;
+ int data_len = nla->nla_len - data_off;
+
+ if (!(snl_parse_header(ss, data, data_len, p, item)))
+ return (false);
+
+ if (count == size) {
+ uint32_t new_size = size * 2;
+ void **new_array = (void **)snl_allocz(ss, new_size *sizeof(void *));
+
+ memcpy(new_array, array->items, size * sizeof(void *));
+ array->items = new_array;
+ size = new_size;
+ }
+ array->items[count++] = item;
+ }
+ array->count = count;
+
+ return (true);
+}
+
+/*
+ * Parses and stores the unknown-size array.
+ * Assumes each array item is a container and the NLAs in the container are parsable
+ * by the parser provided in @arg.
+ * Assumes @target is struct snl_parray
+ */
+static inline bool
+snl_attr_get_parray(struct snl_state *ss, struct nlattr *nla, const void *arg, void *target)
+{
+ return (snl_attr_get_parray_sz(ss, nla, 8, arg, target));
+}
+
+static inline bool
+snl_attr_get_nla(struct snl_state *ss __unused, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ *((struct nlattr **)target) = nla;
+ return (true);
+}
+
+static inline bool
+snl_attr_dup_nla(struct snl_state *ss, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ void *ptr = snl_allocz(ss, nla->nla_len);
+
+ if (ptr != NULL) {
+ memcpy(ptr, nla, nla->nla_len);
+ *((void **)target) = ptr;
+ return (true);
+ }
+ return (false);
+}
+
+static inline bool
+snl_attr_copy_struct(struct snl_state *ss, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ void *ptr = snl_allocz(ss, NLA_DATA_LEN(nla));
+
+ if (ptr != NULL) {
+ memcpy(ptr, NLA_DATA(nla), NLA_DATA_LEN(nla));
+ *((void **)target) = ptr;
+ return (true);
+ }
+ return (false);
+}
+
+static inline bool
+snl_attr_dup_struct(struct snl_state *ss, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ void *ptr = snl_allocz(ss, NLA_DATA_LEN(nla));
+
+ if (ptr != NULL) {
+ memcpy(ptr, NLA_DATA(nla), NLA_DATA_LEN(nla));
+ *((void **)target) = ptr;
+ return (true);
+ }
+ return (false);
+}
+
+struct snl_attr_bit {
+ uint32_t bit_index;
+ char *bit_name;
+ int bit_value;
+};
+
+struct snl_attr_bits {
+ uint32_t num_bits;
+ struct snl_attr_bit **bits;
+};
+
+#define _OUT(_field) offsetof(struct snl_attr_bit, _field)
+static const struct snl_attr_parser _nla_p_bit[] = {
+ { .type = NLA_BITSET_BIT_INDEX, .off = _OUT(bit_index), .cb = snl_attr_get_uint32 },
+ { .type = NLA_BITSET_BIT_NAME, .off = _OUT(bit_name), .cb = snl_attr_dup_string },
+ { .type = NLA_BITSET_BIT_VALUE, .off = _OUT(bit_value), .cb = snl_attr_get_flag },
+};
+#undef _OUT
+SNL_DECLARE_ATTR_PARSER_EXT(_nla_bit_parser, sizeof(struct snl_attr_bit), _nla_p_bit, NULL);
+
+struct snl_attr_bitset {
+ uint32_t nla_bitset_size;
+ uint32_t *nla_bitset_mask;
+ uint32_t *nla_bitset_value;
+ struct snl_attr_bits bits;
+};
+
+#define _OUT(_field) offsetof(struct snl_attr_bitset, _field)
+static const struct snl_attr_parser _nla_p_bitset[] = {
+ { .type = NLA_BITSET_SIZE, .off = _OUT(nla_bitset_size), .cb = snl_attr_get_uint32 },
+ { .type = NLA_BITSET_BITS, .off = _OUT(bits), .cb = snl_attr_get_parray, .arg = &_nla_bit_parser },
+ { .type = NLA_BITSET_VALUE, .off = _OUT(nla_bitset_mask), .cb = snl_attr_dup_nla },
+ { .type = NLA_BITSET_MASK, .off = _OUT(nla_bitset_value), .cb = snl_attr_dup_nla },
+};
+
+static inline bool
+_cb_p_bitset(struct snl_state *ss __unused, void *_target)
+{
+ struct snl_attr_bitset *target = (struct snl_attr_bitset *)_target;
+
+ uint32_t sz_bytes = _roundup2(target->nla_bitset_size, 32) / 8;
+
+ if (target->nla_bitset_mask != NULL) {
+ struct nlattr *nla = (struct nlattr *)target->nla_bitset_mask;
+ uint32_t data_len = NLA_DATA_LEN(nla);
+
+ if (data_len != sz_bytes || _roundup2(data_len, 4) != data_len)
+ return (false);
+ target->nla_bitset_mask = (uint32_t *)NLA_DATA(nla);
+ }
+
+ if (target->nla_bitset_value != NULL) {
+ struct nlattr *nla = (struct nlattr *)target->nla_bitset_value;
+ uint32_t data_len = NLA_DATA_LEN(nla);
+
+ if (data_len != sz_bytes || _roundup2(data_len, 4) != data_len)
+ return (false);
+ target->nla_bitset_value = (uint32_t *)NLA_DATA(nla);
+ }
+ return (true);
+}
+#undef _OUT
+SNL_DECLARE_ATTR_PARSER_EXT(_nla_bitset_parser,
+ sizeof(struct snl_attr_bitset),
+ _nla_p_bitset, _cb_p_bitset);
+
+/*
+ * Parses the compact bitset representation.
+ */
+static inline bool
+snl_attr_get_bitset_c(struct snl_state *ss, struct nlattr *nla,
+ const void *arg __unused, void *_target)
+{
+ const struct snl_hdr_parser *p = &_nla_bitset_parser;
+ struct snl_attr_bitset *target = (struct snl_attr_bitset *)_target;
+
+ /* Assumes target points to the beginning of the structure */
+ if (!snl_parse_header(ss, NLA_DATA(nla), NLA_DATA_LEN(nla), p, _target))
+ return (false);
+ if (target->nla_bitset_mask == NULL || target->nla_bitset_value == NULL)
+ return (false);
+ return (true);
+}
+
+static inline void
+snl_field_get_uint8(struct snl_state *ss __unused, void *src, void *target)
+{
+ *((uint8_t *)target) = *((uint8_t *)src);
+}
+
+static inline void
+snl_field_get_uint16(struct snl_state *ss __unused, void *src, void *target)
+{
+ *((uint16_t *)target) = *((uint16_t *)src);
+}
+
+static inline void
+snl_field_get_uint32(struct snl_state *ss __unused, void *src, void *target)
+{
+ *((uint32_t *)target) = *((uint32_t *)src);
+}
+
+static inline void
+snl_field_get_ptr(struct snl_state *ss __unused, void *src, void *target)
+{
+ *((void **)target) = src;
+}
+
+struct snl_errmsg_data {
+ struct nlmsghdr *orig_hdr;
+ int error;
+ uint32_t error_offs;
+ char *error_str;
+ struct nlattr *cookie;
+};
+
+#define _IN(_field) offsetof(struct nlmsgerr, _field)
+#define _OUT(_field) offsetof(struct snl_errmsg_data, _field)
+static const struct snl_attr_parser nla_p_errmsg[] = {
+ { .type = NLMSGERR_ATTR_MSG, .off = _OUT(error_str), .cb = snl_attr_get_string },
+ { .type = NLMSGERR_ATTR_OFFS, .off = _OUT(error_offs), .cb = snl_attr_get_uint32 },
+ { .type = NLMSGERR_ATTR_COOKIE, .off = _OUT(cookie), .cb = snl_attr_get_nla },
+};
+
+static const struct snl_field_parser nlf_p_errmsg[] = {
+ { .off_in = _IN(error), .off_out = _OUT(error), .cb = snl_field_get_uint32 },
+ { .off_in = _IN(msg), .off_out = _OUT(orig_hdr), .cb = snl_field_get_ptr },
+};
+#undef _IN
+#undef _OUT
+SNL_DECLARE_PARSER(snl_errmsg_parser, struct nlmsgerr, nlf_p_errmsg, nla_p_errmsg);
+
+#define _IN(_field) offsetof(struct nlmsgerr, _field)
+#define _OUT(_field) offsetof(struct snl_errmsg_data, _field)
+static const struct snl_field_parser nlf_p_donemsg[] = {
+ { .off_in = _IN(error), .off_out = _OUT(error), .cb = snl_field_get_uint32 },
+};
+#undef _IN
+#undef _OUT
+SNL_DECLARE_FIELD_PARSER(snl_donemsg_parser, struct nlmsgerr, nlf_p_donemsg);
+
+static inline bool
+snl_parse_errmsg(struct snl_state *ss, struct nlmsghdr *hdr, struct snl_errmsg_data *e)
+{
+ if ((hdr->nlmsg_flags & NLM_F_CAPPED) != 0)
+ return (snl_parse_nlmsg(ss, hdr, &snl_errmsg_parser, e));
+
+ const struct snl_hdr_parser *ps = &snl_errmsg_parser;
+ struct nlmsgerr *errmsg = (struct nlmsgerr *)(hdr + 1);
+ int hdrlen = sizeof(int) + NLMSG_ALIGN(errmsg->msg.nlmsg_len);
+ struct nlattr *attr_head = (struct nlattr *)(void *)((char *)errmsg + hdrlen);
+ int attr_len = hdr->nlmsg_len - sizeof(struct nlmsghdr) - hdrlen;
+
+ snl_parse_fields(ss, (struct nlmsghdr *)errmsg, hdrlen, ps->fp, ps->fp_size, e);
+ return (snl_parse_attrs_raw(ss, attr_head, attr_len, ps->np, ps->np_size, e));
+}
+
+static inline bool
+snl_read_reply_code(struct snl_state *ss, uint32_t nlmsg_seq, struct snl_errmsg_data *e)
+{
+ struct nlmsghdr *hdr = snl_read_reply(ss, nlmsg_seq);
+
+ if (hdr == NULL) {
+ e->error = EINVAL;
+ } else if (hdr->nlmsg_type == NLMSG_ERROR) {
+ if (!snl_parse_errmsg(ss, hdr, e))
+ e->error = EINVAL;
+ return (e->error == 0);
+ }
+
+ return (false);
+}
+
+#define _OUT(_field) offsetof(struct snl_msg_info, _field)
+static const struct snl_attr_parser _nla_p_cinfo[] = {
+ { .type = NLMSGINFO_ATTR_PROCESS_ID, .off = _OUT(process_id), .cb = snl_attr_get_uint32 },
+ { .type = NLMSGINFO_ATTR_PORT_ID, .off = _OUT(port_id), .cb = snl_attr_get_uint32 },
+ { .type = NLMSGINFO_ATTR_SEQ_ID, .off = _OUT(seq_id), .cb = snl_attr_get_uint32 },
+};
+#undef _OUT
+SNL_DECLARE_ATTR_PARSER(snl_msg_info_parser, _nla_p_cinfo);
+
+static inline bool
+parse_cmsg(struct snl_state *ss, const struct msghdr *msg, struct snl_msg_info *attrs)
+{
+ for (struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(msg, cmsg)) {
+ if (cmsg->cmsg_level != SOL_NETLINK || cmsg->cmsg_type != NETLINK_MSG_INFO)
+ continue;
+
+ void *data = CMSG_DATA(cmsg);
+ int len = cmsg->cmsg_len - ((char *)data - (char *)cmsg);
+ const struct snl_hdr_parser *ps = &snl_msg_info_parser;
+
+ return (snl_parse_attrs_raw(ss, (struct nlattr *)data, len, ps->np, ps->np_size, attrs));
+ }
+
+ return (false);
+}
+
+/*
+ * Assumes e is zeroed
+ */
+static inline struct nlmsghdr *
+snl_read_reply_multi(struct snl_state *ss, uint32_t nlmsg_seq, struct snl_errmsg_data *e)
+{
+ struct nlmsghdr *hdr = snl_read_reply(ss, nlmsg_seq);
+
+ if (hdr == NULL) {
+ e->error = EINVAL;
+ } else if (hdr->nlmsg_type == NLMSG_ERROR) {
+ if (!snl_parse_errmsg(ss, hdr, e))
+ e->error = EINVAL;
+ } else if (hdr->nlmsg_type == NLMSG_DONE) {
+ snl_parse_nlmsg(ss, hdr, &snl_donemsg_parser, e);
+ } else
+ return (hdr);
+
+ return (NULL);
+}
+
+
+/* writer logic */
+struct snl_writer {
+ char *base;
+ uint32_t offset;
+ uint32_t size;
+ struct nlmsghdr *hdr;
+ struct snl_state *ss;
+ bool error;
+};
+
+static inline void
+snl_init_writer(struct snl_state *ss, struct snl_writer *nw)
+{
+ nw->size = SNL_WRITER_BUFFER_SIZE;
+ nw->base = (char *)snl_allocz(ss, nw->size);
+ if (nw->base == NULL) {
+ nw->error = true;
+ nw->size = 0;
+ }
+
+ nw->offset = 0;
+ nw->hdr = NULL;
+ nw->error = false;
+ nw->ss = ss;
+}
+
+static inline bool
+snl_realloc_msg_buffer(struct snl_writer *nw, size_t sz)
+{
+ uint32_t new_size = nw->size * 2;
+
+ while (new_size < nw->size + sz)
+ new_size *= 2;
+
+ if (nw->error)
+ return (false);
+
+ if (snl_allocz(nw->ss, new_size) == NULL) {
+ nw->error = true;
+ return (false);
+ }
+ nw->size = new_size;
+
+ void *new_base = nw->ss->lb->base;
+ if (new_base != nw->base) {
+ memcpy(new_base, nw->base, nw->offset);
+ if (nw->hdr != NULL) {
+ int hdr_off = (char *)(nw->hdr) - nw->base;
+
+ nw->hdr = (struct nlmsghdr *)
+ (void *)((char *)new_base + hdr_off);
+ }
+ nw->base = (char *)new_base;
+ }
+
+ return (true);
+}
+
+static inline void *
+snl_reserve_msg_data_raw(struct snl_writer *nw, size_t sz)
+{
+ sz = NETLINK_ALIGN(sz);
+
+ if (__predict_false(nw->offset + sz > nw->size)) {
+ if (!snl_realloc_msg_buffer(nw, sz))
+ return (NULL);
+ }
+
+ void *data_ptr = &nw->base[nw->offset];
+ nw->offset += sz;
+
+ return (data_ptr);
+}
+#define snl_reserve_msg_object(_ns, _t) ((_t *)snl_reserve_msg_data_raw(_ns, sizeof(_t)))
+#define snl_reserve_msg_data(_ns, _sz, _t) ((_t *)snl_reserve_msg_data_raw(_ns, _sz))
+
+static inline struct nlattr *
+snl_reserve_msg_attr_raw(struct snl_writer *nw, uint16_t nla_type, uint16_t sz)
+{
+ struct nlattr *nla;
+
+ sz += sizeof(struct nlattr);
+ nla = snl_reserve_msg_data(nw, sz, struct nlattr);
+ if (__predict_false(nla == NULL))
+ return (NULL);
+ nla->nla_type = nla_type;
+ nla->nla_len = sz;
+
+ return (nla);
+}
+#define snl_reserve_msg_attr(_ns, _at, _t) \
+ ((_t *)(snl_reserve_msg_attr_raw(_ns, _at, sizeof(_t)) + 1))
+
+static inline bool
+snl_add_msg_attr(struct snl_writer *nw, int attr_type, int attr_len, const void *data)
+{
+ int required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr));
+
+ if (__predict_false(nw->offset + required_len > nw->size)) {
+ if (!snl_realloc_msg_buffer(nw, required_len))
+ return (false);
+ }
+
+ struct nlattr *nla = (struct nlattr *)(void *)(&nw->base[nw->offset]);
+
+ nla->nla_len = attr_len + sizeof(struct nlattr);
+ nla->nla_type = attr_type;
+ if (attr_len > 0) {
+ if ((attr_len % 4) != 0) {
+ /* clear padding bytes */
+ bzero((char *)nla + required_len - 4, 4);
+ }
+ memcpy((nla + 1), data, attr_len);
+ }
+ nw->offset += required_len;
+ return (true);
+}
+
+static inline bool
+snl_add_msg_attr_raw(struct snl_writer *nw, const struct nlattr *nla_src)
+{
+ int attr_len = nla_src->nla_len - sizeof(struct nlattr);
+
+ assert(attr_len >= 0);
+
+ return (snl_add_msg_attr(nw, nla_src->nla_type, attr_len, (const void *)(nla_src + 1)));
+}
+
+static inline bool
+snl_add_msg_attr_bool(struct snl_writer *nw, int attrtype, bool value)
+{
+ return (snl_add_msg_attr(nw, attrtype, sizeof(bool), &value));
+}
+
+static inline bool
+snl_add_msg_attr_u8(struct snl_writer *nw, int attrtype, uint8_t value)
+{
+ return (snl_add_msg_attr(nw, attrtype, sizeof(uint8_t), &value));
+}
+
+static inline bool
+snl_add_msg_attr_u16(struct snl_writer *nw, int attrtype, uint16_t value)
+{
+ return (snl_add_msg_attr(nw, attrtype, sizeof(uint16_t), &value));
+}
+
+static inline bool
+snl_add_msg_attr_u32(struct snl_writer *nw, int attrtype, uint32_t value)
+{
+ return (snl_add_msg_attr(nw, attrtype, sizeof(uint32_t), &value));
+}
+
+static inline bool
+snl_add_msg_attr_u64(struct snl_writer *nw, int attrtype, uint64_t value)
+{
+ return (snl_add_msg_attr(nw, attrtype, sizeof(uint64_t), &value));
+}
+
+static inline bool
+snl_add_msg_attr_s8(struct snl_writer *nw, int attrtype, int8_t value)
+{
+ return (snl_add_msg_attr(nw, attrtype, sizeof(int8_t), &value));
+}
+
+static inline bool
+snl_add_msg_attr_s16(struct snl_writer *nw, int attrtype, int16_t value)
+{
+ return (snl_add_msg_attr(nw, attrtype, sizeof(int16_t), &value));
+}
+
+static inline bool
+snl_add_msg_attr_s32(struct snl_writer *nw, int attrtype, int32_t value)
+{
+ return (snl_add_msg_attr(nw, attrtype, sizeof(int32_t), &value));
+}
+
+static inline bool
+snl_add_msg_attr_s64(struct snl_writer *nw, int attrtype, int64_t value)
+{
+ return (snl_add_msg_attr(nw, attrtype, sizeof(int64_t), &value));
+}
+
+static inline bool
+snl_add_msg_attr_flag(struct snl_writer *nw, int attrtype)
+{
+ return (snl_add_msg_attr(nw, attrtype, 0, NULL));
+}
+
+static inline bool
+snl_add_msg_attr_string(struct snl_writer *nw, int attrtype, const char *str)
+{
+ return (snl_add_msg_attr(nw, attrtype, strlen(str) + 1, str));
+}
+
+
+static inline int
+snl_get_msg_offset(const struct snl_writer *nw)
+{
+ return (nw->offset - ((char *)nw->hdr - nw->base));
+}
+
+static inline void *
+_snl_restore_msg_offset(const struct snl_writer *nw, int off)
+{
+ return ((void *)((char *)nw->hdr + off));
+}
+#define snl_restore_msg_offset(_ns, _off, _t) ((_t *)_snl_restore_msg_offset(_ns, _off))
+
+static inline int
+snl_add_msg_attr_nested(struct snl_writer *nw, int attrtype)
+{
+ int off = snl_get_msg_offset(nw);
+ struct nlattr *nla = snl_reserve_msg_data(nw, sizeof(struct nlattr), struct nlattr);
+ if (__predict_false(nla == NULL))
+ return (0);
+ nla->nla_type = attrtype;
+ return (off);
+}
+
+static inline void
+snl_end_attr_nested(const struct snl_writer *nw, int off)
+{
+ if (!nw->error) {
+ struct nlattr *nla = snl_restore_msg_offset(nw, off, struct nlattr);
+ nla->nla_len = NETLINK_ALIGN(snl_get_msg_offset(nw) - off);
+ }
+}
+
+static inline struct nlmsghdr *
+snl_create_msg_request(struct snl_writer *nw, int nlmsg_type)
+{
+ struct nlmsghdr *hdr;
+
+ assert(nw->hdr == NULL);
+
+ if (__predict_false((hdr =
+ snl_reserve_msg_object(nw, struct nlmsghdr)) == NULL))
+ return (NULL);
+ hdr->nlmsg_type = nlmsg_type;
+ hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ nw->hdr = hdr;
+
+ return (hdr);
+}
+
+static void
+snl_abort_msg(struct snl_writer *nw)
+{
+ if (nw->hdr != NULL) {
+ int offset = (char *)(&nw->base[nw->offset]) - (char *)(nw->hdr);
+
+ nw->offset -= offset;
+ nw->hdr = NULL;
+ }
+}
+
+static inline struct nlmsghdr *
+snl_finalize_msg(struct snl_writer *nw)
+{
+ if (nw->error)
+ snl_abort_msg(nw);
+ if (nw->hdr != NULL) {
+ struct nlmsghdr *hdr = nw->hdr;
+
+ int offset = (char *)(&nw->base[nw->offset]) - (char *)(nw->hdr);
+ hdr->nlmsg_len = offset;
+ hdr->nlmsg_seq = snl_get_seq(nw->ss);
+ nw->hdr = NULL;
+
+ return (hdr);
+ }
+ return (NULL);
+}
+
+static inline bool
+snl_send_msgs(struct snl_writer *nw)
+{
+ int offset = nw->offset;
+
+ assert(nw->hdr == NULL);
+ nw->offset = 0;
+
+ return (snl_send(nw->ss, nw->base, offset));
+}
+
+#endif
diff --git a/sys/netlink/netlink_snl_generic.h b/sys/netlink/netlink_snl_generic.h
new file mode 100644
index 000000000000..10e98a0266e0
--- /dev/null
+++ b/sys/netlink/netlink_snl_generic.h
@@ -0,0 +1,175 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _NETLINK_NETLINK_SNL_GENERIC_H_
+#define _NETLINK_NETLINK_SNL_GENERIC_H_
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_generic.h>
+#include <netlink/netlink_snl.h>
+
+/* Genetlink helpers */
+static inline struct nlmsghdr *
+snl_create_genl_msg_request(struct snl_writer *nw, uint16_t genl_family,
+ uint8_t genl_cmd)
+{
+ struct nlmsghdr *hdr;
+ struct genlmsghdr *ghdr;
+
+ assert(nw->hdr == NULL);
+
+ hdr = snl_reserve_msg_object(nw, struct nlmsghdr);
+ if (__predict_false(hdr == NULL))
+ return (NULL);
+ hdr->nlmsg_type = genl_family;
+ hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ ghdr = snl_reserve_msg_object(nw, struct genlmsghdr);
+ if (__predict_false(ghdr == NULL))
+ return (NULL);
+ ghdr->cmd = genl_cmd;
+ nw->hdr = hdr;
+
+ return (hdr);
+}
+
+static struct snl_field_parser snl_fp_genl[] = {};
+
+#define SNL_DECLARE_GENL_PARSER(_name, _np) SNL_DECLARE_PARSER(_name,\
+ struct genlmsghdr, snl_fp_genl, _np)
+
+struct _snl_genl_ctrl_mcast_group {
+ uint32_t mcast_grp_id;
+ const char *mcast_grp_name;
+};
+
+struct _snl_genl_ctrl_mcast_groups {
+ uint32_t num_groups;
+ struct _snl_genl_ctrl_mcast_group **groups;
+};
+
+#define _OUT(_field) offsetof(struct _snl_genl_ctrl_mcast_group, _field)
+static struct snl_attr_parser _nla_p_getmc[] = {
+ {
+ .type = CTRL_ATTR_MCAST_GRP_NAME,
+ .off = _OUT(mcast_grp_name),
+ .cb = snl_attr_get_string,
+ },
+ {
+ .type = CTRL_ATTR_MCAST_GRP_ID,
+ .off = _OUT(mcast_grp_id),
+ .cb = snl_attr_get_uint32,
+ },
+};
+#undef _OUT
+SNL_DECLARE_ATTR_PARSER_EXT(_genl_ctrl_mc_parser,
+ sizeof(struct _snl_genl_ctrl_mcast_group), _nla_p_getmc, NULL);
+
+struct _getfamily_attrs {
+ uint16_t family_id;
+ const char *family_name;
+ struct _snl_genl_ctrl_mcast_groups mcast_groups;
+};
+
+#define _IN(_field) offsetof(struct genlmsghdr, _field)
+#define _OUT(_field) offsetof(struct _getfamily_attrs, _field)
+static struct snl_attr_parser _nla_p_getfam[] = {
+ {
+ .type = CTRL_ATTR_FAMILY_ID,
+ .off = _OUT(family_id),
+ .cb = snl_attr_get_uint16,
+ },
+ {
+ .type = CTRL_ATTR_FAMILY_NAME,
+ .off = _OUT(family_name),
+ .cb = snl_attr_get_string,
+ },
+ {
+ .type = CTRL_ATTR_MCAST_GROUPS,
+ .off = _OUT(mcast_groups),
+ .cb = snl_attr_get_parray,
+ .arg = &_genl_ctrl_mc_parser,
+ },
+};
+#undef _IN
+#undef _OUT
+SNL_DECLARE_GENL_PARSER(_genl_ctrl_getfam_parser, _nla_p_getfam);
+
+static bool
+_snl_get_genl_family_info(struct snl_state *ss, const char *family_name,
+ struct _getfamily_attrs *attrs)
+{
+ struct snl_writer nw;
+ struct nlmsghdr *hdr;
+
+ memset(attrs, 0, sizeof(*attrs));
+
+ snl_init_writer(ss, &nw);
+ snl_create_genl_msg_request(&nw, GENL_ID_CTRL, CTRL_CMD_GETFAMILY);
+ snl_add_msg_attr_string(&nw, CTRL_ATTR_FAMILY_NAME, family_name);
+ if ((hdr = snl_finalize_msg(&nw)) == NULL || !snl_send_message(ss, hdr))
+ return (false);
+
+ hdr = snl_read_reply(ss, hdr->nlmsg_seq);
+ if (hdr != NULL && hdr->nlmsg_type != NLMSG_ERROR) {
+ if (snl_parse_nlmsg(ss, hdr, &_genl_ctrl_getfam_parser, attrs))
+ return (true);
+ }
+
+ return (false);
+}
+
+static inline uint16_t
+snl_get_genl_family(struct snl_state *ss, const char *family_name)
+{
+ struct _getfamily_attrs attrs = {};
+
+ if (__predict_false(!_snl_get_genl_family_info(ss, family_name,
+ &attrs)))
+ return (0);
+ return (attrs.family_id);
+}
+
+static inline uint16_t
+snl_get_genl_mcast_group(struct snl_state *ss, const char *family_name,
+ const char *group_name, uint16_t *family_id)
+{
+ struct _getfamily_attrs attrs = {};
+
+ if (__predict_false(!_snl_get_genl_family_info(ss, family_name,
+ &attrs)))
+ return (0);
+ if (attrs.family_id == 0)
+ return (0);
+ if (family_id != NULL)
+ *family_id = attrs.family_id;
+ for (u_int i = 0; i < attrs.mcast_groups.num_groups; i++)
+ if (strcmp(attrs.mcast_groups.groups[i]->mcast_grp_name,
+ group_name) == 0)
+ return (attrs.mcast_groups.groups[i]->mcast_grp_id);
+ return (0);
+}
+
+#endif
diff --git a/sys/netlink/netlink_snl_route.h b/sys/netlink/netlink_snl_route.h
new file mode 100644
index 000000000000..62055b2db417
--- /dev/null
+++ b/sys/netlink/netlink_snl_route.h
@@ -0,0 +1,201 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _NETLINK_NETLINK_SNL_ROUTE_H_
+#define _NETLINK_NETLINK_SNL_ROUTE_H_
+
+#include <netlink/netlink_snl.h>
+#include <netlink/netlink_route.h>
+#include <netinet/in.h>
+
+/*
+ * Simple Netlink Library - NETLINK_ROUTE helpers
+ */
+
+static inline struct sockaddr *
+parse_rta_ip4(struct snl_state *ss, void *rta_data, int *perror)
+{
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)snl_allocz(ss, sizeof(struct sockaddr_in));
+ if (sin == NULL) {
+ *perror = ENOBUFS;
+ return (NULL);
+ }
+ sin->sin_len = sizeof(struct sockaddr_in);
+ sin->sin_family = AF_INET;
+ memcpy(&sin->sin_addr, rta_data, sizeof(struct in_addr));
+ return ((struct sockaddr *)sin);
+}
+
+static inline struct sockaddr *
+parse_rta_ip6(struct snl_state *ss, void *rta_data, int *perror)
+{
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)snl_allocz(ss, sizeof(struct sockaddr_in6));
+ if (sin6 == NULL) {
+ *perror = ENOBUFS;
+ return (NULL);
+ }
+ sin6->sin6_len = sizeof(struct sockaddr_in6);
+ sin6->sin6_family = AF_INET6;
+ memcpy(&sin6->sin6_addr, rta_data, sizeof(struct in6_addr));
+ return ((struct sockaddr *)sin6);
+}
+
+static inline struct sockaddr *
+parse_rta_ip(struct snl_state *ss, struct rtattr *rta, int *perror)
+{
+ void *rta_data = NL_RTA_DATA(rta);
+ int rta_len = NL_RTA_DATA_LEN(rta);
+
+ if (rta_len == sizeof(struct in_addr)) {
+ return (parse_rta_ip4(ss, rta_data, perror));
+ } else if (rta_len == sizeof(struct in6_addr)) {
+ return (parse_rta_ip6(ss, rta_data, perror));
+ } else {
+ *perror = ENOTSUP;
+ return (NULL);
+ }
+ return (NULL);
+}
+
+static inline bool
+snl_attr_get_ip(struct snl_state *ss, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ int error = 0;
+ struct sockaddr *sa = parse_rta_ip(ss, (struct rtattr *)nla, &error);
+ if (error == 0) {
+ *((struct sockaddr **)target) = sa;
+ return (true);
+ }
+ return (false);
+}
+
+static inline struct sockaddr *
+parse_rta_via(struct snl_state *ss, struct rtattr *rta, int *perror)
+{
+ struct rtvia *via = (struct rtvia *)NL_RTA_DATA(rta);
+
+ switch (via->rtvia_family) {
+ case AF_INET:
+ return (parse_rta_ip4(ss, via->rtvia_addr, perror));
+ case AF_INET6:
+ return (parse_rta_ip6(ss, via->rtvia_addr, perror));
+ default:
+ *perror = ENOTSUP;
+ return (NULL);
+ }
+}
+
+static inline bool
+snl_attr_get_ipvia(struct snl_state *ss, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ int error = 0;
+
+ struct sockaddr *sa = parse_rta_via(ss, (struct rtattr *)nla, &error);
+ if (error == 0) {
+ *((struct sockaddr **)target) = sa;
+ return (true);
+ }
+ return (false);
+}
+
+static inline bool
+snl_add_msg_attr_ip4(struct snl_writer *nw, int attrtype, const struct in_addr *addr)
+{
+ return (snl_add_msg_attr(nw, attrtype, 4, addr));
+}
+
+static inline bool
+snl_add_msg_attr_ip6(struct snl_writer *nw, int attrtype, const struct in6_addr *addr)
+{
+ return (snl_add_msg_attr(nw, attrtype, 16, addr));
+}
+
+static inline bool
+snl_add_msg_attr_ip(struct snl_writer *nw, int attrtype, const struct sockaddr *sa)
+{
+ const void *addr;
+
+ switch (sa->sa_family) {
+ case AF_INET:
+ addr = &((const struct sockaddr_in *)(const void *)sa)->sin_addr;
+ return (snl_add_msg_attr(nw, attrtype, 4, addr));
+ case AF_INET6:
+ addr = &((const struct sockaddr_in6 *)(const void *)sa)->sin6_addr;
+ return (snl_add_msg_attr(nw, attrtype, 16, addr));
+ }
+
+ return (false);
+}
+
+static inline bool
+snl_add_msg_attr_ipvia(struct snl_writer *nw, int attrtype, const struct sockaddr *sa)
+{
+ char buf[17];
+
+ buf[0] = sa->sa_family;
+
+ switch (sa->sa_family) {
+ case AF_INET:
+ memcpy(&buf[1], &((const struct sockaddr_in *)(const void *)sa)->sin_addr, 4);
+ return (snl_add_msg_attr(nw, attrtype, 5, buf));
+ case AF_INET6:
+ memcpy(&buf[1], &((const struct sockaddr_in6 *)(const void *)sa)->sin6_addr, 16);
+ return (snl_add_msg_attr(nw, attrtype, 17, buf));
+ }
+
+ return (false);
+}
+
+static inline bool
+snl_attr_get_in_addr(struct snl_state *ss __unused, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ if (NLA_DATA_LEN(nla) != sizeof(struct in_addr))
+ return (false);
+
+ memcpy(target, NLA_DATA_CONST(nla), sizeof(struct in_addr));
+ return (true);
+}
+
+static inline bool
+snl_attr_get_in6_addr(struct snl_state *ss __unused, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ if (NLA_DATA_LEN(nla) != sizeof(struct in6_addr))
+ return (false);
+
+ memcpy(target, NLA_DATA_CONST(nla), sizeof(struct in6_addr));
+ return (true);
+}
+
+
+#endif
diff --git a/sys/netlink/netlink_snl_route_compat.h b/sys/netlink/netlink_snl_route_compat.h
new file mode 100644
index 000000000000..87c65f1adcda
--- /dev/null
+++ b/sys/netlink/netlink_snl_route_compat.h
@@ -0,0 +1,53 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _NETLINK_NETLINK_SNL_ROUTE_COMPAT_H_
+#define _NETLINK_NETLINK_SNL_ROUTE_COMPAT_H_
+
+#include <sys/socket.h>
+#include <sys/types.h>
+
+/*
+ * This file contains netlink-compatible definitions from the
+ * net/route.h header.
+ */
+#define NETLINK_COMPAT
+
+#include <net/route.h>
+
+#define RTSOCK_RTM_ADD 0x1
+#define RTSOCK_RTM_DELETE 0x2
+#define RTSOCK_RTM_CHANGE 0x3
+#define RTSOCK_RTM_GET 0x4
+#define RTSOCK_RTM_NEWADDR 0xc
+#define RTSOCK_RTM_DELADDR 0xd
+#define RTSOCK_RTM_IFINFO 0xe
+#define RTSOCK_RTM_NEWMADDR 0xf
+#define RTSOCK_RTM_DELMADDR 0x10
+#define RTSOCK_RTM_IFANNOUNCE 0x11
+#define RTSOCK_RTM_IEEE80211 0x12
+
+#endif
diff --git a/sys/netlink/netlink_snl_route_parsers.h b/sys/netlink/netlink_snl_route_parsers.h
new file mode 100644
index 000000000000..6b7a8188180d
--- /dev/null
+++ b/sys/netlink/netlink_snl_route_parsers.h
@@ -0,0 +1,392 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _NETLINK_NETLINK_SNL_ROUTE_PARSERS_H_
+#define _NETLINK_NETLINK_SNL_ROUTE_PARSERS_H_
+
+#include <netlink/netlink_snl.h>
+#include <netlink/netlink_snl_route.h>
+#include <netlink/route/nexthop.h>
+
+/* TODO: this file should be generated automatically */
+
+static inline void
+finalize_sockaddr(struct sockaddr *sa, uint32_t ifindex)
+{
+ if (sa != NULL && sa->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)(void *)sa;
+
+ if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
+ sin6->sin6_scope_id = ifindex;
+ }
+}
+
+/* RTM_<NEW|DEL|GET>ROUTE message parser */
+
+struct rta_mpath_nh {
+ struct sockaddr *gw;
+ uint32_t ifindex;
+ uint8_t rtnh_flags;
+ uint8_t rtnh_weight;
+ uint32_t rtax_mtu;
+ uint32_t rta_rtflags;
+};
+
+#define _IN(_field) offsetof(struct rtnexthop, _field)
+#define _OUT(_field) offsetof(struct rta_mpath_nh, _field)
+static const struct snl_attr_parser _nla_p_mp_nh_metrics[] = {
+ { .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = snl_attr_get_uint32 },
+};
+SNL_DECLARE_ATTR_PARSER(_metrics_mp_nh_parser, _nla_p_mp_nh_metrics);
+
+static const struct snl_attr_parser _nla_p_mp_nh[] = {
+ { .type = NL_RTA_GATEWAY, .off = _OUT(gw), .cb = snl_attr_get_ip },
+ { .type = NL_RTA_METRICS, .arg = &_metrics_mp_nh_parser, .cb = snl_attr_get_nested },
+ { .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = snl_attr_get_uint32 },
+ { .type = NL_RTA_VIA, .off = _OUT(gw), .cb = snl_attr_get_ipvia },
+};
+
+static const struct snl_field_parser _fp_p_mp_nh[] = {
+ { .off_in = _IN(rtnh_flags), .off_out = _OUT(rtnh_flags), .cb = snl_field_get_uint8 },
+ { .off_in = _IN(rtnh_hops), .off_out = _OUT(rtnh_weight), .cb = snl_field_get_uint8 },
+ { .off_in = _IN(rtnh_ifindex), .off_out = _OUT(ifindex), .cb = snl_field_get_uint32 },
+};
+
+static inline bool
+_cb_p_mp_nh(struct snl_state *ss __unused, void *_target)
+{
+ struct rta_mpath_nh *target = (struct rta_mpath_nh *)_target;
+
+ finalize_sockaddr(target->gw, target->ifindex);
+ return (true);
+}
+#undef _IN
+#undef _OUT
+SNL_DECLARE_PARSER_EXT(_mpath_nh_parser, sizeof(struct rtnexthop),
+ sizeof(struct rta_mpath_nh), _fp_p_mp_nh, _nla_p_mp_nh,
+ _cb_p_mp_nh);
+
+struct rta_mpath {
+ uint32_t num_nhops;
+ struct rta_mpath_nh **nhops;
+};
+
+static bool
+nlattr_get_multipath(struct snl_state *ss, struct nlattr *nla,
+ const void *arg __unused, void *target)
+{
+ uint32_t start_size = 4;
+
+ while (start_size < NLA_DATA_LEN(nla) / sizeof(struct rtnexthop))
+ start_size *= 2;
+
+ return (snl_attr_get_parray_sz(ss, nla, start_size, &_mpath_nh_parser, target));
+}
+
+struct snl_parsed_route {
+ struct sockaddr *rta_dst;
+ struct sockaddr *rta_gw;
+ struct nlattr *rta_metrics;
+ struct rta_mpath rta_multipath;
+ uint32_t rta_expires;
+ uint32_t rta_oif;
+ uint32_t rta_expire;
+ uint32_t rta_table;
+ uint32_t rta_knh_id;
+ uint32_t rta_nh_id;
+ uint32_t rta_rtflags;
+ uint32_t rtax_mtu;
+ uint32_t rtax_weight;
+ uint8_t rtm_family;
+ uint8_t rtm_type;
+ uint8_t rtm_protocol;
+ uint8_t rtm_dst_len;
+};
+
+#define _IN(_field) offsetof(struct rtmsg, _field)
+#define _OUT(_field) offsetof(struct snl_parsed_route, _field)
+static const struct snl_attr_parser _nla_p_rtmetrics[] = {
+ { .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = snl_attr_get_uint32 },
+};
+SNL_DECLARE_ATTR_PARSER(_metrics_parser, _nla_p_rtmetrics);
+
+static const struct snl_attr_parser _nla_p_route[] = {
+ { .type = NL_RTA_DST, .off = _OUT(rta_dst), .cb = snl_attr_get_ip },
+ { .type = NL_RTA_OIF, .off = _OUT(rta_oif), .cb = snl_attr_get_uint32 },
+ { .type = NL_RTA_GATEWAY, .off = _OUT(rta_gw), .cb = snl_attr_get_ip },
+ { .type = NL_RTA_METRICS, .arg = &_metrics_parser, .cb = snl_attr_get_nested },
+ { .type = NL_RTA_MULTIPATH, .off = _OUT(rta_multipath), .cb = nlattr_get_multipath },
+ { .type = NL_RTA_KNH_ID, .off = _OUT(rta_knh_id), .cb = snl_attr_get_uint32 },
+ { .type = NL_RTA_WEIGHT, .off = _OUT(rtax_weight), .cb = snl_attr_get_uint32 },
+ { .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = snl_attr_get_uint32 },
+ { .type = NL_RTA_TABLE, .off = _OUT(rta_table), .cb = snl_attr_get_uint32 },
+ { .type = NL_RTA_VIA, .off = _OUT(rta_gw), .cb = snl_attr_get_ipvia },
+ { .type = NL_RTA_EXPIRES, .off = _OUT(rta_expire), .cb = snl_attr_get_uint32 },
+ { .type = NL_RTA_NH_ID, .off = _OUT(rta_nh_id), .cb = snl_attr_get_uint32 },
+};
+
+static const struct snl_field_parser _fp_p_route[] = {
+ {.off_in = _IN(rtm_family), .off_out = _OUT(rtm_family), .cb = snl_field_get_uint8 },
+ {.off_in = _IN(rtm_type), .off_out = _OUT(rtm_type), .cb = snl_field_get_uint8 },
+ {.off_in = _IN(rtm_protocol), .off_out = _OUT(rtm_protocol), .cb = snl_field_get_uint8 },
+ {.off_in = _IN(rtm_dst_len), .off_out = _OUT(rtm_dst_len), .cb = snl_field_get_uint8 },
+};
+
+static inline bool
+_cb_p_route(struct snl_state *ss __unused, void *_target)
+{
+ struct snl_parsed_route *target = (struct snl_parsed_route *)_target;
+
+ finalize_sockaddr(target->rta_dst, target->rta_oif);
+ finalize_sockaddr(target->rta_gw, target->rta_oif);
+ return (true);
+}
+#undef _IN
+#undef _OUT
+SNL_DECLARE_PARSER_EXT(snl_rtm_route_parser, sizeof(struct rtmsg),
+ sizeof(struct snl_parsed_route), _fp_p_route, _nla_p_route,
+ _cb_p_route);
+
+/* RTM_<NEW|DEL|GET>LINK message parser */
+struct snl_parsed_link {
+ uint32_t ifi_index;
+ uint32_t ifi_flags;
+ uint32_t ifi_change;
+ uint16_t ifi_type;
+ uint8_t ifla_operstate;
+ uint8_t ifla_carrier;
+ uint32_t ifla_mtu;
+ char *ifla_ifname;
+ struct nlattr *ifla_address;
+ struct nlattr *ifla_broadcast;
+ char *ifla_ifalias;
+ uint32_t ifla_promiscuity;
+ struct rtnl_link_stats64 *ifla_stats64;
+ struct nlattr *iflaf_orig_hwaddr;
+ struct snl_attr_bitset iflaf_caps;
+};
+
+#define _IN(_field) offsetof(struct ifinfomsg, _field)
+#define _OUT(_field) offsetof(struct snl_parsed_link, _field)
+static const struct snl_attr_parser _nla_p_link_fbsd[] = {
+ { .type = IFLAF_ORIG_HWADDR, .off = _OUT(iflaf_orig_hwaddr), .cb = snl_attr_dup_nla },
+ { .type = IFLAF_CAPS, .off = _OUT(iflaf_caps), .cb = snl_attr_get_bitset_c },
+};
+SNL_DECLARE_ATTR_PARSER(_link_fbsd_parser, _nla_p_link_fbsd);
+
+static const struct snl_attr_parser _nla_p_link[] = {
+ { .type = IFLA_ADDRESS, .off = _OUT(ifla_address), .cb = snl_attr_dup_nla },
+ { .type = IFLA_BROADCAST, .off = _OUT(ifla_broadcast), .cb = snl_attr_dup_nla },
+ { .type = IFLA_IFNAME, .off = _OUT(ifla_ifname), .cb = snl_attr_dup_string },
+ { .type = IFLA_MTU, .off = _OUT(ifla_mtu), .cb = snl_attr_get_uint32 },
+ { .type = IFLA_OPERSTATE, .off = _OUT(ifla_operstate), .cb = snl_attr_get_uint8 },
+ { .type = IFLA_IFALIAS, .off = _OUT(ifla_ifalias), .cb = snl_attr_dup_string },
+ { .type = IFLA_STATS64, .off = _OUT(ifla_stats64), .cb = snl_attr_dup_struct },
+ { .type = IFLA_PROMISCUITY, .off = _OUT(ifla_promiscuity), .cb = snl_attr_get_uint32 },
+ { .type = IFLA_CARRIER, .off = _OUT(ifla_carrier), .cb = snl_attr_get_uint8 },
+ { .type = IFLA_FREEBSD, .arg = &_link_fbsd_parser, .cb = snl_attr_get_nested },
+};
+static const struct snl_field_parser _fp_p_link[] = {
+ {.off_in = _IN(ifi_index), .off_out = _OUT(ifi_index), .cb = snl_field_get_uint32 },
+ {.off_in = _IN(ifi_flags), .off_out = _OUT(ifi_flags), .cb = snl_field_get_uint32 },
+ {.off_in = _IN(ifi_change), .off_out = _OUT(ifi_change), .cb = snl_field_get_uint32 },
+ {.off_in = _IN(ifi_type), .off_out = _OUT(ifi_type), .cb = snl_field_get_uint16 },
+};
+#undef _IN
+#undef _OUT
+SNL_DECLARE_PARSER(snl_rtm_link_parser, struct ifinfomsg, _fp_p_link, _nla_p_link);
+
+struct snl_parsed_link_simple {
+ uint32_t ifi_index;
+ uint32_t ifla_mtu;
+ uint16_t ifi_type;
+ uint32_t ifi_flags;
+ char *ifla_ifname;
+};
+
+#define _IN(_field) offsetof(struct ifinfomsg, _field)
+#define _OUT(_field) offsetof(struct snl_parsed_link_simple, _field)
+static struct snl_attr_parser _nla_p_link_s[] = {
+ { .type = IFLA_IFNAME, .off = _OUT(ifla_ifname), .cb = snl_attr_dup_string },
+ { .type = IFLA_MTU, .off = _OUT(ifla_mtu), .cb = snl_attr_get_uint32 },
+};
+static struct snl_field_parser _fp_p_link_s[] = {
+ {.off_in = _IN(ifi_index), .off_out = _OUT(ifi_index), .cb = snl_field_get_uint32 },
+ {.off_in = _IN(ifi_type), .off_out = _OUT(ifi_type), .cb = snl_field_get_uint16 },
+ {.off_in = _IN(ifi_flags), .off_out = _OUT(ifi_flags), .cb = snl_field_get_uint32 },
+};
+#undef _IN
+#undef _OUT
+SNL_DECLARE_PARSER(snl_rtm_link_parser_simple, struct ifinfomsg, _fp_p_link_s, _nla_p_link_s);
+
+struct snl_parsed_neigh {
+ uint8_t ndm_family;
+ uint8_t ndm_flags;
+ uint16_t ndm_state;
+ uint32_t nda_ifindex;
+ uint32_t nda_probes;
+ uint32_t ndaf_next_ts;
+ struct sockaddr *nda_dst;
+ struct nlattr *nda_lladdr;
+};
+
+#define _IN(_field) offsetof(struct ndmsg, _field)
+#define _OUT(_field) offsetof(struct snl_parsed_neigh, _field)
+static const struct snl_attr_parser _nla_p_neigh_fbsd[] = {
+ { .type = NDAF_NEXT_STATE_TS, .off = _OUT(ndaf_next_ts), .cb = snl_attr_get_uint32 },
+};
+SNL_DECLARE_ATTR_PARSER(_neigh_fbsd_parser, _nla_p_neigh_fbsd);
+
+static struct snl_attr_parser _nla_p_neigh_s[] = {
+ { .type = NDA_DST, .off = _OUT(nda_dst), .cb = snl_attr_get_ip },
+ { .type = NDA_LLADDR , .off = _OUT(nda_lladdr), .cb = snl_attr_dup_nla },
+ { .type = NDA_PROBES, .off = _OUT(nda_probes), .cb = snl_attr_get_uint32 },
+ { .type = NDA_IFINDEX, .off = _OUT(nda_ifindex), .cb = snl_attr_get_uint32 },
+ { .type = NDA_FREEBSD, .arg = &_neigh_fbsd_parser, .cb = snl_attr_get_nested },
+};
+static struct snl_field_parser _fp_p_neigh_s[] = {
+ {.off_in = _IN(ndm_family), .off_out = _OUT(ndm_family), .cb = snl_field_get_uint8 },
+ {.off_in = _IN(ndm_flags), .off_out = _OUT(ndm_flags), .cb = snl_field_get_uint8 },
+ {.off_in = _IN(ndm_state), .off_out = _OUT(ndm_state), .cb = snl_field_get_uint16 },
+ {.off_in = _IN(ndm_ifindex), .off_out = _OUT(nda_ifindex), .cb = snl_field_get_uint32 },
+};
+
+static inline bool
+_cb_p_neigh(struct snl_state *ss __unused, void *_target)
+{
+ struct snl_parsed_neigh *target = (struct snl_parsed_neigh *)_target;
+
+ finalize_sockaddr(target->nda_dst, target->nda_ifindex);
+ return (true);
+}
+#undef _IN
+#undef _OUT
+SNL_DECLARE_PARSER_EXT(snl_rtm_neigh_parser, sizeof(struct ndmsg),
+ sizeof(struct snl_parsed_neigh), _fp_p_neigh_s, _nla_p_neigh_s,
+ _cb_p_neigh);
+
+struct snl_parsed_addr {
+ uint8_t ifa_family;
+ uint8_t ifa_prefixlen;
+ uint32_t ifa_index;
+ struct sockaddr *ifa_local;
+ struct sockaddr *ifa_address;
+ struct sockaddr *ifa_broadcast;
+ char *ifa_label;
+ struct ifa_cacheinfo *ifa_cacheinfo;
+ uint32_t ifaf_vhid;
+ uint32_t ifaf_flags;
+};
+
+#define _IN(_field) offsetof(struct ifaddrmsg, _field)
+#define _OUT(_field) offsetof(struct snl_parsed_addr, _field)
+static const struct snl_attr_parser _nla_p_addr_fbsd[] = {
+ { .type = IFAF_VHID, .off = _OUT(ifaf_vhid), .cb = snl_attr_get_uint32 },
+ { .type = IFAF_FLAGS, .off = _OUT(ifaf_flags), .cb = snl_attr_get_uint32 },
+};
+SNL_DECLARE_ATTR_PARSER(_addr_fbsd_parser, _nla_p_addr_fbsd);
+
+static const struct snl_attr_parser _nla_p_addr_s[] = {
+ { .type = IFA_ADDRESS, .off = _OUT(ifa_address), .cb = snl_attr_get_ip },
+ { .type = IFA_LOCAL, .off = _OUT(ifa_local), .cb = snl_attr_get_ip },
+ { .type = IFA_LABEL, .off = _OUT(ifa_label), .cb = snl_attr_dup_string },
+ { .type = IFA_BROADCAST, .off = _OUT(ifa_broadcast), .cb = snl_attr_get_ip },
+ { .type = IFA_CACHEINFO, .off = _OUT(ifa_cacheinfo), .cb = snl_attr_dup_struct },
+ { .type = IFA_FREEBSD, .arg = &_addr_fbsd_parser, .cb = snl_attr_get_nested },
+};
+static const struct snl_field_parser _fp_p_addr_s[] = {
+ {.off_in = _IN(ifa_family), .off_out = _OUT(ifa_family), .cb = snl_field_get_uint8 },
+ {.off_in = _IN(ifa_prefixlen), .off_out = _OUT(ifa_prefixlen), .cb = snl_field_get_uint8 },
+ {.off_in = _IN(ifa_index), .off_out = _OUT(ifa_index), .cb = snl_field_get_uint32 },
+};
+
+static inline bool
+_cb_p_addr(struct snl_state *ss __unused, void *_target)
+{
+ struct snl_parsed_addr *target = (struct snl_parsed_addr *)_target;
+
+ finalize_sockaddr(target->ifa_address, target->ifa_index);
+ finalize_sockaddr(target->ifa_local, target->ifa_index);
+ return (true);
+}
+#undef _IN
+#undef _OUT
+SNL_DECLARE_PARSER_EXT(snl_rtm_addr_parser, sizeof(struct ifaddrmsg),
+ sizeof(struct snl_parsed_addr), _fp_p_addr_s, _nla_p_addr_s,
+ _cb_p_addr);
+
+struct snl_parsed_nhop {
+ uint32_t nha_id;
+ uint8_t nha_blackhole;
+ uint8_t nha_groups;
+ uint8_t nhaf_knhops;
+ uint8_t nhaf_family;
+ uint32_t nha_oif;
+ struct sockaddr *nha_gw;
+ uint8_t nh_family;
+ uint8_t nh_protocol;
+ uint32_t nhaf_table;
+ uint32_t nhaf_kid;
+ uint32_t nhaf_aif;
+};
+
+#define _IN(_field) offsetof(struct nhmsg, _field)
+#define _OUT(_field) offsetof(struct snl_parsed_nhop, _field)
+static struct snl_attr_parser _nla_p_nh_fbsd[] = {
+ { .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = snl_attr_get_flag },
+ { .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = snl_attr_get_uint32 },
+ { .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = snl_attr_get_uint32 },
+ { .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = snl_attr_get_uint32 },
+};
+SNL_DECLARE_ATTR_PARSER(_nh_fbsd_parser, _nla_p_nh_fbsd);
+
+static const struct snl_field_parser _fp_p_nh[] = {
+ { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = snl_field_get_uint8 },
+ { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = snl_field_get_uint8 },
+};
+
+static const struct snl_attr_parser _nla_p_nh[] = {
+ { .type = NHA_ID, .off = _OUT(nha_id), .cb = snl_attr_get_uint32 },
+ { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = snl_attr_get_flag },
+ { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = snl_attr_get_uint32 },
+ { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = snl_attr_get_ip },
+ { .type = NHA_FREEBSD, .arg = &_nh_fbsd_parser, .cb = snl_attr_get_nested },
+};
+
+static inline bool
+_cb_p_nh(struct snl_state *ss __unused, void *_target)
+{
+ struct snl_parsed_nhop *target = (struct snl_parsed_nhop *)_target;
+
+ finalize_sockaddr(target->nha_gw, target->nha_oif);
+ return (true);
+}
+#undef _IN
+#undef _OUT
+SNL_DECLARE_PARSER_EXT(snl_nhmsg_parser, sizeof(struct nhmsg),
+ sizeof(struct snl_parsed_nhop), _fp_p_nh, _nla_p_nh, _cb_p_nh);
+
+#endif
diff --git a/sys/netlink/netlink_sysevent.c b/sys/netlink/netlink_sysevent.c
new file mode 100644
index 000000000000..09e7e50a7409
--- /dev/null
+++ b/sys/netlink/netlink_sysevent.c
@@ -0,0 +1,205 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Baptiste Daroussin <bapt@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/devctl.h>
+#include <sys/errno.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <net/vnet.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_generic.h>
+#include <netlink/netlink_sysevent.h>
+
+#define DEBUG_MOD_NAME nl_sysevent
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_INFO);
+
+MALLOC_DEFINE(M_NLSE, "nlsysevent", "Memory used for Netlink sysevent");
+#define NLSE_FAMILY_NAME "nlsysevent"
+static uint16_t ctrl_family_id;
+
+#define MAX_SYSEVENT_GROUPS 64
+static struct sysevent_group {
+ char *name;
+ uint32_t id;
+} sysevent_groups[MAX_SYSEVENT_GROUPS] = {};
+
+static const char *devctl_systems[] = {
+ "ACPI",
+ "AEON",
+ "CAM",
+ "CARP",
+ "coretemp",
+ "DEVFS",
+ "device",
+ "ETHERNET",
+ "GEOM",
+ "HYPERV_NIC_VF",
+ "IFNET",
+ "INFINIBAND",
+ "KERNEL",
+ "nvme",
+ "PMU",
+ "RCTL",
+ "USB",
+ "VFS",
+ "VT",
+ "ZFS",
+};
+
+static void
+sysevent_write(struct sysevent_group *se, const char *subsystem, const char *type,
+ const char *data)
+{
+ struct nl_writer nw;
+
+ if (!nl_writer_group(&nw, NLMSG_LARGE, NETLINK_GENERIC, se->id, 0,
+ false)) {
+ NL_LOG(LOG_DEBUG, "error allocating group writer");
+ return;
+ }
+ struct nlmsghdr hdr = { .nlmsg_type = ctrl_family_id };
+ if (!nlmsg_reply(&nw, &hdr, sizeof(struct genlmsghdr))) {
+ return;
+ }
+
+ struct genlmsghdr *ghdr = nlmsg_reserve_object(&nw, struct genlmsghdr);
+ if (ghdr == NULL) {
+ NL_LOG(LOG_DEBUG, "unable to allocate memory");
+ return;
+ }
+ ghdr->version = 0;
+ ghdr->cmd = NLSE_CMD_NEWEVENT;
+ ghdr->reserved = 0;
+ nlattr_add_string(&nw, NLSE_ATTR_SYSTEM, se->name);
+ nlattr_add_string(&nw, NLSE_ATTR_SUBSYSTEM, subsystem);
+ nlattr_add_string(&nw, NLSE_ATTR_TYPE, type);
+ if (data != NULL)
+ nlattr_add_string(&nw, NLSE_ATTR_DATA, data);
+ nlmsg_end(&nw);
+ nlmsg_flush(&nw);
+}
+
+static void
+sysevent_new_group(size_t index, const char *name)
+{
+ if (index >= MAX_SYSEVENT_GROUPS) {
+ NL_LOG(LOG_WARNING, "impossible to add the event %s, "
+ "too many event groups\n", name);
+ return;
+ }
+ sysevent_groups[index].name = strdup(name, M_NLSE);
+ sysevent_groups[index].id = genl_register_group(ctrl_family_id,
+ sysevent_groups[index].name);
+}
+
+static struct sysevent_group *
+sysevent_get_group(const char *system)
+{
+ for (size_t i = 0; i < MAX_SYSEVENT_GROUPS; i++) {
+ if (sysevent_groups[i].name == NULL) {
+ sysevent_new_group(i, system);
+ return (&sysevent_groups[i]);
+ }
+ if (strcmp(sysevent_groups[i].name, system) == 0)
+ return (&sysevent_groups[i]);
+ }
+
+ return (NULL);
+}
+
+static void
+sysevent_send(const char *system, const char *subsystem, const char *type,
+ const char *data)
+{
+ struct sysevent_group *se = sysevent_get_group(system);
+
+ if (se == NULL) {
+ NL_LOG(LOG_WARNING, "impossible to add the event %s, "
+ "too many event groups\n", system);
+ return;
+ }
+
+ CURVNET_SET(vnet0);
+ sysevent_write(se, subsystem, type, data);
+ CURVNET_RESTORE();
+}
+
+static void
+nlsysevent_load(void)
+{
+ devctl_set_notify_hook(sysevent_send);
+ ctrl_family_id = genl_register_family(NLSE_FAMILY_NAME, 0, 2, NLSE_ATTR_MAX);
+ for (size_t i = 0; i < nitems(devctl_systems); i++) {
+ if (i >= MAX_SYSEVENT_GROUPS) {
+ NL_LOG(LOG_WARNING, "impossible to add the event %s, too many events\n", devctl_systems[i]);
+ continue;
+ }
+ sysevent_new_group(i, devctl_systems[i]);
+ }
+}
+
+static void
+nlsysevent_unload(void)
+{
+ devctl_unset_notify_hook();
+ genl_unregister_family(ctrl_family_id);
+ for (size_t i = 0; i < MAX_SYSEVENT_GROUPS; i++) {
+ if (sysevent_groups[i].name == NULL)
+ break;
+ free(sysevent_groups[i].name, M_NLSE);
+ }
+}
+
+static int
+nlsysevent_loader(module_t mod __unused, int what, void *priv __unused)
+{
+ int err = 0;
+
+ switch (what) {
+ case MOD_LOAD:
+ nlsysevent_load();
+ break;
+ case MOD_UNLOAD:
+ nlsysevent_unload();
+ break;
+ default:
+ err = EOPNOTSUPP;
+ break;
+ }
+ return (err);
+}
+static moduledata_t nlsysevent_mod = { "nlsysevent", nlsysevent_loader, NULL};
+
+DECLARE_MODULE(nlsysevent, nlsysevent_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+MODULE_DEPEND(nlsysevent, netlink, 1, 1, 1);
+MODULE_VERSION(nlsysevent, 1);
diff --git a/sys/netlink/netlink_sysevent.h b/sys/netlink/netlink_sysevent.h
new file mode 100644
index 000000000000..8434a0de078e
--- /dev/null
+++ b/sys/netlink/netlink_sysevent.h
@@ -0,0 +1,49 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Baptiste Daroussin <bapt@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETLINK_SYSEVENT_H_
+#define _NETLINK_SYSEVENT_H_
+
+enum {
+ NLSE_ATTR_UNSPEC = 0,
+ NLSE_ATTR_SYSTEM = 1, /* string reporting the system name */
+ NLSE_ATTR_SUBSYSTEM = 2, /* string reporting the subsystem name */
+ NLSE_ATTR_TYPE = 3, /* string reporting the type if the event */
+ NLSE_ATTR_DATA = 4, /* string reporting the extra data (can be null) */
+ __NLSE_ATTR_MAX,
+};
+#define NLSE_ATTR_MAX (__NLSE_ATTR_MAX -1)
+
+/* commands */
+enum {
+ NLSE_CMD_UNSPEC = 0,
+ NLSE_CMD_NEWEVENT = 1,
+ __NLSE_CMD_MAX,
+};
+#define NLSE_CMD_MAX (__NLSE_CMD_MAX - 1)
+
+#endif
diff --git a/sys/netlink/netlink_var.h b/sys/netlink/netlink_var.h
new file mode 100644
index 000000000000..23e7395d44c2
--- /dev/null
+++ b/sys/netlink/netlink_var.h
@@ -0,0 +1,181 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _NETLINK_NETLINK_VAR_H_
+#define _NETLINK_NETLINK_VAR_H_
+
+#ifdef _KERNEL
+
+#include <sys/ck.h>
+#include <sys/epoch.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <net/vnet.h>
+
+#define NLSNDQ 65536 /* Default socket sendspace */
+#define NLRCVQ 65536 /* Default socket recvspace */
+
+#define NLMBUFSIZE 2048 /* External storage size for Netlink mbufs */
+
+struct ucred;
+
+struct nl_buf {
+ TAILQ_ENTRY(nl_buf) tailq;
+ u_int buflen;
+ u_int datalen;
+ u_int offset;
+ char data[];
+};
+
+#define NLP_MAX_GROUPS 128
+
+BITSET_DEFINE(nl_groups, NLP_MAX_GROUPS);
+struct nlpcb {
+ struct socket *nl_socket;
+ struct nl_groups nl_groups;
+ uint32_t nl_port;
+ uint32_t nl_flags;
+ uint32_t nl_process_id;
+ int nl_proto;
+ bool nl_bound;
+ bool nl_task_pending;
+ bool nl_tx_blocked; /* No new requests accepted */
+ bool nl_linux; /* true if running under compat */
+ bool nl_unconstrained_vnet; /* true if running under VNET jail (or without jail) */
+ bool nl_need_thread_setup;
+ struct taskqueue *nl_taskqueue;
+ struct task nl_task;
+ uint64_t nl_dropped_bytes;
+ uint64_t nl_dropped_messages;
+ CK_LIST_ENTRY(nlpcb) nl_next;
+ CK_LIST_ENTRY(nlpcb) nl_port_next;
+ volatile u_int nl_refcount;
+ struct mtx nl_lock;
+ struct epoch_context nl_epoch_ctx;
+};
+#define sotonlpcb(so) ((struct nlpcb *)(so)->so_pcb)
+
+#define NLP_LOCK_INIT(_nlp) mtx_init(&((_nlp)->nl_lock), "nlp mtx", NULL, MTX_DEF)
+#define NLP_LOCK_DESTROY(_nlp) mtx_destroy(&((_nlp)->nl_lock))
+#define NLP_LOCK(_nlp) mtx_lock(&((_nlp)->nl_lock))
+#define NLP_UNLOCK(_nlp) mtx_unlock(&((_nlp)->nl_lock))
+
+#define ALIGNED_NL_SZ(_data) roundup2((((struct nlmsghdr *)(_data))->nlmsg_len), 16)
+
+/* nl_flags */
+#define NLF_CAP_ACK 0x01 /* Do not send message body with errmsg */
+#define NLF_EXT_ACK 0x02 /* Allow including extended TLVs in ack */
+#define NLF_STRICT 0x04 /* Perform strict header checks */
+#define NLF_MSG_INFO 0x08 /* Send caller info along with the notifications */
+
+SYSCTL_DECL(_net_netlink);
+SYSCTL_DECL(_net_netlink_debug);
+
+struct nl_control {
+ CK_LIST_HEAD(nl_pid_head, nlpcb) ctl_port_head;
+ CK_LIST_HEAD(nlpcb_head, nlpcb) ctl_pcb_head;
+ CK_LIST_ENTRY(nl_control) ctl_next;
+ struct rmlock ctl_lock;
+};
+VNET_DECLARE(struct nl_control, nl_ctl);
+#define V_nl_ctl VNET(nl_ctl)
+
+struct sockaddr_nl;
+struct sockaddr;
+struct nlmsghdr;
+
+int nl_verify_proto(int proto);
+const char *nl_get_proto_name(int proto);
+
+extern int netlink_unloading;
+
+struct nl_proto_handler {
+ nl_handler_f cb;
+ const char *proto_name;
+};
+extern struct nl_proto_handler *nl_handlers;
+
+/* netlink_domain.c */
+bool nl_send_group(struct nl_writer *);
+void nl_clear_group(u_int);
+void nl_osd_register(void);
+void nl_osd_unregister(void);
+void nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp);
+
+/* netlink_io.c */
+bool nl_send(struct nl_writer *, struct nlpcb *);
+void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *nlmsg,
+ struct nl_pstate *npt);
+void nl_on_transmit(struct nlpcb *nlp);
+
+void nl_taskqueue_handler(void *_arg, int pending);
+void nl_schedule_taskqueue(struct nlpcb *nlp);
+void nl_process_receive_locked(struct nlpcb *nlp);
+void nl_set_source_metadata(struct mbuf *m, int num_messages);
+struct nl_buf *nl_buf_alloc(size_t len, int mflag);
+void nl_buf_free(struct nl_buf *nb);
+
+#define MAX_FAMILIES 20
+#define MAX_GROUPS 64
+
+#define MIN_GROUP_NUM 48
+
+#define CTRL_FAMILY_ID 0
+#define CTRL_FAMILY_NAME "nlctrl"
+#define CTRL_GROUP_ID 0
+#define CTRL_GROUP_NAME "notify"
+
+struct ifnet;
+struct nl_parsed_link;
+struct nlattr_bmask;
+struct nl_pstate;
+
+/* Function map */
+struct nl_function_wrapper {
+ bool (*nlmsg_add)(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
+ uint16_t flags, uint32_t len);
+ bool (*nlmsg_refill_buffer)(struct nl_writer *nw, size_t required_len);
+ bool (*nlmsg_flush)(struct nl_writer *nw);
+ bool (*nlmsg_end)(struct nl_writer *nw);
+ void (*nlmsg_abort)(struct nl_writer *nw);
+ void (*nlmsg_ignore_limit)(struct nl_writer *nw);
+ bool (*nl_writer_unicast)(struct nl_writer *nw, size_t size,
+ struct nlpcb *nlp, bool waitok);
+ bool (*nl_writer_group)(struct nl_writer *nw, size_t size,
+ uint16_t protocol, uint16_t group_id, int priv, bool waitok);
+ bool (*nlmsg_end_dump)(struct nl_writer *nw, int error, struct nlmsghdr *hdr);
+ int (*nl_modify_ifp_generic)(struct ifnet *ifp, struct nl_parsed_link *lattrs,
+ const struct nlattr_bmask *bm, struct nl_pstate *npt);
+ void (*nl_store_ifp_cookie)(struct nl_pstate *npt, struct ifnet *ifp);
+ struct nlpcb * (*nl_get_thread_nlp)(struct thread *td);
+};
+void nl_set_functions(const struct nl_function_wrapper *nl);
+
+
+
+#endif
+#endif
diff --git a/sys/netlink/route/common.h b/sys/netlink/route/common.h
new file mode 100644
index 000000000000..5cd3a5ee3524
--- /dev/null
+++ b/sys/netlink/route/common.h
@@ -0,0 +1,259 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Common defines for all parts of the NETLINK_ROUTE family
+ */
+#ifndef _NETLINK_ROUTE_COMMON_H_
+#define _NETLINK_ROUTE_COMMON_H_
+
+/* Defined NETLINK_ROUTE messages */
+enum {
+ NL_RTM_BASE = 16,
+ NL_RTM_NEWLINK = 16, /* creates new interface */
+ NL_RTM_DELLINK = 17, /* deletes matching interface */
+ NL_RTM_GETLINK = 18, /* lists matching interfaces */
+ NL_RTM_SETLINK = 19, /* not supported */
+ NL_RTM_NEWADDR = 20, /* not supported */
+ NL_RTM_DELADDR = 21, /* not supported */
+ NL_RTM_GETADDR = 22, /* lists matching ifaddrs */
+ NL_RTM_NEWROUTE = 24, /* adds or changes a route */
+ NL_RTM_DELROUTE = 25, /* deletes matching route */
+ NL_RTM_GETROUTE = 26, /* lists matching routes */
+ NL_RTM_NEWNEIGH = 28, /* creates new arp/ndp entry */
+ NL_RTM_DELNEIGH = 29, /* deletes matching arp/ndp entry */
+ NL_RTM_GETNEIGH = 30, /* lists matching arp/ndp entry */
+ NL_RTM_NEWRULE = 32, /* not supported */
+ NL_RTM_DELRULE = 33, /* not supported */
+ NL_RTM_GETRULE = 34, /* not supported */
+ NL_RTM_NEWQDISC = 36, /* not supported */
+ NL_RTM_DELQDISC = 37, /* not supported */
+ NL_RTM_GETQDISC = 38, /* not supported */
+ NL_RTM_NEWTCLASS = 40, /* not supported */
+ NL_RTM_DELTCLASS = 41, /* not supported */
+ NL_RTM_GETTCLASS = 42, /* not supported */
+ NL_RTM_NEWTFILTER = 44, /* not supported */
+ NL_RTM_DELTFILTER = 45, /* not supported */
+ NL_RTM_GETTFILTER = 46, /* not supported */
+ NL_RTM_NEWACTION = 48, /* not supported */
+ NL_RTM_DELACTION = 49, /* not supported */
+ NL_RTM_GETACTION = 50, /* not supported */
+ NL_RTM_NEWPREFIX = 52, /* not supported */
+ NL_RTM_GETMULTICAST = 58, /* not supported */
+ NL_RTM_GETANYCAST = 62, /* not supported */
+ NL_RTM_NEWNEIGHTBL = 64, /* not supported */
+ NL_RTM_GETNEIGHTBL = 66, /* not supported */
+ NL_RTM_SETNEIGHTBL = 67, /* not supported */
+ NL_RTM_NEWNDUSEROPT = 68, /* not supported */
+ NL_RTM_NEWADDRLABEL = 72, /* not supported */
+ NL_RTM_DELADDRLABEL = 73, /* not supported */
+ NL_RTM_GETADDRLABEL = 74, /* not supported */
+ NL_RTM_GETDCB = 78, /* not supported */
+ NL_RTM_SETDCB = 79, /* not supported */
+ NL_RTM_NEWNETCONF = 80, /* not supported */
+ NL_RTM_GETNETCONF = 82, /* not supported */
+ NL_RTM_NEWMDB = 84, /* not supported */
+ NL_RTM_DELMDB = 85, /* not supported */
+ NL_RTM_GETMDB = 86, /* not supported */
+ NL_RTM_NEWNSID = 88, /* not supported */
+ NL_RTM_DELNSID = 89, /* not supported */
+ NL_RTM_GETNSID = 90, /* not supported */
+ NL_RTM_NEWSTATS = 92, /* not supported */
+ NL_RTM_GETSTATS = 94, /* not supported */
+ NL_RTM_NEWNEXTHOP = 104, /* creates new user nexhtop */
+ NL_RTM_DELNEXTHOP = 105, /* deletes matching nexthop */
+ NL_RTM_GETNEXTHOP = 106, /* lists created user nexthops */
+ __NL_RTM_MAX,
+};
+#define NL_RTM_MAX (((__NL_RTM_MAX + 3) & ~3) - 1)
+
+#ifndef _KERNEL
+/*
+ * RTM_* namespace clashes with BSD rtsock namespace.
+ * Use NL_RTM_ prefix in the kernel and map it to RTM_
+ * for userland.
+ */
+#define RTM_BASE NL_RTM_BASE
+#define RTM_NEWLINK NL_RTM_NEWLINK
+#define RTM_DELLINK NL_RTM_DELLINK
+#define RTM_GETLINK NL_RTM_GETLINK
+#define RTM_SETLINK NL_RTM_SETLINK
+#define RTM_NEWADDR NL_RTM_NEWADDR
+#define RTM_DELADDR NL_RTM_DELADDR
+#define RTM_GETADDR NL_RTM_GETADDR
+#define RTM_NEWROUTE NL_RTM_NEWROUTE
+#define RTM_DELROUTE NL_RTM_DELROUTE
+#define RTM_GETROUTE NL_RTM_GETROUTE
+#define RTM_NEWNEIGH NL_RTM_NEWNEIGH
+#define RTM_DELNEIGH NL_RTM_DELNEIGH
+#define RTM_GETNEIGH NL_RTM_GETNEIGH
+#define RTM_NEWRULE NL_RTM_NEWRULE
+#define RTM_DELRULE NL_RTM_DELRULE
+#define RTM_GETRULE NL_RTM_GETRULE
+#define RTM_NEWQDISC NL_RTM_NEWQDISC
+#define RTM_DELQDISC NL_RTM_DELQDISC
+#define RTM_GETQDISC NL_RTM_GETQDISC
+#define RTM_NEWTCLASS NL_RTM_NEWTCLASS
+#define RTM_DELTCLASS NL_RTM_DELTCLASS
+#define RTM_GETTCLASS NL_RTM_GETTCLASS
+#define RTM_NEWTFILTER NL_RTM_NEWTFILTER
+#define RTM_DELTFILTER NL_RTM_DELTFILTER
+#define RTM_GETTFILTER NL_RTM_GETTFILTER
+#define RTM_NEWACTION NL_RTM_NEWACTION
+#define RTM_DELACTION NL_RTM_DELACTION
+#define RTM_GETACTION NL_RTM_GETACTION
+#define RTM_NEWPREFIX NL_RTM_NEWPREFIX
+#define RTM_GETMULTICAST NL_RTM_GETMULTICAST
+#define RTM_GETANYCAST NL_RTM_GETANYCAST
+#define RTM_NEWNEIGHTBL NL_RTM_NEWNEIGHTBL
+#define RTM_GETNEIGHTBL NL_RTM_GETNEIGHTBL
+#define RTM_SETNEIGHTBL NL_RTM_SETNEIGHTBL
+#define RTM_NEWNDUSEROPT NL_RTM_NEWNDUSEROPT
+#define RTM_NEWADDRLABEL NL_RTM_NEWADDRLABEL
+#define RTM_DELADDRLABEL NL_RTM_DELADDRLABEL
+#define RTM_GETADDRLABEL NL_RTM_GETADDRLABEL
+#define RTM_GETDCB NL_RTM_GETDCB
+#define RTM_SETDCB NL_RTM_SETDCB
+#define RTM_NEWNETCONF NL_RTM_NEWNETCONF
+#define RTM_GETNETCONF NL_RTM_GETNETCONF
+#define RTM_NEWMDB NL_RTM_NEWMDB
+#define RTM_DELMDB NL_RTM_DELMDB
+#define RTM_GETMDB NL_RTM_GETMDB
+#define RTM_NEWNSID NL_RTM_NEWNSID
+#define RTM_DELNSID NL_RTM_DELNSID
+#define RTM_GETNSID NL_RTM_GETNSID
+#define RTM_NEWSTATS NL_RTM_NEWSTATS
+#define RTM_GETSTATS NL_RTM_GETSTATS
+#define RTM_NEWNEXTHOP NL_RTM_NEWNEXTHOP
+#define RTM_DELNEXTHOP NL_RTM_DELNEXTHOP
+#define RTM_GETNEXTHOP NL_RTM_GETNEXTHOP
+#define __RTM_MAX __NL_RTM_MAX
+
+#define RTM_MAX (roundup2(__RTM_MAX, 4))
+
+/* rtnetlink multicast groups - backwards compatibility for userspace */
+#define RTMGRP_LINK 0x01
+#define RTMGRP_NOTIFY 0x02
+#define RTMGRP_NEIGH 0x04
+#define RTMGRP_TC 0x08
+
+#define RTMGRP_IPV4_IFADDR 0x10
+#define RTMGRP_IPV4_MROUTE 0x20
+#define RTMGRP_IPV4_ROUTE 0x40
+#define RTMGRP_IPV4_RULE 0x80
+
+#define RTMGRP_IPV6_IFADDR 0x100
+#define RTMGRP_IPV6_MROUTE 0x200
+#define RTMGRP_IPV6_ROUTE 0x400
+#define RTMGRP_IPV6_IFINFO 0x800
+
+#define RTMGRP_DECnet_IFADDR 0x1000
+#define RTMGRP_DECnet_ROUTE 0x4000
+
+#define RTMGRP_IPV6_PREFIX 0x20000
+#endif
+
+/* Defined NETLINK_ROUTE multicast groups */
+enum rtnetlink_groups {
+ RTNLGRP_NONE,
+#define RTNLGRP_NONE RTNLGRP_NONE
+ RTNLGRP_LINK,
+#define RTNLGRP_LINK RTNLGRP_LINK
+ RTNLGRP_NOTIFY,
+#define RTNLGRP_NOTIFY RTNLGRP_NOTIFY
+ RTNLGRP_NEIGH,
+#define RTNLGRP_NEIGH RTNLGRP_NEIGH
+ RTNLGRP_TC,
+#define RTNLGRP_TC RTNLGRP_TC
+ RTNLGRP_IPV4_IFADDR,
+#define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR
+ RTNLGRP_IPV4_MROUTE,
+#define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE
+ RTNLGRP_IPV4_ROUTE,
+#define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE
+ RTNLGRP_IPV4_RULE,
+#define RTNLGRP_IPV4_RULE RTNLGRP_IPV4_RULE
+ RTNLGRP_IPV6_IFADDR,
+#define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR
+ RTNLGRP_IPV6_MROUTE,
+#define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE
+ RTNLGRP_IPV6_ROUTE,
+#define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE
+ RTNLGRP_IPV6_IFINFO,
+#define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO
+ RTNLGRP_DECnet_IFADDR,
+#define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR
+ RTNLGRP_NOP2,
+ RTNLGRP_DECnet_ROUTE,
+#define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE
+ RTNLGRP_DECnet_RULE,
+#define RTNLGRP_DECnet_RULE RTNLGRP_DECnet_RULE
+ RTNLGRP_NOP4,
+ RTNLGRP_IPV6_PREFIX,
+#define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX
+ RTNLGRP_IPV6_RULE,
+#define RTNLGRP_IPV6_RULE RTNLGRP_IPV6_RULE
+ RTNLGRP_ND_USEROPT,
+#define RTNLGRP_ND_USEROPT RTNLGRP_ND_USEROPT
+ RTNLGRP_PHONET_IFADDR,
+#define RTNLGRP_PHONET_IFADDR RTNLGRP_PHONET_IFADDR
+ RTNLGRP_PHONET_ROUTE,
+#define RTNLGRP_PHONET_ROUTE RTNLGRP_PHONET_ROUTE
+ RTNLGRP_DCB,
+#define RTNLGRP_DCB RTNLGRP_DCB
+ RTNLGRP_IPV4_NETCONF,
+#define RTNLGRP_IPV4_NETCONF RTNLGRP_IPV4_NETCONF
+ RTNLGRP_IPV6_NETCONF,
+#define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF
+ RTNLGRP_MDB,
+#define RTNLGRP_MDB RTNLGRP_MDB
+ RTNLGRP_MPLS_ROUTE,
+#define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE
+ RTNLGRP_NSID,
+#define RTNLGRP_NSID RTNLGRP_NSID
+ RTNLGRP_MPLS_NETCONF,
+#define RTNLGRP_MPLS_NETCONF RTNLGRP_MPLS_NETCONF
+ RTNLGRP_IPV4_MROUTE_R,
+#define RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV4_MROUTE_R
+ RTNLGRP_IPV6_MROUTE_R,
+#define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R
+ RTNLGRP_NEXTHOP,
+#define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP
+ RTNLGRP_BRVLAN,
+#define RTNLGRP_BRVLAN RTNLGRP_BRVLAN
+ __RTNLGRP_MAX
+};
+#define RTNLGRP_MAX (__RTNLGRP_MAX - 1)
+
+
+/* Defined NETLINK_ROUTE virtual multicast address families */
+#define RTNL_FAMILY_IPMR 128 /* Not supported */
+#define RTNL_FAMILY_IP6MR 129 /* Not supported */
+#define RTNL_FAMILY_MAX 129
+
+#endif
+
diff --git a/sys/netlink/route/iface.c b/sys/netlink/route/iface.c
new file mode 100644
index 000000000000..8b871576d0b2
--- /dev/null
+++ b/sys/netlink/route/iface.c
@@ -0,0 +1,1530 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/jail.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/syslog.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_media.h>
+#include <net/if_var.h>
+#include <net/if_clone.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <netinet/in_var.h>
+#include <netinet6/in6_var.h>
+#include <netinet6/scope6_var.h> /* scope deembedding */
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#define DEBUG_MOD_NAME nl_iface
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_INFO);
+
+struct netlink_walkargs {
+ struct nl_writer *nw;
+ struct nlmsghdr hdr;
+ struct nlpcb *so;
+ struct ucred *cred;
+ uint32_t fibnum;
+ int family;
+ int error;
+ int count;
+ int dumped;
+};
+
+static eventhandler_tag ifdetach_event, ifattach_event, iflink_event, ifaddr_event;
+
+static SLIST_HEAD(, nl_cloner) nl_cloners = SLIST_HEAD_INITIALIZER(nl_cloners);
+
+static struct sx rtnl_cloner_lock;
+SX_SYSINIT(rtnl_cloner_lock, &rtnl_cloner_lock, "rtnl cloner lock");
+
+/* These are external hooks for CARP. */
+extern int (*carp_get_vhid_p)(struct ifaddr *);
+
+/*
+ * RTM_GETLINK request
+ * sendto(3, {{len=32, type=RTM_GETLINK, flags=NLM_F_REQUEST|NLM_F_DUMP, seq=1641940952, pid=0},
+ * {ifi_family=AF_INET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}, 32, 0, NULL, 0) = 32
+ *
+ * Reply:
+ * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_ETHER, ifi_index=if_nametoindex("enp0s31f6"), ifi_flags=IFF_UP|IFF_BROADCAST|IFF_RUNNING|IFF_MULTICAST|IFF_LOWER_UP, ifi_change=0},
+{{nla_len=10, nla_type=IFLA_ADDRESS}, "\xfe\x54\x00\x52\x3e\x90"}
+
+[
+{{nla_len=14, nla_type=IFLA_IFNAME}, "enp0s31f6"},
+{{nla_len=8, nla_type=IFLA_TXQLEN}, 1000},
+{{nla_len=5, nla_type=IFLA_OPERSTATE}, 6},
+{{nla_len=5, nla_type=IFLA_LINKMODE}, 0},
+{{nla_len=8, nla_type=IFLA_MTU}, 1500},
+{{nla_len=8, nla_type=IFLA_MIN_MTU}, 68},
+ {{nla_len=8, nla_type=IFLA_MAX_MTU}, 9000},
+{{nla_len=8, nla_type=IFLA_GROUP}, 0},
+{{nla_len=8, nla_type=IFLA_PROMISCUITY}, 0},
+{{nla_len=8, nla_type=IFLA_NUM_TX_QUEUES}, 1},
+{{nla_len=8, nla_type=IFLA_GSO_MAX_SEGS}, 65535},
+{{nla_len=8, nla_type=IFLA_GSO_MAX_SIZE}, 65536},
+{{nla_len=8, nla_type=IFLA_NUM_RX_QUEUES}, 1},
+{{nla_len=5, nla_type=IFLA_CARRIER}, 1},
+{{nla_len=13, nla_type=IFLA_QDISC}, "fq_codel"},
+{{nla_len=8, nla_type=IFLA_CARRIER_CHANGES}, 2},
+{{nla_len=5, nla_type=IFLA_PROTO_DOWN}, 0},
+{{nla_len=8, nla_type=IFLA_CARRIER_UP_COUNT}, 1},
+{{nla_len=8, nla_type=IFLA_CARRIER_DOWN_COUNT}, 1},
+ */
+
+struct if_state {
+ uint8_t ifla_operstate;
+ uint8_t ifla_carrier;
+};
+
+static void
+get_operstate_ether(if_t ifp, struct if_state *pstate)
+{
+ struct ifmediareq ifmr = {};
+ int error;
+ error = if_ioctl(ifp, SIOCGIFMEDIA, (void *)&ifmr);
+
+ if (error != 0) {
+ NL_LOG(LOG_DEBUG, "error calling SIOCGIFMEDIA on %s: %d",
+ if_name(ifp), error);
+ return;
+ }
+
+ switch (IFM_TYPE(ifmr.ifm_active)) {
+ case IFM_ETHER:
+ if (ifmr.ifm_status & IFM_ACTIVE) {
+ pstate->ifla_carrier = 1;
+ if (if_getflags(ifp) & IFF_MONITOR)
+ pstate->ifla_operstate = IF_OPER_DORMANT;
+ else
+ pstate->ifla_operstate = IF_OPER_UP;
+ } else
+ pstate->ifla_operstate = IF_OPER_DOWN;
+ }
+}
+
+static bool
+get_stats(struct nl_writer *nw, if_t ifp)
+{
+ struct rtnl_link_stats64 *stats;
+
+ int nla_len = sizeof(struct nlattr) + sizeof(*stats);
+ struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
+ if (nla == NULL)
+ return (false);
+ nla->nla_type = IFLA_STATS64;
+ nla->nla_len = nla_len;
+ stats = (struct rtnl_link_stats64 *)(nla + 1);
+
+ stats->rx_packets = if_getcounter(ifp, IFCOUNTER_IPACKETS);
+ stats->tx_packets = if_getcounter(ifp, IFCOUNTER_OPACKETS);
+ stats->rx_bytes = if_getcounter(ifp, IFCOUNTER_IBYTES);
+ stats->tx_bytes = if_getcounter(ifp, IFCOUNTER_OBYTES);
+ stats->rx_errors = if_getcounter(ifp, IFCOUNTER_IERRORS);
+ stats->tx_errors = if_getcounter(ifp, IFCOUNTER_OERRORS);
+ stats->rx_dropped = if_getcounter(ifp, IFCOUNTER_IQDROPS);
+ stats->tx_dropped = if_getcounter(ifp, IFCOUNTER_OQDROPS);
+ stats->multicast = if_getcounter(ifp, IFCOUNTER_IMCASTS);
+ stats->rx_nohandler = if_getcounter(ifp, IFCOUNTER_NOPROTO);
+
+ return (true);
+}
+
+static void
+get_operstate(if_t ifp, struct if_state *pstate)
+{
+ pstate->ifla_operstate = IF_OPER_UNKNOWN;
+ pstate->ifla_carrier = 0; /* no carrier */
+
+ switch (if_gettype(ifp)) {
+ case IFT_ETHER:
+ case IFT_L2VLAN:
+ get_operstate_ether(ifp, pstate);
+ break;
+ default:
+ /* Map admin state to the operstate */
+ if (if_getflags(ifp) & IFF_UP) {
+ pstate->ifla_operstate = IF_OPER_UP;
+ pstate->ifla_carrier = 1;
+ } else
+ pstate->ifla_operstate = IF_OPER_DOWN;
+ break;
+ }
+}
+
+static void
+get_hwaddr(struct nl_writer *nw, if_t ifp)
+{
+ struct ifreq ifr = {};
+
+ if (if_gethwaddr(ifp, &ifr) == 0) {
+ nlattr_add(nw, IFLAF_ORIG_HWADDR, if_getaddrlen(ifp),
+ ifr.ifr_addr.sa_data);
+ }
+}
+
+static unsigned
+ifp_flags_to_netlink(const if_t ifp)
+{
+ return (if_getflags(ifp) | if_getdrvflags(ifp));
+}
+
+#define LLADDR_CONST(s) ((const void *)((s)->sdl_data + (s)->sdl_nlen))
+static bool
+dump_sa(struct nl_writer *nw, int attr, const struct sockaddr *sa)
+{
+ uint32_t addr_len = 0;
+ const void *addr_data = NULL;
+#ifdef INET6
+ struct in6_addr addr6;
+#endif
+
+ if (sa == NULL)
+ return (true);
+
+ switch (sa->sa_family) {
+#ifdef INET
+ case AF_INET:
+ addr_len = sizeof(struct in_addr);
+ addr_data = &((const struct sockaddr_in *)sa)->sin_addr;
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ in6_splitscope(&((const struct sockaddr_in6 *)sa)->sin6_addr, &addr6, &addr_len);
+ addr_len = sizeof(struct in6_addr);
+ addr_data = &addr6;
+ break;
+#endif
+ case AF_LINK:
+ addr_len = ((const struct sockaddr_dl *)sa)->sdl_alen;
+ addr_data = LLADDR_CONST((const struct sockaddr_dl *)sa);
+ break;
+ case AF_UNSPEC:
+ /* Ignore empty SAs without warning */
+ return (true);
+ default:
+ NL_LOG(LOG_DEBUG2, "unsupported family: %d, skipping", sa->sa_family);
+ return (true);
+ }
+
+ return (nlattr_add(nw, attr, addr_len, addr_data));
+}
+
+static bool
+dump_iface_caps(struct nl_writer *nw, struct ifnet *ifp)
+{
+ int off = nlattr_add_nested(nw, IFLAF_CAPS);
+ uint32_t active_caps[roundup2(IFCAP_B_SIZE, 32) / 32] = {};
+ uint32_t all_caps[roundup2(IFCAP_B_SIZE, 32) / 32] = {};
+
+ MPASS(sizeof(active_caps) >= 8);
+ MPASS(sizeof(all_caps) >= 8);
+
+ if (off == 0)
+ return (false);
+
+ active_caps[0] = (uint32_t)if_getcapabilities(ifp);
+ all_caps[0] = (uint32_t)if_getcapenable(ifp);
+ active_caps[1] = (uint32_t)if_getcapabilities2(ifp);
+ all_caps[1] = (uint32_t)if_getcapenable2(ifp);
+
+ nlattr_add_u32(nw, NLA_BITSET_SIZE, IFCAP_B_SIZE);
+ nlattr_add(nw, NLA_BITSET_MASK, sizeof(all_caps), all_caps);
+ nlattr_add(nw, NLA_BITSET_VALUE, sizeof(active_caps), active_caps);
+
+ nlattr_set_len(nw, off);
+
+ return (true);
+}
+
+/*
+ * Dumps interface state, properties and metrics.
+ * @nw: message writer
+ * @ifp: target interface
+ * @hdr: template header
+ * @if_flags_mask: changed if_[drv]_flags bitmask
+ *
+ * This function is called without epoch and MAY sleep.
+ */
+static bool
+dump_iface(struct nl_writer *nw, if_t ifp, const struct nlmsghdr *hdr,
+ int if_flags_mask)
+{
+ struct epoch_tracker et;
+ struct ifinfomsg *ifinfo;
+
+ NL_LOG(LOG_DEBUG3, "dumping interface %s data", if_name(ifp));
+
+ if (!nlmsg_reply(nw, hdr, sizeof(struct ifinfomsg)))
+ goto enomem;
+
+ ifinfo = nlmsg_reserve_object(nw, struct ifinfomsg);
+ ifinfo->ifi_family = AF_UNSPEC;
+ ifinfo->__ifi_pad = 0;
+ ifinfo->ifi_type = if_gettype(ifp);
+ ifinfo->ifi_index = if_getindex(ifp);
+ ifinfo->ifi_flags = ifp_flags_to_netlink(ifp);
+ ifinfo->ifi_change = if_flags_mask;
+
+ struct if_state ifs = {};
+ get_operstate(ifp, &ifs);
+
+ if (ifs.ifla_operstate == IF_OPER_UP)
+ ifinfo->ifi_flags |= IFF_LOWER_UP;
+
+ nlattr_add_string(nw, IFLA_IFNAME, if_name(ifp));
+ nlattr_add_u8(nw, IFLA_OPERSTATE, ifs.ifla_operstate);
+ nlattr_add_u8(nw, IFLA_CARRIER, ifs.ifla_carrier);
+
+/*
+ nlattr_add_u8(nw, IFLA_PROTO_DOWN, val);
+ nlattr_add_u8(nw, IFLA_LINKMODE, val);
+*/
+ if (if_getaddrlen(ifp) != 0) {
+ struct ifaddr *ifa;
+ struct ifa_iter it;
+
+ NET_EPOCH_ENTER(et);
+ ifa = ifa_iter_start(ifp, &it);
+ if (ifa != NULL)
+ dump_sa(nw, IFLA_ADDRESS, ifa->ifa_addr);
+ ifa_iter_finish(&it);
+ NET_EPOCH_EXIT(et);
+ }
+
+ if ((if_getbroadcastaddr(ifp) != NULL)) {
+ nlattr_add(nw, IFLA_BROADCAST, if_getaddrlen(ifp),
+ if_getbroadcastaddr(ifp));
+ }
+
+ nlattr_add_u32(nw, IFLA_MTU, if_getmtu(ifp));
+/*
+ nlattr_add_u32(nw, IFLA_MIN_MTU, 60);
+ nlattr_add_u32(nw, IFLA_MAX_MTU, 9000);
+ nlattr_add_u32(nw, IFLA_GROUP, 0);
+*/
+
+ if (if_getdescr(ifp) != NULL)
+ nlattr_add_string(nw, IFLA_IFALIAS, if_getdescr(ifp));
+
+ /* Store FreeBSD-specific attributes */
+ int off = nlattr_add_nested(nw, IFLA_FREEBSD);
+ if (off != 0) {
+ get_hwaddr(nw, ifp);
+ dump_iface_caps(nw, ifp);
+
+ nlattr_set_len(nw, off);
+ }
+
+ get_stats(nw, ifp);
+
+ uint32_t val = (if_getflags(ifp) & IFF_PROMISC) != 0;
+ nlattr_add_u32(nw, IFLA_PROMISCUITY, val);
+
+ ifc_dump_ifp_nl(ifp, nw);
+
+ if (nlmsg_end(nw))
+ return (true);
+
+enomem:
+ NL_LOG(LOG_DEBUG, "unable to dump interface %s state (ENOMEM)", if_name(ifp));
+ nlmsg_abort(nw);
+ return (false);
+}
+
+static bool
+check_ifmsg(void *hdr, struct nl_pstate *npt)
+{
+ struct ifinfomsg *ifm = hdr;
+
+ if (ifm->__ifi_pad != 0 || ifm->ifi_type != 0 ||
+ ifm->ifi_flags != 0 || ifm->ifi_change != 0) {
+ nlmsg_report_err_msg(npt,
+ "strict checking: non-zero values in ifinfomsg header");
+ return (false);
+ }
+
+ return (true);
+}
+
+#define _IN(_field) offsetof(struct ifinfomsg, _field)
+#define _OUT(_field) offsetof(struct nl_parsed_link, _field)
+static const struct nlfield_parser nlf_p_if[] = {
+ { .off_in = _IN(ifi_type), .off_out = _OUT(ifi_type), .cb = nlf_get_u16 },
+ { .off_in = _IN(ifi_index), .off_out = _OUT(ifi_index), .cb = nlf_get_u32 },
+ { .off_in = _IN(ifi_flags), .off_out = _OUT(ifi_flags), .cb = nlf_get_u32 },
+ { .off_in = _IN(ifi_change), .off_out = _OUT(ifi_change), .cb = nlf_get_u32 },
+};
+
+static const struct nlattr_parser nla_p_linfo[] = {
+ { .type = IFLA_INFO_KIND, .off = _OUT(ifla_cloner), .cb = nlattr_get_stringn },
+ { .type = IFLA_INFO_DATA, .off = _OUT(ifla_idata), .cb = nlattr_get_nla },
+};
+NL_DECLARE_ATTR_PARSER(linfo_parser, nla_p_linfo);
+
+static const struct nlattr_parser nla_p_if[] = {
+ { .type = IFLA_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string },
+ { .type = IFLA_MTU, .off = _OUT(ifla_mtu), .cb = nlattr_get_uint32 },
+ { .type = IFLA_LINK, .off = _OUT(ifla_link), .cb = nlattr_get_uint32 },
+ { .type = IFLA_LINKINFO, .arg = &linfo_parser, .cb = nlattr_get_nested },
+ { .type = IFLA_IFALIAS, .off = _OUT(ifla_ifalias), .cb = nlattr_get_string },
+ { .type = IFLA_GROUP, .off = _OUT(ifla_group), .cb = nlattr_get_string },
+ { .type = IFLA_ALT_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string },
+};
+#undef _IN
+#undef _OUT
+NL_DECLARE_STRICT_PARSER(ifmsg_parser, struct ifinfomsg, check_ifmsg, nlf_p_if, nla_p_if);
+
+static bool
+match_iface(if_t ifp, void *_arg)
+{
+ struct nl_parsed_link *attrs = (struct nl_parsed_link *)_arg;
+
+ if (attrs->ifi_index != 0 && attrs->ifi_index != if_getindex(ifp))
+ return (false);
+ if (attrs->ifi_type != 0 && attrs->ifi_index != if_gettype(ifp))
+ return (false);
+ if (attrs->ifla_ifname != NULL && strcmp(attrs->ifla_ifname, if_name(ifp)))
+ return (false);
+ /* TODO: add group match */
+
+ return (true);
+}
+
+static int
+dump_cb(if_t ifp, void *_arg)
+{
+ struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg;
+ if (!dump_iface(wa->nw, ifp, &wa->hdr, 0))
+ return (ENOMEM);
+ return (0);
+}
+
+/*
+ * {nlmsg_len=52, nlmsg_type=RTM_GETLINK, nlmsg_flags=NLM_F_REQUEST, nlmsg_seq=1662842818, nlmsg_pid=0},
+ * {ifi_family=AF_PACKET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0},
+ * [
+ * [{nla_len=10, nla_type=IFLA_IFNAME}, "vnet9"],
+ * [{nla_len=8, nla_type=IFLA_EXT_MASK}, RTEXT_FILTER_VF]
+ * ]
+ */
+static int
+rtnl_handle_getlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ struct epoch_tracker et;
+ if_t ifp;
+ int error = 0;
+
+ struct nl_parsed_link attrs = {};
+ error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ struct netlink_walkargs wa = {
+ .so = nlp,
+ .nw = npt->nw,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_NEWLINK,
+ };
+
+ /* Fast track for an interface w/ explicit name or index match */
+ if ((attrs.ifi_index != 0) || (attrs.ifla_ifname != NULL)) {
+ if (attrs.ifi_index != 0) {
+ NLP_LOG(LOG_DEBUG3, nlp, "fast track -> searching index %u",
+ attrs.ifi_index);
+ NET_EPOCH_ENTER(et);
+ ifp = ifnet_byindex_ref(attrs.ifi_index);
+ NET_EPOCH_EXIT(et);
+ } else {
+ NLP_LOG(LOG_DEBUG3, nlp, "fast track -> searching name %s",
+ attrs.ifla_ifname);
+ ifp = ifunit_ref(attrs.ifla_ifname);
+ }
+
+ if (ifp != NULL) {
+ if (match_iface(ifp, &attrs)) {
+ if (!dump_iface(wa.nw, ifp, &wa.hdr, 0))
+ error = ENOMEM;
+ } else
+ error = ENODEV;
+ if_rele(ifp);
+ } else
+ error = ENODEV;
+ return (error);
+ }
+
+ /* Always treat non-direct-match as a multipart message */
+ wa.hdr.nlmsg_flags |= NLM_F_MULTI;
+
+ /*
+ * Fetching some link properties require performing ioctl's that may be blocking.
+ * Address it by saving referenced pointers of the matching links,
+ * exiting from epoch and going through the list one-by-one.
+ */
+
+ NL_LOG(LOG_DEBUG2, "Start dump");
+ if_foreach_sleep(match_iface, &attrs, dump_cb, &wa);
+ NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped);
+
+ if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) {
+ NL_LOG(LOG_DEBUG, "Unable to finalize the dump");
+ return (ENOMEM);
+ }
+
+ return (error);
+}
+
+/*
+ * sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[
+ * {nlmsg_len=60, nlmsg_type=RTM_NEWLINK, nlmsg_flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, nlmsg_seq=1662715618, nlmsg_pid=0},
+ * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0},
+ * {nla_len=11, nla_type=IFLA_IFNAME}, "dummy0"],
+ * [
+ * {nla_len=16, nla_type=IFLA_LINKINFO},
+ * [
+ * {nla_len=9, nla_type=IFLA_INFO_KIND}, "dummy"...
+ * ]
+ * ]
+ */
+
+static int
+rtnl_handle_dellink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ struct epoch_tracker et;
+ if_t ifp;
+ int error;
+
+ struct nl_parsed_link attrs = {};
+ error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ NET_EPOCH_ENTER(et);
+ ifp = ifnet_byindex_ref(attrs.ifi_index);
+ NET_EPOCH_EXIT(et);
+ if (ifp == NULL) {
+ NLP_LOG(LOG_DEBUG, nlp, "unable to find interface %u", attrs.ifi_index);
+ return (ENOENT);
+ }
+ NLP_LOG(LOG_DEBUG3, nlp, "mapped ifindex %u to %s", attrs.ifi_index, if_name(ifp));
+
+ sx_xlock(&ifnet_detach_sxlock);
+ error = if_clone_destroy(if_name(ifp));
+ sx_xunlock(&ifnet_detach_sxlock);
+
+ NLP_LOG(LOG_DEBUG2, nlp, "deleting interface %s returned %d", if_name(ifp), error);
+
+ if_rele(ifp);
+ return (error);
+}
+
+/*
+ * New link:
+ * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1668185590, pid=0},
+ * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}
+ * [
+ * {{nla_len=8, nla_type=IFLA_MTU}, 123},
+ * {{nla_len=10, nla_type=IFLA_IFNAME}, "vlan1"},
+ * {{nla_len=24, nla_type=IFLA_LINKINFO},
+ * [
+ * {{nla_len=8, nla_type=IFLA_INFO_KIND}, "vlan"...},
+ * {{nla_len=12, nla_type=IFLA_INFO_DATA}, "\x06\x00\x01\x00\x7b\x00\x00\x00"}]}]}
+ *
+ * Update link:
+ * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1668185923, pid=0},
+ * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=if_nametoindex("lo"), ifi_flags=0, ifi_change=0},
+ * {{nla_len=8, nla_type=IFLA_MTU}, 123}}
+ *
+ *
+ * Check command availability:
+ * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=0, pid=0},
+ * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}
+ */
+
+
+static int
+create_link(struct nlmsghdr *hdr, struct nl_parsed_link *lattrs,
+ struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ if (lattrs->ifla_ifname == NULL || strlen(lattrs->ifla_ifname) == 0) {
+ NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_IFNAME attribute");
+ return (EINVAL);
+ }
+ if (lattrs->ifla_cloner == NULL || strlen(lattrs->ifla_cloner) == 0) {
+ NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_INFO_KIND attribute");
+ return (EINVAL);
+ }
+
+ struct ifc_data_nl ifd = {
+ .flags = IFC_F_CREATE,
+ .lattrs = lattrs,
+ .bm = bm,
+ .npt = npt,
+ };
+ if (ifc_create_ifp_nl(lattrs->ifla_ifname, &ifd) && ifd.error == 0)
+ nl_store_ifp_cookie(npt, ifd.ifp);
+
+ return (ifd.error);
+}
+
+static int
+modify_link(struct nlmsghdr *hdr, struct nl_parsed_link *lattrs,
+ struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ if_t ifp = NULL;
+ struct epoch_tracker et;
+
+ if (lattrs->ifi_index == 0 && lattrs->ifla_ifname == NULL) {
+ /*
+ * Applications like ip(8) verify RTM_NEWLINK command
+ * existence by calling it with empty arguments. Always
+ * return "innocent" error in that case.
+ */
+ NLMSG_REPORT_ERR_MSG(npt, "empty ifi_index field");
+ return (EPERM);
+ }
+
+ if (lattrs->ifi_index != 0) {
+ NET_EPOCH_ENTER(et);
+ ifp = ifnet_byindex_ref(lattrs->ifi_index);
+ NET_EPOCH_EXIT(et);
+ if (ifp == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "unable to find interface #%u",
+ lattrs->ifi_index);
+ return (ENOENT);
+ }
+ }
+
+ if (ifp == NULL && lattrs->ifla_ifname != NULL) {
+ ifp = ifunit_ref(lattrs->ifla_ifname);
+ if (ifp == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "unable to find interface %s",
+ lattrs->ifla_ifname);
+ return (ENOENT);
+ }
+ }
+
+ MPASS(ifp != NULL);
+
+ /*
+ * Modification request can address either
+ * 1) cloned interface, in which case we call the cloner-specific
+ * modification routine
+ * or
+ * 2) non-cloned (e.g. "physical") interface, in which case we call
+ * generic modification routine
+ */
+ struct ifc_data_nl ifd = { .lattrs = lattrs, .bm = bm, .npt = npt };
+ if (!ifc_modify_ifp_nl(ifp, &ifd))
+ ifd.error = nl_modify_ifp_generic(ifp, lattrs, bm, npt);
+
+ if_rele(ifp);
+
+ return (ifd.error);
+}
+
+
+static int
+rtnl_handle_newlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ struct nlattr_bmask bm;
+ int error;
+
+ struct nl_parsed_link attrs = {};
+ error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+ nl_get_attrs_bmask_nlmsg(hdr, &ifmsg_parser, &bm);
+
+ if (hdr->nlmsg_flags & NLM_F_CREATE)
+ return (create_link(hdr, &attrs, &bm, nlp, npt));
+ else
+ return (modify_link(hdr, &attrs, &bm, nlp, npt));
+}
+
+static void
+set_scope6(struct sockaddr *sa, uint32_t ifindex)
+{
+#ifdef INET6
+ if (sa != NULL && sa->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa;
+
+ if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr))
+ in6_set_unicast_scopeid(&sa6->sin6_addr, ifindex);
+ }
+#endif
+}
+
+static bool
+check_sa_family(const struct sockaddr *sa, int family, const char *attr_name,
+ struct nl_pstate *npt)
+{
+ if (sa == NULL || sa->sa_family == family)
+ return (true);
+
+ nlmsg_report_err_msg(npt, "wrong family for %s attribute: %d != %d",
+ attr_name, family, sa->sa_family);
+ return (false);
+}
+
+struct nl_parsed_ifa {
+ uint8_t ifa_family;
+ uint8_t ifa_prefixlen;
+ uint8_t ifa_scope;
+ uint32_t ifa_index;
+ uint32_t ifa_flags;
+ uint32_t ifaf_vhid;
+ uint32_t ifaf_flags;
+ struct sockaddr *ifa_address;
+ struct sockaddr *ifa_local;
+ struct sockaddr *ifa_broadcast;
+ struct ifa_cacheinfo *ifa_cacheinfo;
+ struct sockaddr *f_ifa_addr;
+ struct sockaddr *f_ifa_dst;
+};
+
+static int
+nlattr_get_cinfo(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg __unused, void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != sizeof(struct ifa_cacheinfo))) {
+ NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not ifa_cacheinfo",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+ *((struct ifa_cacheinfo **)target) = (struct ifa_cacheinfo *)NL_RTA_DATA(nla);
+ return (0);
+}
+
+#define _IN(_field) offsetof(struct ifaddrmsg, _field)
+#define _OUT(_field) offsetof(struct nl_parsed_ifa, _field)
+static const struct nlfield_parser nlf_p_ifa[] = {
+ { .off_in = _IN(ifa_family), .off_out = _OUT(ifa_family), .cb = nlf_get_u8 },
+ { .off_in = _IN(ifa_prefixlen), .off_out = _OUT(ifa_prefixlen), .cb = nlf_get_u8 },
+ { .off_in = _IN(ifa_scope), .off_out = _OUT(ifa_scope), .cb = nlf_get_u8 },
+ { .off_in = _IN(ifa_flags), .off_out = _OUT(ifa_flags), .cb = nlf_get_u8_u32 },
+ { .off_in = _IN(ifa_index), .off_out = _OUT(ifa_index), .cb = nlf_get_u32 },
+};
+
+static const struct nlattr_parser nla_p_ifa_fbsd[] = {
+ { .type = IFAF_VHID, .off = _OUT(ifaf_vhid), .cb = nlattr_get_uint32 },
+ { .type = IFAF_FLAGS, .off = _OUT(ifaf_flags), .cb = nlattr_get_uint32 },
+};
+NL_DECLARE_ATTR_PARSER(ifa_fbsd_parser, nla_p_ifa_fbsd);
+
+static const struct nlattr_parser nla_p_ifa[] = {
+ { .type = IFA_ADDRESS, .off = _OUT(ifa_address), .cb = nlattr_get_ip },
+ { .type = IFA_LOCAL, .off = _OUT(ifa_local), .cb = nlattr_get_ip },
+ { .type = IFA_BROADCAST, .off = _OUT(ifa_broadcast), .cb = nlattr_get_ip },
+ { .type = IFA_CACHEINFO, .off = _OUT(ifa_cacheinfo), .cb = nlattr_get_cinfo },
+ { .type = IFA_FLAGS, .off = _OUT(ifa_flags), .cb = nlattr_get_uint32 },
+ { .type = IFA_FREEBSD, .arg = &ifa_fbsd_parser, .cb = nlattr_get_nested },
+};
+#undef _IN
+#undef _OUT
+
+static bool
+post_p_ifa(void *_attrs, struct nl_pstate *npt)
+{
+ struct nl_parsed_ifa *attrs = (struct nl_parsed_ifa *)_attrs;
+
+ if (!check_sa_family(attrs->ifa_address, attrs->ifa_family, "IFA_ADDRESS", npt))
+ return (false);
+ if (!check_sa_family(attrs->ifa_local, attrs->ifa_family, "IFA_LOCAL", npt))
+ return (false);
+ if (!check_sa_family(attrs->ifa_broadcast, attrs->ifa_family, "IFA_BROADADDR", npt))
+ return (false);
+
+ set_scope6(attrs->ifa_address, attrs->ifa_index);
+ set_scope6(attrs->ifa_local, attrs->ifa_index);
+
+ return (true);
+}
+
+NL_DECLARE_PARSER_EXT(ifa_parser, struct ifaddrmsg, NULL, nlf_p_ifa, nla_p_ifa, post_p_ifa);
+
+
+/*
+
+{ifa_family=AF_INET, ifa_prefixlen=8, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_HOST, ifa_index=if_nametoindex("lo")},
+ [
+ {{nla_len=8, nla_type=IFA_ADDRESS}, inet_addr("127.0.0.1")},
+ {{nla_len=8, nla_type=IFA_LOCAL}, inet_addr("127.0.0.1")},
+ {{nla_len=7, nla_type=IFA_LABEL}, "lo"},
+ {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT},
+ {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=3619, tstamp=3619}}]},
+---
+
+{{len=72, type=RTM_NEWADDR, flags=NLM_F_MULTI, seq=1642191126, pid=566735},
+ {ifa_family=AF_INET6, ifa_prefixlen=96, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_UNIVERSE, ifa_index=if_nametoindex("virbr0")},
+ [
+ {{nla_len=20, nla_type=IFA_ADDRESS}, inet_pton(AF_INET6, "2a01:4f8:13a:70c:ffff::1")},
+ {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=4283, tstamp=4283}},
+ {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}]},
+*/
+
+static uint8_t
+ifa_get_scope(const struct ifaddr *ifa)
+{
+ const struct sockaddr *sa;
+ uint8_t addr_scope = RT_SCOPE_UNIVERSE;
+
+ sa = ifa->ifa_addr;
+ switch (sa->sa_family) {
+#ifdef INET
+ case AF_INET:
+ {
+ struct in_addr addr;
+ addr = ((const struct sockaddr_in *)sa)->sin_addr;
+ if (IN_LOOPBACK(ntohl(addr.s_addr)))
+ addr_scope = RT_SCOPE_HOST;
+ else if (IN_LINKLOCAL(ntohl(addr.s_addr)))
+ addr_scope = RT_SCOPE_LINK;
+ break;
+ }
+#endif
+#ifdef INET6
+ case AF_INET6:
+ {
+ const struct in6_addr *addr;
+ addr = &((const struct sockaddr_in6 *)sa)->sin6_addr;
+ if (IN6_IS_ADDR_LOOPBACK(addr))
+ addr_scope = RT_SCOPE_HOST;
+ else if (IN6_IS_ADDR_LINKLOCAL(addr))
+ addr_scope = RT_SCOPE_LINK;
+ break;
+ }
+#endif
+ }
+
+ return (addr_scope);
+}
+
+#ifdef INET6
+static uint8_t
+inet6_get_plen(const struct in6_addr *addr)
+{
+
+ return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) +
+ bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3]));
+}
+#endif
+
+static uint8_t
+get_sa_plen(const struct sockaddr *sa)
+{
+#ifdef INET
+ const struct in_addr *paddr;
+#endif
+#ifdef INET6
+ const struct in6_addr *paddr6;
+#endif
+
+ switch (sa->sa_family) {
+#ifdef INET
+ case AF_INET:
+ paddr = &(((const struct sockaddr_in *)sa)->sin_addr);
+ return bitcount32(paddr->s_addr);
+#endif
+#ifdef INET6
+ case AF_INET6:
+ paddr6 = &(((const struct sockaddr_in6 *)sa)->sin6_addr);
+ return inet6_get_plen(paddr6);
+#endif
+ }
+
+ return (0);
+}
+
+#ifdef INET6
+static uint32_t
+in6_flags_to_nl(uint32_t flags)
+{
+ uint32_t nl_flags = 0;
+
+ if (flags & IN6_IFF_TEMPORARY)
+ nl_flags |= IFA_F_TEMPORARY;
+ if (flags & IN6_IFF_NODAD)
+ nl_flags |= IFA_F_NODAD;
+ if (flags & IN6_IFF_DEPRECATED)
+ nl_flags |= IFA_F_DEPRECATED;
+ if (flags & IN6_IFF_TENTATIVE)
+ nl_flags |= IFA_F_TENTATIVE;
+ if ((flags & (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY)) == 0)
+ flags |= IFA_F_PERMANENT;
+ if (flags & IN6_IFF_DUPLICATED)
+ flags |= IFA_F_DADFAILED;
+ return (nl_flags);
+}
+
+static uint32_t
+nl_flags_to_in6(uint32_t flags)
+{
+ uint32_t in6_flags = 0;
+
+ if (flags & IFA_F_TEMPORARY)
+ in6_flags |= IN6_IFF_TEMPORARY;
+ if (flags & IFA_F_NODAD)
+ in6_flags |= IN6_IFF_NODAD;
+ if (flags & IFA_F_DEPRECATED)
+ in6_flags |= IN6_IFF_DEPRECATED;
+ if (flags & IFA_F_TENTATIVE)
+ in6_flags |= IN6_IFF_TENTATIVE;
+ if (flags & IFA_F_DADFAILED)
+ in6_flags |= IN6_IFF_DUPLICATED;
+
+ return (in6_flags);
+}
+
+static void
+export_cache_info6(struct nl_writer *nw, const struct in6_ifaddr *ia)
+{
+ struct ifa_cacheinfo ci = {
+ .cstamp = ia->ia6_createtime * 1000,
+ .tstamp = ia->ia6_updatetime * 1000,
+ .ifa_prefered = ia->ia6_lifetime.ia6t_pltime,
+ .ifa_valid = ia->ia6_lifetime.ia6t_vltime,
+ };
+
+ nlattr_add(nw, IFA_CACHEINFO, sizeof(ci), &ci);
+}
+#endif
+
+static void
+export_cache_info(struct nl_writer *nw, struct ifaddr *ifa)
+{
+ switch (ifa->ifa_addr->sa_family) {
+#ifdef INET6
+ case AF_INET6:
+ export_cache_info6(nw, (struct in6_ifaddr *)ifa);
+ break;
+#endif
+ }
+}
+
+/*
+ * {'attrs': [('IFA_ADDRESS', '12.0.0.1'),
+ ('IFA_LOCAL', '12.0.0.1'),
+ ('IFA_LABEL', 'eth10'),
+ ('IFA_FLAGS', 128),
+ ('IFA_CACHEINFO', {'ifa_preferred': 4294967295, 'ifa_valid': 4294967295, 'cstamp': 63745746, 'tstamp': 63745746})],
+ */
+static bool
+dump_iface_addr(struct nl_writer *nw, if_t ifp, struct ifaddr *ifa,
+ const struct nlmsghdr *hdr)
+{
+ struct ifaddrmsg *ifamsg;
+ struct sockaddr *sa = ifa->ifa_addr;
+ struct sockaddr *sa_dst = ifa->ifa_dstaddr;
+
+ NL_LOG(LOG_DEBUG3, "dumping ifa %p type %s(%d) for interface %s",
+ ifa, rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp));
+
+ if (!nlmsg_reply(nw, hdr, sizeof(struct ifaddrmsg)))
+ goto enomem;
+
+ ifamsg = nlmsg_reserve_object(nw, struct ifaddrmsg);
+ ifamsg->ifa_family = sa->sa_family;
+ ifamsg->ifa_prefixlen = get_sa_plen(ifa->ifa_netmask);
+ ifamsg->ifa_flags = 0; // ifa_flags is useless
+ ifamsg->ifa_scope = ifa_get_scope(ifa);
+ ifamsg->ifa_index = if_getindex(ifp);
+
+ if ((if_getflags(ifp) & IFF_POINTOPOINT) && sa_dst != NULL && sa_dst->sa_family != 0) {
+ /* P2P interface may have IPv6 LL with no dst address */
+ dump_sa(nw, IFA_ADDRESS, sa_dst);
+ dump_sa(nw, IFA_LOCAL, sa);
+ } else {
+ dump_sa(nw, IFA_ADDRESS, sa);
+#ifdef INET
+ /*
+ * In most cases, IFA_ADDRESS == IFA_LOCAL
+ * Skip IFA_LOCAL for anything except INET
+ */
+ if (sa->sa_family == AF_INET)
+ dump_sa(nw, IFA_LOCAL, sa);
+#endif
+ }
+ if (if_getflags(ifp) & IFF_BROADCAST)
+ dump_sa(nw, IFA_BROADCAST, ifa->ifa_broadaddr);
+
+ nlattr_add_string(nw, IFA_LABEL, if_name(ifp));
+
+ uint32_t nl_ifa_flags = 0;
+#ifdef INET6
+ if (sa->sa_family == AF_INET6) {
+ struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
+ nl_ifa_flags = in6_flags_to_nl(ia->ia6_flags);
+ }
+#endif
+ nlattr_add_u32(nw, IFA_FLAGS, nl_ifa_flags);
+
+ export_cache_info(nw, ifa);
+
+ /* Store FreeBSD-specific attributes */
+ int off = nlattr_add_nested(nw, IFA_FREEBSD);
+ if (off != 0) {
+ if (ifa->ifa_carp != NULL && carp_get_vhid_p != NULL) {
+ uint32_t vhid = (uint32_t)(*carp_get_vhid_p)(ifa);
+ nlattr_add_u32(nw, IFAF_VHID, vhid);
+ }
+#ifdef INET6
+ if (sa->sa_family == AF_INET6) {
+ uint32_t ifa_flags = ((struct in6_ifaddr *)ifa)->ia6_flags;
+
+ nlattr_add_u32(nw, IFAF_FLAGS, ifa_flags);
+ }
+#endif
+
+ nlattr_set_len(nw, off);
+ }
+
+ if (nlmsg_end(nw))
+ return (true);
+enomem:
+ NL_LOG(LOG_DEBUG, "Failed to dump ifa type %s(%d) for interface %s",
+ rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp));
+ nlmsg_abort(nw);
+ return (false);
+}
+
+static int
+dump_iface_addrs(struct netlink_walkargs *wa, if_t ifp)
+{
+ struct ifaddr *ifa;
+ struct ifa_iter it;
+ int error = 0;
+
+ for (ifa = ifa_iter_start(ifp, &it); ifa != NULL; ifa = ifa_iter_next(&it)) {
+ if (wa->family != 0 && wa->family != ifa->ifa_addr->sa_family)
+ continue;
+ if (ifa->ifa_addr->sa_family == AF_LINK)
+ continue;
+ if (prison_if(wa->cred, ifa->ifa_addr) != 0)
+ continue;
+ wa->count++;
+ if (!dump_iface_addr(wa->nw, ifp, ifa, &wa->hdr)) {
+ error = ENOMEM;
+ break;
+ }
+ wa->dumped++;
+ }
+ ifa_iter_finish(&it);
+
+ return (error);
+}
+
+static int
+rtnl_handle_getaddr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ if_t ifp;
+ int error = 0;
+
+ struct nl_parsed_ifa attrs = {};
+ error = nl_parse_nlmsg(hdr, &ifa_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ struct netlink_walkargs wa = {
+ .so = nlp,
+ .nw = npt->nw,
+ .cred = nlp_get_cred(nlp),
+ .family = attrs.ifa_family,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI,
+ .hdr.nlmsg_type = NL_RTM_NEWADDR,
+ };
+
+ NL_LOG(LOG_DEBUG2, "Start dump");
+
+ if (attrs.ifa_index != 0) {
+ ifp = ifnet_byindex(attrs.ifa_index);
+ if (ifp == NULL)
+ error = ENOENT;
+ else
+ error = dump_iface_addrs(&wa, ifp);
+ } else {
+ struct if_iter it;
+
+ for (ifp = if_iter_start(&it); ifp != NULL; ifp = if_iter_next(&it)) {
+ error = dump_iface_addrs(&wa, ifp);
+ if (error != 0)
+ break;
+ }
+ if_iter_finish(&it);
+ }
+
+ NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped);
+
+ if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) {
+ NL_LOG(LOG_DEBUG, "Unable to finalize the dump");
+ return (ENOMEM);
+ }
+
+ return (error);
+}
+
+#ifdef INET
+static int
+handle_newaddr_inet(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs,
+ if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ int plen = attrs->ifa_prefixlen;
+ int if_flags = if_getflags(ifp);
+ struct sockaddr_in *addr, *dst;
+
+ if (plen > 32) {
+ nlmsg_report_err_msg(npt, "invalid ifa_prefixlen");
+ return (EINVAL);
+ };
+
+ if (if_flags & IFF_POINTOPOINT) {
+ /*
+ * Only P2P IFAs are allowed by the implementation.
+ */
+ if (attrs->ifa_address == NULL || attrs->ifa_local == NULL) {
+ nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS");
+ return (EINVAL);
+ }
+ addr = (struct sockaddr_in *)attrs->ifa_local;
+ dst = (struct sockaddr_in *)attrs->ifa_address;
+ } else {
+ /*
+ * Map the Netlink attributes to FreeBSD ifa layout.
+ * If only IFA_ADDRESS or IFA_LOCAL is set OR
+ * both are set to the same value => ifa is not p2p
+ * and the attribute value contains interface address.
+ *
+ * Otherwise (both IFA_ADDRESS and IFA_LOCAL are set and
+ * different), IFA_LOCAL contains an interface address and
+ * IFA_ADDRESS contains peer address.
+ */
+ addr = (struct sockaddr_in *)attrs->ifa_local;
+ if (addr == NULL)
+ addr = (struct sockaddr_in *)attrs->ifa_address;
+
+ if (addr == NULL) {
+ nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS");
+ return (EINVAL);
+ }
+
+ /* Generate broadcast address if not set */
+ if ((if_flags & IFF_BROADCAST) && attrs->ifa_broadcast == NULL) {
+ uint32_t s_baddr;
+ struct sockaddr_in *sin_brd;
+
+ if (plen == 31)
+ s_baddr = INADDR_BROADCAST; /* RFC 3021 */
+ else {
+ uint32_t s_mask;
+
+ s_mask = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
+ s_baddr = addr->sin_addr.s_addr | ~s_mask;
+ }
+
+ sin_brd = (struct sockaddr_in *)npt_alloc(npt, sizeof(*sin_brd));
+ if (sin_brd == NULL)
+ return (ENOMEM);
+ sin_brd->sin_family = AF_INET;
+ sin_brd->sin_len = sizeof(*sin_brd);
+ sin_brd->sin_addr.s_addr = s_baddr;
+ attrs->ifa_broadcast = (struct sockaddr *)sin_brd;
+ }
+ dst = (struct sockaddr_in *)attrs->ifa_broadcast;
+ }
+
+ struct sockaddr_in mask = {
+ .sin_len = sizeof(struct sockaddr_in),
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0),
+ };
+ struct in_aliasreq req = {
+ .ifra_addr = *addr,
+ .ifra_mask = mask,
+ .ifra_vhid = attrs->ifaf_vhid,
+ };
+ if (dst != NULL)
+ req.ifra_dstaddr = *dst;
+
+ return (in_control_ioctl(SIOCAIFADDR, &req, ifp, nlp_get_cred(nlp)));
+}
+
+static int
+handle_deladdr_inet(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs,
+ if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ struct sockaddr *addr = attrs->ifa_local;
+
+ if (addr == NULL)
+ addr = attrs->ifa_address;
+
+ if (addr == NULL) {
+ nlmsg_report_err_msg(npt, "empty IFA_ADDRESS/IFA_LOCAL");
+ return (EINVAL);
+ }
+
+ struct ifreq req = { .ifr_addr = *addr };
+
+ return (in_control_ioctl(SIOCDIFADDR, &req, ifp, nlp_get_cred(nlp)));
+}
+#endif
+
+#ifdef INET6
+static int
+handle_newaddr_inet6(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs,
+ if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ struct sockaddr_in6 *addr, *dst;
+
+ if (attrs->ifa_prefixlen > 128) {
+ nlmsg_report_err_msg(npt, "invalid ifa_prefixlen");
+ return (EINVAL);
+ }
+
+ /*
+ * In IPv6 implementation, adding non-P2P address to the P2P interface
+ * is allowed.
+ */
+ addr = (struct sockaddr_in6 *)(attrs->ifa_local);
+ dst = (struct sockaddr_in6 *)(attrs->ifa_address);
+
+ if (addr == NULL) {
+ addr = dst;
+ dst = NULL;
+ } else if (dst != NULL) {
+ if (IN6_ARE_ADDR_EQUAL(&addr->sin6_addr, &dst->sin6_addr)) {
+ /*
+ * Sometimes Netlink users fills in both attributes
+ * with the same address. It still means "non-p2p".
+ */
+ dst = NULL;
+ }
+ }
+
+ if (addr == NULL) {
+ nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS");
+ return (EINVAL);
+ }
+
+ uint32_t flags = nl_flags_to_in6(attrs->ifa_flags) | attrs->ifaf_flags;
+
+ uint32_t pltime = 0, vltime = 0;
+ if (attrs->ifa_cacheinfo != 0) {
+ pltime = attrs->ifa_cacheinfo->ifa_prefered;
+ vltime = attrs->ifa_cacheinfo->ifa_valid;
+ }
+
+ struct sockaddr_in6 mask = {
+ .sin6_len = sizeof(struct sockaddr_in6),
+ .sin6_family = AF_INET6,
+ };
+ ip6_writemask(&mask.sin6_addr, attrs->ifa_prefixlen);
+
+ struct in6_aliasreq req = {
+ .ifra_addr = *addr,
+ .ifra_prefixmask = mask,
+ .ifra_flags = flags,
+ .ifra_lifetime = { .ia6t_vltime = vltime, .ia6t_pltime = pltime },
+ .ifra_vhid = attrs->ifaf_vhid,
+ };
+ if (dst != NULL)
+ req.ifra_dstaddr = *dst;
+
+ return (in6_control_ioctl(SIOCAIFADDR_IN6, &req, ifp, nlp_get_cred(nlp)));
+}
+
+static int
+handle_deladdr_inet6(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs,
+ if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ struct sockaddr_in6 *addr = (struct sockaddr_in6 *)attrs->ifa_local;
+
+ if (addr == NULL)
+ addr = (struct sockaddr_in6 *)(attrs->ifa_address);
+
+ if (addr == NULL) {
+ nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS");
+ return (EINVAL);
+ }
+
+ struct in6_ifreq req = { .ifr_addr = *addr };
+
+ return (in6_control_ioctl(SIOCDIFADDR_IN6, &req, ifp, nlp_get_cred(nlp)));
+}
+#endif
+
+
+static int
+rtnl_handle_addr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ struct epoch_tracker et;
+ int error;
+
+ struct nl_parsed_ifa attrs = {};
+ error = nl_parse_nlmsg(hdr, &ifa_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ NET_EPOCH_ENTER(et);
+ if_t ifp = ifnet_byindex_ref(attrs.ifa_index);
+ NET_EPOCH_EXIT(et);
+
+ if (ifp == NULL) {
+ nlmsg_report_err_msg(npt, "Unable to find interface with index %u",
+ attrs.ifa_index);
+ return (ENOENT);
+ }
+ int if_flags = if_getflags(ifp);
+
+#if defined(INET) || defined(INET6)
+ bool new = hdr->nlmsg_type == NL_RTM_NEWADDR;
+#endif
+
+ /*
+ * TODO: Properly handle NLM_F_CREATE / NLM_F_EXCL.
+ * The current ioctl-based KPI always does an implicit create-or-replace.
+ * It is not possible to specify fine-grained options.
+ */
+
+ switch (attrs.ifa_family) {
+#ifdef INET
+ case AF_INET:
+ if (new)
+ error = handle_newaddr_inet(hdr, &attrs, ifp, nlp, npt);
+ else
+ error = handle_deladdr_inet(hdr, &attrs, ifp, nlp, npt);
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ if (new)
+ error = handle_newaddr_inet6(hdr, &attrs, ifp, nlp, npt);
+ else
+ error = handle_deladdr_inet6(hdr, &attrs, ifp, nlp, npt);
+ break;
+#endif
+ default:
+ error = EAFNOSUPPORT;
+ }
+
+ if (error == 0 && !(if_flags & IFF_UP) && (if_getflags(ifp) & IFF_UP))
+ if_up(ifp);
+
+ if_rele(ifp);
+
+ return (error);
+}
+
+
+static void
+rtnl_handle_ifaddr(void *arg __unused, struct ifaddr *ifa, int cmd)
+{
+ struct nlmsghdr hdr = {};
+ struct nl_writer nw;
+ uint32_t group = 0;
+
+ switch (ifa->ifa_addr->sa_family) {
+#ifdef INET
+ case AF_INET:
+ group = RTNLGRP_IPV4_IFADDR;
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ group = RTNLGRP_IPV6_IFADDR;
+ break;
+#endif
+ default:
+ NL_LOG(LOG_DEBUG2, "ifa notification for unknown AF: %d",
+ ifa->ifa_addr->sa_family);
+ return;
+ }
+
+ if (!nl_writer_group(&nw, NLMSG_LARGE, NETLINK_ROUTE, group, 0,
+ false)) {
+ NL_LOG(LOG_DEBUG, "error allocating group writer");
+ return;
+ }
+
+ hdr.nlmsg_type = (cmd == RTM_DELETE) ? NL_RTM_DELADDR : NL_RTM_NEWADDR;
+
+ dump_iface_addr(&nw, ifa->ifa_ifp, ifa, &hdr);
+ nlmsg_flush(&nw);
+}
+
+static void
+rtnl_handle_ifevent(if_t ifp, int nlmsg_type, int if_flags_mask)
+{
+ struct nlmsghdr hdr = { .nlmsg_type = nlmsg_type };
+ struct nl_writer nw;
+
+ if (!nl_writer_group(&nw, NLMSG_LARGE, NETLINK_ROUTE, RTNLGRP_LINK, 0,
+ false)) {
+ NL_LOG(LOG_DEBUG, "error allocating group writer");
+ return;
+ }
+ dump_iface(&nw, ifp, &hdr, if_flags_mask);
+ nlmsg_flush(&nw);
+}
+
+static void
+rtnl_handle_ifattach(void *arg, if_t ifp)
+{
+ NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp));
+ rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, 0);
+}
+
+static void
+rtnl_handle_ifdetach(void *arg, if_t ifp)
+{
+ NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp));
+ rtnl_handle_ifevent(ifp, NL_RTM_DELLINK, 0);
+}
+
+static void
+rtnl_handle_iflink(void *arg, if_t ifp, int link_state __unused)
+{
+ NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp));
+ rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, 0);
+}
+
+void
+rtnl_handle_ifnet_event(if_t ifp, int if_flags_mask)
+{
+ NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp));
+ rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, if_flags_mask);
+}
+
+static const struct rtnl_cmd_handler cmd_handlers[] = {
+ {
+ .cmd = NL_RTM_GETLINK,
+ .name = "RTM_GETLINK",
+ .cb = &rtnl_handle_getlink,
+ .flags = RTNL_F_NOEPOCH | RTNL_F_ALLOW_NONVNET_JAIL,
+ },
+ {
+ .cmd = NL_RTM_DELLINK,
+ .name = "RTM_DELLINK",
+ .cb = &rtnl_handle_dellink,
+ .priv = PRIV_NET_IFDESTROY,
+ .flags = RTNL_F_NOEPOCH,
+ },
+ {
+ .cmd = NL_RTM_NEWLINK,
+ .name = "RTM_NEWLINK",
+ .cb = &rtnl_handle_newlink,
+ .priv = PRIV_NET_IFCREATE,
+ .flags = RTNL_F_NOEPOCH,
+ },
+ {
+ .cmd = NL_RTM_GETADDR,
+ .name = "RTM_GETADDR",
+ .cb = &rtnl_handle_getaddr,
+ .flags = RTNL_F_ALLOW_NONVNET_JAIL,
+ },
+ {
+ .cmd = NL_RTM_NEWADDR,
+ .name = "RTM_NEWADDR",
+ .cb = &rtnl_handle_addr,
+ .priv = PRIV_NET_ADDIFADDR,
+ .flags = RTNL_F_NOEPOCH,
+ },
+ {
+ .cmd = NL_RTM_DELADDR,
+ .name = "RTM_DELADDR",
+ .cb = &rtnl_handle_addr,
+ .priv = PRIV_NET_DELIFADDR,
+ .flags = RTNL_F_NOEPOCH,
+ },
+};
+
+static const struct nlhdr_parser *all_parsers[] = {
+ &ifmsg_parser, &ifa_parser, &ifa_fbsd_parser,
+};
+
+void
+rtnl_iface_add_cloner(struct nl_cloner *cloner)
+{
+ sx_xlock(&rtnl_cloner_lock);
+ SLIST_INSERT_HEAD(&nl_cloners, cloner, next);
+ sx_xunlock(&rtnl_cloner_lock);
+}
+
+void
+rtnl_iface_del_cloner(struct nl_cloner *cloner)
+{
+ sx_xlock(&rtnl_cloner_lock);
+ SLIST_REMOVE(&nl_cloners, cloner, nl_cloner, next);
+ sx_xunlock(&rtnl_cloner_lock);
+}
+
+void
+rtnl_ifaces_init(void)
+{
+ ifattach_event = EVENTHANDLER_REGISTER(
+ ifnet_arrival_event, rtnl_handle_ifattach, NULL,
+ EVENTHANDLER_PRI_ANY);
+ ifdetach_event = EVENTHANDLER_REGISTER(
+ ifnet_departure_event, rtnl_handle_ifdetach, NULL,
+ EVENTHANDLER_PRI_ANY);
+ ifaddr_event = EVENTHANDLER_REGISTER(
+ rt_addrmsg, rtnl_handle_ifaddr, NULL,
+ EVENTHANDLER_PRI_ANY);
+ iflink_event = EVENTHANDLER_REGISTER(
+ ifnet_link_event, rtnl_handle_iflink, NULL,
+ EVENTHANDLER_PRI_ANY);
+ NL_VERIFY_PARSERS(all_parsers);
+ rtnl_register_messages(cmd_handlers, nitems(cmd_handlers));
+}
+
+void
+rtnl_ifaces_destroy(void)
+{
+ EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ifattach_event);
+ EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_event);
+ EVENTHANDLER_DEREGISTER(rt_addrmsg, ifaddr_event);
+ EVENTHANDLER_DEREGISTER(ifnet_link_event, iflink_event);
+}
diff --git a/sys/netlink/route/iface_drivers.c b/sys/netlink/route/iface_drivers.c
new file mode 100644
index 000000000000..4bf913d9c978
--- /dev/null
+++ b/sys/netlink/route/iface_drivers.c
@@ -0,0 +1,145 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_media.h>
+#include <net/if_var.h>
+#include <net/if_clone.h>
+#include <net/if_vlan_var.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#include <netinet6/scope6_var.h> /* scope deembedding */
+
+#define DEBUG_MOD_NAME nl_iface_drivers
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_INFO);
+
+/*
+ * Generic modification interface handler.
+ * Responsible for changing network stack interface attributes
+ * such as state, mtu or description.
+ */
+int
+_nl_modify_ifp_generic(struct ifnet *ifp, struct nl_parsed_link *lattrs,
+ const struct nlattr_bmask *bm, struct nl_pstate *npt)
+{
+ int error;
+
+ if (lattrs->ifla_ifalias != NULL) {
+ if (nlp_has_priv(npt->nlp, PRIV_NET_SETIFDESCR)) {
+ int len = strlen(lattrs->ifla_ifalias) + 1;
+ char *buf = if_allocdescr(len, M_WAITOK);
+
+ memcpy(buf, lattrs->ifla_ifalias, len);
+ if_setdescr(ifp, buf);
+ if_setlastchange(ifp);
+ } else {
+ nlmsg_report_err_msg(npt, "Not enough privileges to set descr");
+ return (EPERM);
+ }
+ }
+
+ if ((lattrs->ifi_change & IFF_UP) && (lattrs->ifi_flags & IFF_UP) == 0) {
+ /* Request to down the interface */
+ if_down(ifp);
+ }
+
+ if (lattrs->ifla_mtu > 0) {
+ if (nlp_has_priv(npt->nlp, PRIV_NET_SETIFMTU)) {
+ struct ifreq ifr = { .ifr_mtu = lattrs->ifla_mtu };
+ error = ifhwioctl(SIOCSIFMTU, ifp, (char *)&ifr, curthread);
+ } else {
+ nlmsg_report_err_msg(npt, "Not enough privileges to set mtu");
+ return (EPERM);
+ }
+ }
+
+ if (lattrs->ifi_change & IFF_PROMISC) {
+ error = ifpromisc(ifp, lattrs->ifi_flags & IFF_PROMISC);
+ if (error != 0) {
+ nlmsg_report_err_msg(npt, "unable to set promisc");
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Saves the resulting ifindex and ifname to report them
+ * to userland along with the operation result.
+ * NLA format:
+ * NLMSGERR_ATTR_COOKIE(nested)
+ * IFLA_NEW_IFINDEX(u32)
+ * IFLA_IFNAME(string)
+ */
+void
+_nl_store_ifp_cookie(struct nl_pstate *npt, struct ifnet *ifp)
+{
+ int ifname_len = strlen(if_name(ifp));
+ uint32_t ifindex = (uint32_t)if_getindex(ifp);
+
+ int nla_len = sizeof(struct nlattr) * 3 +
+ sizeof(ifindex) + NL_ITEM_ALIGN(ifname_len + 1);
+ struct nlattr *nla_cookie = npt_alloc(npt, nla_len);
+
+ /* Nested TLV */
+ nla_cookie->nla_len = nla_len;
+ nla_cookie->nla_type = NLMSGERR_ATTR_COOKIE;
+
+ struct nlattr *nla = nla_cookie + 1;
+ nla->nla_len = sizeof(struct nlattr) + sizeof(ifindex);
+ nla->nla_type = IFLA_NEW_IFINDEX;
+ memcpy(NLA_DATA(nla), &ifindex, sizeof(ifindex));
+
+ nla = NLA_NEXT(nla);
+ nla->nla_len = sizeof(struct nlattr) + ifname_len + 1;
+ nla->nla_type = IFLA_IFNAME;
+ strlcpy(NLA_DATA(nla), if_name(ifp), ifname_len + 1);
+
+ nlmsg_report_cookie(npt, nla_cookie);
+}
+
diff --git a/sys/netlink/route/ifaddrs.h b/sys/netlink/route/ifaddrs.h
new file mode 100644
index 000000000000..88d776c3b925
--- /dev/null
+++ b/sys/netlink/route/ifaddrs.h
@@ -0,0 +1,99 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Interface address-related (RTM_<NEW|DEL|GET>ADDR) message header and attributes.
+ */
+
+#ifndef _NETLINK_ROUTE_IFADDRS_H_
+#define _NETLINK_ROUTE_IFADDRS_H_
+
+/* Base header for all of the relevant messages */
+struct ifaddrmsg {
+ uint8_t ifa_family; /* Address family */
+ uint8_t ifa_prefixlen; /* Prefix length */
+ uint8_t ifa_flags; /* Address-specific flags */
+ uint8_t ifa_scope; /* Address scope */
+ uint32_t ifa_index; /* Link ifindex */
+};
+
+#ifndef _KERNEL
+#define _NL_IFA_HDRLEN ((int)sizeof(struct ifaddrmsg))
+#define IFA_RTA(_ifa) ((struct rtattr *)(NL_ITEM_DATA(_ifa, _NL_IFA_HDRLEN)))
+#define IFA_PAYLOAD(_hdr) NLMSG_PAYLOAD(_hdr, _NL_IFA_HDRLEN)
+#endif
+
+/* Defined attributes */
+enum {
+ IFA_UNSPEC,
+ IFA_ADDRESS = 1, /* binary, prefix address (destination for p2p) */
+ IFA_LOCAL = 2, /* binary, interface address */
+ IFA_LABEL = 3, /* string, interface name */
+ IFA_BROADCAST = 4, /* binary, broadcast ifa */
+ IFA_ANYCAST = 5, /* not supported */
+ IFA_CACHEINFO = 6, /* binary, struct ifa_cacheinfo */
+ IFA_MULTICAST = 7, /* not supported */
+ IFA_FLAGS = 8, /* u32, IFA_F flags */
+ IFA_RT_PRIORITY = 9, /* not supported */
+ IFA_TARGET_NETNSID = 10, /* not supported */
+ IFA_FREEBSD = 11, /* nested, FreeBSD-specific */
+ __IFA_MAX,
+};
+#define IFA_MAX (__IFA_MAX - 1)
+
+enum {
+ IFAF_UNSPEC,
+ IFAF_VHID = 1, /* u32: carp vhid */
+ IFAF_FLAGS = 2, /* u32: FreeBSD-specific ifa flags */
+ __IFAF_MAX,
+};
+#define IFAF_MAX (__IFAF_MAX - 1)
+
+/* IFA_FLAGS attribute flags */
+#define IFA_F_SECONDARY 0x0001
+#define IFA_F_TEMPORARY IFA_F_SECONDARY
+#define IFA_F_NODAD 0x0002
+#define IFA_F_OPTIMISTIC 0x0004
+#define IFA_F_DADFAILED 0x0008
+#define IFA_F_HOMEADDRESS 0x0010
+#define IFA_F_DEPRECATED 0x0020
+#define IFA_F_TENTATIVE 0x0040
+#define IFA_F_PERMANENT 0x0080
+#define IFA_F_MANAGETEMPADDR 0x0100
+#define IFA_F_NOPREFIXROUTE 0x0200
+#define IFA_F_MCAUTOJOIN 0x0400
+#define IFA_F_STABLE_PRIVACY 0x0800
+
+/* IFA_CACHEINFO value */
+struct ifa_cacheinfo {
+ uint32_t ifa_prefered; /* seconds till the end of the prefix considered preferred */
+ uint32_t ifa_valid; /* seconds till the end of the prefix considered valid */
+ uint32_t cstamp; /* creation time in 1ms intervals from the boot time */
+ uint32_t tstamp; /* update time in 1ms intervals from the boot time */
+};
+
+#endif
diff --git a/sys/netlink/route/interface.h b/sys/netlink/route/interface.h
new file mode 100644
index 000000000000..667bf2c96151
--- /dev/null
+++ b/sys/netlink/route/interface.h
@@ -0,0 +1,266 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Interface-related (RTM_<NEW|DEL|GET|SET>LINK) message header and attributes.
+ */
+
+#ifndef _NETLINK_ROUTE_INTERFACE_H_
+#define _NETLINK_ROUTE_INTERFACE_H_
+
+/* Base header for all of the relevant messages */
+struct ifinfomsg {
+ unsigned char ifi_family; /* not used */
+ unsigned char __ifi_pad;
+ unsigned short ifi_type; /* ARPHRD_* */
+ int ifi_index; /* Inteface index */
+ unsigned ifi_flags; /* IFF_* flags */
+ unsigned ifi_change; /* IFF_* change mask */
+};
+
+/* Linux-specific link-level state flag */
+#define IFF_LOWER_UP IFF_NETLINK_1
+
+#ifndef _KERNEL
+/* Compatilbility helpers */
+#define _IFINFO_HDRLEN ((int)sizeof(struct ifinfomsg))
+#define IFLA_RTA(_ifi) ((struct rtattr *)NL_ITEM_DATA(_ifi, _IFINFO_HDRLEN))
+#define IFLA_PAYLOAD(_ifi) NLMSG_PAYLOAD(_ifi, _IFINFO_HDRLEN)
+#endif
+
+enum {
+ IFLA_UNSPEC = 0,
+ IFLA_ADDRESS = 1, /* binary: Link-level address (MAC) */
+#define IFLA_ADDRESS IFLA_ADDRESS
+ IFLA_BROADCAST = 2, /* binary: link-level broadcast address */
+#define IFLA_BROADCAST IFLA_BROADCAST
+ IFLA_IFNAME = 3, /* string: Interface name */
+#define IFLA_IFNAME IFLA_IFNAME
+ IFLA_MTU = 4, /* u32: Current interface L3 mtu */
+#define IFLA_MTU IFLA_MTU
+ IFLA_LINK = 5, /* u32: interface index */
+#define IFLA_LINK IFLA_LINK
+ IFLA_QDISC = 6, /* string: Queing policy (not supported) */
+#define IFLA_QDISC IFLA_QDISC
+ IFLA_STATS = 7, /* Interface counters */
+#define IFLA_STATS IFLA_STATS
+ IFLA_COST = 8, /* not supported */
+#define IFLA_COST IFLA_COST
+ IFLA_PRIORITY = 9, /* not supported */
+#define IFLA_PRIORITY IFLA_PRIORITY
+ IFLA_MASTER = 10, /* u32: parent interface ifindex */
+#define IFLA_MASTER IFLA_MASTER
+ IFLA_WIRELESS = 11, /* not supported */
+#define IFLA_WIRELESS IFLA_WIRELESS
+ IFLA_PROTINFO = 12, /* protocol-specific data */
+#define IFLA_PROTINFO IFLA_PROTINFO
+ IFLA_TXQLEN = 13, /* u32: transmit queue length */
+#define IFLA_TXQLEN IFLA_TXQLEN
+ IFLA_MAP = 14, /* not supported */
+#define IFLA_MAP IFLA_MAP
+ IFLA_WEIGHT = 15, /* not supported */
+#define IFLA_WEIGHT IFLA_WEIGHT
+ IFLA_OPERSTATE = 16, /* u8: ifOperStatus per RFC 2863 */
+#define IFLA_OPERSTATE IFLA_OPERSTATE
+ IFLA_LINKMODE = 17, /* u8: ifmedia (not supported) */
+#define IFLA_LINKMODE IFLA_LINKMODE
+ IFLA_LINKINFO = 18, /* nested: IFLA_INFO_ */
+#define IFLA_LINKINFO IFLA_LINKINFO
+ IFLA_NET_NS_PID = 19, /* u32: vnet id (not supported) */
+#define IFLA_NET_NS_PID IFLA_NET_NS_PID
+ IFLA_IFALIAS = 20, /* string: interface description */
+#define IFLA_IFALIAS IFLA_IFALIAS
+ IFLA_NUM_VF = 21, /* not supported */
+#define IFLA_NUM_VF IFLA_NUM_VF
+ IFLA_VFINFO_LIST= 22, /* not supported */
+#define IFLA_VFINFO_LIST IFLA_VFINFO_LIST
+ IFLA_STATS64 = 23, /* rtnl_link_stats64: iface stats */
+#define IFLA_STATS64 IFLA_STATS64
+ IFLA_VF_PORTS,
+ IFLA_PORT_SELF,
+ IFLA_AF_SPEC,
+ IFLA_GROUP, /* Group the device belongs to */
+ IFLA_NET_NS_FD,
+ IFLA_EXT_MASK, /* Extended info mask, VFs, etc */
+ IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */
+#define IFLA_PROMISCUITY IFLA_PROMISCUITY
+ IFLA_NUM_TX_QUEUES,
+ IFLA_NUM_RX_QUEUES,
+ IFLA_CARRIER,
+ IFLA_PHYS_PORT_ID,
+ IFLA_CARRIER_CHANGES,
+ IFLA_PHYS_SWITCH_ID,
+ IFLA_LINK_NETNSID,
+ IFLA_PHYS_PORT_NAME,
+ IFLA_PROTO_DOWN,
+ IFLA_GSO_MAX_SEGS,
+ IFLA_GSO_MAX_SIZE,
+ IFLA_PAD,
+ IFLA_XDP,
+ IFLA_EVENT,
+ IFLA_NEW_NETNSID,
+ IFLA_IF_NETNSID,
+ IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */
+ IFLA_CARRIER_UP_COUNT,
+ IFLA_CARRIER_DOWN_COUNT,
+ IFLA_NEW_IFINDEX,
+ IFLA_MIN_MTU,
+ IFLA_MAX_MTU,
+ IFLA_PROP_LIST,
+ IFLA_ALT_IFNAME, /* Alternative ifname */
+ IFLA_PERM_ADDRESS,
+ IFLA_PROTO_DOWN_REASON,
+ IFLA_PARENT_DEV_NAME,
+ IFLA_PARENT_DEV_BUS_NAME,
+ IFLA_GRO_MAX_SIZE,
+ IFLA_TSO_MAX_SEGS,
+ IFLA_ALLMULTI,
+ IFLA_DEVLINK_PORT,
+ IFLA_GSO_IPV4_MAX_SIZE,
+ IFLA_GRO_IPV4_MAX_SIZE,
+ IFLA_FREEBSD,
+ __IFLA_MAX
+};
+#define IFLA_MAX (__IFLA_MAX - 1)
+
+enum {
+ IFLAF_UNSPEC = 0,
+ IFLAF_ORIG_IFNAME = 1, /* string, original interface name at creation */
+ IFLAF_ORIG_HWADDR = 2, /* binary, original hardware address */
+ IFLAF_CAPS = 3, /* bitset, interface capabilities */
+ __IFLAF_MAX
+};
+#define IFLAF_MAX (__IFLAF_MAX - 1)
+
+/*
+ * Attributes that can be used as filters:
+ * IFLA_IFNAME, IFLA_GROUP, IFLA_ALT_IFNAME
+ * Headers that can be used as filters:
+ * ifi_index, ifi_type
+ */
+
+/*
+ * IFLA_OPERSTATE.
+ * The values below represent the possible
+ * states of ifOperStatus defined by RFC 2863
+ */
+enum {
+ IF_OPER_UNKNOWN = 0, /* status can not be determined */
+ IF_OPER_NOTPRESENT = 1, /* some (hardware) component not present */
+ IF_OPER_DOWN = 2, /* down */
+ IF_OPER_LOWERLAYERDOWN = 3, /* some lower-level interface is down */
+ IF_OPER_TESTING = 4, /* in some test mode */
+ IF_OPER_DORMANT = 5, /* "up" but waiting for some condition (802.1X) */
+ IF_OPER_UP = 6, /* ready to pass packets */
+};
+
+/* IFLA_STATS */
+struct rtnl_link_stats {
+ uint32_t rx_packets; /* total RX packets (IFCOUNTER_IPACKETS) */
+ uint32_t tx_packets; /* total TX packets (IFCOUNTER_OPACKETS) */
+ uint32_t rx_bytes; /* total RX bytes (IFCOUNTER_IBYTES) */
+ uint32_t tx_bytes; /* total TX bytes (IFCOUNTER_OBYTES) */
+ uint32_t rx_errors; /* RX errors (IFCOUNTER_IERRORS) */
+ uint32_t tx_errors; /* RX errors (IFCOUNTER_OERRORS) */
+ uint32_t rx_dropped; /* RX drop (no space in ring/no bufs) (IFCOUNTER_IQDROPS) */
+ uint32_t tx_dropped; /* TX drop (IFCOUNTER_OQDROPS) */
+ uint32_t multicast; /* RX multicast packets (IFCOUNTER_IMCASTS) */
+ uint32_t collisions; /* not supported */
+ uint32_t rx_length_errors; /* not supported */
+ uint32_t rx_over_errors; /* not supported */
+ uint32_t rx_crc_errors; /* not supported */
+ uint32_t rx_frame_errors; /* not supported */
+ uint32_t rx_fifo_errors; /* not supported */
+ uint32_t rx_missed_errors; /* not supported */
+ uint32_t tx_aborted_errors; /* not supported */
+ uint32_t tx_carrier_errors; /* not supported */
+ uint32_t tx_fifo_errors; /* not supported */
+ uint32_t tx_heartbeat_errors; /* not supported */
+ uint32_t tx_window_errors; /* not supported */
+ uint32_t rx_compressed; /* not supported */
+ uint32_t tx_compressed; /* not supported */
+ uint32_t rx_nohandler; /* dropped due to no proto handler (IFCOUNTER_NOPROTO) */
+};
+
+/* IFLA_STATS64 */
+struct rtnl_link_stats64 {
+ uint64_t rx_packets; /* total RX packets (IFCOUNTER_IPACKETS) */
+ uint64_t tx_packets; /* total TX packets (IFCOUNTER_OPACKETS) */
+ uint64_t rx_bytes; /* total RX bytes (IFCOUNTER_IBYTES) */
+ uint64_t tx_bytes; /* total TX bytes (IFCOUNTER_OBYTES) */
+ uint64_t rx_errors; /* RX errors (IFCOUNTER_IERRORS) */
+ uint64_t tx_errors; /* RX errors (IFCOUNTER_OERRORS) */
+ uint64_t rx_dropped; /* RX drop (no space in ring/no bufs) (IFCOUNTER_IQDROPS) */
+ uint64_t tx_dropped; /* TX drop (IFCOUNTER_OQDROPS) */
+ uint64_t multicast; /* RX multicast packets (IFCOUNTER_IMCASTS) */
+ uint64_t collisions; /* not supported */
+ uint64_t rx_length_errors; /* not supported */
+ uint64_t rx_over_errors; /* not supported */
+ uint64_t rx_crc_errors; /* not supported */
+ uint64_t rx_frame_errors; /* not supported */
+ uint64_t rx_fifo_errors; /* not supported */
+ uint64_t rx_missed_errors; /* not supported */
+ uint64_t tx_aborted_errors; /* not supported */
+ uint64_t tx_carrier_errors; /* not supported */
+ uint64_t tx_fifo_errors; /* not supported */
+ uint64_t tx_heartbeat_errors; /* not supported */
+ uint64_t tx_window_errors; /* not supported */
+ uint64_t rx_compressed; /* not supported */
+ uint64_t tx_compressed; /* not supported */
+ uint64_t rx_nohandler; /* dropped due to no proto handler (IFCOUNTER_NOPROTO) */
+};
+
+/* IFLA_LINKINFO child nlattr types */
+enum {
+ IFLA_INFO_UNSPEC,
+ IFLA_INFO_KIND = 1, /* string, link type ("vlan") */
+ IFLA_INFO_DATA = 2, /* Per-link-type custom data */
+ IFLA_INFO_XSTATS = 3,
+ IFLA_INFO_SLAVE_KIND = 4,
+ IFLA_INFO_SLAVE_DATA = 5,
+ __IFLA_INFO_MAX,
+};
+#define IFLA_INFO_MAX (__IFLA_INFO_MAX - 1)
+
+/* IFLA_INFO_DATA vlan attributes */
+enum {
+ IFLA_VLAN_UNSPEC,
+ IFLA_VLAN_ID,
+ IFLA_VLAN_FLAGS,
+ IFLA_VLAN_EGRESS_QOS,
+ IFLA_VLAN_INGRESS_QOS,
+ IFLA_VLAN_PROTOCOL,
+ __IFLA_VLAN_MAX,
+};
+
+#define IFLA_VLAN_MAX (__IFLA_VLAN_MAX - 1)
+struct ifla_vlan_flags {
+ uint32_t flags;
+ uint32_t mask;
+};
+
+#endif
diff --git a/sys/netlink/route/neigh.c b/sys/netlink/route/neigh.c
new file mode 100644
index 000000000000..9eaaae263254
--- /dev/null
+++ b/sys/netlink/route/neigh.c
@@ -0,0 +1,601 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_private.h>
+#include <net/if_llatbl.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#include <netinet6/in6_var.h> /* nd6.h requires this */
+#include <netinet6/nd6.h> /* nd6 state machine */
+#include <netinet6/scope6_var.h> /* scope deembedding */
+
+#define DEBUG_MOD_NAME nl_neigh
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_INFO);
+
+static int lle_families[] = { AF_INET, AF_INET6 };
+
+static eventhandler_tag lle_event_p;
+
+struct netlink_walkargs {
+ struct nl_writer *nw;
+ struct nlmsghdr hdr;
+ struct nlpcb *so;
+ if_t ifp;
+ int family;
+ int error;
+ int count;
+ int dumped;
+};
+
+static int
+lle_state_to_nl_state(int family, struct llentry *lle)
+{
+ int state = lle->ln_state;
+
+ switch (family) {
+ case AF_INET:
+ if (lle->la_flags & (LLE_STATIC | LLE_IFADDR))
+ state = 1;
+ switch (state) {
+ case 0: /* ARP_LLINFO_INCOMPLETE */
+ return (NUD_INCOMPLETE);
+ case 1: /* ARP_LLINFO_REACHABLE */
+ return (NUD_REACHABLE);
+ case 2: /* ARP_LLINFO_VERIFY */
+ return (NUD_PROBE);
+ }
+ break;
+ case AF_INET6:
+ switch (state) {
+ case ND6_LLINFO_INCOMPLETE:
+ return (NUD_INCOMPLETE);
+ case ND6_LLINFO_REACHABLE:
+ return (NUD_REACHABLE);
+ case ND6_LLINFO_STALE:
+ return (NUD_STALE);
+ case ND6_LLINFO_DELAY:
+ return (NUD_DELAY);
+ case ND6_LLINFO_PROBE:
+ return (NUD_PROBE);
+ }
+ break;
+ }
+
+ return (NUD_NONE);
+}
+
+static uint32_t
+lle_flags_to_nl_flags(const struct llentry *lle)
+{
+ uint32_t nl_flags = 0;
+
+ if (lle->la_flags & LLE_IFADDR)
+ nl_flags |= NTF_SELF;
+ if (lle->la_flags & LLE_PUB)
+ nl_flags |= NTF_PROXY;
+ if (lle->la_flags & LLE_STATIC)
+ nl_flags |= NTF_STICKY;
+ if (lle->ln_router != 0)
+ nl_flags |= NTF_ROUTER;
+
+ return (nl_flags);
+}
+
+static uint32_t
+get_lle_next_ts(const struct llentry *lle)
+{
+ if (lle->la_expire == 0)
+ return (0);
+ return (lle->la_expire + lle->lle_remtime / hz + time_second - time_uptime);
+}
+
+static int
+dump_lle_locked(struct llentry *lle, void *arg)
+{
+ struct netlink_walkargs *wa = (struct netlink_walkargs *)arg;
+ struct nlmsghdr *hdr = &wa->hdr;
+ struct nl_writer *nw = wa->nw;
+ struct ndmsg *ndm;
+#if defined(INET) || defined(INET6)
+ union {
+ struct in_addr in;
+ struct in6_addr in6;
+ } addr;
+#endif
+
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ char llebuf[NHOP_PRINT_BUFSIZE];
+ llentry_print_buf_lltable(lle, llebuf, sizeof(llebuf));
+ NL_LOG(LOG_DEBUG2, "dumping %s", llebuf);
+ }
+
+ if (!nlmsg_reply(nw, hdr, sizeof(struct ndmsg)))
+ goto enomem;
+
+ ndm = nlmsg_reserve_object(nw, struct ndmsg);
+ ndm->ndm_family = wa->family;
+ ndm->ndm_ifindex = if_getindex(wa->ifp);
+ ndm->ndm_state = lle_state_to_nl_state(wa->family, lle);
+ ndm->ndm_flags = lle_flags_to_nl_flags(lle);
+
+ switch (wa->family) {
+#ifdef INET
+ case AF_INET:
+ addr.in = lle->r_l3addr.addr4;
+ nlattr_add(nw, NDA_DST, 4, &addr);
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ addr.in6 = lle->r_l3addr.addr6;
+ in6_clearscope(&addr.in6);
+ nlattr_add(nw, NDA_DST, 16, &addr);
+ break;
+#endif
+ }
+
+ if (lle->r_flags & RLLE_VALID) {
+ /* Has L2 */
+ int addrlen = if_getaddrlen(wa->ifp);
+ nlattr_add(nw, NDA_LLADDR, addrlen, lle->ll_addr);
+ }
+
+ nlattr_add_u32(nw, NDA_PROBES, lle->la_asked);
+
+ struct nda_cacheinfo *cache;
+ cache = nlmsg_reserve_attr(nw, NDA_CACHEINFO, struct nda_cacheinfo);
+ if (cache == NULL)
+ goto enomem;
+ /* TODO: provide confirmed/updated */
+ cache->ndm_refcnt = lle->lle_refcnt;
+
+ int off = nlattr_add_nested(nw, NDA_FREEBSD);
+ if (off != 0) {
+ nlattr_add_u32(nw, NDAF_NEXT_STATE_TS, get_lle_next_ts(lle));
+
+ nlattr_set_len(nw, off);
+ }
+
+ if (nlmsg_end(nw))
+ return (0);
+enomem:
+ NL_LOG(LOG_DEBUG, "unable to dump lle state (ENOMEM)");
+ nlmsg_abort(nw);
+ return (ENOMEM);
+}
+
+static int
+dump_lle(struct lltable *llt, struct llentry *lle, void *arg)
+{
+ int error;
+
+ LLE_RLOCK(lle);
+ error = dump_lle_locked(lle, arg);
+ LLE_RUNLOCK(lle);
+ return (error);
+}
+
+static bool
+dump_llt(struct lltable *llt, struct netlink_walkargs *wa)
+{
+ lltable_foreach_lle(llt, dump_lle, wa);
+
+ return (true);
+}
+
+static int
+dump_llts_iface(struct netlink_walkargs *wa, if_t ifp, int family)
+{
+ int error = 0;
+
+ wa->ifp = ifp;
+ for (int i = 0; i < sizeof(lle_families) / sizeof(int); i++) {
+ int fam = lle_families[i];
+ struct lltable *llt = lltable_get(ifp, fam);
+ if (llt != NULL && (family == 0 || family == fam)) {
+ wa->count++;
+ wa->family = fam;
+ if (!dump_llt(llt, wa)) {
+ error = ENOMEM;
+ break;
+ }
+ wa->dumped++;
+ }
+ }
+ return (error);
+}
+
+static int
+dump_llts(struct netlink_walkargs *wa, if_t ifp, int family)
+{
+ NL_LOG(LOG_DEBUG2, "Start dump ifp=%s family=%d", ifp ? if_name(ifp) : "NULL", family);
+
+ wa->hdr.nlmsg_flags |= NLM_F_MULTI;
+
+ if (ifp != NULL) {
+ dump_llts_iface(wa, ifp, family);
+ } else {
+ struct if_iter it;
+
+ for (ifp = if_iter_start(&it); ifp != NULL; ifp = if_iter_next(&it)) {
+ dump_llts_iface(wa, ifp, family);
+ }
+ if_iter_finish(&it);
+ }
+
+ NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa->count, wa->dumped);
+
+ if (!nlmsg_end_dump(wa->nw, wa->error, &wa->hdr)) {
+ NL_LOG(LOG_DEBUG, "Unable to add new message");
+ return (ENOMEM);
+ }
+
+ return (0);
+}
+
+static int
+get_lle(struct netlink_walkargs *wa, if_t ifp, int family, struct sockaddr *dst)
+{
+ struct lltable *llt = lltable_get(ifp, family);
+ if (llt == NULL)
+ return (ESRCH);
+
+ struct llentry *lle = lla_lookup(llt, LLE_UNLOCKED, dst);
+ if (lle == NULL)
+ return (ESRCH);
+
+ wa->ifp = ifp;
+ wa->family = family;
+
+ return (dump_lle(llt, lle, wa));
+}
+
+static void
+set_scope6(struct sockaddr *sa, if_t ifp)
+{
+#ifdef INET6
+ if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) {
+ struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa;
+
+ if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr))
+ in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp));
+ }
+#endif
+}
+
+struct nl_parsed_neigh {
+ struct sockaddr *nda_dst;
+ struct ifnet *nda_ifp;
+ struct nlattr *nda_lladdr;
+ uint32_t ndaf_next_ts;
+ uint32_t ndm_flags;
+ uint16_t ndm_state;
+ uint8_t ndm_family;
+};
+
+#define _IN(_field) offsetof(struct ndmsg, _field)
+#define _OUT(_field) offsetof(struct nl_parsed_neigh, _field)
+static const struct nlattr_parser nla_p_neigh_fbsd[] = {
+ { .type = NDAF_NEXT_STATE_TS, .off = _OUT(ndaf_next_ts), .cb = nlattr_get_uint32 },
+};
+NL_DECLARE_ATTR_PARSER(neigh_fbsd_parser, nla_p_neigh_fbsd);
+
+static const struct nlfield_parser nlf_p_neigh[] = {
+ { .off_in = _IN(ndm_family), .off_out = _OUT(ndm_family), .cb = nlf_get_u8 },
+ { .off_in = _IN(ndm_flags), .off_out = _OUT(ndm_flags), .cb = nlf_get_u8_u32 },
+ { .off_in = _IN(ndm_state), .off_out = _OUT(ndm_state), .cb = nlf_get_u16 },
+ { .off_in = _IN(ndm_ifindex), .off_out = _OUT(nda_ifp), .cb = nlf_get_ifpz },
+};
+
+static const struct nlattr_parser nla_p_neigh[] = {
+ { .type = NDA_DST, .off = _OUT(nda_dst), .cb = nlattr_get_ip },
+ { .type = NDA_LLADDR, .off = _OUT(nda_lladdr), .cb = nlattr_get_nla },
+ { .type = NDA_IFINDEX, .off = _OUT(nda_ifp), .cb = nlattr_get_ifp },
+ { .type = NDA_FLAGS_EXT, .off = _OUT(ndm_flags), .cb = nlattr_get_uint32 },
+ { .type = NDA_FREEBSD, .arg = &neigh_fbsd_parser, .cb = nlattr_get_nested },
+};
+#undef _IN
+#undef _OUT
+
+static bool
+post_p_neigh(void *_attrs, struct nl_pstate *npt __unused)
+{
+ struct nl_parsed_neigh *attrs = (struct nl_parsed_neigh *)_attrs;
+
+ set_scope6(attrs->nda_dst, attrs->nda_ifp);
+ return (true);
+}
+NL_DECLARE_PARSER_EXT(ndmsg_parser, struct ndmsg, NULL, nlf_p_neigh, nla_p_neigh, post_p_neigh);
+
+
+/*
+ * type=RTM_NEWNEIGH, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1661941473, pid=0},
+ * {ndm_family=AF_INET6, ndm_ifindex=if_nametoindex("enp0s31f6"), ndm_state=NUD_PERMANENT, ndm_flags=0, ndm_type=RTN_UNSPEC},
+ * [
+ * {{nla_len=20, nla_type=NDA_DST}, inet_pton(AF_INET6, "2a01:4f8:13a:70c::3")},
+ * {{nla_len=10, nla_type=NDA_LLADDR}, 20:4e:71:62:ae:f2}]}, iov_len=60}
+ */
+
+static int
+rtnl_handle_newneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ int error;
+
+ struct nl_parsed_neigh attrs = {};
+ error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.nda_ifp == NULL || attrs.nda_dst == NULL || attrs.nda_lladdr == NULL) {
+ if (attrs.nda_ifp == NULL)
+ NLMSG_REPORT_ERR_MSG(npt, "NDA_IFINDEX / ndm_ifindex not set");
+ if (attrs.nda_dst == NULL)
+ NLMSG_REPORT_ERR_MSG(npt, "NDA_DST not set");
+ if (attrs.nda_lladdr == NULL)
+ NLMSG_REPORT_ERR_MSG(npt, "NDA_LLADDR not set");
+ return (EINVAL);
+ }
+
+ if (attrs.nda_dst->sa_family != attrs.ndm_family) {
+ NLMSG_REPORT_ERR_MSG(npt,
+ "NDA_DST family (%d) is different from ndm_family (%d)",
+ attrs.nda_dst->sa_family, attrs.ndm_family);
+ return (EINVAL);
+ }
+
+ int addrlen = if_getaddrlen(attrs.nda_ifp);
+ if (attrs.nda_lladdr->nla_len != sizeof(struct nlattr) + addrlen) {
+ NLMSG_REPORT_ERR_MSG(npt,
+ "NDA_LLADDR address length (%d) is different from expected (%d)",
+ (int)attrs.nda_lladdr->nla_len - (int)sizeof(struct nlattr), addrlen);
+ return (EINVAL);
+ }
+
+ const uint16_t supported_flags = NTF_PROXY | NTF_STICKY;
+ if ((attrs.ndm_flags & supported_flags) != attrs.ndm_flags) {
+ NLMSG_REPORT_ERR_MSG(npt, "ndm_flags %X not supported",
+ attrs.ndm_flags &~ supported_flags);
+ return (ENOTSUP);
+ }
+
+ /* Replacement requires new entry creation anyway */
+ if ((hdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_REPLACE)) == 0)
+ return (ENOTSUP);
+
+ struct lltable *llt = lltable_get(attrs.nda_ifp, attrs.ndm_family);
+ if (llt == NULL)
+ return (EAFNOSUPPORT);
+
+
+ uint8_t linkhdr[LLE_MAX_LINKHDR];
+ size_t linkhdrsize = sizeof(linkhdr);
+ int lladdr_off = 0;
+ if (lltable_calc_llheader(attrs.nda_ifp, attrs.ndm_family,
+ (char *)(attrs.nda_lladdr + 1), linkhdr, &linkhdrsize, &lladdr_off) != 0) {
+ NLMSG_REPORT_ERR_MSG(npt, "unable to calculate lle prepend data");
+ return (EINVAL);
+ }
+
+ int lle_flags = (attrs.ndm_flags & NTF_PROXY) ? LLE_PUB : 0;
+ if (attrs.ndm_flags & NTF_STICKY)
+ lle_flags |= LLE_STATIC;
+ struct llentry *lle = lltable_alloc_entry(llt, lle_flags, attrs.nda_dst);
+ if (lle == NULL)
+ return (ENOMEM);
+ lltable_set_entry_addr(attrs.nda_ifp, lle, linkhdr, linkhdrsize, lladdr_off);
+
+ if (attrs.ndm_flags & NTF_STICKY)
+ lle->la_expire = 0;
+ else
+ lle->la_expire = attrs.ndaf_next_ts - time_second + time_uptime;
+
+ /* llentry created, try to insert or update */
+ IF_AFDATA_WLOCK(attrs.nda_ifp);
+ LLE_WLOCK(lle);
+ struct llentry *lle_tmp = lla_lookup(llt, LLE_EXCLUSIVE, attrs.nda_dst);
+ if (lle_tmp != NULL) {
+ error = EEXIST;
+ if (hdr->nlmsg_flags & NLM_F_REPLACE) {
+ error = EPERM;
+ if ((lle_tmp->la_flags & LLE_IFADDR) == 0) {
+ error = 0; /* success */
+ lltable_unlink_entry(llt, lle_tmp);
+ llentry_free(lle_tmp);
+ lle_tmp = NULL;
+ lltable_link_entry(llt, lle);
+ }
+ }
+ if (lle_tmp)
+ LLE_WUNLOCK(lle_tmp);
+ } else {
+ if (hdr->nlmsg_flags & NLM_F_CREATE)
+ lltable_link_entry(llt, lle);
+ else
+ error = ENOENT;
+ }
+ IF_AFDATA_WUNLOCK(attrs.nda_ifp);
+
+ if (error != 0) {
+ /* throw away the newly allocated llentry */
+ llentry_free(lle);
+ return (error);
+ }
+
+ /* XXX: We're inside epoch */
+ EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED);
+ LLE_WUNLOCK(lle);
+ llt->llt_post_resolved(llt, lle);
+
+ return (0);
+}
+
+static int
+rtnl_handle_delneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ int error;
+
+ struct nl_parsed_neigh attrs = {};
+ error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.nda_dst == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "NDA_DST not set");
+ return (EINVAL);
+ }
+
+ if (attrs.nda_ifp == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "no ifindex provided");
+ return (EINVAL);
+ }
+
+ struct lltable *llt = lltable_get(attrs.nda_ifp, attrs.ndm_family);
+ if (llt == NULL)
+ return (EAFNOSUPPORT);
+
+ return (lltable_delete_addr(llt, 0, attrs.nda_dst));
+}
+
+static int
+rtnl_handle_getneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ int error;
+
+ struct nl_parsed_neigh attrs = {};
+ error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.nda_dst != NULL && attrs.nda_ifp == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "has NDA_DST but no ifindex provided");
+ return (EINVAL);
+ }
+
+ struct netlink_walkargs wa = {
+ .so = nlp,
+ .nw = npt->nw,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_NEWNEIGH,
+ };
+
+ if (attrs.nda_dst == NULL)
+ error = dump_llts(&wa, attrs.nda_ifp, attrs.ndm_family);
+ else
+ error = get_lle(&wa, attrs.nda_ifp, attrs.ndm_family, attrs.nda_dst);
+
+ return (error);
+}
+
+static const struct rtnl_cmd_handler cmd_handlers[] = {
+ {
+ .cmd = NL_RTM_NEWNEIGH,
+ .name = "RTM_NEWNEIGH",
+ .cb = &rtnl_handle_newneigh,
+ .priv = PRIV_NET_ROUTE,
+ },
+ {
+ .cmd = NL_RTM_DELNEIGH,
+ .name = "RTM_DELNEIGH",
+ .cb = &rtnl_handle_delneigh,
+ .priv = PRIV_NET_ROUTE,
+ },
+ {
+ .cmd = NL_RTM_GETNEIGH,
+ .name = "RTM_GETNEIGH",
+ .cb = &rtnl_handle_getneigh,
+ }
+};
+
+static void
+rtnl_lle_event(void *arg __unused, struct llentry *lle, int evt)
+{
+ struct nl_writer nw;
+ if_t ifp;
+ int family;
+
+ LLE_WLOCK_ASSERT(lle);
+
+ ifp = lltable_get_ifp(lle->lle_tbl);
+ family = lltable_get_af(lle->lle_tbl);
+
+ if (family != AF_INET && family != AF_INET6)
+ return;
+
+ int nlmsgs_type = evt == LLENTRY_RESOLVED ? NL_RTM_NEWNEIGH : NL_RTM_DELNEIGH;
+
+ if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEIGH, 0,
+ false)) {
+ NL_LOG(LOG_DEBUG, "error allocating group writer");
+ return;
+ }
+
+ struct netlink_walkargs wa = {
+ .hdr.nlmsg_type = nlmsgs_type,
+ .nw = &nw,
+ .ifp = ifp,
+ .family = family,
+ };
+
+ dump_lle_locked(lle, &wa);
+ nlmsg_flush(&nw);
+}
+
+static const struct nlhdr_parser *all_parsers[] = { &ndmsg_parser, &neigh_fbsd_parser };
+
+void
+rtnl_neighs_init(void)
+{
+ NL_VERIFY_PARSERS(all_parsers);
+ rtnl_register_messages(cmd_handlers, nitems(cmd_handlers));
+ lle_event_p = EVENTHANDLER_REGISTER(lle_event, rtnl_lle_event, NULL,
+ EVENTHANDLER_PRI_ANY);
+}
+
+void
+rtnl_neighs_destroy(void)
+{
+ EVENTHANDLER_DEREGISTER(lle_event, lle_event_p);
+}
diff --git a/sys/netlink/route/neigh.h b/sys/netlink/route/neigh.h
new file mode 100644
index 000000000000..10bc3b93d16a
--- /dev/null
+++ b/sys/netlink/route/neigh.h
@@ -0,0 +1,111 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Neighbors-related (RTM_<NEW|DEL|GET>NEIGH) message header and attributes.
+ */
+
+#ifndef _NETLINK_ROUTE_NEIGH_H_
+#define _NETLINK_ROUTE_NEIGH_H_
+
+/* Base header for all of the relevant messages */
+struct ndmsg {
+ uint8_t ndm_family;
+ uint8_t ndm_pad1;
+ uint16_t ndm_pad2;
+ int32_t ndm_ifindex;
+ uint16_t ndm_state;
+ uint8_t ndm_flags;
+ uint8_t ndm_type;
+};
+
+/* Attributes */
+enum {
+ NDA_UNSPEC,
+ NDA_DST, /* binary: neigh l3 address */
+ NDA_LLADDR, /* binary: neigh link-level address */
+ NDA_CACHEINFO, /* binary, struct nda_cacheinfo */
+ NDA_PROBES, /* u32: number of probes sent */
+ NDA_VLAN, /* upper 802.1Q tag */
+ NDA_PORT, /* not supported */
+ NDA_VNI, /* not supported */
+ NDA_IFINDEX, /* interface index */
+ NDA_MASTER, /* not supported */
+ NDA_LINK_NETNSID, /* not supported */
+ NDA_SRC_VNI, /* not supported */
+ NDA_PROTOCOL, /* XXX */
+ NDA_NH_ID, /* not supported */
+ NDA_FDB_EXT_ATTRS, /* not supported */
+ NDA_FLAGS_EXT, /* u32: ndm_flags */
+ NDA_NDM_STATE_MASK, /* XXX */
+ NDA_NDM_FLAGS_MASK, /* XXX */
+ NDA_FREEBSD, /* nested: FreeBSD-specific */
+ __NDA_MAX
+};
+
+#define NDA_MAX (__NDA_MAX - 1)
+
+enum {
+ NDAF_UNSPEC,
+ NDAF_NEXT_STATE_TS, /* (u32) seconds from time_uptime when moving to the next state */
+};
+
+
+/* ndm_flags / NDA_FLAGS_EXT */
+#define NTF_USE 0x0001 /* XXX */
+#define NTF_SELF 0x0002 /* local station */
+#define NTF_MASTER 0x0004 /* XXX */
+#define NTF_PROXY 0x0008 /* proxy entry */
+#define NTF_EXT_LEARNED 0x0010 /* not used */
+#define NTF_OFFLOADED 0x0020 /* not used */
+#define NTF_STICKY 0x0040 /* permanent entry */
+#define NTF_ROUTER 0x0080 /* dst indicated itself as a router */
+/* start of NDA_FLAGS_EXT */
+#define NTF_EXT_MANAGED 0x0100 /* not used */
+
+/* ndm_state */
+#define NUD_INCOMPLETE 0x01 /* No lladdr, address resolution in progress */
+#define NUD_REACHABLE 0x02 /* reachable & recently resolved */
+#define NUD_STALE 0x04 /* has lladdr but it's stale */
+#define NUD_DELAY 0x08 /* has lladdr, is stale, probes delayed */
+#define NUD_PROBE 0x10 /* has lladdr, is stale, probes sent */
+#define NUD_FAILED 0x20 /* unused */
+
+/* Dummy states */
+#define NUD_NOARP 0x40 /* not used */
+#define NUD_PERMANENT 0x80 /* not flushed */
+#define NUD_NONE 0x00
+
+/* NDA_CACHEINFO */
+struct nda_cacheinfo {
+ uint32_t ndm_confirmed; /* seconds since ARP/ND was received from neigh */
+ uint32_t ndm_used; /* seconds since last used (not provided) */
+ uint32_t ndm_updated; /* seconds since state was updated last */
+ uint32_t ndm_refcnt; /* number of references held */
+};
+
+#endif
diff --git a/sys/netlink/route/nexthop.c b/sys/netlink/route/nexthop.c
new file mode 100644
index 000000000000..30aa3dd72534
--- /dev/null
+++ b/sys/netlink/route/nexthop.c
@@ -0,0 +1,1123 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_route.h"
+#include <sys/types.h>
+#include <sys/ck.h>
+#include <sys/epoch.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_utils.h>
+
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <netinet6/scope6_var.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#define DEBUG_MOD_NAME nl_nhop
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_INFO);
+
+/*
+ * This file contains the logic to maintain kernel nexthops and
+ * nexhop groups based om the data provided by the user.
+ *
+ * Kernel stores (nearly) all of the routing data in the nexthops,
+ * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT).
+ *
+ * Netlink API provides higher-level abstraction for the user. Each
+ * user-created nexthop may map to multiple kernel nexthops.
+ *
+ * The following variations require separate kernel nexthop to be
+ * created:
+ * * prefix flags (NHF_HOST, NHF_DEFAULT)
+ * * using IPv6 gateway for IPv4 routes
+ * * different fibnum
+ *
+ * These kernel nexthops have the lifetime bound to the lifetime of
+ * the user_nhop object. They are not collected until user requests
+ * to delete the created user_nhop.
+ *
+ */
+struct user_nhop {
+ uint32_t un_idx; /* Userland-provided index */
+ uint32_t un_fibfam; /* fibnum+af(as highest byte) */
+ uint8_t un_protocol; /* protocol that install the record */
+ struct nhop_object *un_nhop; /* "production" nexthop */
+ struct nhop_object *un_nhop_src; /* nexthop to copy from */
+ struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */
+ uint32_t un_nhgrp_count; /* number of nexthops */
+ struct user_nhop *un_next; /* next item in hash chain */
+ struct user_nhop *un_nextchild; /* master -> children */
+ struct epoch_context un_epoch_ctx; /* epoch ctl helper */
+};
+
+/* produce hash value for an object */
+#define unhop_hash_obj(_obj) (hash_unhop(_obj))
+/* compare two objects */
+#define unhop_cmp(_one, _two) (cmp_unhop(_one, _two))
+/* next object accessor */
+#define unhop_next(_obj) (_obj)->un_next
+
+CHT_SLIST_DEFINE(unhop, struct user_nhop);
+
+struct unhop_ctl {
+ struct unhop_head un_head;
+ struct rmlock un_lock;
+};
+#define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl")
+#define UN_TRACKER struct rm_priotracker un_tracker
+#define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker)
+#define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker)
+
+#define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock);
+#define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock);
+
+VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL;
+#define V_un_ctl VNET(un_ctl)
+
+static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size);
+static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b);
+static unsigned int hash_unhop(const struct user_nhop *obj);
+
+static void destroy_unhop(struct user_nhop *unhop);
+static struct nhop_object *clone_unhop(const struct user_nhop *unhop,
+ uint32_t fibnum, int family, int nh_flags);
+
+static int
+cmp_unhop(const struct user_nhop *a, const struct user_nhop *b)
+{
+ return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam);
+}
+
+/*
+ * Hash callback: calculate hash of an object
+ */
+static unsigned int
+hash_unhop(const struct user_nhop *obj)
+{
+ return (obj->un_idx ^ obj->un_fibfam);
+}
+
+#define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0)
+
+/*
+ * Factory interface for creating matching kernel nexthops/nexthop groups
+ *
+ * @uidx: userland nexhop index used to create the nexthop
+ * @fibnum: fibnum nexthop will be used in
+ * @family: upper family nexthop will be used in
+ * @nh_flags: desired nexthop prefix flags
+ * @perror: pointer to store error to
+ *
+ * Returns referenced nexthop linked to @fibnum/@family rib on success.
+ */
+struct nhop_object *
+nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx,
+ int nh_flags, int *perror)
+{
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ UN_TRACKER;
+
+ if (__predict_false(ctl == NULL))
+ return (NULL);
+
+ struct user_nhop key= {
+ .un_idx = uidx,
+ .un_fibfam = fibnum | ((uint32_t)family) << 24,
+ };
+ struct user_nhop *unhop;
+
+ nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT);
+
+ if (__predict_false(family == 0))
+ return (NULL);
+
+ UN_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop != NULL) {
+ struct nhop_object *nh = unhop->un_nhop;
+ UN_RLOCK(ctl);
+ *perror = 0;
+ nhop_ref_any(nh);
+ return (nh);
+ }
+
+ /*
+ * Exact nexthop not found. Search for template nexthop to clone from.
+ */
+ key.un_fibfam = 0;
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop == NULL) {
+ UN_RUNLOCK(ctl);
+ *perror = ESRCH;
+ return (NULL);
+ }
+
+ UN_RUNLOCK(ctl);
+
+ /* Create entry to insert first */
+ struct user_nhop *un_new, *un_tmp;
+ un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
+ if (un_new == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ un_new->un_idx = uidx;
+ un_new->un_fibfam = fibnum | ((uint32_t)family) << 24;
+
+ /* Relying on epoch to protect unhop here */
+ un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags);
+ if (un_new->un_nhop == NULL) {
+ free(un_new, M_NETLINK);
+ *perror = ENOMEM;
+ return (NULL);
+ }
+
+ /* Insert back and report */
+ UN_WLOCK(ctl);
+
+ /* First, find template record once again */
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop == NULL) {
+ /* Someone deleted the nexthop during the call */
+ UN_WUNLOCK(ctl);
+ *perror = ESRCH;
+ destroy_unhop(un_new);
+ return (NULL);
+ }
+
+ /* Second, check the direct match */
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp);
+ struct nhop_object *nh;
+ if (un_tmp != NULL) {
+ /* Another thread already created the desired nextop, use it */
+ nh = un_tmp->un_nhop;
+ } else {
+ /* Finally, insert the new nexthop and link it to the primary */
+ nh = un_new->un_nhop;
+ CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new);
+ un_new->un_nextchild = unhop->un_nextchild;
+ unhop->un_nextchild = un_new;
+ un_new = NULL;
+ NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh);
+ }
+
+ UN_WUNLOCK(ctl);
+
+ if (un_new != NULL)
+ destroy_unhop(un_new);
+
+ *perror = 0;
+ nhop_ref_any(nh);
+ return (nh);
+}
+
+static struct user_nhop *
+nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx)
+{
+ struct user_nhop key= { .un_idx = uidx };
+ struct user_nhop *unhop = NULL;
+ UN_TRACKER;
+
+ UN_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ UN_RUNLOCK(ctl);
+
+ return (unhop);
+}
+
+#define MAX_STACK_NHOPS 4
+static struct nhop_object *
+clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags)
+{
+#ifdef ROUTE_MPATH
+ const struct weightened_nhop *wn;
+ struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS];
+ uint32_t num_nhops;
+#endif
+ struct nhop_object *nh = NULL;
+ int error;
+
+ if (unhop->un_nhop_src != NULL) {
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf));
+ FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src,
+ "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum,
+ family, nh_flags);
+ }
+ struct nhop_object *nh;
+ nh = nhop_alloc(fibnum, AF_UNSPEC);
+ if (nh == NULL)
+ return (NULL);
+ nhop_copy(nh, unhop->un_nhop_src);
+ /* Check that nexthop gateway is compatible with the new family */
+ if (!nhop_set_upper_family(nh, family)) {
+ nhop_free(nh);
+ return (NULL);
+ }
+ nhop_set_uidx(nh, unhop->un_idx);
+ nhop_set_pxtype_flag(nh, nh_flags);
+ return (nhop_get_nhop(nh, &error));
+ }
+#ifdef ROUTE_MPATH
+ wn = unhop->un_nhgrp_src;
+ num_nhops = unhop->un_nhgrp_count;
+
+ if (num_nhops > MAX_STACK_NHOPS) {
+ wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT);
+ if (wn_new == NULL)
+ return (NULL);
+ } else
+ wn_new = wn_base;
+
+ for (int i = 0; i < num_nhops; i++) {
+ uint32_t uidx = nhop_get_uidx(wn[i].nh);
+ MPASS(uidx != 0);
+ wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error);
+ if (error != 0)
+ break;
+ wn_new[i].weight = wn[i].weight;
+ }
+
+ if (error == 0) {
+ struct rib_head *rh = nhop_get_rh(wn_new[0].nh);
+ struct nhgrp_object *nhg;
+
+ error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg);
+ nh = (struct nhop_object *)nhg;
+ }
+
+ if (wn_new != wn_base)
+ free(wn_new, M_TEMP);
+#endif
+ return (nh);
+}
+
+static void
+destroy_unhop(struct user_nhop *unhop)
+{
+ if (unhop->un_nhop != NULL)
+ nhop_free_any(unhop->un_nhop);
+ if (unhop->un_nhop_src != NULL)
+ nhop_free_any(unhop->un_nhop_src);
+ free(unhop, M_NETLINK);
+}
+
+static void
+destroy_unhop_epoch(epoch_context_t ctx)
+{
+ struct user_nhop *unhop;
+
+ unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx);
+
+ destroy_unhop(unhop);
+}
+
+static uint32_t
+find_spare_uidx(struct unhop_ctl *ctl)
+{
+ struct user_nhop *unhop, key = {};
+ uint32_t uidx = 0;
+ UN_TRACKER;
+
+ UN_RLOCK(ctl);
+ /* This should return spare uid with 75% of 65k used in ~99/100 cases */
+ for (int i = 0; i < 16; i++) {
+ key.un_idx = (arc4random() % 65536) + 65536 * 4;
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop == NULL) {
+ uidx = key.un_idx;
+ break;
+ }
+ }
+ UN_RUNLOCK(ctl);
+
+ return (uidx);
+}
+
+
+/*
+ * Actual netlink code
+ */
+struct netlink_walkargs {
+ struct nl_writer *nw;
+ struct nlmsghdr hdr;
+ struct nlpcb *so;
+ int family;
+ int error;
+ int count;
+ int dumped;
+};
+#define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem
+
+static bool
+dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr,
+ struct nl_writer *nw)
+{
+
+ if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
+ goto enomem;
+
+ struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
+ nhm->nh_family = AF_UNSPEC;
+ nhm->nh_scope = 0;
+ nhm->nh_protocol = unhop->un_protocol;
+ nhm->nh_flags = 0;
+
+ nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
+ nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH);
+
+ struct weightened_nhop *wn = unhop->un_nhgrp_src;
+ uint32_t num_nhops = unhop->un_nhgrp_count;
+ /* TODO: a better API? */
+ int nla_len = sizeof(struct nlattr);
+ nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp));
+ struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
+ if (nla == NULL)
+ goto enomem;
+ nla->nla_type = NHA_GROUP;
+ nla->nla_len = nla_len;
+ for (int i = 0; i < num_nhops; i++) {
+ struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i];
+ grp->id = nhop_get_uidx(wn[i].nh);
+ grp->weight = wn[i].weight;
+ grp->resvd1 = 0;
+ grp->resvd2 = 0;
+ }
+
+ if (nlmsg_end(nw))
+ return (true);
+enomem:
+ NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory");
+ nlmsg_abort(nw);
+ return (false);
+}
+
+static bool
+dump_nhop(const struct nhop_object *nh, uint32_t uidx, struct nlmsghdr *hdr,
+ struct nl_writer *nw)
+{
+ if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
+ goto enomem;
+
+ struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
+ ENOMEM_IF_NULL(nhm);
+ nhm->nh_family = nhop_get_neigh_family(nh);
+ nhm->nh_scope = 0; // XXX: what's that?
+ nhm->nh_protocol = nhop_get_origin(nh);
+ nhm->nh_flags = 0;
+
+ if (uidx != 0)
+ nlattr_add_u32(nw, NHA_ID, uidx);
+ if (nh->nh_flags & NHF_BLACKHOLE) {
+ nlattr_add_flag(nw, NHA_BLACKHOLE);
+ goto done;
+ }
+ nlattr_add_u32(nw, NHA_OIF, if_getindex(nh->nh_ifp));
+
+ switch (nh->gw_sa.sa_family) {
+#ifdef INET
+ case AF_INET:
+ nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct in6_addr addr = nh->gw6_sa.sin6_addr;
+ in6_clearscope(&addr);
+ nlattr_add(nw, NHA_GATEWAY, 16, &addr);
+ break;
+ }
+#endif
+ }
+
+ int off = nlattr_add_nested(nw, NHA_FREEBSD);
+ if (off != 0) {
+ nlattr_add_u32(nw, NHAF_AIF, if_getindex(nh->nh_aifp));
+
+ if (uidx == 0) {
+ nlattr_add_u32(nw, NHAF_KID, nhop_get_idx(nh));
+ nlattr_add_u32(nw, NHAF_FAMILY, nhop_get_upper_family(nh));
+ nlattr_add_u32(nw, NHAF_TABLE, nhop_get_fibnum(nh));
+ }
+
+ nlattr_set_len(nw, off);
+ }
+
+done:
+ if (nlmsg_end(nw))
+ return (true);
+enomem:
+ nlmsg_abort(nw);
+ return (false);
+}
+
+static void
+dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
+ struct nl_writer *nw)
+{
+ if (unhop->un_nhop_src != NULL)
+ dump_nhop(unhop->un_nhop_src, unhop->un_idx, hdr, nw);
+ else
+ dump_nhgrp(unhop, hdr, nw);
+}
+
+static int
+delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx)
+{
+ struct user_nhop *unhop_ret, *unhop_base, *unhop_chain;
+ struct nl_writer nw;
+ struct user_nhop key = { .un_idx = uidx };
+
+ UN_WLOCK(ctl);
+
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base);
+
+ if (unhop_base != NULL) {
+ CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret);
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf));
+ FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop,
+ "removed base nhop %u: %s", uidx, nhbuf);
+ }
+ /* Unlink all child nexhops as well, keeping the chain intact */
+ unhop_chain = unhop_base->un_nextchild;
+ while (unhop_chain != NULL) {
+ CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain,
+ unhop_ret);
+ MPASS(unhop_chain == unhop_ret);
+ IF_DEBUG_LEVEL(LOG_DEBUG3) {
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf_any(unhop_chain->un_nhop,
+ nhbuf, sizeof(nhbuf));
+ FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop,
+ "removed child nhop %u: %s", uidx, nhbuf);
+ }
+ unhop_chain = unhop_chain->un_nextchild;
+ }
+ }
+
+ UN_WUNLOCK(ctl);
+
+ if (unhop_base == NULL) {
+ NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx);
+ return (ENOENT);
+ }
+
+ /* Report nexthop deletion */
+ struct netlink_walkargs wa = {
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_DELNEXTHOP,
+ };
+
+ if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP,
+ 0, false)) {
+ NL_LOG(LOG_DEBUG, "error allocating message writer");
+ return (ENOMEM);
+ }
+
+ dump_unhop(unhop_base, &wa.hdr, &nw);
+ nlmsg_flush(&nw);
+
+ while (unhop_base != NULL) {
+ unhop_chain = unhop_base->un_nextchild;
+ NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx);
+ unhop_base = unhop_chain;
+ }
+
+ return (0);
+}
+
+static void
+consider_resize(struct unhop_ctl *ctl, uint32_t new_size)
+{
+ void *new_ptr = NULL;
+ size_t alloc_size;
+
+ if (new_size == 0)
+ return;
+
+ if (new_size != 0) {
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size);
+ new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (new_ptr == NULL)
+ return;
+ }
+
+ NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size);
+ UN_WLOCK(ctl);
+ if (new_ptr != NULL) {
+ CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size);
+ }
+ UN_WUNLOCK(ctl);
+
+
+ if (new_ptr != NULL)
+ free(new_ptr, M_NETLINK);
+}
+
+static bool __noinline
+vnet_init_unhops(void)
+{
+ uint32_t num_buckets = 16;
+ size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
+
+ struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK,
+ M_NOWAIT | M_ZERO);
+ if (ctl == NULL)
+ return (false);
+
+ void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (ptr == NULL) {
+ free(ctl, M_NETLINK);
+ return (false);
+ }
+ CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets);
+ UN_LOCK_INIT(ctl);
+
+ if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) {
+ free(ptr, M_NETLINK);
+ free(ctl, M_NETLINK);
+ }
+
+ if (atomic_load_ptr(&V_un_ctl) == NULL)
+ return (false);
+
+ NL_LOG(LOG_NOTICE, "UNHOPS init done");
+
+ return (true);
+}
+
+static void
+vnet_destroy_unhops(const void *unused __unused)
+{
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ struct user_nhop *unhop, *tmp;
+
+ if (ctl == NULL)
+ return;
+ V_un_ctl = NULL;
+
+ /* Wait till all unhop users finish their reads */
+ NET_EPOCH_WAIT();
+
+ UN_WLOCK(ctl);
+ CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) {
+ destroy_unhop(unhop);
+ } CHT_SLIST_FOREACH_SAFE_END;
+ UN_WUNLOCK(ctl);
+
+ free(ctl->un_head.ptr, M_NETLINK);
+ free(ctl, M_NETLINK);
+}
+VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY,
+ vnet_destroy_unhops, NULL);
+
+static int
+nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
+{
+ int error = 0;
+
+ /* Verify attribute correctness */
+ struct nexthop_grp *grp = NLA_DATA(nla);
+ int data_len = NLA_DATA_LEN(nla);
+
+ int count = data_len / sizeof(*grp);
+ if (count == 0 || (count * sizeof(*grp) != data_len)) {
+ NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len);
+ return (EINVAL);
+ }
+
+ *((struct nlattr **)target) = nla;
+ return (error);
+}
+
+static void
+set_scope6(struct sockaddr *sa, if_t ifp)
+{
+#ifdef INET6
+ if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) {
+ struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa;
+
+ if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr))
+ in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp));
+ }
+#endif
+}
+
+struct nl_parsed_nhop {
+ uint32_t nha_id;
+ uint8_t nha_blackhole;
+ uint8_t nha_groups;
+ uint8_t nhaf_knhops;
+ uint8_t nhaf_family;
+ struct ifnet *nha_oif;
+ struct sockaddr *nha_gw;
+ struct nlattr *nha_group;
+ uint8_t nh_family;
+ uint8_t nh_protocol;
+ uint32_t nhaf_table;
+ uint32_t nhaf_kid;
+ uint32_t nhaf_aif;
+};
+
+#define _IN(_field) offsetof(struct nhmsg, _field)
+#define _OUT(_field) offsetof(struct nl_parsed_nhop, _field)
+static struct nlattr_parser nla_p_nh_fbsd[] = {
+ { .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = nlattr_get_flag },
+ { .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = nlattr_get_uint32 },
+ { .type = NHAF_FAMILY, .off = _OUT(nhaf_family), .cb = nlattr_get_uint8 },
+ { .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = nlattr_get_uint32 },
+ { .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = nlattr_get_uint32 },
+};
+NL_DECLARE_ATTR_PARSER(nh_fbsd_parser, nla_p_nh_fbsd);
+
+static const struct nlfield_parser nlf_p_nh[] = {
+ { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 },
+ { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 },
+};
+
+static const struct nlattr_parser nla_p_nh[] = {
+ { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 },
+ { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg },
+ { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag },
+ { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp },
+ { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip },
+ { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag },
+ { .type = NHA_FREEBSD, .arg = &nh_fbsd_parser, .cb = nlattr_get_nested },
+};
+#undef _IN
+#undef _OUT
+
+static bool
+post_p_nh(void *_attrs, struct nl_pstate *npt)
+{
+ struct nl_parsed_nhop *attrs = (struct nl_parsed_nhop *)_attrs;
+
+ set_scope6(attrs->nha_gw, attrs->nha_oif);
+ return (true);
+}
+NL_DECLARE_PARSER_EXT(nhmsg_parser, struct nhmsg, NULL, nlf_p_nh, nla_p_nh, post_p_nh);
+
+static bool
+eligible_nhg(const struct nhop_object *nh)
+{
+ return (nh->nh_flags & NHF_GATEWAY);
+}
+
+static int
+newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
+{
+ struct nexthop_grp *grp = NLA_DATA(attrs->nha_group);
+ int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp);
+ struct weightened_nhop *wn;
+
+ wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (wn == NULL)
+ return (ENOMEM);
+
+ for (int i = 0; i < count; i++) {
+ struct user_nhop *unhop;
+ unhop = nl_find_base_unhop(ctl, grp[i].id);
+ if (unhop == NULL) {
+ NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id);
+ free(wn, M_NETLINK);
+ return (ESRCH);
+ } else if (unhop->un_nhop_src == NULL) {
+ NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported",
+ grp[i].id);
+ free(wn, M_NETLINK);
+ return (ENOTSUP);
+ } else if (!eligible_nhg(unhop->un_nhop_src)) {
+ NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible",
+ grp[i].id);
+ free(wn, M_NETLINK);
+ return (ENOTSUP);
+ }
+ /*
+ * TODO: consider more rigid eligibility checks:
+ * restrict nexthops with the same gateway
+ */
+ wn[i].nh = unhop->un_nhop_src;
+ wn[i].weight = grp[i].weight;
+ }
+ unhop->un_nhgrp_src = wn;
+ unhop->un_nhgrp_count = count;
+ return (0);
+}
+
+/*
+ * Sets nexthop @nh gateway specified by @gw.
+ * If gateway is IPv6 link-local, alters @gw to include scopeid equal to
+ * @ifp ifindex.
+ * Returns 0 on success or errno.
+ */
+int
+nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, if_t ifp,
+ struct nl_pstate *npt)
+{
+#ifdef INET6
+ if (gw->sa_family == AF_INET6) {
+ struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw;
+ if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) {
+ if (ifp == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "interface not set");
+ return (EINVAL);
+ }
+ in6_set_unicast_scopeid(&gw6->sin6_addr, if_getindex(ifp));
+ }
+ }
+#endif
+ nhop_set_gw(nh, gw, true);
+ return (0);
+}
+
+static int
+newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt)
+{
+ struct ifaddr *ifa = NULL;
+ struct nhop_object *nh;
+ int error;
+
+ if (!attrs->nha_blackhole) {
+ if (attrs->nha_gw == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY");
+ return (EINVAL);
+ }
+ if (attrs->nha_oif == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF");
+ return (EINVAL);
+ }
+ if (ifa == NULL)
+ ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif);
+ if (ifa == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP");
+ return (EINVAL);
+ }
+ }
+
+ int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family;
+
+ nh = nhop_alloc(RT_DEFAULT_FIB, family);
+ if (nh == NULL) {
+ NL_LOG(LOG_DEBUG, "Unable to allocate nexthop");
+ return (ENOMEM);
+ }
+ nhop_set_uidx(nh, attrs->nha_id);
+ nhop_set_origin(nh, attrs->nh_protocol);
+
+ if (attrs->nha_blackhole)
+ nhop_set_blackhole(nh, NHF_BLACKHOLE);
+ else {
+ error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt);
+ if (error != 0) {
+ nhop_free(nh);
+ return (error);
+ }
+ nhop_set_transmit_ifp(nh, attrs->nha_oif);
+ nhop_set_src(nh, ifa);
+ }
+
+ error = nhop_get_unlinked(nh);
+ if (error != 0) {
+ NL_LOG(LOG_DEBUG, "unable to finalize nexthop");
+ return (error);
+ }
+
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf(nh, nhbuf, sizeof(nhbuf));
+ NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf);
+ }
+
+ unhop->un_nhop_src = nh;
+ return (0);
+}
+
+static int
+rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nl_pstate *npt)
+{
+ struct nl_writer nw;
+ struct user_nhop *unhop;
+ int error;
+
+ if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops()))
+ return (ENOMEM);
+ struct unhop_ctl *ctl = V_un_ctl;
+
+ struct nl_parsed_nhop attrs = {};
+ error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class
+ * citizen.
+ */
+ if (attrs.nha_id == 0) {
+ attrs.nha_id = find_spare_uidx(ctl);
+ if (attrs.nha_id == 0) {
+ NL_LOG(LOG_DEBUG, "Unable to get spare uidx");
+ return (ENOSPC);
+ }
+ }
+
+ NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? if_getindex(attrs.nha_oif) : 0);
+
+ unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
+ if (unhop == NULL) {
+ NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop");
+ return (ENOMEM);
+ }
+ unhop->un_idx = attrs.nha_id;
+ unhop->un_protocol = attrs.nh_protocol;
+
+ if (attrs.nha_group)
+ error = newnhg(ctl, &attrs, unhop);
+ else
+ error = newnhop(&attrs, unhop, npt);
+
+ if (error != 0) {
+ free(unhop, M_NETLINK);
+ return (error);
+ }
+
+ UN_WLOCK(ctl);
+ /* Check if uidx already exists */
+ struct user_nhop *tmp = NULL;
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp);
+ if (tmp != NULL) {
+ UN_WUNLOCK(ctl);
+ NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id);
+ destroy_unhop(unhop);
+ return (EEXIST);
+ }
+ CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop);
+ uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head);
+ UN_WUNLOCK(ctl);
+
+ /* Report addition of the next nexhop */
+ struct netlink_walkargs wa = {
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
+ };
+
+ if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP,
+ 0, false)) {
+ NL_LOG(LOG_DEBUG, "error allocating message writer");
+ return (ENOMEM);
+ }
+
+ dump_unhop(unhop, &wa.hdr, &nw);
+ nlmsg_flush(&nw);
+
+ consider_resize(ctl, num_buckets_new);
+
+ return (0);
+}
+
+static int
+rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nl_pstate *npt)
+{
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ int error;
+
+ if (__predict_false(ctl == NULL))
+ return (ESRCH);
+
+ struct nl_parsed_nhop attrs = {};
+ error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.nha_id == 0) {
+ NL_LOG(LOG_DEBUG, "NHA_ID not set");
+ return (EINVAL);
+ }
+
+ error = delete_unhop(ctl, hdr, attrs.nha_id);
+
+ return (error);
+}
+
+static bool
+match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
+{
+ if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id)
+ return (false);
+ if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL)
+ return (false);
+ if (attrs->nha_oif != NULL &&
+ (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif))
+ return (false);
+
+ return (true);
+}
+
+static int
+rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nl_pstate *npt)
+{
+ struct user_nhop *unhop;
+ UN_TRACKER;
+ int error;
+
+ struct nl_parsed_nhop attrs = {};
+ error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ struct netlink_walkargs wa = {
+ .nw = npt->nw,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
+ };
+
+ if (attrs.nha_id != 0) {
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ struct user_nhop key = { .un_idx = attrs.nha_id };
+
+ if (__predict_false(ctl == NULL))
+ return (ESRCH);
+
+ NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id);
+ UN_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ UN_RUNLOCK(ctl);
+
+ if (unhop == NULL)
+ return (ESRCH);
+ dump_unhop(unhop, &wa.hdr, wa.nw);
+ return (0);
+ } else if (attrs.nhaf_kid != 0) {
+ struct nhop_iter iter = {
+ .fibnum = attrs.nhaf_table,
+ .family = attrs.nhaf_family,
+ };
+ int error = ESRCH;
+
+ NL_LOG(LOG_DEBUG2, "START table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
+ for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
+ nh = nhops_iter_next(&iter)) {
+ NL_LOG(LOG_DEBUG3, "get %u", nhop_get_idx(nh));
+ if (nhop_get_idx(nh) == attrs.nhaf_kid) {
+ dump_nhop(nh, 0, &wa.hdr, wa.nw);
+ error = 0;
+ break;
+ }
+ }
+ nhops_iter_stop(&iter);
+ return (error);
+ } else if (attrs.nhaf_knhops) {
+ struct nhop_iter iter = {
+ .fibnum = attrs.nhaf_table,
+ .family = attrs.nhaf_family,
+ };
+
+ NL_LOG(LOG_DEBUG2, "DUMP table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
+ wa.hdr.nlmsg_flags |= NLM_F_MULTI;
+ for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
+ nh = nhops_iter_next(&iter)) {
+ dump_nhop(nh, 0, &wa.hdr, wa.nw);
+ }
+ nhops_iter_stop(&iter);
+ } else {
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+
+ if (__predict_false(ctl == NULL))
+ return (ESRCH);
+
+ NL_LOG(LOG_DEBUG2, "DUMP unhops");
+ UN_RLOCK(ctl);
+ wa.hdr.nlmsg_flags |= NLM_F_MULTI;
+ CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) {
+ if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop))
+ dump_unhop(unhop, &wa.hdr, wa.nw);
+ } CHT_SLIST_FOREACH_END;
+ UN_RUNLOCK(ctl);
+ }
+
+ if (wa.error == 0) {
+ if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr))
+ return (ENOMEM);
+ }
+ return (0);
+}
+
+static const struct rtnl_cmd_handler cmd_handlers[] = {
+ {
+ .cmd = NL_RTM_NEWNEXTHOP,
+ .name = "RTM_NEWNEXTHOP",
+ .cb = &rtnl_handle_newnhop,
+ .priv = PRIV_NET_ROUTE,
+ },
+ {
+ .cmd = NL_RTM_DELNEXTHOP,
+ .name = "RTM_DELNEXTHOP",
+ .cb = &rtnl_handle_delnhop,
+ .priv = PRIV_NET_ROUTE,
+ },
+ {
+ .cmd = NL_RTM_GETNEXTHOP,
+ .name = "RTM_GETNEXTHOP",
+ .cb = &rtnl_handle_getnhop,
+ }
+};
+
+static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser, &nh_fbsd_parser };
+
+void
+rtnl_nexthops_init(void)
+{
+ NL_VERIFY_PARSERS(all_parsers);
+ rtnl_register_messages(cmd_handlers, nitems(cmd_handlers));
+}
diff --git a/sys/netlink/route/nexthop.h b/sys/netlink/route/nexthop.h
new file mode 100644
index 000000000000..81a1c9ac88f8
--- /dev/null
+++ b/sys/netlink/route/nexthop.h
@@ -0,0 +1,113 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * NEXTHOP-related (RTM_<NEW|DEL|GET>NEXTHOP) message header and attributes.
+ */
+
+#ifndef _NETLINK_ROUTE_NEXTHOP_H_
+#define _NETLINK_ROUTE_NEXTHOP_H_
+
+/* Base header for all of the relevant messages */
+struct nhmsg {
+ unsigned char nh_family; /* transport family */
+ unsigned char nh_scope; /* ignored on RX, filled by kernel */
+ unsigned char nh_protocol; /* Routing protocol that installed nh */
+ unsigned char resvd;
+ unsigned int nh_flags; /* RTNH_F_* flags from route.h */
+};
+
+enum {
+ NHA_UNSPEC,
+ NHA_ID, /* u32: nexthop userland index, auto-assigned if 0 */
+ NHA_GROUP, /* binary: array of struct nexthop_grp */
+ NHA_GROUP_TYPE, /* u16: set to NEXTHOP_GRP_TYPE */
+ NHA_BLACKHOLE, /* flag: nexthop used to blackhole packets */
+ NHA_OIF, /* u32: transmit ifindex */
+ NHA_GATEWAY, /* network: IPv4/IPv6 gateway addr */
+ NHA_ENCAP_TYPE, /* not supported */
+ NHA_ENCAP, /* not supported */
+ NHA_GROUPS, /* flag: match nexthop groups */
+ NHA_MASTER, /* not supported */
+ NHA_FDB, /* not supported */
+ NHA_RES_GROUP, /* not supported */
+ NHA_RES_BUCKET, /* not supported */
+ NHA_FREEBSD, /* nested: FreeBSD-specific attributes */
+ __NHA_MAX,
+};
+#define NHA_MAX (__NHA_MAX - 1)
+
+enum {
+ NHAF_UNSPEC,
+ NHAF_KNHOPS, /* flag: dump kernel nexthops */
+ NHAF_KGOUPS, /* flag: dump kernel nexthop groups */
+ NHAF_TABLE, /* u32: rtable id */
+ NHAF_FAMILY, /* u32: upper family */
+ NHAF_KID, /* u32: kernel nexthop index */
+ NHAF_AIF, /* u32: source interface address */
+};
+
+/*
+ * Attributes that can be used as filters:
+ * NHA_ID (nexhop or group), NHA_OIF, NHA_GROUPS,
+ */
+
+/*
+ * NHA_GROUP: array of the following structures.
+ * If attribute is set, the only other valid attributes are
+ * NHA_ID and NHA_GROUP_TYPE.
+ * NHA_RES_GROUP and NHA_RES_BUCKET are not supported yet
+ */
+struct nexthop_grp {
+ uint32_t id; /* nexhop userland index */
+ uint8_t weight; /* weight of this nexthop */
+ uint8_t resvd1;
+ uint16_t resvd2;
+};
+
+/* NHA_GROUP_TYPE: u16 */
+enum {
+ NEXTHOP_GRP_TYPE_MPATH, /* default nexthop group */
+ NEXTHOP_GRP_TYPE_RES, /* resilient nexthop group */
+ __NEXTHOP_GRP_TYPE_MAX,
+};
+#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1)
+
+
+/* NHA_RES_GROUP */
+enum {
+ NHA_RES_GROUP_UNSPEC,
+ NHA_RES_GROUP_PAD = NHA_RES_GROUP_UNSPEC,
+ NHA_RES_GROUP_BUCKETS,
+ NHA_RES_GROUP_IDLE_TIMER,
+ NHA_RES_GROUP_UNBALANCED_TIMER,
+ NHA_RES_GROUP_UNBALANCED_TIME,
+ __NHA_RES_GROUP_MAX,
+};
+#define NHA_RES_GROUP_MAX (__NHA_RES_GROUP_MAX - 1)
+
+#endif
diff --git a/sys/netlink/route/route.h b/sys/netlink/route/route.h
new file mode 100644
index 000000000000..60c3a22718a3
--- /dev/null
+++ b/sys/netlink/route/route.h
@@ -0,0 +1,368 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Route-related (RTM_<NEW|DEL|GET>ROUTE) message header and attributes.
+ */
+
+#ifndef _NETLINK_ROUTE_ROUTE_H_
+#define _NETLINK_ROUTE_ROUTE_H_
+
+/* Base header for all of the relevant messages */
+struct rtmsg {
+ unsigned char rtm_family; /* address family */
+ unsigned char rtm_dst_len; /* Prefix length */
+ unsigned char rtm_src_len; /* Source prefix length (not used) */
+ unsigned char rtm_tos; /* Type of service (not used) */
+ unsigned char rtm_table; /* rtable id */
+ unsigned char rtm_protocol; /* Routing protocol id (RTPROT_) */
+ unsigned char rtm_scope; /* Route distance (RT_SCOPE_) */
+ unsigned char rtm_type; /* Route type (RTN_) */
+ unsigned rtm_flags; /* Route flags (RTM_F_) */
+};
+
+/*
+ * RFC 3549, 3.1.1, route type (rtm_type field).
+ */
+enum {
+ RTN_UNSPEC,
+ RTN_UNICAST, /* Unicast route */
+ RTN_LOCAL, /* Accept locally (not supported) */
+ RTN_BROADCAST, /* Accept locally as broadcast, send as broadcast */
+ RTN_ANYCAST, /* Accept locally as broadcast, but send as unicast */
+ RTN_MULTICAST, /* Multicast route */
+ RTN_BLACKHOLE, /* Drop traffic towards destination */
+ RTN_UNREACHABLE, /* Destination is unreachable */
+ RTN_PROHIBIT, /* Administratively prohibited */
+ RTN_THROW, /* Not in this table (not supported) */
+ RTN_NAT, /* Translate this address (not supported) */
+ RTN_XRESOLVE, /* Use external resolver (not supported) */
+ __RTN_MAX,
+};
+#define RTN_MAX (__RTN_MAX - 1)
+
+/*
+ * RFC 3549, 3.1.1, protocol (Identifies what/who added the route).
+ * Values larger than RTPROT_STATIC(4) are not interpreted by the
+ * kernel, they are just for user information.
+ */
+#define RTPROT_UNSPEC 0
+#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirect */
+#define RTPROT_KERNEL 2 /* Route installed by kernel */
+#define RTPROT_BOOT 3 /* Route installed during boot */
+#define RTPROT_STATIC 4 /* Route installed by administrator */
+
+#define RTPROT_GATED 8
+#define RTPROT_RA 9
+#define RTPROT_MRT 1
+#define RTPROT_ZEBRA 11
+#define RTPROT_BIRD 12
+#define RTPROT_DNROUTED 13
+#define RTPROT_XORP 14
+#define RTPROT_NTK 15
+#define RTPROT_DHCP 16
+#define RTPROT_MROUTED 17
+#define RTPROT_KEEPALIVED 18
+#define RTPROT_BABEL 42
+#define RTPROT_OPENR 99
+#define RTPROT_BGP 186
+#define RTPROT_ISIS 187
+#define RTPROT_OSPF 188
+#define RTPROT_RIP 189
+#define RTPROT_EIGRP 192
+
+/*
+ * RFC 3549 3.1.1 Route scope (valid distance to destination).
+ *
+ * The values between RT_SCOPE_UNIVERSE(0) and RT_SCOPE_SITE(200)
+ * are available to the user.
+ */
+enum rt_scope_t {
+ RT_SCOPE_UNIVERSE = 0,
+ /* User defined values */
+ RT_SCOPE_SITE = 200,
+ RT_SCOPE_LINK = 253,
+ RT_SCOPE_HOST = 254,
+ RT_SCOPE_NOWHERE = 255
+};
+
+/*
+ * RFC 3549 3.1.1 Route flags (rtm_flags).
+ * Is a composition of RTNH_F flags (0x1..0x40 range), RTM_F flags (below)
+ * and per-protocol (IPv4/IPv6) flags.
+ */
+#define RTM_F_NOTIFY 0x00000100 /* not supported */
+#define RTM_F_CLONED 0x00000200 /* not supported */
+#define RTM_F_EQUALIZE 0x00000400 /* not supported */
+#define RTM_F_PREFIX 0x00000800 /* not supported */
+#define RTM_F_LOOKUP_TABLE 0x00001000 /* not supported */
+#define RTM_F_FIB_MATCH 0x00002000 /* not supported */
+#define RTM_F_OFFLOAD 0x00004000 /* not supported */
+#define RTM_F_TRAP 0x00008000 /* not supported */
+#define RTM_F_OFFLOAD_FAILED 0x20000000 /* not supported */
+
+/* Compatibility handling helpers */
+#ifndef _KERNEL
+#define NL_RTM_HDRLEN ((int)sizeof(struct rtmsg))
+#define RTM_RTA(_rtm) ((struct rtattr *)((char *)(_rtm) + NL_RTM_HDRLEN))
+#define RTM_PAYLOAD(_hdr) NLMSG_PAYLOAD((_hdr), NL_RTM_HDRLEN)
+#endif
+
+/*
+ * Routing table identifiers.
+ * FreeBSD route table numbering starts from 0, where 0 is a valid default
+ * routing table. Indicating "all tables" via netlink can be done by not
+ * including RTA_TABLE attribute and keeping rtm_table=0 (compatibility) or
+ * setting RTA_TABLE value to RT_TABLE_UNSPEC.
+ */
+#define RT_TABLE_MAIN 0 /* RT_DEFAULT_FIB */
+#define RT_TABLE_UNSPEC 0xFFFFFFFF /* RT_ALL_FIBS */
+
+enum rtattr_type_t {
+ NL_RTA_UNSPEC,
+ NL_RTA_DST = 1, /* binary, IPv4/IPv6 destination */
+ NL_RTA_SRC = 2, /* binary, preferred source address */
+ NL_RTA_IIF = 3, /* not supported */
+ NL_RTA_OIF = 4, /* u32, transmit ifindex */
+ NL_RTA_GATEWAY = 5, /* binary: IPv4/IPv6 gateway */
+ NL_RTA_PRIORITY = 6, /* not supported */
+ NL_RTA_PREFSRC = 7, /* not supported */
+ NL_RTA_METRICS = 8, /* nested, list of NL_RTAX* attrs */
+ NL_RTA_MULTIPATH = 9, /* binary, array of struct rtnexthop */
+ NL_RTA_PROTOINFO = 10, /* not supported / deprecated */
+ NL_RTA_KNH_ID = 10, /* u32, FreeBSD specific, kernel nexthop index */
+ NL_RTA_FLOW = 11, /* not supported */
+ NL_RTA_CACHEINFO = 12, /* not supported */
+ NL_RTA_SESSION = 13, /* not supported / deprecated */
+ NL_RTA_WEIGHT = 13, /* u32, FreeBSD specific, path weight */
+ NL_RTA_MP_ALGO = 14, /* not supported / deprecated */
+ NL_RTA_RTFLAGS = 14, /* u32, FreeBSD specific, path flags (RTF_)*/
+ NL_RTA_TABLE = 15, /* u32, fibnum */
+ NL_RTA_MARK = 16, /* not supported */
+ NL_RTA_MFC_STATS = 17, /* not supported */
+ NL_RTA_VIA = 18, /* binary, struct rtvia */
+ NL_RTA_NEWDST = 19, /* not supported */
+ NL_RTA_PREF = 20, /* not supported */
+ NL_RTA_ENCAP_TYPE = 21, /* not supported */
+ NL_RTA_ENCAP = 22, /* not supported */
+ NL_RTA_EXPIRES = 23, /* u32, seconds till expiration */
+ NL_RTA_PAD = 24, /* not supported */
+ NL_RTA_UID = 25, /* not supported */
+ NL_RTA_TTL_PROPAGATE = 26, /* not supported */
+ NL_RTA_IP_PROTO = 27, /* not supported */
+ NL_RTA_SPORT = 28, /* not supported */
+ NL_RTA_DPORT = 29, /* not supported */
+ NL_RTA_NH_ID = 30, /* u32, nexthop/nexthop group index */
+ __RTA_MAX
+};
+#define NL_RTA_MAX (__RTA_MAX - 1)
+
+/*
+ * Attributes that can be used as filters:
+ *
+ */
+
+#ifndef _KERNEL
+/*
+ * RTA_* space has clashes with rtsock namespace.
+ * Use NL_RTA_ prefix in the kernel and map to
+ * RTA_ for userland.
+ */
+#define RTA_UNSPEC NL_RTA_UNSPEC
+#define RTA_DST NL_RTA_DST
+#define RTA_SRC NL_RTA_SRC
+#define RTA_IIF NL_RTA_IIF
+#define RTA_OIF NL_RTA_OIF
+#define RTA_GATEWAY NL_RTA_GATEWAY
+#define RTA_PRIORITY NL_RTA_PRIORITY
+#define RTA_PREFSRC NL_RTA_PREFSRC
+#define RTA_METRICS NL_RTA_METRICS
+#define RTA_MULTIPATH NL_RTA_MULTIPATH
+#define RTA_PROTOINFO NL_RTA_PROTOINFO
+#define RTA_KNH_ID NL_RTA_KNH_ID
+#define RTA_FLOW NL_RTA_FLOW
+#define RTA_CACHEINFO NL_RTA_CACHEINFO
+#define RTA_SESSION NL_RTA_SESSION
+#define RTA_MP_ALGO NL_RTA_MP_ALGO
+#define RTA_TABLE NL_RTA_TABLE
+#define RTA_MARK NL_RTA_MARK
+#define RTA_MFC_STATS NL_RTA_MFC_STATS
+#define RTA_VIA NL_RTA_VIA
+#define RTA_NEWDST NL_RTA_NEWDST
+#define RTA_PREF NL_RTA_PREF
+#define RTA_ENCAP_TYPE NL_RTA_ENCAP_TYPE
+#define RTA_ENCAP NL_RTA_ENCAP
+#define RTA_EXPIRES NL_RTA_EXPIRES
+#define RTA_PAD NL_RTA_PAD
+#define RTA_UID NL_RTA_UID
+#define RTA_TTL_PROPAGATE NL_RTA_TTL_PROPAGATE
+#define RTA_IP_PROTO NL_RTA_IP_PROTO
+#define RTA_SPORT NL_RTA_SPORT
+#define RTA_DPORT NL_RTA_DPORT
+#define RTA_NH_ID NL_RTA_NH_ID
+#define RTA_MAX NL_RTA_MAX
+#endif
+
+/* route attribute header */
+struct rtattr {
+ unsigned short rta_len;
+ unsigned short rta_type;
+};
+
+#define NL_RTA_ALIGN_SIZE NL_ITEM_ALIGN_SIZE
+#define NL_RTA_ALIGN NL_ITEM_ALIGN
+#define NL_RTA_HDRLEN ((int)sizeof(struct rtattr))
+#define NL_RTA_DATA_LEN(_rta) ((int)((_rta)->rta_len - NL_RTA_HDRLEN))
+#define NL_RTA_DATA(_rta) NL_ITEM_DATA(_rta, NL_RTA_HDRLEN)
+#define NL_RTA_DATA_CONST(_rta) NL_ITEM_DATA_CONST(_rta, NL_RTA_HDRLEN)
+
+/* Compatibility attribute handling helpers */
+#ifndef _KERNEL
+#define RTA_ALIGNTO NL_RTA_ALIGN_SIZE
+#define RTA_ALIGN(_len) NL_RTA_ALIGN(_len)
+#define _RTA_LEN(_rta) ((int)(_rta)->rta_len)
+#define _RTA_ALIGNED_LEN(_rta) RTA_ALIGN(_RTA_LEN(_rta))
+#define RTA_OK(_rta, _len) NL_ITEM_OK(_rta, _len, NL_RTA_HDRLEN, _RTA_LEN)
+#define RTA_NEXT(_rta, _len) NL_ITEM_ITER(_rta, _len, _RTA_ALIGNED_LEN)
+#define RTA_LENGTH(_len) (NL_RTA_HDRLEN + (_len))
+#define RTA_SPACE(_len) RTA_ALIGN(RTA_LENGTH(_len))
+#define RTA_DATA(_rta) NL_RTA_DATA(_rta)
+#define RTA_PAYLOAD(_rta) ((int)(_RTA_LEN(_rta) - NL_RTA_HDRLEN))
+#endif
+
+/* RTA attribute headers */
+
+/* RTA_VIA */
+struct rtvia {
+ sa_family_t rtvia_family;
+ uint8_t rtvia_addr[0];
+};
+
+/*
+ * RTA_METRICS is a nested attribute, consisting of a list of
+ * TLVs with types defined below.
+ */
+ enum {
+ NL_RTAX_UNSPEC,
+ NL_RTAX_LOCK = 1, /* not supported */
+ NL_RTAX_MTU = 2, /* desired path MTU */
+ NL_RTAX_WINDOW = 3, /* not supported */
+ NL_RTAX_RTT = 4, /* not supported */
+ NL_RTAX_RTTVAR = 5, /* not supported */
+ NL_RTAX_SSTHRESH = 6, /* not supported */
+ NL_RTAX_CWND = 7, /* not supported */
+ NL_RTAX_ADVMSS = 8, /* not supported */
+ NL_RTAX_REORDERING = 9, /* not supported */
+ NL_RTAX_HOPLIMIT = 10, /* not supported */
+ NL_RTAX_INITCWND = 11, /* not supporrted */
+ NL_RTAX_FEATURES = 12, /* not supported */
+ NL_RTAX_RTO_MIN = 13, /* not supported */
+ NL_RTAX_INITRWND = 14, /* not supported */
+ NL_RTAX_QUICKACK = 15, /* not supported */
+ NL_RTAX_CC_ALGO = 16, /* not supported */
+ NL_RTAX_FASTOPEN_NO_COOKIE = 17, /* not supported */
+ __NL_RTAX_MAX
+};
+#define NL_RTAX_MAX (__NL_RTAX_MAX - 1)
+
+#define RTAX_FEATURE_ECN (1 << 0)
+#define RTAX_FEATURE_SACK (1 << 1)
+#define RTAX_FEATURE_TIMESTAMP (1 << 2)
+#define RTAX_FEATURE_ALLFRAG (1 << 3)
+
+#define RTAX_FEATURE_MASK \
+ (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | RTAX_FEATURE_TIMESTAMP | \
+ RTAX_FEATURE_ALLFRAG)
+
+#ifndef _KERNEL
+
+/*
+ * RTAX_* space clashes with rtsock namespace.
+ * Use NL_RTAX_ prefix in the kernel and map to
+ * RTAX_ for userland.
+ */
+#define RTAX_UNSPEC NL_RTAX_UNSPEC
+#define RTAX_LOCK NL_RTAX_LOCK
+#define RTAX_MTU NL_RTAX_MTU
+#define RTAX_WINDOW NL_RTAX_WINDOW
+#define RTAX_RTT NL_RTAX_RTT
+#define RTAX_RTTVAR NL_RTAX_RTTVAR
+#define RTAX_SSTHRESH NL_RTAX_SSTHRESH
+#define RTAX_CWND NL_RTAX_CWND
+#define RTAX_ADVMSS NL_RTAX_ADVMSS
+#define RTAX_REORDERING NL_RTAX_REORDERING
+#define RTAX_HOPLIMIT NL_RTAX_HOPLIMIT
+#define RTAX_INITCWND NL_RTAX_INITCWND
+#define RTAX_FEATURES NL_RTAX_FEATURES
+#define RTAX_RTO_MIN NL_RTAX_RTO_MIN
+#define RTAX_INITRWND NL_RTAX_INITRWND
+#define RTAX_QUICKACK NL_RTAX_QUICKACK
+#define RTAX_CC_ALGO NL_RTAX_CC_ALGO
+#define RTAX_FASTOPEN_NO_COOKIE NL_RTAX_FASTOPEN_NO_COOKIE
+#endif
+
+/*
+ * RTA_MULTIPATH consists of an array of rtnexthop structures.
+ * Each rtnexthop structure contains RTA_GATEWAY or RTA_VIA
+ * attribute following the header.
+ */
+struct rtnexthop {
+ unsigned short rtnh_len;
+ unsigned char rtnh_flags;
+ unsigned char rtnh_hops; /* nexthop weight */
+ int rtnh_ifindex;
+};
+
+/* rtnh_flags */
+#define RTNH_F_DEAD 0x01 /* not supported */
+#define RTNH_F_PERVASIVE 0x02 /* not supported */
+#define RTNH_F_ONLINK 0x04 /* not supported */
+#define RTNH_F_OFFLOAD 0x08 /* not supported */
+#define RTNH_F_LINKDOWN 0x10 /* not supported */
+#define RTNH_F_UNRESOLVED 0x20 /* not supported */
+#define RTNH_F_TRAP 0x40 /* not supported */
+
+#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN | \
+ RTNH_F_OFFLOAD | RTNH_F_TRAP)
+
+/* Macros to handle hexthops */
+#define RTNH_ALIGNTO NL_ITEM_ALIGN_SIZE
+#define RTNH_ALIGN(_len) NL_ITEM_ALIGN(_len)
+#define RTNH_HDRLEN ((int)sizeof(struct rtnexthop))
+#define _RTNH_LEN(_nh) ((int)(_nh)->rtnh_len)
+#define _RTNH_ALIGNED_LEN(_nh) RTNH_ALIGN(_RTNH_LEN(_nh))
+#define RTNH_OK(_nh, _len) NL_ITEM_OK(_nh, _len, RTNH_HDRLEN, _RTNH_LEN)
+#define RTNH_NEXT(_nh) ((struct rtnexthop *)((char *)(_nh) + _RTNH_ALIGNED_LEN(_nh)))
+#define RTNH_LENGTH(_len) (RTNH_HDRLEN + (_len))
+#define RTNH_SPACE(_len) RTNH_ALIGN(RTNH_LENGTH(_len))
+#define RTNH_DATA(_nh) ((struct rtattr *)NL_ITEM_DATA(_nh, RTNH_HDRLEN))
+
+struct rtgenmsg {
+ unsigned char rtgen_family;
+};
+
+#endif
diff --git a/sys/netlink/route/route_var.h b/sys/netlink/route/route_var.h
new file mode 100644
index 000000000000..b84b34461e35
--- /dev/null
+++ b/sys/netlink/route/route_var.h
@@ -0,0 +1,140 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This file contains definitions shared among NETLINK_ROUTE family
+ */
+#ifndef _NETLINK_ROUTE_ROUTE_VAR_H_
+#define _NETLINK_ROUTE_ROUTE_VAR_H_
+
+#include <sys/priv.h> /* values for priv_check */
+
+struct nlmsghdr;
+struct nlpcb;
+struct nl_pstate;
+
+typedef int rtnl_msg_cb_f(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nl_pstate *npt);
+
+struct rtnl_cmd_handler {
+ int cmd;
+ const char *name;
+ rtnl_msg_cb_f *cb;
+ int priv;
+ int flags;
+};
+
+#define RTNL_F_NOEPOCH 0x01 /* Do not enter epoch when handling command */
+#define RTNL_F_ALLOW_NONVNET_JAIL 0x02 /* Allow command execution inside non-VNET jail */
+
+bool rtnl_register_messages(const struct rtnl_cmd_handler *handlers, int count);
+
+/* route.c */
+struct rib_cmd_info;
+void rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc);
+void rtnl_routes_init(void);
+
+/* neigh.c */
+void rtnl_neighs_init(void);
+void rtnl_neighs_destroy(void);
+
+/* iface.c */
+struct nl_parsed_link {
+ char *ifla_group;
+ char *ifla_ifname;
+ char *ifla_cloner;
+ char *ifla_ifalias;
+ struct nlattr *ifla_idata;
+ unsigned short ifi_type;
+ int ifi_index;
+ uint32_t ifla_link;
+ uint32_t ifla_mtu;
+ uint32_t ifi_flags;
+ uint32_t ifi_change;
+};
+
+#if defined(NETLINK) || defined(NETLINK_MODULE)
+/* Provide optimized calls to the functions inside the same linking unit */
+
+int _nl_modify_ifp_generic(struct ifnet *ifp, struct nl_parsed_link *lattrs,
+ const struct nlattr_bmask *bm, struct nl_pstate *npt);
+void _nl_store_ifp_cookie(struct nl_pstate *npt, struct ifnet *ifp);
+
+static inline int
+nl_modify_ifp_generic(struct ifnet *ifp, struct nl_parsed_link *lattrs,
+ const struct nlattr_bmask *bm, struct nl_pstate *npt)
+{
+ return (_nl_modify_ifp_generic(ifp, lattrs, bm, npt));
+}
+
+static inline void
+nl_store_ifp_cookie(struct nl_pstate *npt, struct ifnet *ifp)
+{
+ _nl_store_ifp_cookie(npt, ifp);
+}
+#else
+/* Provide access to the functions via netlink_glue.c */
+int nl_modify_ifp_generic(struct ifnet *ifp, struct nl_parsed_link *lattrs,
+ const struct nlattr_bmask *bm, struct nl_pstate *npt);
+void nl_store_ifp_cookie(struct nl_pstate *npt, struct ifnet *ifp);
+#endif /* defined(NETLINK) || defined(NETLINK_MODULE) */
+
+
+typedef int rtnl_iface_create_f(struct nl_parsed_link *lattrs,
+ const struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt);
+typedef int rtnl_iface_modify_f(struct ifnet *ifp, struct nl_parsed_link *lattrs,
+ const struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt);
+typedef int rtnl_iface_dump_f(struct ifnet *ifp, struct nl_writer *nw);
+
+struct nl_cloner {
+ const char *name;
+ rtnl_iface_create_f *create_f;
+ rtnl_iface_modify_f *modify_f;
+ rtnl_iface_dump_f *dump_f;
+ SLIST_ENTRY(nl_cloner) next;
+};
+
+extern struct nl_cloner generic_cloner;
+
+void rtnl_ifaces_init(void);
+void rtnl_ifaces_destroy(void);
+void rtnl_iface_add_cloner(struct nl_cloner *cloner);
+void rtnl_iface_del_cloner(struct nl_cloner *cloner);
+void rtnl_handle_ifnet_event(struct ifnet *ifp, int if_change_mask);
+
+/* iface_drivers.c */
+void rtnl_iface_drivers_register(void);
+
+/* nexthop.c */
+void rtnl_nexthops_init(void);
+struct nhop_object *nl_find_nhop(uint32_t fibnum, int family,
+ uint32_t uidx, int nh_flags, int *perror);
+int nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw,
+ struct ifnet *ifp, struct nl_pstate *npt);
+
+
+#endif
diff --git a/sys/netlink/route/rt.c b/sys/netlink/route/rt.c
new file mode 100644
index 000000000000..dcd19b43105c
--- /dev/null
+++ b/sys/netlink/route/rt.c
@@ -0,0 +1,1139 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_route.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <netinet6/scope6_var.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#define DEBUG_MOD_NAME nl_route
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_INFO);
+
+static unsigned char
+get_rtm_type(const struct nhop_object *nh)
+{
+ int nh_flags = nh->nh_flags;
+
+ /* Use the fact that nhg runtime flags are only NHF_MULTIPATH */
+ if (nh_flags & NHF_BLACKHOLE)
+ return (RTN_BLACKHOLE);
+ else if (nh_flags & NHF_REJECT)
+ return (RTN_PROHIBIT);
+ return (RTN_UNICAST);
+}
+
+static uint8_t
+nl_get_rtm_protocol(const struct nhop_object *nh)
+{
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh;
+ uint8_t origin = nhgrp_get_origin(nhg);
+ if (origin != RTPROT_UNSPEC)
+ return (origin);
+ nh = nhg->nhops[0];
+ }
+#endif
+ uint8_t origin = nhop_get_origin(nh);
+ if (origin != RTPROT_UNSPEC)
+ return (origin);
+ /* TODO: remove guesswork once all kernel users fill in origin */
+ int rt_flags = nhop_get_rtflags(nh);
+ if (rt_flags & RTF_PROTO1)
+ return (RTPROT_ZEBRA);
+ if (rt_flags & RTF_STATIC)
+ return (RTPROT_STATIC);
+ return (RTPROT_KERNEL);
+}
+
+static int
+get_rtmsg_type_from_rtsock(int cmd)
+{
+ switch (cmd) {
+ case RTM_ADD:
+ case RTM_CHANGE:
+ case RTM_GET:
+ return NL_RTM_NEWROUTE;
+ case RTM_DELETE:
+ return NL_RTM_DELROUTE;
+ }
+
+ return (0);
+}
+
+/*
+ * fibnum heuristics
+ *
+ * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS
+ * msg rtm_table RTA_TABLE result
+ * RTM_GETROUTE/dump 0 - RT_ALL_FIBS
+ * RTM_GETROUTE/dump 1 - 1
+ * RTM_GETROUTE/get 0 - 0
+ *
+ */
+
+static struct nhop_object *
+rc_get_nhop(const struct rib_cmd_info *rc)
+{
+ return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new);
+}
+
+static void
+dump_rc_nhop_gw(struct nl_writer *nw, const struct nhop_object *nh)
+{
+#ifdef INET6
+ int upper_family;
+#endif
+
+ switch (nhop_get_neigh_family(nh)) {
+ case AF_LINK:
+ /* onlink prefix, skip */
+ break;
+ case AF_INET:
+ nlattr_add(nw, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
+ break;
+#ifdef INET6
+ case AF_INET6:
+ upper_family = nhop_get_upper_family(nh);
+ if (upper_family == AF_INET6) {
+ struct in6_addr gw6 = nh->gw6_sa.sin6_addr;
+ in6_clearscope(&gw6);
+
+ nlattr_add(nw, NL_RTA_GATEWAY, 16, &gw6);
+ } else if (upper_family == AF_INET) {
+ /* IPv4 over IPv6 */
+ struct in6_addr gw6 = nh->gw6_sa.sin6_addr;
+ in6_clearscope(&gw6);
+
+ char buf[20];
+ struct rtvia *via = (struct rtvia *)&buf[0];
+ via->rtvia_family = AF_INET6;
+ memcpy(via->rtvia_addr, &gw6, 16);
+ nlattr_add(nw, NL_RTA_VIA, 17, via);
+ }
+ break;
+#endif
+ }
+}
+
+static void
+dump_rc_nhop_mtu(struct nl_writer *nw, const struct nhop_object *nh)
+{
+ int nla_len = sizeof(struct nlattr) * 2 + sizeof(uint32_t);
+ struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
+
+ if (nla == NULL)
+ return;
+ nla->nla_type = NL_RTA_METRICS;
+ nla->nla_len = nla_len;
+ nla++;
+ nla->nla_type = NL_RTAX_MTU;
+ nla->nla_len = sizeof(struct nlattr) + sizeof(uint32_t);
+ *((uint32_t *)(nla + 1)) = nh->nh_mtu;
+}
+
+#ifdef ROUTE_MPATH
+static void
+dump_rc_nhg(struct nl_writer *nw, const struct nhgrp_object *nhg, struct rtmsg *rtm)
+{
+ uint32_t uidx = nhgrp_get_uidx(nhg);
+ uint32_t num_nhops;
+ const struct weightened_nhop *wn = nhgrp_get_nhops(nhg, &num_nhops);
+ uint32_t base_rtflags = nhop_get_rtflags(wn[0].nh);
+
+ if (uidx != 0)
+ nlattr_add_u32(nw, NL_RTA_NH_ID, uidx);
+ nlattr_add_u32(nw, NL_RTA_KNH_ID, nhgrp_get_idx(nhg));
+
+ nlattr_add_u32(nw, NL_RTA_RTFLAGS, base_rtflags);
+ int off = nlattr_add_nested(nw, NL_RTA_MULTIPATH);
+ if (off == 0)
+ return;
+
+ for (int i = 0; i < num_nhops; i++) {
+ int nh_off = nlattr_save_offset(nw);
+ struct rtnexthop *rtnh = nlmsg_reserve_object(nw, struct rtnexthop);
+ if (rtnh == NULL)
+ return;
+ rtnh->rtnh_flags = 0;
+ rtnh->rtnh_ifindex = if_getindex(wn[i].nh->nh_ifp);
+ rtnh->rtnh_hops = wn[i].weight;
+ dump_rc_nhop_gw(nw, wn[i].nh);
+ uint32_t rtflags = nhop_get_rtflags(wn[i].nh);
+ if (rtflags != base_rtflags)
+ nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags);
+ if (rtflags & RTF_FIXEDMTU)
+ dump_rc_nhop_mtu(nw, wn[i].nh);
+ rtnh = nlattr_restore_offset(nw, nh_off, struct rtnexthop);
+ /*
+ * nlattr_add() allocates 4-byte aligned storage, no need to aligh
+ * length here
+ * */
+ rtnh->rtnh_len = nlattr_save_offset(nw) - nh_off;
+ }
+ nlattr_set_len(nw, off);
+}
+#endif
+
+static void
+dump_rc_nhop(struct nl_writer *nw, const struct route_nhop_data *rnd, struct rtmsg *rtm)
+{
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(rnd->rnd_nhop)) {
+ dump_rc_nhg(nw, rnd->rnd_nhgrp, rtm);
+ return;
+ }
+#endif
+ const struct nhop_object *nh = rnd->rnd_nhop;
+ uint32_t rtflags = nhop_get_rtflags(nh);
+
+ /*
+ * IPv4 over IPv6
+ * ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2),
+ * IPv4 w/ gw
+ * ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)],
+ * Direct route:
+ * ('RTA_OIF', 2)
+ */
+ if (nh->nh_flags & NHF_GATEWAY)
+ dump_rc_nhop_gw(nw, nh);
+
+ uint32_t uidx = nhop_get_uidx(nh);
+ if (uidx != 0)
+ nlattr_add_u32(nw, NL_RTA_NH_ID, uidx);
+ nlattr_add_u32(nw, NL_RTA_KNH_ID, nhop_get_idx(nh));
+ nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags);
+
+ if (rtflags & RTF_FIXEDMTU)
+ dump_rc_nhop_mtu(nw, nh);
+ uint32_t nh_expire = nhop_get_expire(nh);
+ if (nh_expire > 0)
+ nlattr_add_u32(nw, NL_RTA_EXPIRES, nh_expire - time_uptime);
+
+ /* In any case, fill outgoing interface */
+ nlattr_add_u32(nw, NL_RTA_OIF, if_getindex(nh->nh_ifp));
+
+ if (rnd->rnd_weight != RT_DEFAULT_WEIGHT)
+ nlattr_add_u32(nw, NL_RTA_WEIGHT, rnd->rnd_weight);
+}
+
+/*
+ * Dumps output from a rib command into an rtmsg
+ */
+
+static int
+dump_px(uint32_t fibnum, const struct nlmsghdr *hdr,
+ const struct rtentry *rt, struct route_nhop_data *rnd,
+ struct nl_writer *nw)
+{
+ struct rtmsg *rtm;
+ int error = 0;
+
+ NET_EPOCH_ASSERT();
+
+ if (!nlmsg_reply(nw, hdr, sizeof(struct rtmsg)))
+ goto enomem;
+
+ int family = rt_get_family(rt);
+ int rtm_off = nlattr_save_offset(nw);
+ rtm = nlmsg_reserve_object(nw, struct rtmsg);
+ rtm->rtm_family = family;
+ rtm->rtm_dst_len = 0;
+ rtm->rtm_src_len = 0;
+ rtm->rtm_tos = 0;
+ if (fibnum < 255)
+ rtm->rtm_table = (unsigned char)fibnum;
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ rtm->rtm_protocol = nl_get_rtm_protocol(rnd->rnd_nhop);
+ rtm->rtm_type = get_rtm_type(rnd->rnd_nhop);
+
+ nlattr_add_u32(nw, NL_RTA_TABLE, fibnum);
+
+ int plen = 0;
+#if defined(INET) || defined(INET6)
+ uint32_t scopeid;
+#endif
+ switch (family) {
+#ifdef INET
+ case AF_INET:
+ {
+ struct in_addr addr;
+ rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid);
+ nlattr_add(nw, NL_RTA_DST, 4, &addr);
+ break;
+ }
+#endif
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct in6_addr addr;
+ rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid);
+ nlattr_add(nw, NL_RTA_DST, 16, &addr);
+ break;
+ }
+#endif
+ default:
+ FIB_LOG(LOG_NOTICE, fibnum, family, "unsupported rt family: %d", family);
+ error = EAFNOSUPPORT;
+ goto flush;
+ }
+
+ rtm = nlattr_restore_offset(nw, rtm_off, struct rtmsg);
+ if (plen > 0)
+ rtm->rtm_dst_len = plen;
+ dump_rc_nhop(nw, rnd, rtm);
+
+ if (nlmsg_end(nw))
+ return (0);
+enomem:
+ error = ENOMEM;
+flush:
+ nlmsg_abort(nw);
+ return (error);
+}
+
+static int
+family_to_group(int family)
+{
+ switch (family) {
+ case AF_INET:
+ return (RTNLGRP_IPV4_ROUTE);
+ case AF_INET6:
+ return (RTNLGRP_IPV6_ROUTE);
+ }
+ return (0);
+}
+
+static void
+report_operation(uint32_t fibnum, struct rib_cmd_info *rc,
+ struct nlpcb *nlp, struct nlmsghdr *hdr)
+{
+ struct nl_writer nw;
+ uint32_t group_id = family_to_group(rt_get_family(rc->rc_rt));
+
+ if (nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id, 0,
+ false)) {
+ struct route_nhop_data rnd = {
+ .rnd_nhop = rc_get_nhop(rc),
+ .rnd_weight = rc->rc_nh_weight,
+ };
+ hdr->nlmsg_flags &= ~(NLM_F_REPLACE | NLM_F_CREATE);
+ hdr->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_APPEND);
+ switch (rc->rc_cmd) {
+ case RTM_ADD:
+ hdr->nlmsg_type = NL_RTM_NEWROUTE;
+ hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL;
+ break;
+ case RTM_CHANGE:
+ hdr->nlmsg_type = NL_RTM_NEWROUTE;
+ hdr->nlmsg_flags |= NLM_F_REPLACE;
+ break;
+ case RTM_DELETE:
+ hdr->nlmsg_type = NL_RTM_DELROUTE;
+ break;
+ }
+ dump_px(fibnum, hdr, rc->rc_rt, &rnd, &nw);
+ nlmsg_flush(&nw);
+ }
+
+ rtsock_callback_p->route_f(fibnum, rc);
+}
+
+static void
+set_scope6(struct sockaddr *sa, struct ifnet *ifp)
+{
+#ifdef INET6
+ if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) {
+ struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa;
+
+ if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr))
+ in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp));
+ }
+#endif
+}
+
+struct rta_mpath_nh {
+ struct sockaddr *gw;
+ struct ifnet *ifp;
+ uint8_t rtnh_flags;
+ uint8_t rtnh_weight;
+};
+
+#define _IN(_field) offsetof(struct rtnexthop, _field)
+#define _OUT(_field) offsetof(struct rta_mpath_nh, _field)
+const static struct nlattr_parser nla_p_rtnh[] = {
+ { .type = NL_RTA_GATEWAY, .off = _OUT(gw), .cb = nlattr_get_ip },
+ { .type = NL_RTA_VIA, .off = _OUT(gw), .cb = nlattr_get_ipvia },
+};
+const static struct nlfield_parser nlf_p_rtnh[] = {
+ { .off_in = _IN(rtnh_flags), .off_out = _OUT(rtnh_flags), .cb = nlf_get_u8 },
+ { .off_in = _IN(rtnh_hops), .off_out = _OUT(rtnh_weight), .cb = nlf_get_u8 },
+ { .off_in = _IN(rtnh_ifindex), .off_out = _OUT(ifp), .cb = nlf_get_ifpz },
+};
+#undef _IN
+#undef _OUT
+
+static bool
+post_p_rtnh(void *_attrs, struct nl_pstate *npt __unused)
+{
+ struct rta_mpath_nh *attrs = (struct rta_mpath_nh *)_attrs;
+
+ set_scope6(attrs->gw, attrs->ifp);
+ return (true);
+}
+NL_DECLARE_PARSER_EXT(mpath_parser, struct rtnexthop, NULL, nlf_p_rtnh, nla_p_rtnh, post_p_rtnh);
+
+struct rta_mpath {
+ u_int num_nhops;
+ struct rta_mpath_nh nhops[0];
+};
+
+static int
+nlattr_get_multipath(struct nlattr *nla, struct nl_pstate *npt,
+ const void *arg, void *target)
+{
+ struct rta_mpath *mp;
+ struct rtnexthop *rtnh;
+ uint16_t data_len, len;
+ u_int max_nhops;
+ int error;
+
+ data_len = nla->nla_len - sizeof(struct nlattr);
+ max_nhops = data_len / sizeof(struct rtnexthop);
+
+ mp = npt_alloc(npt, (max_nhops + 2) * sizeof(struct rta_mpath_nh));
+ mp->num_nhops = 0;
+
+ for (rtnh = (struct rtnexthop *)(nla + 1); data_len > 0; ) {
+ struct rta_mpath_nh *mpnh;
+
+ if (__predict_false(rtnh->rtnh_len <= sizeof(*rtnh) ||
+ rtnh->rtnh_len > data_len)) {
+ NLMSG_REPORT_ERR_MSG(npt, "%s: bad length %u",
+ __func__, rtnh->rtnh_len);
+ return (EINVAL);
+ }
+ mpnh = &mp->nhops[mp->num_nhops++];
+ error = nl_parse_header(rtnh, rtnh->rtnh_len, &mpath_parser,
+ npt, mpnh);
+ if (error != 0) {
+ NLMSG_REPORT_ERR_MSG(npt,
+ "RTA_MULTIPATH: nexthop %u: parse failed",
+ mp->num_nhops - 1);
+ return (error);
+ }
+ len = NL_ITEM_ALIGN(rtnh->rtnh_len);
+ data_len -= len;
+ rtnh = (struct rtnexthop *)((char *)rtnh + len);
+ }
+ if (data_len != 0 || mp->num_nhops == 0) {
+ NLMSG_REPORT_ERR_MSG(npt, "invalid RTA_MULTIPATH attr");
+ return (EINVAL);
+ }
+
+ *((struct rta_mpath **)target) = mp;
+ return (0);
+}
+
+
+struct nl_parsed_route {
+ struct sockaddr *rta_dst;
+ struct sockaddr *rta_gw;
+ struct ifnet *rta_oif;
+ struct rta_mpath *rta_multipath;
+ uint32_t rta_table;
+ uint32_t rta_rtflags;
+ uint32_t rta_nh_id;
+ uint32_t rta_weight;
+ uint32_t rtax_mtu;
+ uint8_t rtm_table;
+ uint8_t rtm_family;
+ uint8_t rtm_dst_len;
+ uint8_t rtm_protocol;
+ uint8_t rtm_type;
+ uint32_t rtm_flags;
+};
+
+#define _IN(_field) offsetof(struct rtmsg, _field)
+#define _OUT(_field) offsetof(struct nl_parsed_route, _field)
+static struct nlattr_parser nla_p_rtmetrics[] = {
+ { .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = nlattr_get_uint32 },
+};
+NL_DECLARE_ATTR_PARSER(metrics_parser, nla_p_rtmetrics);
+
+static const struct nlattr_parser nla_p_rtmsg[] = {
+ { .type = NL_RTA_DST, .off = _OUT(rta_dst), .cb = nlattr_get_ip },
+ { .type = NL_RTA_OIF, .off = _OUT(rta_oif), .cb = nlattr_get_ifp },
+ { .type = NL_RTA_GATEWAY, .off = _OUT(rta_gw), .cb = nlattr_get_ip },
+ { .type = NL_RTA_METRICS, .arg = &metrics_parser, .cb = nlattr_get_nested },
+ { .type = NL_RTA_MULTIPATH, .off = _OUT(rta_multipath), .cb = nlattr_get_multipath },
+ { .type = NL_RTA_WEIGHT, .off = _OUT(rta_weight), .cb = nlattr_get_uint32 },
+ { .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = nlattr_get_uint32 },
+ { .type = NL_RTA_TABLE, .off = _OUT(rta_table), .cb = nlattr_get_uint32 },
+ { .type = NL_RTA_VIA, .off = _OUT(rta_gw), .cb = nlattr_get_ipvia },
+ { .type = NL_RTA_NH_ID, .off = _OUT(rta_nh_id), .cb = nlattr_get_uint32 },
+};
+
+static const struct nlfield_parser nlf_p_rtmsg[] = {
+ { .off_in = _IN(rtm_family), .off_out = _OUT(rtm_family), .cb = nlf_get_u8 },
+ { .off_in = _IN(rtm_dst_len), .off_out = _OUT(rtm_dst_len), .cb = nlf_get_u8 },
+ { .off_in = _IN(rtm_protocol), .off_out = _OUT(rtm_protocol), .cb = nlf_get_u8 },
+ { .off_in = _IN(rtm_type), .off_out = _OUT(rtm_type), .cb = nlf_get_u8 },
+ { .off_in = _IN(rtm_table), .off_out = _OUT(rtm_table), .cb = nlf_get_u8 },
+ { .off_in = _IN(rtm_flags), .off_out = _OUT(rtm_flags), .cb = nlf_get_u32 },
+};
+#undef _IN
+#undef _OUT
+
+static bool
+post_p_rtmsg(void *_attrs, struct nl_pstate *npt __unused)
+{
+ struct nl_parsed_route *attrs = (struct nl_parsed_route *)_attrs;
+
+ set_scope6(attrs->rta_dst, attrs->rta_oif);
+ set_scope6(attrs->rta_gw, attrs->rta_oif);
+ return (true);
+}
+NL_DECLARE_PARSER_EXT(rtm_parser, struct rtmsg, NULL, nlf_p_rtmsg, nla_p_rtmsg, post_p_rtmsg);
+
+struct netlink_walkargs {
+ struct nl_writer *nw;
+ struct route_nhop_data rnd;
+ struct nlmsghdr hdr;
+ struct nlpcb *nlp;
+ uint32_t fibnum;
+ int family;
+ int error;
+ int count;
+ int dumped;
+ int dumped_tables;
+};
+
+static int
+dump_rtentry(struct rtentry *rt, void *_arg)
+{
+ struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg;
+ int error;
+
+ wa->count++;
+ if (wa->error != 0)
+ return (0);
+ if (!rt_is_exportable(rt, nlp_get_cred(wa->nlp)))
+ return (0);
+ wa->dumped++;
+
+ rt_get_rnd(rt, &wa->rnd);
+
+ error = dump_px(wa->fibnum, &wa->hdr, rt, &wa->rnd, wa->nw);
+
+ IF_DEBUG_LEVEL(LOG_DEBUG3) {
+ char rtbuf[INET6_ADDRSTRLEN + 5];
+ FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family,
+ "Dump %s, error %d",
+ rt_print_buf(rt, rtbuf, sizeof(rtbuf)), error);
+ }
+ wa->error = error;
+
+ return (0);
+}
+
+static void
+dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family)
+{
+ FIB_LOG(LOG_DEBUG2, fibnum, family, "Start dump");
+ wa->count = 0;
+ wa->dumped = 0;
+
+ rib_walk(fibnum, family, false, dump_rtentry, wa);
+
+ wa->dumped_tables++;
+
+ FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d",
+ wa->count, wa->dumped);
+}
+
+static int
+dump_rtable_fib(struct netlink_walkargs *wa, uint32_t fibnum, int family)
+{
+ wa->fibnum = fibnum;
+
+ if (family == AF_UNSPEC) {
+ for (int i = 0; i < AF_MAX; i++) {
+ if (rt_tables_get_rnh(fibnum, i) != 0) {
+ wa->family = i;
+ dump_rtable_one(wa, fibnum, i);
+ if (wa->error != 0)
+ break;
+ }
+ }
+ } else {
+ if (rt_tables_get_rnh(fibnum, family) != 0) {
+ wa->family = family;
+ dump_rtable_one(wa, fibnum, family);
+ }
+ }
+
+ return (wa->error);
+}
+
+static int
+handle_rtm_getroute(struct nlpcb *nlp, struct nl_parsed_route *attrs,
+ struct nlmsghdr *hdr, struct nl_pstate *npt)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rnh;
+ const struct rtentry *rt;
+ struct route_nhop_data rnd;
+ uint32_t fibnum = attrs->rta_table;
+ sa_family_t family = attrs->rtm_family;
+
+ if (attrs->rta_dst == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "No RTA_DST supplied");
+ return (EINVAL);
+ }
+
+ rnh = rt_tables_get_rnh(fibnum, family);
+ if (rnh == NULL)
+ return (EAFNOSUPPORT);
+
+ RIB_RLOCK(rnh);
+
+ struct sockaddr *dst = attrs->rta_dst;
+
+ if (attrs->rtm_flags & RTM_F_PREFIX)
+ rt = rib_lookup_prefix_plen(rnh, dst, attrs->rtm_dst_len, &rnd);
+ else
+ rt = (const struct rtentry *)rnh->rnh_matchaddr(dst, &rnh->head);
+ if (rt == NULL) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
+ }
+
+ rt_get_rnd(rt, &rnd);
+ rnd.rnd_nhop = nhop_select_func(rnd.rnd_nhop, 0);
+
+ RIB_RUNLOCK(rnh);
+
+ if (!rt_is_exportable(rt, nlp_get_cred(nlp)))
+ return (ESRCH);
+
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ char rtbuf[NHOP_PRINT_BUFSIZE] __unused, nhbuf[NHOP_PRINT_BUFSIZE] __unused;
+ FIB_LOG(LOG_DEBUG2, fibnum, family, "getroute completed: got %s for %s",
+ nhop_print_buf_any(rnd.rnd_nhop, nhbuf, sizeof(nhbuf)),
+ rt_print_buf(rt, rtbuf, sizeof(rtbuf)));
+ }
+
+ hdr->nlmsg_type = NL_RTM_NEWROUTE;
+ dump_px(fibnum, hdr, rt, &rnd, npt->nw);
+
+ return (0);
+}
+
+static int
+handle_rtm_dump(struct nlpcb *nlp, uint32_t fibnum, int family,
+ struct nlmsghdr *hdr, struct nl_writer *nw)
+{
+ struct netlink_walkargs wa = {
+ .nlp = nlp,
+ .nw = nw,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_type = NL_RTM_NEWROUTE,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI,
+ };
+
+ if (fibnum == RT_TABLE_UNSPEC) {
+ for (int i = 0; i < V_rt_numfibs; i++) {
+ dump_rtable_fib(&wa, fibnum, family);
+ if (wa.error != 0)
+ break;
+ }
+ } else
+ dump_rtable_fib(&wa, fibnum, family);
+
+ if (wa.error == 0 && wa.dumped_tables == 0) {
+ FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum/family");
+ wa.error = ESRCH;
+ // How do we propagate it?
+ }
+
+ if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) {
+ NL_LOG(LOG_DEBUG, "Unable to finalize the dump");
+ return (ENOMEM);
+ }
+
+ return (wa.error);
+}
+
+static struct nhop_object *
+finalize_nhop(struct nhop_object *nh, const struct sockaddr *dst, int *perror)
+{
+ /*
+ * The following MUST be filled:
+ * nh_ifp, nh_ifa, nh_gw
+ */
+ if (nh->gw_sa.sa_family == 0) {
+ /*
+ * Empty gateway. Can be direct route with RTA_OIF set.
+ */
+ if (nh->nh_ifp != NULL)
+ nhop_set_direct_gw(nh, nh->nh_ifp);
+ else {
+ NL_LOG(LOG_DEBUG, "empty gateway and interface, skipping");
+ *perror = EINVAL;
+ return (NULL);
+ }
+ /* Both nh_ifp and gateway are set */
+ } else {
+ /* Gateway is set up, we can derive ifp if not set */
+ if (nh->nh_ifp == NULL) {
+ uint32_t fibnum = nhop_get_fibnum(nh);
+ uint32_t flags = 0;
+
+ if (nh->nh_flags & NHF_GATEWAY)
+ flags = RTF_GATEWAY;
+ else if (nh->nh_flags & NHF_HOST)
+ flags = RTF_HOST;
+
+ struct ifaddr *ifa = ifa_ifwithroute(flags, dst, &nh->gw_sa, fibnum);
+ if (ifa == NULL) {
+ NL_LOG(LOG_DEBUG, "Unable to determine ifp, skipping");
+ *perror = EINVAL;
+ return (NULL);
+ }
+ nhop_set_transmit_ifp(nh, ifa->ifa_ifp);
+ }
+ }
+ /* Both nh_ifp and gateway are set */
+ if (nh->nh_ifa == NULL) {
+ const struct sockaddr *gw_sa = &nh->gw_sa;
+
+ if (gw_sa->sa_family != dst->sa_family) {
+ /*
+ * Use dst as the target for determining the default
+ * preferred ifa IF
+ * 1) the gateway is link-level (e.g. direct route)
+ * 2) the gateway family is different (e.g. IPv4 over IPv6).
+ */
+ gw_sa = dst;
+ }
+
+ struct ifaddr *ifa = ifaof_ifpforaddr(gw_sa, nh->nh_ifp);
+ if (ifa == NULL) {
+ /* Try link-level ifa. */
+ gw_sa = &nh->gw_sa;
+ ifa = ifaof_ifpforaddr(gw_sa, nh->nh_ifp);
+ if (ifa == NULL) {
+ NL_LOG(LOG_DEBUG, "Unable to determine ifa, skipping");
+ *perror = EINVAL;
+ return (NULL);
+ }
+ }
+ nhop_set_src(nh, ifa);
+ }
+
+ return (nhop_get_nhop(nh, perror));
+}
+
+static int
+get_pxflag(const struct nl_parsed_route *attrs)
+{
+ int pxflag = 0;
+ switch (attrs->rtm_family) {
+ case AF_INET:
+ if (attrs->rtm_dst_len == 32)
+ pxflag = NHF_HOST;
+ else if (attrs->rtm_dst_len == 0)
+ pxflag = NHF_DEFAULT;
+ break;
+ case AF_INET6:
+ if (attrs->rtm_dst_len == 128)
+ pxflag = NHF_HOST;
+ else if (attrs->rtm_dst_len == 0)
+ pxflag = NHF_DEFAULT;
+ break;
+ }
+
+ return (pxflag);
+}
+
+static int
+get_op_flags(int nlm_flags)
+{
+ int op_flags = 0;
+
+ op_flags |= (nlm_flags & NLM_F_REPLACE) ? RTM_F_REPLACE : 0;
+ op_flags |= (nlm_flags & NLM_F_EXCL) ? RTM_F_EXCL : 0;
+ op_flags |= (nlm_flags & NLM_F_CREATE) ? RTM_F_CREATE : 0;
+ op_flags |= (nlm_flags & NLM_F_APPEND) ? RTM_F_APPEND : 0;
+
+ return (op_flags);
+}
+
+#ifdef ROUTE_MPATH
+static int
+create_nexthop_one(struct nl_parsed_route *attrs, struct rta_mpath_nh *mpnh,
+ struct nl_pstate *npt, struct nhop_object **pnh)
+{
+ int error;
+
+ if (mpnh->gw == NULL)
+ return (EINVAL);
+
+ struct nhop_object *nh = nhop_alloc(attrs->rta_table, attrs->rtm_family);
+ if (nh == NULL)
+ return (ENOMEM);
+
+ error = nl_set_nexthop_gw(nh, mpnh->gw, mpnh->ifp, npt);
+ if (error != 0) {
+ nhop_free(nh);
+ return (error);
+ }
+ if (mpnh->ifp != NULL)
+ nhop_set_transmit_ifp(nh, mpnh->ifp);
+ nhop_set_pxtype_flag(nh, get_pxflag(attrs));
+ nhop_set_rtflags(nh, attrs->rta_rtflags);
+ if (attrs->rtm_protocol > RTPROT_STATIC)
+ nhop_set_origin(nh, attrs->rtm_protocol);
+
+ *pnh = finalize_nhop(nh, attrs->rta_dst, &error);
+
+ return (error);
+}
+#endif
+
+static struct nhop_object *
+create_nexthop_from_attrs(struct nl_parsed_route *attrs,
+ struct nl_pstate *npt, int *perror)
+{
+ struct nhop_object *nh = NULL;
+ int error = 0;
+
+ if (attrs->rta_multipath != NULL) {
+#ifdef ROUTE_MPATH
+ /* Multipath w/o explicit nexthops */
+ int num_nhops = attrs->rta_multipath->num_nhops;
+ struct weightened_nhop *wn = npt_alloc(npt, sizeof(*wn) * num_nhops);
+
+ for (int i = 0; i < num_nhops; i++) {
+ struct rta_mpath_nh *mpnh = &attrs->rta_multipath->nhops[i];
+
+ error = create_nexthop_one(attrs, mpnh, npt, &wn[i].nh);
+ if (error != 0) {
+ for (int j = 0; j < i; j++)
+ nhop_free(wn[j].nh);
+ break;
+ }
+ wn[i].weight = mpnh->rtnh_weight > 0 ? mpnh->rtnh_weight : 1;
+ }
+ if (error == 0) {
+ struct rib_head *rh = nhop_get_rh(wn[0].nh);
+ struct nhgrp_object *nhg;
+
+ nhg = nhgrp_alloc(rh->rib_fibnum, rh->rib_family,
+ wn, num_nhops, perror);
+ if (nhg != NULL) {
+ if (attrs->rtm_protocol > RTPROT_STATIC)
+ nhgrp_set_origin(nhg, attrs->rtm_protocol);
+ nhg = nhgrp_get_nhgrp(nhg, perror);
+ }
+ for (int i = 0; i < num_nhops; i++)
+ nhop_free(wn[i].nh);
+ if (nhg != NULL)
+ return ((struct nhop_object *)nhg);
+ error = *perror;
+ }
+#else
+ error = ENOTSUP;
+#endif
+ *perror = error;
+ } else {
+ nh = nhop_alloc(attrs->rta_table, attrs->rtm_family);
+ if (nh == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ if (attrs->rta_gw != NULL) {
+ *perror = nl_set_nexthop_gw(nh, attrs->rta_gw, attrs->rta_oif, npt);
+ if (*perror != 0) {
+ nhop_free(nh);
+ return (NULL);
+ }
+ }
+ if (attrs->rta_oif != NULL)
+ nhop_set_transmit_ifp(nh, attrs->rta_oif);
+ if (attrs->rtax_mtu != 0)
+ nhop_set_mtu(nh, attrs->rtax_mtu, true);
+ if (attrs->rta_rtflags & RTF_BROADCAST)
+ nhop_set_broadcast(nh, true);
+ if (attrs->rtm_protocol > RTPROT_STATIC)
+ nhop_set_origin(nh, attrs->rtm_protocol);
+ nhop_set_pxtype_flag(nh, get_pxflag(attrs));
+ nhop_set_rtflags(nh, attrs->rta_rtflags);
+
+ switch (attrs->rtm_type) {
+ case RTN_UNICAST:
+ break;
+ case RTN_BLACKHOLE:
+ nhop_set_blackhole(nh, RTF_BLACKHOLE);
+ break;
+ case RTN_PROHIBIT:
+ case RTN_UNREACHABLE:
+ nhop_set_blackhole(nh, RTF_REJECT);
+ break;
+ /* TODO: return ENOTSUP for other types if strict option is set */
+ }
+
+ nh = finalize_nhop(nh, attrs->rta_dst, perror);
+ }
+
+ return (nh);
+}
+
+static int
+rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nl_pstate *npt)
+{
+ struct rib_cmd_info rc = {};
+ struct nhop_object *nh = NULL;
+ int error;
+
+ struct nl_parsed_route attrs = {};
+ error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ /* Check if we have enough data */
+ if (attrs.rta_dst == NULL) {
+ NL_LOG(LOG_DEBUG, "missing RTA_DST");
+ return (EINVAL);
+ }
+
+ /* pre-2.6.19 Linux API compatibility */
+ if (attrs.rtm_table > 0 && attrs.rta_table == 0)
+ attrs.rta_table = attrs.rtm_table;
+ if (attrs.rta_table >= V_rt_numfibs || attrs.rtm_family > AF_MAX) {
+ NLMSG_REPORT_ERR_MSG(npt, "invalid fib");
+ return (EINVAL);
+ }
+
+ if (attrs.rta_nh_id != 0) {
+ /* Referenced uindex */
+ int pxflag = get_pxflag(&attrs);
+ nh = nl_find_nhop(attrs.rta_table, attrs.rtm_family, attrs.rta_nh_id,
+ pxflag, &error);
+ if (error != 0)
+ return (error);
+ } else {
+ nh = create_nexthop_from_attrs(&attrs, npt, &error);
+ if (error != 0) {
+ NL_LOG(LOG_DEBUG, "Error creating nexthop");
+ return (error);
+ }
+ }
+
+ if (!NH_IS_NHGRP(nh) && attrs.rta_weight == 0)
+ attrs.rta_weight = RT_DEFAULT_WEIGHT;
+ struct route_nhop_data rnd = { .rnd_nhop = nh, .rnd_weight = attrs.rta_weight };
+ int op_flags = get_op_flags(hdr->nlmsg_flags);
+
+ error = rib_add_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len,
+ &rnd, op_flags, &rc);
+ if (error == 0)
+ report_operation(attrs.rta_table, &rc, nlp, hdr);
+ return (error);
+}
+
+static int
+path_match_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
+{
+ struct nl_parsed_route *attrs = (struct nl_parsed_route *)_data;
+
+ if ((attrs->rta_gw != NULL) && !rib_match_gw(rt, nh, attrs->rta_gw))
+ return (0);
+
+ if ((attrs->rta_oif != NULL) && (attrs->rta_oif != nh->nh_ifp))
+ return (0);
+
+ return (1);
+}
+
+static int
+rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nl_pstate *npt)
+{
+ struct rib_cmd_info rc;
+ int error;
+
+ struct nl_parsed_route attrs = {};
+ error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.rta_dst == NULL) {
+ NLMSG_REPORT_ERR_MSG(npt, "RTA_DST is not set");
+ return (ESRCH);
+ }
+
+ if (attrs.rta_table >= V_rt_numfibs || attrs.rtm_family > AF_MAX) {
+ NLMSG_REPORT_ERR_MSG(npt, "invalid fib");
+ return (EINVAL);
+ }
+
+ error = rib_del_route_px(attrs.rta_table, attrs.rta_dst,
+ attrs.rtm_dst_len, path_match_func, &attrs,
+ (attrs.rta_rtflags & RTF_PINNED) ? RTM_F_FORCE : 0, &rc);
+ if (error == 0)
+ report_operation(attrs.rta_table, &rc, nlp, hdr);
+ return (error);
+}
+
+static int
+rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ int error;
+
+ struct nl_parsed_route attrs = {};
+ error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.rta_table >= V_rt_numfibs || attrs.rtm_family > AF_MAX) {
+ NLMSG_REPORT_ERR_MSG(npt, "invalid fib");
+ return (EINVAL);
+ }
+
+ if (hdr->nlmsg_flags & NLM_F_DUMP)
+ error = handle_rtm_dump(nlp, attrs.rta_table, attrs.rtm_family, hdr, npt->nw);
+ else
+ error = handle_rtm_getroute(nlp, &attrs, hdr, npt);
+
+ return (error);
+}
+
+void
+rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc)
+{
+ struct nl_writer nw;
+ int family, nlm_flags = 0;
+
+ family = rt_get_family(rc->rc_rt);
+
+ /* XXX: check if there are active listeners first */
+
+ /* TODO: consider passing PID/type/seq */
+ switch (rc->rc_cmd) {
+ case RTM_ADD:
+ nlm_flags = NLM_F_EXCL | NLM_F_CREATE;
+ break;
+ case RTM_CHANGE:
+ nlm_flags = NLM_F_REPLACE;
+ break;
+ case RTM_DELETE:
+ nlm_flags = 0;
+ break;
+ }
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ char rtbuf[NHOP_PRINT_BUFSIZE] __unused;
+ FIB_LOG(LOG_DEBUG2, fibnum, family,
+ "received event %s for %s / nlm_flags=%X",
+ rib_print_cmd(rc->rc_cmd),
+ rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)),
+ nlm_flags);
+ }
+
+ struct nlmsghdr hdr = {
+ .nlmsg_flags = nlm_flags,
+ .nlmsg_type = get_rtmsg_type_from_rtsock(rc->rc_cmd),
+ };
+
+ struct route_nhop_data rnd = {
+ .rnd_nhop = rc_get_nhop(rc),
+ .rnd_weight = rc->rc_nh_weight,
+ };
+
+ uint32_t group_id = family_to_group(family);
+ if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id, 0,
+ false)) {
+ NL_LOG(LOG_DEBUG, "error allocating event buffer");
+ return;
+ }
+
+ dump_px(fibnum, &hdr, rc->rc_rt, &rnd, &nw);
+ nlmsg_flush(&nw);
+}
+
+static const struct rtnl_cmd_handler cmd_handlers[] = {
+ {
+ .cmd = NL_RTM_GETROUTE,
+ .name = "RTM_GETROUTE",
+ .cb = &rtnl_handle_getroute,
+ .flags = RTNL_F_ALLOW_NONVNET_JAIL,
+ },
+ {
+ .cmd = NL_RTM_DELROUTE,
+ .name = "RTM_DELROUTE",
+ .cb = &rtnl_handle_delroute,
+ .priv = PRIV_NET_ROUTE,
+ .flags = RTNL_F_ALLOW_NONVNET_JAIL,
+ },
+ {
+ .cmd = NL_RTM_NEWROUTE,
+ .name = "RTM_NEWROUTE",
+ .cb = &rtnl_handle_newroute,
+ .priv = PRIV_NET_ROUTE,
+ .flags = RTNL_F_ALLOW_NONVNET_JAIL,
+ }
+};
+
+static const struct nlhdr_parser *all_parsers[] = {&mpath_parser, &metrics_parser, &rtm_parser};
+
+void
+rtnl_routes_init(void)
+{
+ NL_VERIFY_PARSERS(all_parsers);
+ rtnl_register_messages(cmd_handlers, nitems(cmd_handlers));
+}