diff options
Diffstat (limited to 'sys/netlink')
39 files changed, 13614 insertions, 0 deletions
diff --git a/sys/netlink/ktest_netlink_message_writer.c b/sys/netlink/ktest_netlink_message_writer.c new file mode 100644 index 000000000000..805f52197f69 --- /dev/null +++ b/sys/netlink/ktest_netlink_message_writer.c @@ -0,0 +1,113 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <tests/ktest.h> +#include <sys/cdefs.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_var.h> +#include <netlink/netlink_message_writer.h> + +#define KTEST_CALLER +#include <netlink/ktest_netlink_message_writer.h> + +#ifdef INVARIANTS + +struct test_nlbuf_attrs { + uint32_t size; + uint32_t expected_avail; + int waitok; +}; + +#define _OUT(_field) offsetof(struct test_nlbuf_attrs, _field) +static const struct nlattr_parser nla_p_nlbuf_w[] = { + { .type = 1, .off = _OUT(size), .cb = nlattr_get_uint32 }, + { .type = 2, .off = _OUT(expected_avail), .cb = nlattr_get_uint32 }, + { .type = 3, .off = _OUT(waitok), .cb = nlattr_get_uint32 }, +}; +#undef _OUT +NL_DECLARE_ATTR_PARSER(nlbuf_w_parser, nla_p_nlbuf_w); + +static int +test_nlbuf_parser(struct ktest_test_context *ctx, struct nlattr *nla) +{ + struct test_nlbuf_attrs *attrs = npt_alloc(ctx->npt, sizeof(*attrs)); + + ctx->arg = attrs; + if (attrs != NULL) + return (nl_parse_nested(nla, &nlbuf_w_parser, ctx->npt, attrs)); + return (ENOMEM); +} + +static int +test_nlbuf_writer_allocation(struct ktest_test_context *ctx) +{ + struct test_nlbuf_attrs *attrs = ctx->arg; + struct nl_writer nw = {}; + u_int alloc_len; + bool ret; + + ret = nlmsg_get_buf_wrapper(&nw, attrs->size, attrs->waitok); + if (!ret) + return (EINVAL); + + alloc_len = nw.buf->buflen; + KTEST_LOG(ctx, "requested %u, allocated %d", attrs->size, alloc_len); + + /* Mark enomem to avoid reallocation */ + nw.enomem = true; + + if (nlmsg_reserve_data(&nw, alloc_len, void *) == NULL) { + KTEST_LOG(ctx, "unable to get %d bytes from the writer", alloc_len); + return (EINVAL); + } + + nl_buf_free(nw.buf); + + if (alloc_len < attrs->expected_avail) { + KTEST_LOG(ctx, "alloc_len %d, expected %u", + alloc_len, attrs->expected_avail); + return (EINVAL); + } + + return (0); +} +#endif + +static const struct ktest_test_info tests[] = { +#ifdef INVARIANTS + { + .name = "test_nlbuf_writer_allocation", + .desc = "test different buffer sizes in the netlink writer", + .func = &test_nlbuf_writer_allocation, + .parse = &test_nlbuf_parser, + }, +#endif +}; +KTEST_MODULE_DECLARE(ktest_netlink_message_writer, tests); diff --git a/sys/netlink/ktest_netlink_message_writer.h b/sys/netlink/ktest_netlink_message_writer.h new file mode 100644 index 000000000000..447593e0e700 --- /dev/null +++ b/sys/netlink/ktest_netlink_message_writer.h @@ -0,0 +1,46 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETLINK_KTEST_NETLINK_MESSAGE_WRITER_H_ +#define _NETLINK_KTEST_NETLINK_MESSAGE_WRITER_H_ + +#if defined(_KERNEL) && defined(INVARIANTS) + +bool nlmsg_get_buf_wrapper(struct nl_writer *nw, size_t size, bool waitok); + +#ifndef KTEST_CALLER + +bool +nlmsg_get_buf_wrapper(struct nl_writer *nw, size_t size, bool waitok) +{ + return (nlmsg_get_buf(nw, size, waitok)); +} +#endif + +#endif + +#endif diff --git a/sys/netlink/netlink.h b/sys/netlink/netlink.h new file mode 100644 index 000000000000..2395726e7455 --- /dev/null +++ b/sys/netlink/netlink.h @@ -0,0 +1,263 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (C) The Internet Society (2003). All Rights Reserved. + * + * This document and translations of it may be copied and furnished to + * others, and derivative works that comment on or otherwise explain it + * or assist in its implementation may be prepared, copied, published + * and distributed, in whole or in part, without restriction of any + * kind, provided that the above copyright notice and this paragraph are + * included on all such copies and derivative works. However, this + * document itself may not be modified in any way, such as by removing + * the copyright notice or references to the Internet Society or other + * Internet organizations, except as needed for the purpose of + * developing Internet standards in which case the procedures for + * copyrights defined in the Internet Standards process must be + * followed, or as required to translate it into languages other than + * English. + * + * The limited permissions granted above are perpetual and will not be + * revoked by the Internet Society or its successors or assignees. + * + * This document and the information contained herein is provided on an + * "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING + * TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION + * HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + + */ + +/* + * This file contains structures and constants for RFC 3549 (Netlink) + * protocol. Some values have been taken from Linux implementation. + */ + +#ifndef _NETLINK_NETLINK_H_ +#define _NETLINK_NETLINK_H_ + +#include <sys/param.h> +#include <sys/socket.h> + +struct sockaddr_nl { + uint8_t nl_len; /* sizeof(sockaddr_nl) */ + sa_family_t nl_family; /* netlink family */ + uint16_t nl_pad; /* reserved, set to 0 */ + uint32_t nl_pid; /* desired port ID, 0 for auto-select */ + uint32_t nl_groups; /* multicast groups mask to bind to */ +}; + +#define SOL_NETLINK 270 + +/* Netlink socket options */ +#define NETLINK_ADD_MEMBERSHIP 1 /* Subscribe for the specified group notifications */ +#define NETLINK_DROP_MEMBERSHIP 2 /* Unsubscribe from the specified group */ +#define NETLINK_PKTINFO 3 /* XXX: not supported */ +#define NETLINK_BROADCAST_ERROR 4 /* XXX: not supported */ +#define NETLINK_NO_ENOBUFS 5 /* XXX: not supported */ +#define NETLINK_RX_RING 6 /* XXX: not supported */ +#define NETLINK_TX_RING 7 /* XXX: not supported */ +#define NETLINK_LISTEN_ALL_NSID 8 /* XXX: not supported */ + +#define NETLINK_LIST_MEMBERSHIPS 9 +#define NETLINK_CAP_ACK 10 /* Send only original message header in the reply */ +#define NETLINK_EXT_ACK 11 /* Ack support for receiving additional TLVs in ack */ +#define NETLINK_GET_STRICT_CHK 12 /* Strict header checking */ + +#define NETLINK_MSG_INFO 257 /* (FreeBSD-specific) Receive message originator data in cmsg */ + +/* + * RFC 3549, 2.3.2 Netlink Message Header + */ +struct nlmsghdr { + uint32_t nlmsg_len; /* Length of message including header */ + uint16_t nlmsg_type; /* Message type identifier */ + uint16_t nlmsg_flags; /* Flags (NLM_F_) */ + uint32_t nlmsg_seq; /* Sequence number */ + uint32_t nlmsg_pid; /* Sending process port ID */ +}; + +/* + * RFC 3549, 2.3.2 standard flag bits (nlmsg_flags) + */ +#define NLM_F_REQUEST 0x01 /* Indicateds request to kernel */ +#define NLM_F_MULTI 0x02 /* Message is part of a group terminated by NLMSG_DONE msg */ +#define NLM_F_ACK 0x04 /* Reply with ack message containing resulting error code */ +#define NLM_F_ECHO 0x08 /* (not supported) Echo this request back */ +#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */ +#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */ + +/* + * RFC 3549, 2.3.2 Additional flag bits for GET requests + */ +#define NLM_F_ROOT 0x100 /* Return the complete table */ +#define NLM_F_MATCH 0x200 /* Return all entries matching criteria */ +#define NLM_F_ATOMIC 0x400 /* Return an atomic snapshot (ignored) */ +#define NLM_F_DUMP (NLM_F_ROOT | NLM_F_MATCH) + +/* + * RFC 3549, 2.3.2 Additional flag bits for NEW requests + */ +#define NLM_F_REPLACE 0x100 /* Replace existing matching config object */ +#define NLM_F_EXCL 0x200 /* Don't replace the object if exists */ +#define NLM_F_CREATE 0x400 /* Create if it does not exist */ +#define NLM_F_APPEND 0x800 /* Add to end of list */ + +/* Modifiers to DELETE requests */ +#define NLM_F_NONREC 0x100 /* Do not delete recursively */ + +/* Flags for ACK message */ +#define NLM_F_CAPPED 0x100 /* request was capped */ +#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */ + +/* + * RFC 3549, 2.3.2 standard message types (nlmsg_type). + */ +#define NLMSG_NOOP 0x1 /* Message is ignored. */ +#define NLMSG_ERROR 0x2 /* reply error code reporting */ +#define NLMSG_DONE 0x3 /* Message terminates a multipart message. */ +#define NLMSG_OVERRUN 0x4 /* overrun detected, data is lost */ + +#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */ + +/* + * Defition of numbers assigned to the netlink subsystems. + */ +#define NETLINK_ROUTE 0 /* Routing/device hook */ +#define NETLINK_UNUSED 1 /* not supported */ +#define NETLINK_USERSOCK 2 /* not supported */ +#define NETLINK_FIREWALL 3 /* not supported */ +#define NETLINK_SOCK_DIAG 4 /* not supported */ +#define NETLINK_NFLOG 5 /* not supported */ +#define NETLINK_XFRM 6 /* (not supported) PF_SETKEY */ +#define NETLINK_SELINUX 7 /* not supported */ +#define NETLINK_ISCSI 8 /* not supported */ +#define NETLINK_AUDIT 9 /* not supported */ +#define NETLINK_FIB_LOOKUP 10 /* not supported */ +#define NETLINK_CONNECTOR 11 /* not supported */ +#define NETLINK_NETFILTER 12 /* not supported */ +#define NETLINK_IP6_FW 13 /* not supported */ +#define NETLINK_DNRTMSG 14 /* not supported */ +#define NETLINK_KOBJECT_UEVENT 15 /* not supported */ +#define NETLINK_GENERIC 16 /* Generic netlink (dynamic families) */ + +/* + * RFC 3549, 2.3.2.2 The ACK Netlink Message + */ +struct nlmsgerr { + int error; + struct nlmsghdr msg; +}; + +enum nlmsgerr_attrs { + NLMSGERR_ATTR_UNUSED, + NLMSGERR_ATTR_MSG = 1, /* string, error message */ + NLMSGERR_ATTR_OFFS = 2, /* u32, offset of the invalid attr from nl header */ + NLMSGERR_ATTR_COOKIE = 3, /* binary, data to pass to userland */ + NLMSGERR_ATTR_POLICY = 4, /* not supported */ + __NLMSGERR_ATTR_MAX, + NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 +}; + +/* FreeBSD-specific debugging info */ + +enum nlmsginfo_attrs { + NLMSGINFO_ATTR_UNUSED, + NLMSGINFO_ATTR_PROCESS_ID = 1, /* u32, source process PID */ + NLMSGINFO_ATTR_PORT_ID = 2, /* u32, source socket nl_pid */ + NLMSGINFO_ATTR_SEQ_ID = 3, /* u32, source message seq_id */ +}; + + +#define NL_ITEM_ALIGN_SIZE sizeof(uint32_t) +#define NL_ITEM_ALIGN(_len) __align_up(_len, NL_ITEM_ALIGN_SIZE) +#define NL_ITEM_DATA(_ptr, _off) ((void *)((char *)(_ptr) + _off)) +#define NL_ITEM_DATA_CONST(_ptr, _off) ((const void *)((const char *)(_ptr) + _off)) + +#define NL_ITEM_OK(_ptr, _len, _hlen, _LEN_M) \ + ((_len) >= _hlen && _LEN_M(_ptr) >= _hlen && _LEN_M(_ptr) <= (_len)) +#define NL_ITEM_NEXT(_ptr, _LEN_M) ((__typeof(_ptr))((char *)(_ptr) + _LEN_M(_ptr))) +#define NL_ITEM_ITER(_ptr, _len, _LEN_MACRO) \ + ((_len) -= _LEN_MACRO(_ptr), NL_ITEM_NEXT(_ptr, _LEN_MACRO)) + +/* part of netlink(3) API */ +#define NLMSG_ALIGNTO NL_ITEM_ALIGN_SIZE +#define NLMSG_ALIGN(_len) NL_ITEM_ALIGN(_len) + +#ifndef _KERNEL +/* part of netlink(3) API */ +#define NLMSG_HDRLEN (sizeof(struct nlmsghdr)) +#define NLMSG_LENGTH(_len) ((_len) + NLMSG_HDRLEN) +#define NLMSG_SPACE(_len) NLMSG_ALIGN(NLMSG_LENGTH(_len)) +#define NLMSG_DATA(_hdr) NL_ITEM_DATA(_hdr, NLMSG_HDRLEN) +#define _NLMSG_LEN(_hdr) ((_hdr)->nlmsg_len) +#define _NLMSG_ALIGNED_LEN(_hdr) NLMSG_ALIGN(_NLMSG_LEN(_hdr)) +#define NLMSG_OK(_hdr, _len) NL_ITEM_OK(_hdr, _len, NLMSG_HDRLEN, _NLMSG_LEN) +#define NLMSG_PAYLOAD(_hdr,_len) (_NLMSG_LEN(_hdr) - NLMSG_SPACE((_len))) +#define NLMSG_NEXT(_hdr, _len) NL_ITEM_ITER(_hdr, _len, _NLMSG_ALIGNED_LEN) + +#else +#define NLMSG_HDRLEN (NLMSG_ALIGN(sizeof(struct nlmsghdr))) +#endif + +/* + * Base netlink attribute TLV header. + */ +struct nlattr { + uint16_t nla_len; /* Total attribute length */ + uint16_t nla_type; /* Attribute type */ +}; + +/* + * + * nl_type field enconding: + * + * 0 1 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |N|O| Attribute type | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * N - attribute contains other attributes (mostly unused) + * O - encoded in network byte order (mostly unused) + * Note: N & O are mutually exclusive + * + * Note: attribute type value scope normally is either parent attribute + * or the message/message group. + */ + +#define NLA_F_NESTED (1 << 15) +#define NLA_F_NET_BYTEORDER (1 << 14) +#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER) + +#ifndef _KERNEL +#define NLA_ALIGNTO NL_ITEM_ALIGN_SIZE +#define NLA_ALIGN(_len) NL_ITEM_ALIGN(_len) +#define NLA_HDRLEN ((int)sizeof(struct nlattr)) +#endif + +#endif diff --git a/sys/netlink/netlink_bitset.h b/sys/netlink/netlink_bitset.h new file mode 100644 index 000000000000..9a918bd20997 --- /dev/null +++ b/sys/netlink/netlink_bitset.h @@ -0,0 +1,57 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Generic netlink message header and attributes + */ +#ifndef _NETLINK_NETLINK_BITSET_H_ +#define _NETLINK_NETLINK_BITSET_H_ + +#include <netlink/netlink.h> + +/* Bitset type nested attributes */ +enum { + NLA_BITSET_UNSPEC, + NLA_BITSET_NOMASK = 1, /* flag: mask of valid bits not provided */ + NLA_BITSET_SIZE = 2, /* u32: max valid bit # */ + NLA_BITSET_BITS = 3, /* nested: array of NLA_BITSET_BIT */ + NLA_BITSET_VALUE = 4, /* binary: array of bit values */ + NLA_BITSET_MASK = 5, /* binary: array of valid bits */ + __NLA_BITSET_MAX, +}; +#define NLA_BITSET_MAX (__NLA_BITSET_MAX - 1) + +enum { + NLA_BITSET_BIT_UNSPEC, + NLA_BITSET_BIT_INDEX = 1, /* u32: index of the bit */ + NLA_BITSET_BIT_NAME = 2, /* string: bit description */ + NLA_BITSET_BIT_VALUE = 3, /* flag: provided if bit is set */ + __NLA_BITSET_BIT_MAX, +}; +#define NLA_BITSET_BIT_MAX (__NLA_BITSET_BIT_MAX - 1) + +#endif diff --git a/sys/netlink/netlink_ctl.h b/sys/netlink/netlink_ctl.h new file mode 100644 index 000000000000..7f43e0f2c25e --- /dev/null +++ b/sys/netlink/netlink_ctl.h @@ -0,0 +1,124 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETLINK_NETLINK_CTL_H_ +#define _NETLINK_NETLINK_CTL_H_ + +#ifdef _KERNEL +/* + * This file provides headers for the public KPI of the netlink + * subsystem + */ +#include <sys/_eventhandler.h> + +MALLOC_DECLARE(M_NETLINK); + +/* + * Macro for handling attribute TLVs + */ +#define _roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) + +#define NETLINK_ALIGN_SIZE sizeof(uint32_t) +#define NETLINK_ALIGN(_len) _roundup2(_len, NETLINK_ALIGN_SIZE) + +#define NLA_ALIGN_SIZE sizeof(uint32_t) +#define NLA_ALIGN(_len) _roundup2(_len, NLA_ALIGN_SIZE) +#define NLA_HDRLEN ((uint16_t)sizeof(struct nlattr)) +#define NLA_DATA_LEN(_nla) ((_nla)->nla_len - NLA_HDRLEN) +#define NLA_DATA(_nla) NL_ITEM_DATA(_nla, NLA_HDRLEN) +#define NLA_DATA_CONST(_nla) NL_ITEM_DATA_CONST(_nla, NLA_HDRLEN) +#define NLA_TYPE(_nla) ((_nla)->nla_type & 0x3FFF) + +#ifndef typeof +#define typeof __typeof +#endif + +#define NLA_NEXT(_attr) (struct nlattr *)((char *)_attr + NLA_ALIGN(_attr->nla_len)) +#define _NLA_END(_start, _len) ((char *)(_start) + (_len)) +#define NLA_FOREACH(_attr, _start, _len) \ + for (typeof(_attr) _end = (typeof(_attr))_NLA_END(_start, _len), _attr = (_start); \ + ((char *)_attr < (char *)_end) && \ + ((char *)NLA_NEXT(_attr) <= (char *)_end); \ + _attr = (_len -= NLA_ALIGN(_attr->nla_len), NLA_NEXT(_attr))) + +#include <netlink/netlink_message_writer.h> +#include <netlink/netlink_message_parser.h> + + +/* Protocol handlers */ +struct nl_pstate; +typedef int (*nl_handler_f)(struct nlmsghdr *hdr, struct nl_pstate *npt); + +bool netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler); +bool netlink_unregister_proto(int proto); + +/* Common helpers */ +bool nlp_has_priv(struct nlpcb *nlp, int priv); +struct ucred *nlp_get_cred(struct nlpcb *nlp); +uint32_t nlp_get_pid(const struct nlpcb *nlp); +bool nlp_unconstrained_vnet(const struct nlpcb *nlp); + +/* netlink_generic.c */ +struct genl_cmd { + const char *cmd_name; + nl_handler_f cmd_cb; + uint32_t cmd_flags; + uint32_t cmd_priv; + uint32_t cmd_num; +}; + +uint16_t genl_register_family(const char *family_name, size_t hdrsize, + uint16_t family_version, uint16_t max_attr_idx); +void genl_unregister_family(uint16_t family); +bool genl_register_cmds(uint16_t family, const struct genl_cmd *cmds, + u_int count); +uint32_t genl_register_group(uint16_t family, const char *group_name); +void genl_unregister_group(uint16_t family, uint32_t group); + +typedef void (*genl_family_event_handler_t)(void *arg, const char *family_name, + uint16_t family_id, u_int action); +EVENTHANDLER_DECLARE(genl_family_event, genl_family_event_handler_t); + +struct thread; +#if defined(NETLINK) || defined(NETLINK_MODULE) +/* Provide optimized calls to the functions inside the same linking unit */ +struct nlpcb *_nl_get_thread_nlp(struct thread *td); + +static inline struct nlpcb * +nl_get_thread_nlp(struct thread *td) +{ + return (_nl_get_thread_nlp(td)); +} + +#else +/* Provide access to the functions via netlink_glue.c */ +struct nlpcb *nl_get_thread_nlp(struct thread *td); + +#endif /* defined(NETLINK) || defined(NETLINK_MODULE) */ + +#endif +#endif diff --git a/sys/netlink/netlink_debug.h b/sys/netlink/netlink_debug.h new file mode 100644 index 000000000000..db987b26b6d7 --- /dev/null +++ b/sys/netlink/netlink_debug.h @@ -0,0 +1,85 @@ +/*- + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETLINK_NETLINK_DEBUG_H_ +#define _NETLINK_NETLINK_DEBUG_H_ + +#ifdef _KERNEL + +#define _DEBUG_SYSCTL_OID _net_netlink_debug +#include <net/route/route_debug.h> + +SYSCTL_DECL(_net_netlink_debug); + +/* + * Generic debug + * [nl_domain] func_name: debug text + */ +#define NL_LOG RT_LOG + +/* + * Logging for events specific for particular process + * Example: [nl_domain] PID 4834 fdump_sa: unsupported family: 45 + */ +#define NL_RAW_PID_LOG(_l, _pid, _fmt, ...) \ + NL_RAW_PID_LOG_##_l(_l, _pid, _fmt, ## __VA_ARGS__) +#define _NL_RAW_PID_LOG(_l, _pid, _fmt, ...) \ + if (_DEBUG_PASS_MSG(_l)) { \ + _output("[" DEBUG_PREFIX_NAME "] PID %u %s: " _fmt "\n", _pid, \ + __func__, ##__VA_ARGS__); \ + } + +#define NLP_LOG(_l, _nlp, _fmt, ...) \ + NL_RAW_PID_LOG_##_l(_l, nlp_get_pid(_nlp), _fmt, ## __VA_ARGS__) + +#if DEBUG_MAX_LEVEL>=LOG_DEBUG3 +#define NL_RAW_PID_LOG_LOG_DEBUG3 _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_DEBUG3(_l, _pid, _fmt, ...) +#endif +#if DEBUG_MAX_LEVEL>=LOG_DEBUG2 +#define NL_RAW_PID_LOG_LOG_DEBUG2 _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_DEBUG2(_l, _pid, _fmt, ...) +#endif +#if DEBUG_MAX_LEVEL>=LOG_DEBUG +#define NL_RAW_PID_LOG_LOG_DEBUG _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_DEBUG(_l, _pid, _fmt, ...) +#endif +#if DEBUG_MAX_LEVEL>=LOG_INFO +#define NL_RAW_PID_LOG_LOG_INFO _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_INFO(_l, _pid, _fmt, ...) +#endif +#define NL_RAW_PID_LOG_LOG_NOTICE _NL_RAW_PID_LOG +#define NL_RAW_PID_LOG_LOG_ERR _NL_RAW_PID_LOG +#define NL_RAW_PID_LOG_LOG_WARNING _NL_RAW_PID_LOG + +#endif /* _KERNEL */ +#endif /* !_NETLINK_NETLINK_DEBUG_H_ */ diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c new file mode 100644 index 000000000000..74b46114716e --- /dev/null +++ b/sys/netlink/netlink_domain.c @@ -0,0 +1,1002 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * Copyright (c) 2023 Gleb Smirnoff <glebius@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file contains socket and protocol bindings for netlink. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/rmlock.h> +#include <sys/domain.h> +#include <sys/jail.h> +#include <sys/mbuf.h> +#include <sys/osd.h> +#include <sys/protosw.h> +#include <sys/proc.h> +#include <sys/ck.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysent.h> +#include <sys/syslog.h> +#include <sys/priv.h> +#include <sys/uio.h> + +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_var.h> + +#define DEBUG_MOD_NAME nl_domain +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include <netlink/netlink_debug.h> +_DECLARE_DEBUG(LOG_INFO); + +_Static_assert((NLP_MAX_GROUPS % 64) == 0, + "NLP_MAX_GROUPS has to be multiple of 64"); +_Static_assert(NLP_MAX_GROUPS >= 64, + "NLP_MAX_GROUPS has to be at least 64"); + +#define NLCTL_TRACKER struct rm_priotracker nl_tracker +#define NLCTL_RLOCK() rm_rlock(&V_nl_ctl.ctl_lock, &nl_tracker) +#define NLCTL_RUNLOCK() rm_runlock(&V_nl_ctl.ctl_lock, &nl_tracker) +#define NLCTL_LOCK_ASSERT() rm_assert(&V_nl_ctl.ctl_lock, RA_LOCKED) + +#define NLCTL_WLOCK() rm_wlock(&V_nl_ctl.ctl_lock) +#define NLCTL_WUNLOCK() rm_wunlock(&V_nl_ctl.ctl_lock) +#define NLCTL_WLOCK_ASSERT() rm_assert(&V_nl_ctl.ctl_lock, RA_WLOCKED) + +static u_long nl_sendspace = NLSNDQ; +SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0, + "Default netlink socket send space"); + +static u_long nl_recvspace = NLSNDQ; +SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0, + "Default netlink socket receive space"); + +extern u_long sb_max_adj; +static u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */ +static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS); +SYSCTL_OID(_net_netlink, OID_AUTO, nl_maxsockbuf, + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, &nl_maxsockbuf, 0, + sysctl_handle_nl_maxsockbuf, "LU", + "Maximum Netlink socket buffer size"); + + +static unsigned int osd_slot_id = 0; + +void +nl_osd_register(void) +{ + osd_slot_id = osd_register(OSD_THREAD, NULL, NULL); +} + +void +nl_osd_unregister(void) +{ + osd_deregister(OSD_THREAD, osd_slot_id); +} + +struct nlpcb * +_nl_get_thread_nlp(struct thread *td) +{ + return (osd_get(OSD_THREAD, &td->td_osd, osd_slot_id)); +} + +void +nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp) +{ + NLP_LOG(LOG_DEBUG2, nlp, "Set thread %p nlp to %p (slot %u)", td, nlp, osd_slot_id); + if (osd_set(OSD_THREAD, &td->td_osd, osd_slot_id, nlp) == 0) + return; + /* Failed, need to realloc */ + void **rsv = osd_reserve(osd_slot_id); + osd_set_reserved(OSD_THREAD, &td->td_osd, osd_slot_id, rsv, nlp); +} + +/* + * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx. + * Returns nlpcb pointer if present else NULL + */ +static struct nlpcb * +nl_port_lookup(uint32_t port_id) +{ + struct nlpcb *nlp; + + CK_LIST_FOREACH(nlp, &V_nl_ctl.ctl_port_head, nl_port_next) { + if (nlp->nl_port == port_id) + return (nlp); + } + return (NULL); +} + +static void +nlp_join_group(struct nlpcb *nlp, unsigned int group_id) +{ + MPASS(group_id < NLP_MAX_GROUPS); + NLCTL_WLOCK_ASSERT(); + + /* TODO: add family handler callback */ + if (!nlp_unconstrained_vnet(nlp)) + return; + + BIT_SET(NLP_MAX_GROUPS, group_id, &nlp->nl_groups); +} + +static void +nlp_leave_group(struct nlpcb *nlp, unsigned int group_id) +{ + MPASS(group_id < NLP_MAX_GROUPS); + NLCTL_WLOCK_ASSERT(); + + BIT_CLR(NLP_MAX_GROUPS, group_id, &nlp->nl_groups); +} + +static bool +nlp_memberof_group(struct nlpcb *nlp, unsigned int group_id) +{ + MPASS(group_id < NLP_MAX_GROUPS); + NLCTL_LOCK_ASSERT(); + + return (BIT_ISSET(NLP_MAX_GROUPS, group_id, &nlp->nl_groups)); +} + +static uint32_t +nlp_get_groups_compat(struct nlpcb *nlp) +{ + uint32_t groups_mask = 0; + + NLCTL_LOCK_ASSERT(); + + for (int i = 0; i < 32; i++) { + if (nlp_memberof_group(nlp, i + 1)) + groups_mask |= (1 << i); + } + + return (groups_mask); +} + +static struct nl_buf * +nl_buf_copy(struct nl_buf *nb) +{ + struct nl_buf *copy; + + copy = nl_buf_alloc(nb->buflen, M_NOWAIT); + if (__predict_false(copy == NULL)) + return (NULL); + memcpy(copy, nb, sizeof(*nb) + nb->buflen); + + return (copy); +} + +/* + * Broadcasts in the writer's buffer. + */ +bool +nl_send_group(struct nl_writer *nw) +{ + struct nl_buf *nb = nw->buf; + struct nlpcb *nlp_last = NULL; + struct nlpcb *nlp; + NLCTL_TRACKER; + + IF_DEBUG_LEVEL(LOG_DEBUG2) { + struct nlmsghdr *hdr = (struct nlmsghdr *)nb->data; + NL_LOG(LOG_DEBUG2, "MCAST len %u msg type %d len %u to group %d/%d", + nb->datalen, hdr->nlmsg_type, hdr->nlmsg_len, + nw->group.proto, nw->group.id); + } + + nw->buf = NULL; + + NLCTL_RLOCK(); + CK_LIST_FOREACH(nlp, &V_nl_ctl.ctl_pcb_head, nl_next) { + if ((nw->group.priv == 0 || priv_check_cred( + nlp->nl_socket->so_cred, nw->group.priv) == 0) && + nlp->nl_proto == nw->group.proto && + nlp_memberof_group(nlp, nw->group.id)) { + if (nlp_last != NULL) { + struct nl_buf *copy; + + copy = nl_buf_copy(nb); + if (copy != NULL) { + nw->buf = copy; + (void)nl_send(nw, nlp_last); + } else { + NLP_LOCK(nlp_last); + if (nlp_last->nl_socket != NULL) + sorwakeup(nlp_last->nl_socket); + NLP_UNLOCK(nlp_last); + } + } + nlp_last = nlp; + } + } + if (nlp_last != NULL) { + nw->buf = nb; + (void)nl_send(nw, nlp_last); + } else + nl_buf_free(nb); + + NLCTL_RUNLOCK(); + + return (true); +} + +void +nl_clear_group(u_int group) +{ + struct nlpcb *nlp; + + NLCTL_WLOCK(); + CK_LIST_FOREACH(nlp, &V_nl_ctl.ctl_pcb_head, nl_next) + if (nlp_memberof_group(nlp, group)) + nlp_leave_group(nlp, group); + NLCTL_WUNLOCK(); +} + +static uint32_t +nl_find_port(void) +{ + /* + * app can open multiple netlink sockets. + * Start with current pid, if already taken, + * try random numbers in 65k..256k+65k space, + * avoiding clash with pids. + */ + if (nl_port_lookup(curproc->p_pid) == NULL) + return (curproc->p_pid); + for (int i = 0; i < 16; i++) { + uint32_t nl_port = (arc4random() % 65536) + 65536 * 4; + if (nl_port_lookup(nl_port) == 0) + return (nl_port); + NL_LOG(LOG_DEBUG3, "tried %u\n", nl_port); + } + return (curproc->p_pid); +} + +static int +nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl) +{ + if (nlp->nl_bound) { + if (nlp->nl_port != snl->nl_pid) { + NL_LOG(LOG_DEBUG, + "bind() failed: program pid %d " + "is different from provided pid %d", + nlp->nl_port, snl->nl_pid); + return (EINVAL); // XXX: better error + } + } else { + if (snl->nl_pid == 0) + snl->nl_pid = nl_find_port(); + if (nl_port_lookup(snl->nl_pid) != NULL) + return (EADDRINUSE); + nlp->nl_port = snl->nl_pid; + nlp->nl_bound = true; + CK_LIST_INSERT_HEAD(&V_nl_ctl.ctl_port_head, nlp, nl_port_next); + } + for (int i = 0; i < 32; i++) { + if (snl->nl_groups & ((uint32_t)1 << i)) + nlp_join_group(nlp, i + 1); + else + nlp_leave_group(nlp, i + 1); + } + + return (0); +} + +static int +nl_attach(struct socket *so, int proto, struct thread *td) +{ + struct nlpcb *nlp; + int error; + + if (__predict_false(netlink_unloading != 0)) + return (EAFNOSUPPORT); + + error = nl_verify_proto(proto); + if (error != 0) + return (error); + + bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX; + NL_LOG(LOG_DEBUG2, "socket %p, %sPID %d: attaching socket to %s", + so, is_linux ? "(linux) " : "", curproc->p_pid, + nl_get_proto_name(proto)); + + nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO); + error = soreserve(so, nl_sendspace, nl_recvspace); + if (error != 0) { + free(nlp, M_PCB); + return (error); + } + TAILQ_INIT(&so->so_rcv.nl_queue); + TAILQ_INIT(&so->so_snd.nl_queue); + so->so_pcb = nlp; + nlp->nl_socket = so; + nlp->nl_proto = proto; + nlp->nl_process_id = curproc->p_pid; + nlp->nl_linux = is_linux; + nlp->nl_unconstrained_vnet = !jailed_without_vnet(so->so_cred); + nlp->nl_need_thread_setup = true; + NLP_LOCK_INIT(nlp); + refcount_init(&nlp->nl_refcount, 1); + + nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK, + taskqueue_thread_enqueue, &nlp->nl_taskqueue); + TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp); + taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT, + "netlink_socket (PID %u)", nlp->nl_process_id); + + NLCTL_WLOCK(); + CK_LIST_INSERT_HEAD(&V_nl_ctl.ctl_pcb_head, nlp, nl_next); + NLCTL_WUNLOCK(); + + soisconnected(so); + + return (0); +} + +static int +nl_bind(struct socket *so, struct sockaddr *sa, struct thread *td) +{ + struct nlpcb *nlp = sotonlpcb(so); + struct sockaddr_nl *snl = (struct sockaddr_nl *)sa; + int error; + + NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); + if (snl->nl_len != sizeof(*snl)) { + NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); + return (EINVAL); + } + + + NLCTL_WLOCK(); + NLP_LOCK(nlp); + error = nl_bind_locked(nlp, snl); + NLP_UNLOCK(nlp); + NLCTL_WUNLOCK(); + NL_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so, + snl->nl_pid, snl->nl_groups, error); + + return (error); +} + + +static int +nl_assign_port(struct nlpcb *nlp, uint32_t port_id) +{ + struct sockaddr_nl snl = { + .nl_pid = port_id, + }; + int error; + + NLCTL_WLOCK(); + NLP_LOCK(nlp); + snl.nl_groups = nlp_get_groups_compat(nlp); + error = nl_bind_locked(nlp, &snl); + NLP_UNLOCK(nlp); + NLCTL_WUNLOCK(); + + NL_LOG(LOG_DEBUG3, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error); + return (error); +} + +/* + * nl_autobind_port binds a unused portid to @nlp + * @nlp: pcb data for the netlink socket + * @candidate_id: first id to consider + */ +static int +nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id) +{ + uint32_t port_id = candidate_id; + NLCTL_TRACKER; + bool exist; + int error = EADDRINUSE; + + for (int i = 0; i < 10; i++) { + NL_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id); + NLCTL_RLOCK(); + exist = nl_port_lookup(port_id) != 0; + NLCTL_RUNLOCK(); + if (!exist) { + error = nl_assign_port(nlp, port_id); + if (error != EADDRINUSE) + break; + } + port_id++; + } + NL_LOG(LOG_DEBUG3, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error); + return (error); +} + +static int +nl_connect(struct socket *so, struct sockaddr *sa, struct thread *td) +{ + struct sockaddr_nl *snl = (struct sockaddr_nl *)sa; + struct nlpcb *nlp; + + NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); + if (snl->nl_len != sizeof(*snl)) { + NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); + return (EINVAL); + } + + nlp = sotonlpcb(so); + if (!nlp->nl_bound) { + int error = nl_autobind_port(nlp, td->td_proc->p_pid); + if (error != 0) { + NL_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error); + return (error); + } + } + /* XXX: Handle socket flags & multicast */ + soisconnected(so); + + NL_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid); + + return (0); +} + +static void +destroy_nlpcb_epoch(epoch_context_t ctx) +{ + struct nlpcb *nlp; + + nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx); + + NLP_LOCK_DESTROY(nlp); + free(nlp, M_PCB); +} + +static void +nl_close(struct socket *so) +{ + MPASS(sotonlpcb(so) != NULL); + struct nlpcb *nlp; + struct nl_buf *nb; + + NL_LOG(LOG_DEBUG2, "detaching socket %p, PID %d", so, curproc->p_pid); + nlp = sotonlpcb(so); + + /* Mark as inactive so no new work can be enqueued */ + NLP_LOCK(nlp); + bool was_bound = nlp->nl_bound; + NLP_UNLOCK(nlp); + + /* Wait till all scheduled work has been completed */ + taskqueue_drain_all(nlp->nl_taskqueue); + taskqueue_free(nlp->nl_taskqueue); + + NLCTL_WLOCK(); + NLP_LOCK(nlp); + if (was_bound) { + CK_LIST_REMOVE(nlp, nl_port_next); + NL_LOG(LOG_DEBUG3, "socket %p, unlinking bound pid %u", so, nlp->nl_port); + } + CK_LIST_REMOVE(nlp, nl_next); + nlp->nl_socket = NULL; + NLP_UNLOCK(nlp); + NLCTL_WUNLOCK(); + + so->so_pcb = NULL; + + while ((nb = TAILQ_FIRST(&so->so_snd.nl_queue)) != NULL) { + TAILQ_REMOVE(&so->so_snd.nl_queue, nb, tailq); + nl_buf_free(nb); + } + while ((nb = TAILQ_FIRST(&so->so_rcv.nl_queue)) != NULL) { + TAILQ_REMOVE(&so->so_rcv.nl_queue, nb, tailq); + nl_buf_free(nb); + } + + NL_LOG(LOG_DEBUG3, "socket %p, detached", so); + + /* XXX: is delayed free needed? */ + NET_EPOCH_CALL(destroy_nlpcb_epoch, &nlp->nl_epoch_ctx); +} + +static int +nl_disconnect(struct socket *so) +{ + NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); + MPASS(sotonlpcb(so) != NULL); + return (ENOTCONN); +} + +static int +nl_sockaddr(struct socket *so, struct sockaddr *sa) +{ + + *(struct sockaddr_nl *)sa = (struct sockaddr_nl ){ + /* TODO: set other fields */ + .nl_len = sizeof(struct sockaddr_nl), + .nl_family = AF_NETLINK, + .nl_pid = sotonlpcb(so)->nl_port, + }; + + return (0); +} + +static int +nl_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, + struct mbuf *m, struct mbuf *control, int flags, struct thread *td) +{ + struct nlpcb *nlp = sotonlpcb(so); + struct sockbuf *sb = &so->so_snd; + struct nl_buf *nb; + size_t len; + int error; + + MPASS(m == NULL && uio != NULL); + + if (__predict_false(control != NULL)) { + m_freem(control); + return (EINVAL); + } + + if (__predict_false(flags & MSG_OOB)) /* XXXGL: or just ignore? */ + return (EOPNOTSUPP); + + if (__predict_false(uio->uio_resid < sizeof(struct nlmsghdr))) + return (ENOBUFS); /* XXXGL: any better error? */ + + if (__predict_false(uio->uio_resid > sb->sb_hiwat)) + return (EMSGSIZE); + + error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); + if (error) + return (error); + + len = roundup2(uio->uio_resid, 8) + SCRATCH_BUFFER_SIZE; + if (nlp->nl_linux) + len += roundup2(uio->uio_resid, 8); + nb = nl_buf_alloc(len, M_WAITOK); + nb->datalen = uio->uio_resid; + error = uiomove(&nb->data[0], uio->uio_resid, uio); + if (__predict_false(error)) + goto out; + + NL_LOG(LOG_DEBUG2, "sending message to kernel %u bytes", nb->datalen); + + SOCK_SENDBUF_LOCK(so); +restart: + if (sb->sb_hiwat - sb->sb_ccc >= nb->datalen) { + TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq); + sb->sb_acc += nb->datalen; + sb->sb_ccc += nb->datalen; + nb = NULL; + } else if ((so->so_state & SS_NBIO) || + (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { + SOCK_SENDBUF_UNLOCK(so); + error = EWOULDBLOCK; + goto out; + } else { + if ((error = sbwait(so, SO_SND)) != 0) { + SOCK_SENDBUF_UNLOCK(so); + goto out; + } else + goto restart; + } + SOCK_SENDBUF_UNLOCK(so); + + if (nb == NULL) { + NL_LOG(LOG_DEBUG3, "success"); + NLP_LOCK(nlp); + nl_schedule_taskqueue(nlp); + NLP_UNLOCK(nlp); + } + +out: + SOCK_IO_SEND_UNLOCK(so); + if (nb != NULL) { + NL_LOG(LOG_DEBUG3, "failure, error %d", error); + nl_buf_free(nb); + } + return (error); +} + +/* Create control data for recvmsg(2) on Netlink socket. */ +static struct mbuf * +nl_createcontrol(struct nlpcb *nlp) +{ + struct { + struct nlattr nla; + uint32_t val; + } data[] = { + { + .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), + .nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID, + .val = nlp->nl_process_id, + }, + { + .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), + .nla.nla_type = NLMSGINFO_ATTR_PORT_ID, + .val = nlp->nl_port, + }, + }; + + return (sbcreatecontrol(data, sizeof(data), NETLINK_MSG_INFO, + SOL_NETLINK, M_WAITOK)); +} + +static int +nl_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, + struct mbuf **mp, struct mbuf **controlp, int *flagsp) +{ + static const struct sockaddr_nl nl_empty_src = { + .nl_len = sizeof(struct sockaddr_nl), + .nl_family = PF_NETLINK, + .nl_pid = 0 /* comes from the kernel */ + }; + struct sockbuf *sb = &so->so_rcv; + struct nlpcb *nlp = sotonlpcb(so); + struct nl_buf *first, *last, *nb, *next; + struct nlmsghdr *hdr; + int flags, error; + u_int len, overflow, partoff, partlen, msgrcv, datalen; + bool nonblock, trunc, peek; + + MPASS(mp == NULL && uio != NULL); + + NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); + + if (psa != NULL) + *psa = sodupsockaddr((const struct sockaddr *)&nl_empty_src, + M_WAITOK); + + if (controlp != NULL && (nlp->nl_flags & NLF_MSG_INFO)) + *controlp = nl_createcontrol(nlp); + + flags = flagsp != NULL ? *flagsp & ~MSG_TRUNC : 0; + trunc = flagsp != NULL ? *flagsp & MSG_TRUNC : false; + nonblock = (so->so_state & SS_NBIO) || + (flags & (MSG_DONTWAIT | MSG_NBIO)); + peek = flags & MSG_PEEK; + + error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); + if (__predict_false(error)) + return (error); + + len = 0; + overflow = 0; + msgrcv = 0; + datalen = 0; + + SOCK_RECVBUF_LOCK(so); + while ((first = TAILQ_FIRST(&sb->nl_queue)) == NULL) { + if (nonblock) { + SOCK_RECVBUF_UNLOCK(so); + SOCK_IO_RECV_UNLOCK(so); + return (EWOULDBLOCK); + } + error = sbwait(so, SO_RCV); + if (error) { + SOCK_RECVBUF_UNLOCK(so); + SOCK_IO_RECV_UNLOCK(so); + return (error); + } + } + + /* + * Netlink socket buffer consists of a queue of nl_bufs, but for the + * userland there should be no boundaries. However, there are Netlink + * messages, that shouldn't be split. Internal invariant is that a + * message never spans two nl_bufs. + * If a large userland buffer is provided, we would traverse the queue + * until either queue end is reached or the buffer is fulfilled. If + * an application provides a buffer that isn't able to fit a single + * message, we would truncate it and lose its tail. This is the only + * condition where we would lose data. If buffer is able to fit at + * least one message, we would return it and won't truncate the next. + * + * We use same code for normal and MSG_PEEK case. At first queue pass + * we scan nl_bufs and count lenght. In case we can read entire buffer + * at one write everything is trivial. In case we can not, we save + * pointer to the last (or partial) nl_buf and in the !peek case we + * split the queue into two pieces. We can safely drop the queue lock, + * as kernel would only append nl_bufs to the end of the queue, and + * we are the exclusive owner of queue beginning due to sleepable lock. + * At the second pass we copy data out and in !peek case free nl_bufs. + */ + TAILQ_FOREACH(nb, &sb->nl_queue, tailq) { + u_int offset; + + MPASS(nb->offset < nb->datalen); + offset = nb->offset; + while (offset < nb->datalen) { + hdr = (struct nlmsghdr *)&nb->data[offset]; + MPASS(nb->offset + hdr->nlmsg_len <= nb->datalen); + if (uio->uio_resid < len + hdr->nlmsg_len) { + overflow = len + hdr->nlmsg_len - + uio->uio_resid; + partoff = nb->offset; + if (offset > partoff) { + partlen = offset - partoff; + if (!peek) { + nb->offset = offset; + datalen += partlen; + } + } else if (len == 0 && uio->uio_resid > 0) { + flags |= MSG_TRUNC; + partlen = uio->uio_resid; + if (peek) + goto nospace; + datalen += hdr->nlmsg_len; + if (nb->offset + hdr->nlmsg_len == + nb->datalen) { + /* + * Avoid leaving empty nb. + * Process last nb normally. + * Trust uiomove() to care + * about negative uio_resid. + */ + nb = TAILQ_NEXT(nb, tailq); + overflow = 0; + partlen = 0; + } else + nb->offset += hdr->nlmsg_len; + msgrcv++; + } else + partlen = 0; + goto nospace; + } + len += hdr->nlmsg_len; + offset += hdr->nlmsg_len; + MPASS(offset <= nb->buflen); + msgrcv++; + } + MPASS(offset == nb->datalen); + datalen += nb->datalen - nb->offset; + } +nospace: + last = nb; + if (!peek) { + if (last == NULL) + TAILQ_INIT(&sb->nl_queue); + else { + /* XXXGL: create TAILQ_SPLIT */ + TAILQ_FIRST(&sb->nl_queue) = last; + last->tailq.tqe_prev = &TAILQ_FIRST(&sb->nl_queue); + } + MPASS(sb->sb_acc >= datalen); + sb->sb_acc -= datalen; + sb->sb_ccc -= datalen; + } + SOCK_RECVBUF_UNLOCK(so); + + for (nb = first; nb != last; nb = next) { + next = TAILQ_NEXT(nb, tailq); + if (__predict_true(error == 0)) + error = uiomove(&nb->data[nb->offset], + (int)(nb->datalen - nb->offset), uio); + if (!peek) + nl_buf_free(nb); + } + if (last != NULL && partlen > 0 && __predict_true(error == 0)) + error = uiomove(&nb->data[partoff], (int)partlen, uio); + + if (trunc && overflow > 0) { + uio->uio_resid -= overflow; + MPASS(uio->uio_resid < 0); + } else + MPASS(uio->uio_resid >= 0); + + if (uio->uio_td) + uio->uio_td->td_ru.ru_msgrcv += msgrcv; + + if (flagsp != NULL) + *flagsp |= flags; + + SOCK_IO_RECV_UNLOCK(so); + + nl_on_transmit(sotonlpcb(so)); + + return (error); +} + +static int +nl_getoptflag(int sopt_name) +{ + switch (sopt_name) { + case NETLINK_CAP_ACK: + return (NLF_CAP_ACK); + case NETLINK_EXT_ACK: + return (NLF_EXT_ACK); + case NETLINK_GET_STRICT_CHK: + return (NLF_STRICT); + case NETLINK_MSG_INFO: + return (NLF_MSG_INFO); + } + + return (0); +} + +static int +nl_ctloutput(struct socket *so, struct sockopt *sopt) +{ + struct nlpcb *nlp = sotonlpcb(so); + uint32_t flag; + int optval, error = 0; + NLCTL_TRACKER; + + NL_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get", + so, sopt->sopt_name); + + switch (sopt->sopt_dir) { + case SOPT_SET: + switch (sopt->sopt_name) { + case NETLINK_ADD_MEMBERSHIP: + case NETLINK_DROP_MEMBERSHIP: + error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); + if (error != 0) + break; + if (optval <= 0 || optval >= NLP_MAX_GROUPS) { + error = ERANGE; + break; + } + NL_LOG(LOG_DEBUG2, "ADD/DEL group %d", (uint32_t)optval); + + NLCTL_WLOCK(); + if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP) + nlp_join_group(nlp, optval); + else + nlp_leave_group(nlp, optval); + NLCTL_WUNLOCK(); + break; + case NETLINK_CAP_ACK: + case NETLINK_EXT_ACK: + case NETLINK_GET_STRICT_CHK: + case NETLINK_MSG_INFO: + error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); + if (error != 0) + break; + + flag = nl_getoptflag(sopt->sopt_name); + + if ((flag == NLF_MSG_INFO) && nlp->nl_linux) { + error = EINVAL; + break; + } + + NLCTL_WLOCK(); + if (optval != 0) + nlp->nl_flags |= flag; + else + nlp->nl_flags &= ~flag; + NLCTL_WUNLOCK(); + break; + default: + error = ENOPROTOOPT; + } + break; + case SOPT_GET: + switch (sopt->sopt_name) { + case NETLINK_LIST_MEMBERSHIPS: + NLCTL_RLOCK(); + optval = nlp_get_groups_compat(nlp); + NLCTL_RUNLOCK(); + error = sooptcopyout(sopt, &optval, sizeof(optval)); + break; + case NETLINK_CAP_ACK: + case NETLINK_EXT_ACK: + case NETLINK_GET_STRICT_CHK: + case NETLINK_MSG_INFO: + NLCTL_RLOCK(); + optval = (nlp->nl_flags & nl_getoptflag(sopt->sopt_name)) != 0; + NLCTL_RUNLOCK(); + error = sooptcopyout(sopt, &optval, sizeof(optval)); + break; + default: + error = ENOPROTOOPT; + } + break; + default: + error = ENOPROTOOPT; + } + + return (error); +} + +static int +sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS) +{ + int error = 0; + u_long tmp_maxsockbuf = nl_maxsockbuf; + + error = sysctl_handle_long(oidp, &tmp_maxsockbuf, arg2, req); + if (error || !req->newptr) + return (error); + if (tmp_maxsockbuf < MSIZE + MCLBYTES) + return (EINVAL); + nl_maxsockbuf = tmp_maxsockbuf; + + return (0); +} + +static int +nl_setsbopt(struct socket *so, struct sockopt *sopt) +{ + int error, optval; + bool result; + + if (sopt->sopt_name != SO_RCVBUF) + return (sbsetopt(so, sopt)); + + /* Allow to override max buffer size in certain conditions */ + + error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); + if (error != 0) + return (error); + NL_LOG(LOG_DEBUG2, "socket %p, PID %d, SO_RCVBUF=%d", so, curproc->p_pid, optval); + if (optval > sb_max_adj) { + if (priv_check(curthread, PRIV_NET_ROUTE) != 0) + return (EPERM); + } + + SOCK_RECVBUF_LOCK(so); + result = sbreserve_locked_limit(so, SO_RCV, optval, nl_maxsockbuf, curthread); + SOCK_RECVBUF_UNLOCK(so); + + return (result ? 0 : ENOBUFS); +} + +#define NETLINK_PROTOSW \ + .pr_flags = PR_ATOMIC | PR_ADDR | PR_SOCKBUF, \ + .pr_ctloutput = nl_ctloutput, \ + .pr_setsbopt = nl_setsbopt, \ + .pr_attach = nl_attach, \ + .pr_bind = nl_bind, \ + .pr_connect = nl_connect, \ + .pr_disconnect = nl_disconnect, \ + .pr_sosend = nl_sosend, \ + .pr_soreceive = nl_soreceive, \ + .pr_sockaddr = nl_sockaddr, \ + .pr_close = nl_close + +static struct protosw netlink_raw_sw = { + .pr_type = SOCK_RAW, + NETLINK_PROTOSW +}; + +static struct protosw netlink_dgram_sw = { + .pr_type = SOCK_DGRAM, + NETLINK_PROTOSW +}; + +static struct domain netlinkdomain = { + .dom_family = PF_NETLINK, + .dom_name = "netlink", + .dom_flags = DOMF_UNLOADABLE, + .dom_nprotosw = 2, + .dom_protosw = { &netlink_raw_sw, &netlink_dgram_sw }, +}; + +DOMAIN_SET(netlink); diff --git a/sys/netlink/netlink_generic.c b/sys/netlink/netlink_generic.c new file mode 100644 index 000000000000..00f47e60f013 --- /dev/null +++ b/sys/netlink/netlink_generic.c @@ -0,0 +1,525 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/types.h> +#include <sys/ck.h> +#include <sys/epoch.h> +#include <sys/eventhandler.h> +#include <sys/kernel.h> +#include <sys/jail.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/priv.h> +#include <sys/socket.h> +#include <sys/sx.h> + +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_generic.h> +#include <netlink/netlink_var.h> + +#define DEBUG_MOD_NAME nl_generic +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include <netlink/netlink_debug.h> +_DECLARE_DEBUG(LOG_INFO); + +static int nlctrl_handle_getfamily(struct nlmsghdr *, struct nl_pstate *); + +static struct genl_cmd nlctrl_cmds[] = { + [CTRL_CMD_GETFAMILY] = { + .cmd_num = CTRL_CMD_GETFAMILY, + .cmd_name = "GETFAMILY", + .cmd_cb = nlctrl_handle_getfamily, + .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | + GENL_CMD_CAP_HASPOL, + }, +}; + +static struct genl_family { + const char *family_name; + uint16_t family_hdrsize; + uint16_t family_version; + uint16_t family_attr_max; + uint16_t family_cmd_size; + uint16_t family_num_groups; + struct genl_cmd *family_cmds; +} families[MAX_FAMILIES] = { + [CTRL_FAMILY_ID] = { + .family_name = CTRL_FAMILY_NAME, + .family_hdrsize = 0, + .family_version = 2, + .family_attr_max = CTRL_ATTR_MAX, + .family_cmd_size = CTRL_CMD_GETFAMILY + 1, + .family_cmds = nlctrl_cmds, + .family_num_groups = 1, + }, +}; + +static struct genl_group { + struct genl_family *group_family; + const char *group_name; +} groups[MAX_GROUPS] = { + [CTRL_GROUP_ID] = { + .group_family = &families[CTRL_FAMILY_ID], + .group_name = CTRL_GROUP_NAME, + }, +}; + +static inline struct genl_family * +genl_family(uint16_t family_id) +{ + struct genl_family *gf; + + gf = &families[family_id - GENL_MIN_ID]; + KASSERT(family_id - GENL_MIN_ID < MAX_FAMILIES && + gf->family_name != NULL, ("family %u does not exist", family_id)); + return (gf); +} + +static inline uint16_t +genl_family_id(const struct genl_family *gf) +{ + MPASS(gf >= &families[0] && gf < &families[MAX_FAMILIES]); + return ((uint16_t)(gf - &families[0]) + GENL_MIN_ID); +} + +/* + * Handler called by netlink subsystem when matching netlink message is received + */ +static int +genl_handle_message(struct nlmsghdr *hdr, struct nl_pstate *npt) +{ + struct nlpcb *nlp = npt->nlp; + struct genl_family *gf; + uint16_t family_id; + int error = 0; + + if (__predict_false(hdr->nlmsg_len < sizeof(struct nlmsghdr) + + GENL_HDRLEN)) { + NLP_LOG(LOG_DEBUG, nlp, "invalid message size: %d", + hdr->nlmsg_len); + return (EINVAL); + } + + family_id = hdr->nlmsg_type - GENL_MIN_ID; + gf = &families[family_id]; + if (__predict_false(family_id >= MAX_FAMILIES || + gf->family_name == NULL)) { + NLP_LOG(LOG_DEBUG, nlp, "invalid message type: %d", + hdr->nlmsg_type); + return (ENOTSUP); + } + + struct genlmsghdr *ghdr = (struct genlmsghdr *)(hdr + 1); + + if (ghdr->cmd >= gf->family_cmd_size || gf->family_cmds[ghdr->cmd].cmd_cb == NULL) { + NLP_LOG(LOG_DEBUG, nlp, "family %s: invalid cmd %d", + gf->family_name, ghdr->cmd); + return (ENOTSUP); + } + + struct genl_cmd *cmd = &gf->family_cmds[ghdr->cmd]; + + if (cmd->cmd_priv != 0 && !nlp_has_priv(nlp, cmd->cmd_priv)) { + NLP_LOG(LOG_DEBUG, nlp, "family %s: cmd %d priv_check() failed", + gf->family_name, ghdr->cmd); + return (EPERM); + } + + NLP_LOG(LOG_DEBUG2, nlp, "received family %s cmd %s(%d) len %d", + gf->family_name, cmd->cmd_name, ghdr->cmd, hdr->nlmsg_len); + + error = cmd->cmd_cb(hdr, npt); + + return (error); +} + +static uint32_t +get_cmd_flags(const struct genl_cmd *cmd) +{ + uint32_t flags = cmd->cmd_flags; + if (cmd->cmd_priv != 0) + flags |= GENL_ADMIN_PERM; + return (flags); +} + +static int +dump_family(struct nlmsghdr *hdr, struct genlmsghdr *ghdr, + const struct genl_family *gf, struct nl_writer *nw) +{ + if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) + goto enomem; + + struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); + ghdr_new->cmd = ghdr->cmd; + ghdr_new->version = gf->family_version; + ghdr_new->reserved = 0; + + nlattr_add_string(nw, CTRL_ATTR_FAMILY_NAME, gf->family_name); + nlattr_add_u16(nw, CTRL_ATTR_FAMILY_ID, genl_family_id(gf)); + nlattr_add_u32(nw, CTRL_ATTR_VERSION, gf->family_version); + nlattr_add_u32(nw, CTRL_ATTR_HDRSIZE, gf->family_hdrsize); + nlattr_add_u32(nw, CTRL_ATTR_MAXATTR, gf->family_attr_max); + + if (gf->family_cmd_size > 0) { + int off = nlattr_add_nested(nw, CTRL_ATTR_OPS); + if (off == 0) + goto enomem; + for (int i = 0, cnt=0; i < gf->family_cmd_size; i++) { + struct genl_cmd *cmd = &gf->family_cmds[i]; + if (cmd->cmd_cb == NULL) + continue; + int cmd_off = nlattr_add_nested(nw, ++cnt); + if (cmd_off == 0) + goto enomem; + + nlattr_add_u32(nw, CTRL_ATTR_OP_ID, cmd->cmd_num); + nlattr_add_u32(nw, CTRL_ATTR_OP_FLAGS, get_cmd_flags(cmd)); + nlattr_set_len(nw, cmd_off); + } + nlattr_set_len(nw, off); + } + if (gf->family_num_groups > 0) { + int off = nlattr_add_nested(nw, CTRL_ATTR_MCAST_GROUPS); + if (off == 0) + goto enomem; + for (u_int i = 0, cnt = 0; i < MAX_GROUPS; i++) { + struct genl_group *gg = &groups[i]; + + if (gg->group_family != gf) + continue; + + int cmd_off = nlattr_add_nested(nw, ++cnt); + if (cmd_off == 0) + goto enomem; + nlattr_add_u32(nw, CTRL_ATTR_MCAST_GRP_ID, i + MIN_GROUP_NUM); + nlattr_add_string(nw, CTRL_ATTR_MCAST_GRP_NAME, gg->group_name); + nlattr_set_len(nw, cmd_off); + } + nlattr_set_len(nw, off); + } + if (nlmsg_end(nw)) + return (0); +enomem: + NL_LOG(LOG_DEBUG, "unable to dump family %s state (ENOMEM)", gf->family_name); + nlmsg_abort(nw); + return (ENOMEM); +} + +struct nl_parsed_family { + char *family_name; + uint16_t family_id; + uint8_t version; +}; + +#define _IN(_field) offsetof(struct genlmsghdr, _field) +#define _OUT(_field) offsetof(struct nl_parsed_family, _field) +static const struct nlfield_parser nlf_p_generic[] = { + { .off_in = _IN(version), .off_out = _OUT(version), .cb = nlf_get_u8 }, +}; + +static struct nlattr_parser nla_p_generic[] = { + { .type = CTRL_ATTR_FAMILY_ID , .off = _OUT(family_id), .cb = nlattr_get_uint16 }, + { .type = CTRL_ATTR_FAMILY_NAME , .off = _OUT(family_name), .cb = nlattr_get_string }, +}; +#undef _IN +#undef _OUT +NL_DECLARE_PARSER(genl_parser, struct genlmsghdr, nlf_p_generic, nla_p_generic); + +static int +nlctrl_handle_getfamily(struct nlmsghdr *hdr, struct nl_pstate *npt) +{ + int error = 0; + + struct nl_parsed_family attrs = {}; + error = nl_parse_nlmsg(hdr, &genl_parser, npt, &attrs); + if (error != 0) + return (error); + + struct genlmsghdr ghdr = { + .cmd = CTRL_CMD_NEWFAMILY, + }; + + if (attrs.family_id != 0 || attrs.family_name != NULL) { + for (u_int i = 0; i < MAX_FAMILIES; i++) { + struct genl_family *gf = &families[i]; + + if (gf->family_name == NULL) + continue; + if (attrs.family_id != 0 && + attrs.family_id != genl_family_id(gf)) + continue; + if (attrs.family_name != NULL && + strcmp(attrs.family_name, gf->family_name) != 0) + continue; + return (dump_family(hdr, &ghdr, gf, npt->nw)); + } + return (ENOENT); + } + + hdr->nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI; + for (u_int i = 0; i < MAX_FAMILIES; i++) { + struct genl_family *gf = &families[i]; + + if (gf->family_name != NULL) { + error = dump_family(hdr, &ghdr, gf, npt->nw); + if (error != 0) + break; + } + } + + if (!nlmsg_end_dump(npt->nw, error, hdr)) { + NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); + return (ENOMEM); + } + + return (error); +} + +static void +nlctrl_notify(void *arg __unused, const char *family_name __unused, + uint16_t family_id, u_int cmd) +{ + struct nlmsghdr hdr = {.nlmsg_type = NETLINK_GENERIC }; + struct genlmsghdr ghdr = { .cmd = cmd }; + struct genl_family *gf; + struct nl_writer nw; + + gf = genl_family(family_id); + if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_GENERIC, CTRL_GROUP_ID, + 0, false)) { + NL_LOG(LOG_DEBUG, "error allocating group writer"); + return; + } + + dump_family(&hdr, &ghdr, gf, &nw); + nlmsg_flush(&nw); +} + +static const struct nlhdr_parser *all_parsers[] = { &genl_parser }; +static eventhandler_tag family_event_tag; + +static void +genl_load_all(void *u __unused) +{ + NL_VERIFY_PARSERS(all_parsers); + family_event_tag = EVENTHANDLER_REGISTER(genl_family_event, + nlctrl_notify, NULL, EVENTHANDLER_PRI_ANY); + netlink_register_proto(NETLINK_GENERIC, "NETLINK_GENERIC", + genl_handle_message); +} +SYSINIT(genl_load_all, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_load_all, NULL); + +static void +genl_unload(void *u __unused) +{ + netlink_unregister_proto(NETLINK_GENERIC); + EVENTHANDLER_DEREGISTER(genl_family_event, family_event_tag); + NET_EPOCH_WAIT(); +} +SYSUNINIT(genl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_unload, NULL); + +/* + * Public KPI for NETLINK_GENERIC families/groups registration logic below. + */ + +static struct sx sx_lock; +SX_SYSINIT(genl_lock, &sx_lock, "genetlink lock"); +#define GENL_LOCK() sx_xlock(&sx_lock) +#define GENL_UNLOCK() sx_xunlock(&sx_lock) +#define GENL_ASSERT_LOCKED() sx_assert(&sx_lock, SA_LOCKED) +#define GENL_ASSERT_XLOCKED() sx_assert(&sx_lock, SA_XLOCKED) + +uint16_t +genl_register_family(const char *family_name, size_t hdrsize, + uint16_t family_version, uint16_t max_attr_idx) +{ + struct genl_family *gf; + uint16_t family_id; + + MPASS(family_name != NULL); + + GENL_LOCK(); + for (u_int i = 0; i < MAX_FAMILIES; i++) + if (families[i].family_name != NULL && + strcmp(families[i].family_name, family_name) == 0) + return (0); + + /* Microoptimization: index 0 is reserved for the control family. */ + gf = NULL; + for (u_int i = 1; i < MAX_FAMILIES; i++) + if (families[i].family_name == NULL) { + gf = &families[i]; + break; + } + KASSERT(gf, ("%s: maximum of %u generic netlink families allocated", + __func__, MAX_FAMILIES)); + + *gf = (struct genl_family) { + .family_name = family_name, + .family_version = family_version, + .family_hdrsize = hdrsize, + .family_attr_max = max_attr_idx, + }; + family_id = genl_family_id(gf); + GENL_UNLOCK(); + + NL_LOG(LOG_DEBUG2, "Registered family %s id %d", gf->family_name, + family_id); + EVENTHANDLER_INVOKE(genl_family_event, gf->family_name, family_id, + CTRL_CMD_NEWFAMILY); + + return (family_id); +} + +void +genl_unregister_family(uint16_t family_id) +{ + struct genl_family *gf; + + GENL_LOCK(); + gf = genl_family(family_id); + + EVENTHANDLER_INVOKE(genl_family_event, gf->family_name, + family_id, CTRL_CMD_DELFAMILY); + for (u_int i = 0; i < MAX_GROUPS; i++) { + struct genl_group *gg = &groups[i]; + if (gg->group_family == gf && gg->group_name != NULL) { + gg->group_family = NULL; + gg->group_name = NULL; + } + } + if (gf->family_cmds != NULL) + free(gf->family_cmds, M_NETLINK); + bzero(gf, sizeof(*gf)); + GENL_UNLOCK(); +} + +bool +genl_register_cmds(uint16_t family_id, const struct genl_cmd *cmds, + u_int count) +{ + struct genl_family *gf; + uint16_t cmd_size; + + GENL_LOCK(); + gf = genl_family(family_id); + + cmd_size = gf->family_cmd_size; + + for (u_int i = 0; i < count; i++) { + MPASS(cmds[i].cmd_cb != NULL); + if (cmds[i].cmd_num >= cmd_size) + cmd_size = cmds[i].cmd_num + 1; + } + + if (cmd_size > gf->family_cmd_size) { + void *old_data; + + /* need to realloc */ + size_t sz = cmd_size * sizeof(struct genl_cmd); + void *data = malloc(sz, M_NETLINK, M_WAITOK | M_ZERO); + + memcpy(data, gf->family_cmds, + gf->family_cmd_size * sizeof(struct genl_cmd)); + old_data = gf->family_cmds; + gf->family_cmds = data; + gf->family_cmd_size = cmd_size; + free(old_data, M_NETLINK); + } + + for (u_int i = 0; i < count; i++) { + const struct genl_cmd *cmd = &cmds[i]; + + MPASS(gf->family_cmds[cmd->cmd_num].cmd_cb == NULL); + gf->family_cmds[cmd->cmd_num] = cmds[i]; + NL_LOG(LOG_DEBUG2, "Adding cmd %s(%d) to family %s", + cmd->cmd_name, cmd->cmd_num, gf->family_name); + } + GENL_UNLOCK(); + return (true); +} + +uint32_t +genl_register_group(uint16_t family_id, const char *group_name) +{ + struct genl_family *gf; + uint32_t group_id = 0; + + MPASS(group_name != NULL); + + GENL_LOCK(); + gf = genl_family(family_id); + + for (u_int i = 0; i < MAX_GROUPS; i++) + if (groups[i].group_family == gf && + strcmp(groups[i].group_name, group_name) == 0) { + GENL_UNLOCK(); + return (0); + } + + /* Microoptimization: index 0 is reserved for the control family */ + for (u_int i = 1; i < MAX_GROUPS; i++) { + struct genl_group *gg = &groups[i]; + if (gg->group_family == NULL) { + gf->family_num_groups++; + gg->group_family = gf; + gg->group_name = group_name; + group_id = i + MIN_GROUP_NUM; + break; + } + } + GENL_UNLOCK(); + + return (group_id); +} + +void +genl_unregister_group(uint16_t family_id, uint32_t group_id) +{ + struct genl_family *gf; + struct genl_group *gg; + + MPASS(group_id > MIN_GROUP_NUM && + group_id < MIN_GROUP_NUM + MAX_GROUPS); + + nl_clear_group(group_id); + + group_id -= MIN_GROUP_NUM; + + GENL_LOCK(); + gf = genl_family(family_id); + gg = &groups[group_id]; + + MPASS(gg->group_family == gf); + MPASS(gf->family_num_groups > 0); + + gf->family_num_groups--; + gg->group_family = NULL; + gg->group_name = NULL; + GENL_UNLOCK(); +} diff --git a/sys/netlink/netlink_generic.h b/sys/netlink/netlink_generic.h new file mode 100644 index 000000000000..fbd4ae785cbe --- /dev/null +++ b/sys/netlink/netlink_generic.h @@ -0,0 +1,114 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Generic netlink message header and attributes + */ +#ifndef _NETLINK_NETLINK_GENERIC_H_ +#define _NETLINK_NETLINK_GENERIC_H_ + +#include <netlink/netlink.h> + +/* Base header for all of the relevant messages */ +struct genlmsghdr { + uint8_t cmd; /* CTRL_CMD_ */ + uint8_t version; /* ABI version for the cmd */ + uint16_t reserved; /* reserved: set to 0 */ +}; +#define GENL_HDRLEN NL_ITEM_ALIGN(sizeof(struct genlmsghdr)) + +/* Dynamic family number range, inclusive */ +#define GENL_MIN_ID NLMSG_MIN_TYPE +#define GENL_MAX_ID 1023 + +/* Pre-defined family numbers */ +#define GENL_ID_CTRL GENL_MIN_ID + +/* Available commands */ +enum { + CTRL_CMD_UNSPEC = 0, + CTRL_CMD_NEWFAMILY = 1, + CTRL_CMD_DELFAMILY = 2, + CTRL_CMD_GETFAMILY = 3, /* lists all (or matching) genetlink families */ + CTRL_CMD_NEWOPS = 4, + CTRL_CMD_DELOPS = 5, + CTRL_CMD_GETOPS = 6, + CTRL_CMD_NEWMCAST_GRP = 7, + CTRL_CMD_DELMCAST_GRP = 8, + CTRL_CMD_GETMCAST_GRP = 9, + CTRL_CMD_GETPOLICY = 10, + __CTRL_CMD_MAX, +}; +#define CTRL_CMD_MAX (__CTRL_CMD_MAX - 1) + +/* Generic attributes */ +enum { + CTRL_ATTR_UNSPEC, + CTRL_ATTR_FAMILY_ID = 1, /* u16, dynamically-assigned ID */ + CTRL_ATTR_FAMILY_NAME = 2, /* string, family name */ + CTRL_ATTR_VERSION = 3, /* u32, command version */ + CTRL_ATTR_HDRSIZE = 4, /* u32, family header size */ + CTRL_ATTR_MAXATTR = 5, /* u32, maximum family attr # */ + CTRL_ATTR_OPS = 6, /* nested, available operations */ + CTRL_ATTR_MCAST_GROUPS = 7, + CTRL_ATTR_POLICY = 8, + CTRL_ATTR_OP_POLICY = 9, + CTRL_ATTR_OP = 10, + __CTRL_ATTR_MAX, +}; +#define CTRL_ATTR_MAX (__CTRL_ATTR_MAX - 1) + +#define GENL_NAMSIZ 16 /* max family name length including \0 */ + +/* CTRL_ATTR_OPS attributes */ +enum { + CTRL_ATTR_OP_UNSPEC, + CTRL_ATTR_OP_ID = 1, /* u32, operation # */ + CTRL_ATTR_OP_FLAGS = 2, /* u32, flags-based op description */ + __CTRL_ATTR_OP_MAX, +}; +#define CTRL_ATTR_OP_MAX (__CTRL_ATTR_OP_MAX - 1) + +/* CTRL_ATTR_OP_FLAGS values */ +#define GENL_ADMIN_PERM 0x0001 /* Requires elevated permissions */ +#define GENL_CMD_CAP_DO 0x0002 /* Operation is a modification request */ +#define GENL_CMD_CAP_DUMP 0x0004 /* Operation is a get/dump request */ +#define GENL_CMD_CAP_HASPOL 0x0008 /* Operation has a validation policy */ +#define GENL_UNS_ADMIN_PERM 0x0010 + +/* CTRL_ATTR_MCAST_GROUPS attributes */ +enum { + CTRL_ATTR_MCAST_GRP_UNSPEC, + CTRL_ATTR_MCAST_GRP_NAME, /* string, group name */ + CTRL_ATTR_MCAST_GRP_ID, /* u32, dynamically-assigned group id */ + __CTRL_ATTR_MCAST_GRP_MAX, +}; +#define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1) + + +#endif + diff --git a/sys/netlink/netlink_glue.c b/sys/netlink/netlink_glue.c new file mode 100644 index 000000000000..4b593fd9657b --- /dev/null +++ b/sys/netlink/netlink_glue.c @@ -0,0 +1,292 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/rmlock.h> +#include <sys/domain.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/proc.h> +#include <sys/ck.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysent.h> +#include <sys/syslog.h> +#include <sys/priv.h> /* priv_check */ + +#include <net/route.h> +#include <net/route/route_ctl.h> + +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_var.h> +#include <netlink/route/route_var.h> + +/* Standard bits: built-in the kernel */ +SYSCTL_NODE(_net, OID_AUTO, netlink, CTLFLAG_RD, 0, + "RFC3549 Netlink network state socket family"); +SYSCTL_NODE(_net_netlink, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "Netlink per-subsystem debug levels"); + +MALLOC_DEFINE(M_NETLINK, "netlink", "Memory used for netlink packets"); + +/* Netlink-related callbacks needed to glue rtsock, netlink and linuxolator */ +static void +ignore_route_event(uint32_t fibnum, const struct rib_cmd_info *rc) +{ +} + +static void +ignore_ifmsg_event(struct ifnet *ifp, int if_flags_mask) +{ +} + +static struct rtbridge ignore_cb = { + .route_f = ignore_route_event, + .ifmsg_f = ignore_ifmsg_event, +}; + +void *linux_netlink_p = NULL; /* Callback pointer for Linux translator functions */ +struct rtbridge *rtsock_callback_p = &ignore_cb; +struct rtbridge *netlink_callback_p = &ignore_cb; + + +/* + * nlp accessors. + * TODO: move to a separate file once the number grows. + */ +bool +nlp_has_priv(struct nlpcb *nlp, int priv) +{ + return (priv_check_cred(nlp->nl_socket->so_cred, priv) == 0); +} + +struct ucred * +nlp_get_cred(struct nlpcb *nlp) +{ + return (nlp->nl_socket->so_cred); +} + +uint32_t +nlp_get_pid(const struct nlpcb *nlp) +{ + return (nlp->nl_process_id); +} + +bool +nlp_unconstrained_vnet(const struct nlpcb *nlp) +{ + return (nlp->nl_unconstrained_vnet); +} + +#ifndef NETLINK +/* Stub implementations for the loadable functions */ + +static bool +nl_writer_unicast_stub(struct nl_writer *nw, size_t size, struct nlpcb *nlp, + bool waitok) +{ + return (get_stub_writer(nw)); +} + +static bool +nl_writer_group_stub(struct nl_writer *nw, size_t size, uint16_t protocol, + uint16_t group_id, int priv, bool waitok) +{ + return (get_stub_writer(nw)); +} + +static bool +nlmsg_flush_stub(struct nl_writer *nw __unused) +{ + return (false); +} + +static void +nlmsg_ignore_limit_stub(struct nl_writer *nw __unused) +{ +} + +static bool +nlmsg_refill_buffer_stub(struct nl_writer *nw __unused, + size_t required_len __unused) +{ + return (false); +} + +static bool +nlmsg_add_stub(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, + uint16_t flags, uint32_t len) +{ + return (false); +} + +static bool +nlmsg_end_stub(struct nl_writer *nw __unused) +{ + return (false); +} + +static void +nlmsg_abort_stub(struct nl_writer *nw __unused) +{ +} + +static bool +nlmsg_end_dump_stub(struct nl_writer *nw, int error, struct nlmsghdr *hdr) +{ + return (false); +} + +static int +nl_modify_ifp_generic_stub(struct ifnet *ifp __unused, + struct nl_parsed_link *lattrs __unused, const struct nlattr_bmask *bm __unused, + struct nl_pstate *npt __unused) +{ + return (ENOTSUP); +} + +static void +nl_store_ifp_cookie_stub(struct nl_pstate *npt __unused, struct ifnet *ifp __unused) +{ +} + +static struct nlpcb * +nl_get_thread_nlp_stub(struct thread *td __unused) +{ + return (NULL); +} + +const static struct nl_function_wrapper nl_stub = { + .nlmsg_add = nlmsg_add_stub, + .nlmsg_refill_buffer = nlmsg_refill_buffer_stub, + .nlmsg_flush = nlmsg_flush_stub, + .nlmsg_end = nlmsg_end_stub, + .nlmsg_abort = nlmsg_abort_stub, + .nlmsg_ignore_limit = nlmsg_ignore_limit_stub, + .nl_writer_unicast = nl_writer_unicast_stub, + .nl_writer_group = nl_writer_group_stub, + .nlmsg_end_dump = nlmsg_end_dump_stub, + .nl_modify_ifp_generic = nl_modify_ifp_generic_stub, + .nl_store_ifp_cookie = nl_store_ifp_cookie_stub, + .nl_get_thread_nlp = nl_get_thread_nlp_stub, +}; + +/* + * If the kernel is compiled with netlink as a module, + * provide a way to introduce non-stub functioms + */ +static const struct nl_function_wrapper *_nl = &nl_stub; + +void +nl_set_functions(const struct nl_function_wrapper *nl) +{ + _nl = (nl != NULL) ? nl : &nl_stub; +} + +/* Function wrappers */ +bool +nl_writer_unicast(struct nl_writer *nw, size_t size, struct nlpcb *nlp, + bool waitok) +{ + return (_nl->nl_writer_unicast(nw, size, nlp, waitok)); +} + +bool +nl_writer_group(struct nl_writer *nw, size_t size, uint16_t protocol, + uint16_t group_id, int priv, bool waitok) +{ + return (_nl->nl_writer_group(nw, size, protocol, group_id, priv, + waitok)); +} + +bool +nlmsg_flush(struct nl_writer *nw) +{ + return (_nl->nlmsg_flush(nw)); +} + +void nlmsg_ignore_limit(struct nl_writer *nw) +{ + _nl->nlmsg_ignore_limit(nw); +} + +bool +nlmsg_refill_buffer(struct nl_writer *nw, size_t required_len) +{ + return (_nl->nlmsg_refill_buffer(nw, required_len)); +} + +bool +nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, + uint16_t flags, uint32_t len) +{ + return (_nl->nlmsg_add(nw, portid, seq, type, flags, len)); +} + +bool +nlmsg_end(struct nl_writer *nw) +{ + return (_nl->nlmsg_end(nw)); +} + +void +nlmsg_abort(struct nl_writer *nw) +{ + _nl->nlmsg_abort(nw); +} + +bool +nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr) +{ + return (_nl->nlmsg_end_dump(nw, error, hdr)); +} + +int +nl_modify_ifp_generic(struct ifnet *ifp, struct nl_parsed_link *lattrs, + const struct nlattr_bmask *bm , struct nl_pstate *npt) +{ + return (_nl->nl_modify_ifp_generic(ifp, lattrs, bm, npt)); +} + +void +nl_store_ifp_cookie(struct nl_pstate *npt, struct ifnet *ifp) +{ + return (_nl->nl_store_ifp_cookie(npt, ifp)); +} + +struct nlpcb * +nl_get_thread_nlp(struct thread *td) +{ + return (_nl->nl_get_thread_nlp(td)); +} + +#endif /* !NETLINK */ + diff --git a/sys/netlink/netlink_io.c b/sys/netlink/netlink_io.c new file mode 100644 index 000000000000..e7908d6f3a44 --- /dev/null +++ b/sys/netlink/netlink_io.c @@ -0,0 +1,369 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/ck.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/syslog.h> + +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_linux.h> +#include <netlink/netlink_var.h> + +#define DEBUG_MOD_NAME nl_io +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include <netlink/netlink_debug.h> +_DECLARE_DEBUG(LOG_INFO); + +/* + * The logic below provide a p2p interface for receiving and + * sending netlink data between the kernel and userland. + */ + +static bool nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp); + +struct nl_buf * +nl_buf_alloc(size_t len, int mflag) +{ + struct nl_buf *nb; + + KASSERT(len > 0 && len <= UINT_MAX, ("%s: invalid length %zu", + __func__, len)); + + nb = malloc(sizeof(struct nl_buf) + len, M_NETLINK, mflag); + if (__predict_true(nb != NULL)) { + nb->buflen = len; + nb->datalen = nb->offset = 0; + } + + return (nb); +} + +void +nl_buf_free(struct nl_buf *nb) +{ + + free(nb, M_NETLINK); +} + +void +nl_schedule_taskqueue(struct nlpcb *nlp) +{ + if (!nlp->nl_task_pending) { + nlp->nl_task_pending = true; + taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task); + NL_LOG(LOG_DEBUG3, "taskqueue scheduled"); + } else { + NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped"); + } +} + +static bool +nl_process_received_one(struct nlpcb *nlp) +{ + struct socket *so = nlp->nl_socket; + struct sockbuf *sb; + struct nl_buf *nb; + bool reschedule = false; + + NLP_LOCK(nlp); + nlp->nl_task_pending = false; + NLP_UNLOCK(nlp); + + /* + * Do not process queued up requests if there is no space to queue + * replies. + */ + sb = &so->so_rcv; + SOCK_RECVBUF_LOCK(so); + if (sb->sb_hiwat <= sb->sb_ccc) { + SOCK_RECVBUF_UNLOCK(so); + NL_LOG(LOG_DEBUG3, "socket %p stuck", so); + return (false); + } + SOCK_RECVBUF_UNLOCK(so); + + sb = &so->so_snd; + SOCK_SENDBUF_LOCK(so); + while ((nb = TAILQ_FIRST(&sb->nl_queue)) != NULL) { + TAILQ_REMOVE(&sb->nl_queue, nb, tailq); + SOCK_SENDBUF_UNLOCK(so); + reschedule = nl_process_nbuf(nb, nlp); + SOCK_SENDBUF_LOCK(so); + if (reschedule) { + sb->sb_acc -= nb->datalen; + sb->sb_ccc -= nb->datalen; + /* XXXGL: potentially can reduce lock&unlock count. */ + sowwakeup_locked(so); + nl_buf_free(nb); + SOCK_SENDBUF_LOCK(so); + } else { + TAILQ_INSERT_HEAD(&sb->nl_queue, nb, tailq); + break; + } + } + SOCK_SENDBUF_UNLOCK(so); + + return (reschedule); +} + +static void +nl_process_received(struct nlpcb *nlp) +{ + NL_LOG(LOG_DEBUG3, "taskqueue called"); + + if (__predict_false(nlp->nl_need_thread_setup)) { + nl_set_thread_nlp(curthread, nlp); + NLP_LOCK(nlp); + nlp->nl_need_thread_setup = false; + NLP_UNLOCK(nlp); + } + + while (nl_process_received_one(nlp)) + ; +} + +/* + * Called after some data have been read from the socket. + */ +void +nl_on_transmit(struct nlpcb *nlp) +{ + NLP_LOCK(nlp); + + struct socket *so = nlp->nl_socket; + if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) { + unsigned long dropped_bytes = nlp->nl_dropped_bytes; + unsigned long dropped_messages = nlp->nl_dropped_messages; + nlp->nl_dropped_bytes = 0; + nlp->nl_dropped_messages = 0; + + struct sockbuf *sb = &so->so_rcv; + NLP_LOG(LOG_DEBUG, nlp, + "socket RX overflowed, %lu messages (%lu bytes) dropped. " + "bytes: [%u/%u]", dropped_messages, dropped_bytes, + sb->sb_ccc, sb->sb_hiwat); + /* TODO: send netlink message */ + } + + nl_schedule_taskqueue(nlp); + NLP_UNLOCK(nlp); +} + +void +nl_taskqueue_handler(void *_arg, int pending) +{ + struct nlpcb *nlp = (struct nlpcb *)_arg; + + CURVNET_SET(nlp->nl_socket->so_vnet); + nl_process_received(nlp); + CURVNET_RESTORE(); +} + +/* + * Tries to send current data buffer from writer. + * + * Returns true on success. + * If no queue overrunes happened, wakes up socket owner. + */ +bool +nl_send(struct nl_writer *nw, struct nlpcb *nlp) +{ + struct socket *so = nlp->nl_socket; + struct sockbuf *sb = &so->so_rcv; + struct nl_buf *nb; + + MPASS(nw->hdr == NULL); + MPASS(nw->buf != NULL); + MPASS(nw->buf->datalen > 0); + + IF_DEBUG_LEVEL(LOG_DEBUG2) { + struct nlmsghdr *hdr = (struct nlmsghdr *)nw->buf->data; + NLP_LOG(LOG_DEBUG2, nlp, + "TX len %u msgs %u msg type %d first hdrlen %u", + nw->buf->datalen, nw->num_messages, hdr->nlmsg_type, + hdr->nlmsg_len); + } + + if (nlp->nl_linux && linux_netlink_p != NULL && + __predict_false(!linux_netlink_p->msgs_to_linux(nw, nlp))) { + nl_buf_free(nw->buf); + nw->buf = NULL; + return (false); + } + + nb = nw->buf; + nw->buf = NULL; + + SOCK_RECVBUF_LOCK(so); + if (!nw->ignore_limit && __predict_false(sb->sb_hiwat <= sb->sb_ccc)) { + SOCK_RECVBUF_UNLOCK(so); + NLP_LOCK(nlp); + nlp->nl_dropped_bytes += nb->datalen; + nlp->nl_dropped_messages += nw->num_messages; + NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)", + (unsigned long)nlp->nl_dropped_messages, nw->num_messages, + (unsigned long)nlp->nl_dropped_bytes, nb->datalen); + NLP_UNLOCK(nlp); + nl_buf_free(nb); + return (false); + } else { + bool full; + + TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq); + sb->sb_acc += nb->datalen; + sb->sb_ccc += nb->datalen; + full = sb->sb_hiwat <= sb->sb_ccc; + sorwakeup_locked(so); + if (full) { + NLP_LOCK(nlp); + nlp->nl_tx_blocked = true; + NLP_UNLOCK(nlp); + } + return (true); + } +} + +static int +nl_receive_message(struct nlmsghdr *hdr, int remaining_length, + struct nlpcb *nlp, struct nl_pstate *npt) +{ + nl_handler_f handler = nl_handlers[nlp->nl_proto].cb; + int error = 0; + + NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", + hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq, + hdr->nlmsg_pid); + + if (__predict_false(hdr->nlmsg_len > remaining_length)) { + NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d", + hdr->nlmsg_len, remaining_length); + return (EINVAL); + } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) { + NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len); + return (EINVAL); + } + /* Stamp each message with sender pid */ + hdr->nlmsg_pid = nlp->nl_port; + + npt->hdr = hdr; + + if (hdr->nlmsg_flags & NLM_F_REQUEST && + hdr->nlmsg_type >= NLMSG_MIN_TYPE) { + NL_LOG(LOG_DEBUG2, "handling message with msg type: %d", + hdr->nlmsg_type); + if (nlp->nl_linux) { + MPASS(linux_netlink_p != NULL); + error = linux_netlink_p->msg_from_linux(nlp->nl_proto, + &hdr, npt); + if (error) + goto ack; + } + error = handler(hdr, npt); + NL_LOG(LOG_DEBUG2, "retcode: %d", error); + } +ack: + if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) { + if (!npt->nw->suppress_ack) { + NL_LOG(LOG_DEBUG3, "ack"); + nlmsg_ack(nlp, error, hdr, npt); + } + } + + return (0); +} + +static void +npt_clear(struct nl_pstate *npt) +{ + lb_clear(&npt->lb); + npt->cookie = NULL; + npt->error = 0; + npt->err_msg = NULL; + npt->err_off = 0; + npt->hdr = NULL; + npt->nw->suppress_ack = false; +} + +/* + * Processes an incoming packet, which can contain multiple netlink messages + */ +static bool +nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp) +{ + struct nl_writer nw; + struct nlmsghdr *hdr; + int error; + + NL_LOG(LOG_DEBUG3, "RX netlink buf %p on %p", nb, nlp->nl_socket); + + if (!nl_writer_unicast(&nw, NLMSG_SMALL, nlp, false)) { + NL_LOG(LOG_DEBUG, "error allocating socket writer"); + return (true); + } + + nlmsg_ignore_limit(&nw); + + struct nl_pstate npt = { + .nlp = nlp, + .lb.base = &nb->data[roundup2(nb->datalen, 8)], + .lb.size = nb->buflen - roundup2(nb->datalen, 8), + .nw = &nw, + .strict = nlp->nl_flags & NLF_STRICT, + }; + + for (; nb->offset + sizeof(struct nlmsghdr) <= nb->datalen;) { + hdr = (struct nlmsghdr *)&nb->data[nb->offset]; + /* Save length prior to calling handler */ + int msglen = NLMSG_ALIGN(hdr->nlmsg_len); + NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", + nb->offset, nb->datalen); + npt_clear(&npt); + error = nl_receive_message(hdr, nb->datalen - nb->offset, nlp, + &npt); + nb->offset += msglen; + if (__predict_false(error != 0 || nlp->nl_tx_blocked)) + break; + } + NL_LOG(LOG_DEBUG3, "packet parsing done"); + nlmsg_flush(&nw); + + if (nlp->nl_tx_blocked) { + NLP_LOCK(nlp); + nlp->nl_tx_blocked = false; + NLP_UNLOCK(nlp); + return (false); + } else + return (true); +} diff --git a/sys/netlink/netlink_linux.h b/sys/netlink/netlink_linux.h new file mode 100644 index 000000000000..d4c451d470b2 --- /dev/null +++ b/sys/netlink/netlink_linux.h @@ -0,0 +1,53 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETLINK_LINUX_VAR_H_ +#define _NETLINK_LINUX_VAR_H_ +#ifdef _KERNEL + +/* + * The file contains headers for the bridge interface between + * linux[_common] module and the netlink module + */ +struct nlpcb; +struct nl_pstate; +struct nl_writer; + +typedef bool msgs_to_linux_cb_t(struct nl_writer *nw, struct nlpcb *nlp); +typedef int msg_from_linux_cb_t(int netlink_family, struct nlmsghdr **hdr, + struct nl_pstate *npt); + +struct linux_netlink_provider { + msgs_to_linux_cb_t *msgs_to_linux; + msg_from_linux_cb_t *msg_from_linux; + +}; + +extern struct linux_netlink_provider *linux_netlink_p; + +#endif +#endif diff --git a/sys/netlink/netlink_message_parser.c b/sys/netlink/netlink_message_parser.c new file mode 100644 index 000000000000..4c41235efaac --- /dev/null +++ b/sys/netlink/netlink_message_parser.c @@ -0,0 +1,635 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_inet.h" +#include "opt_inet6.h" +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/rmlock.h> +#include <sys/socket.h> +#include <sys/stdarg.h> + +#include <net/if.h> +#include <net/route.h> +#include <net/route/nhop.h> + +#include <net/route/route_ctl.h> +#include <netinet/in.h> +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_var.h> +#include <netlink/netlink_route.h> + +#define DEBUG_MOD_NAME nl_parser +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include <netlink/netlink_debug.h> +_DECLARE_DEBUG(LOG_INFO); + +bool +nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...) +{ + va_list ap; + + if (npt->err_msg != NULL) + return (false); + char *buf = npt_alloc(npt, NL_MAX_ERROR_BUF); + if (buf == NULL) + return (false); + va_start(ap, fmt); + vsnprintf(buf, NL_MAX_ERROR_BUF, fmt, ap); + va_end(ap); + + npt->err_msg = buf; + return (true); +} + +bool +nlmsg_report_err_offset(struct nl_pstate *npt, uint32_t off) +{ + if (npt->err_off != 0) + return (false); + npt->err_off = off; + return (true); +} + +void +nlmsg_report_cookie(struct nl_pstate *npt, struct nlattr *nla) +{ + MPASS(nla->nla_type == NLMSGERR_ATTR_COOKIE); + MPASS(nla->nla_len >= sizeof(struct nlattr)); + npt->cookie = nla; +} + +void +nlmsg_report_cookie_u32(struct nl_pstate *npt, uint32_t val) +{ + struct nlattr *nla = npt_alloc(npt, sizeof(*nla) + sizeof(uint32_t)); + + nla->nla_type = NLMSGERR_ATTR_COOKIE; + nla->nla_len = sizeof(*nla) + sizeof(uint32_t); + memcpy(nla + 1, &val, sizeof(uint32_t)); + nlmsg_report_cookie(npt, nla); +} + +static const struct nlattr_parser * +search_states(const struct nlattr_parser *ps, u_int pslen, int key) +{ + int left_i = 0, right_i = pslen - 1; + + if (key < ps[0].type || key > ps[pslen - 1].type) + return (NULL); + + while (left_i + 1 < right_i) { + int mid_i = (left_i + right_i) / 2; + if (key < ps[mid_i].type) + right_i = mid_i; + else if (key > ps[mid_i].type) + left_i = mid_i + 1; + else + return (&ps[mid_i]); + } + if (ps[left_i].type == key) + return (&ps[left_i]); + else if (ps[right_i].type == key) + return (&ps[right_i]); + return (NULL); +} + +int +nl_parse_attrs_raw(struct nlattr *nla_head, uint16_t len, + const struct nlattr_parser *ps, u_int pslen, struct nl_pstate *npt, + void *target) +{ + const struct nlattr_parser *s; + struct nlattr *nla; + uint16_t orig_len, off; + int error = 0; + + NL_LOG(LOG_DEBUG3, "parse %p remaining_len %d", nla_head, len); + orig_len = len; + NLA_FOREACH(nla, nla_head, len) { + NL_LOG(LOG_DEBUG3, ">> parsing %p attr_type %u len %u (rem %u)", + nla, nla->nla_type, nla->nla_len, len); + if (nla->nla_len < sizeof(struct nlattr)) { + NLMSG_REPORT_ERR_MSG(npt, + "Invalid attr %p type %u len: %u", + nla, nla->nla_type, nla->nla_len); + off = (char *)nla - (char *)npt->hdr; + nlmsg_report_err_offset(npt, off); + return (EINVAL); + } + + s = search_states(ps, pslen, nla->nla_type & NLA_TYPE_MASK); + if (s != NULL) { + void *ptr; + + ptr = (void *)((char *)target + s->off); + error = s->cb(nla, npt, s->arg, ptr); + if (error != 0) { + off = (char *)nla - (char *)npt->hdr; + nlmsg_report_err_offset(npt, off); + NL_LOG(LOG_DEBUG3, + "parse failed at offset %u", off); + return (error); + } + } else { + /* Ignore non-specified attributes */ + NL_LOG(LOG_DEBUG3, "ignoring attr %u", nla->nla_type); + } + } + if (len >= sizeof(struct nlattr)) { + nla = (struct nlattr *)((char *)nla_head + (orig_len - len)); + NL_LOG(LOG_DEBUG3, " >>> end %p attr_type %u len %u", nla, + nla->nla_type, nla->nla_len); + } + NL_LOG(LOG_DEBUG3, "end parse: %p remaining_len %u", nla, len); + + return (0); +} + +void +nl_get_attrs_bmask_raw(struct nlattr *nla_head, uint32_t len, + struct nlattr_bmask *bm) +{ + struct nlattr *nla = NULL; + uint16_t nla_type; + + BIT_ZERO(NL_ATTR_BMASK_SIZE, bm); + + NLA_FOREACH(nla, nla_head, len) { + if (nla->nla_len < sizeof(struct nlattr)) + return; + nla_type = nla->nla_type & NLA_TYPE_MASK; + if (nla_type < NL_ATTR_BMASK_SIZE) + BIT_SET(NL_ATTR_BMASK_SIZE, nla_type, bm); + else + NL_LOG(LOG_DEBUG2, + "Skipping type %u in the mask: too short", + nla_type); + } +} + +bool +nl_has_attr(const struct nlattr_bmask *bm, uint16_t nla_type) +{ + MPASS(nla_type < NL_ATTR_BMASK_SIZE); + + return (BIT_ISSET(NL_ATTR_BMASK_SIZE, nla_type, bm)); +} + +int +nlattr_get_flag(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + if (__predict_false(NLA_DATA_LEN(nla) != 0)) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not a flag", + nla->nla_type, NLA_DATA_LEN(nla)); + return (EINVAL); + } + + *((uint8_t *)target) = 1; + return (0); +} + +static struct sockaddr * +parse_rta_ip4(void *rta_data, struct nl_pstate *npt, int *perror) +{ + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)npt_alloc_sockaddr(npt, + sizeof(struct sockaddr_in)); + if (__predict_false(sin == NULL)) { + *perror = ENOBUFS; + return (NULL); + } + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_family = AF_INET; + memcpy(&sin->sin_addr, rta_data, sizeof(struct in_addr)); + return ((struct sockaddr *)sin); +} + +static struct sockaddr * +parse_rta_ip6(void *rta_data, struct nl_pstate *npt, int *perror) +{ + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)npt_alloc_sockaddr(npt, + sizeof(struct sockaddr_in6)); + if (__predict_false(sin6 == NULL)) { + *perror = ENOBUFS; + return (NULL); + } + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_family = AF_INET6; + memcpy(&sin6->sin6_addr, rta_data, sizeof(struct in6_addr)); + return ((struct sockaddr *)sin6); +} + +static struct sockaddr * +parse_rta_ip(struct rtattr *rta, struct nl_pstate *npt, int *perror) +{ + void *rta_data = NL_RTA_DATA(rta); + int rta_len = NL_RTA_DATA_LEN(rta); + + if (rta_len == sizeof(struct in_addr)) { + return (parse_rta_ip4(rta_data, npt, perror)); + } else if (rta_len == sizeof(struct in6_addr)) { + return (parse_rta_ip6(rta_data, npt, perror)); + } else { + NLMSG_REPORT_ERR_MSG(npt, "unknown IP len: %d for rta type %d", + rta_len, rta->rta_type); + *perror = ENOTSUP; + return (NULL); + } + return (NULL); +} + +int +nlattr_get_ip(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + int error = 0; + + struct sockaddr *sa = parse_rta_ip((struct rtattr *)nla, npt, &error); + + *((struct sockaddr **)target) = sa; + return (error); +} + +static struct sockaddr * +parse_rta_via(struct rtattr *rta, struct nl_pstate *npt, int *perror) +{ + struct rtvia *via = NL_RTA_DATA(rta); + int data_len = NL_RTA_DATA_LEN(rta); + + if (__predict_false(data_len) < sizeof(struct rtvia)) { + NLMSG_REPORT_ERR_MSG(npt, "undersized RTA_VIA(%d) attr: len %d", + rta->rta_type, data_len); + *perror = EINVAL; + return (NULL); + } + data_len -= offsetof(struct rtvia, rtvia_addr); + + switch (via->rtvia_family) { + case AF_INET: + if (__predict_false(data_len < sizeof(struct in_addr))) { + *perror = EINVAL; + return (NULL); + } + return (parse_rta_ip4(via->rtvia_addr, npt, perror)); + case AF_INET6: + if (__predict_false(data_len < sizeof(struct in6_addr))) { + *perror = EINVAL; + return (NULL); + } + return (parse_rta_ip6(via->rtvia_addr, npt, perror)); + default: + *perror = ENOTSUP; + return (NULL); + } +} + +int +nlattr_get_ipvia(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + int error = 0; + + struct sockaddr *sa = parse_rta_via((struct rtattr *)nla, npt, &error); + + *((struct sockaddr **)target) = sa; + return (error); +} + +int +nlattr_get_bool(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + if (__predict_false(NLA_DATA_LEN(nla) != sizeof(bool))) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not bool", + nla->nla_type, NLA_DATA_LEN(nla)); + return (EINVAL); + } + *((bool *)target) = *((const bool *)NL_RTA_DATA_CONST(nla)); + return (0); +} + +int +nlattr_get_uint8(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint8_t))) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint8", + nla->nla_type, NLA_DATA_LEN(nla)); + return (EINVAL); + } + *((uint8_t *)target) = *((const uint8_t *)NL_RTA_DATA_CONST(nla)); + return (0); +} + +int +nlattr_get_uint16(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint16_t))) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint16", + nla->nla_type, NLA_DATA_LEN(nla)); + return (EINVAL); + } + *((uint16_t *)target) = *((const uint16_t *)NL_RTA_DATA_CONST(nla)); + return (0); +} + +int +nlattr_get_uint32(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint32_t))) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint32", + nla->nla_type, NLA_DATA_LEN(nla)); + return (EINVAL); + } + *((uint32_t *)target) = *((const uint32_t *)NL_RTA_DATA_CONST(nla)); + return (0); +} + +int +nlattr_get_uint64(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint64_t))) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint64", + nla->nla_type, NLA_DATA_LEN(nla)); + return (EINVAL); + } + memcpy(target, NL_RTA_DATA_CONST(nla), sizeof(uint64_t)); + return (0); +} + +int +nlattr_get_in_addr(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + if (__predict_false(NLA_DATA_LEN(nla) != sizeof(in_addr_t))) { + NLMSG_REPORT_ERR_MSG(npt, + "nla type %d size(%u) is not in_addr_t", + nla->nla_type, NLA_DATA_LEN(nla)); + return (EINVAL); + } + memcpy(target, NLA_DATA_CONST(nla), sizeof(in_addr_t)); + return (0); +} + +int +nlattr_get_in6_addr(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + if (__predict_false(NLA_DATA_LEN(nla) != sizeof(struct in6_addr))) { + NLMSG_REPORT_ERR_MSG(npt, + "nla type %d size(%u) is not struct in6_addr", + nla->nla_type, NLA_DATA_LEN(nla)); + return (EINVAL); + } + memcpy(target, NLA_DATA_CONST(nla), sizeof(struct in6_addr)); + return (0); +} + +static int +nlattr_get_ifp_internal(struct nlattr *nla, struct nl_pstate *npt, + void *target, bool zero_ok) +{ + struct ifnet *ifp; + u_int ifindex; + + if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint32_t))) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint32", + nla->nla_type, NLA_DATA_LEN(nla)); + return (EINVAL); + } + ifindex = *((const u_int *)NLA_DATA_CONST(nla)); + + if (ifindex == 0 && zero_ok) { + *((struct ifnet **)target) = NULL; + return (0); + } + + NET_EPOCH_ASSERT(); + + ifp = ifnet_byindex(ifindex); + if (__predict_false(ifp == NULL)) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d: ifindex %u invalid", + nla->nla_type, ifindex); + return (ENOENT); + } + *((struct ifnet **)target) = ifp; + NL_LOG(LOG_DEBUG3, "nla type %d: ifindex %u -> %s", nla->nla_type, + ifindex, if_name(ifp)); + + return (0); +} + +int +nlattr_get_ifp(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + return (nlattr_get_ifp_internal(nla, npt, target, false)); +} + +int +nlattr_get_ifpz(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + return (nlattr_get_ifp_internal(nla, npt, target, true)); +} + +int +nlattr_get_chara(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + int maxlen = NLA_DATA_LEN(nla); + int target_size = (size_t)arg; + int len = strnlen((char *)NLA_DATA(nla), maxlen); + + if (__predict_false(len >= maxlen) || + __predict_false(len >= target_size)) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not " + "NULL-terminated or longer than %u", + nla->nla_type, maxlen, target_size); + return (EINVAL); + } + + strncpy((char *)target, (char *)NLA_DATA(nla), target_size); + return (0); +} + +int +nlattr_get_string(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + int maxlen = NLA_DATA_LEN(nla); + + if (__predict_false(strnlen((char *)NLA_DATA(nla), maxlen) >= maxlen)) { + NLMSG_REPORT_ERR_MSG(npt, + "nla type %d size(%u) is not NULL-terminated", + nla->nla_type, maxlen); + return (EINVAL); + } + + *((char **)target) = (char *)NLA_DATA(nla); + return (0); +} + +int +nlattr_get_stringn(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + int maxlen = NLA_DATA_LEN(nla); + + char *buf = npt_alloc(npt, maxlen + 1); + if (buf == NULL) + return (ENOMEM); + buf[maxlen] = '\0'; + memcpy(buf, NLA_DATA(nla), maxlen); + + *((char **)target) = buf; + return (0); +} + +int +nlattr_get_bytes(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + size_t size = (size_t)arg; + + if (NLA_DATA_LEN(nla) != size) + return (EINVAL); + + memcpy(target, NLA_DATA(nla), size); + + return (0); +} + +int +nlattr_get_nla(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + NL_LOG(LOG_DEBUG3, "STORING %p len %d", nla, nla->nla_len); + *((struct nlattr **)target) = nla; + return (0); +} + +int +nlattr_get_nested(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + const struct nlhdr_parser *p = (const struct nlhdr_parser *)arg; + + /* Assumes target points to the beginning of the structure. */ + return (nl_parse_header(NLA_DATA(nla), NLA_DATA_LEN(nla), p, npt, + target)); +} + +int +nlattr_get_nested_ptr(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target) +{ + const struct nlhdr_parser *p = (const struct nlhdr_parser *)arg; + + /* Assumes target points to the beginning of the structure. */ + return (nl_parse_header(NLA_DATA(nla), NLA_DATA_LEN(nla), p, npt, + *(void **)target)); +} + +int +nlf_get_ifp(void *src, struct nl_pstate *npt, void *target) +{ + struct ifnet *ifp; + u_int ifindex; + + NET_EPOCH_ASSERT(); + + ifindex = *((const u_int *)src); + ifp = ifnet_byindex(ifindex); + if (ifp == NULL) { + NL_LOG(LOG_DEBUG, "ifindex %u invalid", ifindex); + return (ENOENT); + } + *((struct ifnet **)target) = ifp; + + return (0); +} + +int +nlf_get_ifpz(void *src, struct nl_pstate *npt, void *target) +{ + struct ifnet *ifp; + u_int ifindex; + + NET_EPOCH_ASSERT(); + + ifindex = *((const u_int *)src); + ifp = ifnet_byindex(ifindex); + if (ifindex != 0 && ifp == NULL) { + NL_LOG(LOG_DEBUG, "ifindex %u invalid", ifindex); + return (ENOENT); + } + *((struct ifnet **)target) = ifp; + + return (0); +} + +int +nlf_get_u8(void *src, struct nl_pstate *npt, void *target) +{ + uint8_t val = *((const uint8_t *)src); + + *((uint8_t *)target) = val; + + return (0); +} + +int +nlf_get_u8_u32(void *src, struct nl_pstate *npt, void *target) +{ + *((uint32_t *)target) = *((const uint8_t *)src); + return (0); +} + +int +nlf_get_u16(void *src, struct nl_pstate *npt, void *target) +{ + *((uint16_t *)target) = *((const uint16_t *)src); + return (0); +} + +int +nlf_get_u32(void *src, struct nl_pstate *npt, void *target) +{ + *((uint32_t *)target) = *((const uint32_t *)src); + return (0); +} diff --git a/sys/netlink/netlink_message_parser.h b/sys/netlink/netlink_message_parser.h new file mode 100644 index 000000000000..720317ed74f3 --- /dev/null +++ b/sys/netlink/netlink_message_parser.h @@ -0,0 +1,337 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETLINK_NETLINK_MESSAGE_PARSER_H_ +#define _NETLINK_NETLINK_MESSAGE_PARSER_H_ + +#ifdef _KERNEL + +#include <sys/bitset.h> + +/* + * It is not meant to be included directly + */ + +/* Parsing state */ +struct linear_buffer { + char *base; /* Base allocated memory pointer */ + uint32_t offset; /* Currently used offset */ + uint32_t size; /* Total buffer size */ +} __aligned(_Alignof(__max_align_t)); + +static inline void * +lb_alloc(struct linear_buffer *lb, int len) +{ + len = roundup2(len, _Alignof(__max_align_t)); + if (lb->offset + len > lb->size) + return (NULL); + void *data = (void *)(lb->base + lb->offset); + lb->offset += len; + return (data); +} + +static inline void +lb_clear(struct linear_buffer *lb) +{ + memset(lb->base, 0, lb->size); + lb->offset = 0; +} + +#define NL_MAX_ERROR_BUF 128 +#define SCRATCH_BUFFER_SIZE (1024 + NL_MAX_ERROR_BUF) +struct nl_pstate { + struct linear_buffer lb; /* Per-message scratch buffer */ + struct nlpcb *nlp; /* Originator socket */ + struct nl_writer *nw; /* Message writer to use */ + struct nlmsghdr *hdr; /* Current parsed message header */ + uint32_t err_off; /* error offset from hdr start */ + int error; /* last operation error */ + char *err_msg; /* Description of last error */ + struct nlattr *cookie; /* NLA to return to the userspace */ + bool strict; /* Strict parsing required */ +}; + +static inline void * +npt_alloc(struct nl_pstate *npt, int len) +{ + return (lb_alloc(&npt->lb, len)); +} +#define npt_alloc_sockaddr(_npt, _len) \ + ((struct sockaddr *)(npt_alloc((_npt), (_len)))) + +typedef int parse_field_f(void *hdr, struct nl_pstate *npt, void *target); +struct nlfield_parser { + uint16_t off_in; + uint16_t off_out; + parse_field_f *cb; +}; +static const struct nlfield_parser nlf_p_empty[] = {}; + +int nlf_get_ifp(void *src, struct nl_pstate *npt, void *target); +int nlf_get_ifpz(void *src, struct nl_pstate *npt, void *target); +int nlf_get_u8(void *src, struct nl_pstate *npt, void *target); +int nlf_get_u16(void *src, struct nl_pstate *npt, void *target); +int nlf_get_u32(void *src, struct nl_pstate *npt, void *target); +int nlf_get_u8_u32(void *src, struct nl_pstate *npt, void *target); + +struct nlattr_parser; +typedef int parse_attr_f(struct nlattr *attr, struct nl_pstate *npt, + const void *arg, void *target); +struct nlattr_parser { + uint16_t type; /* Attribute type */ + uint16_t off; /* field offset in the target structure */ + parse_attr_f *cb; /* parser function to call */ + const void *arg; +}; + +typedef bool strict_parser_f(void *hdr, struct nl_pstate *npt); +typedef bool post_parser_f(void *parsed_attrs, struct nl_pstate *npt); + +struct nlhdr_parser { + u_int nl_hdr_off; /* aligned netlink header size */ + u_int out_hdr_off; /* target header size */ + u_int fp_size; + u_int np_size; + const struct nlfield_parser *fp; /* array of header field parsers */ + const struct nlattr_parser *np; /* array of attribute parsers */ + strict_parser_f *sp; /* Pre-parse strict validation function */ + post_parser_f *post_parse; +}; + +#define NL_DECLARE_PARSER_EXT(_name, _t, _sp, _fp, _np, _pp) \ +static const struct nlhdr_parser _name = { \ + .nl_hdr_off = sizeof(_t), \ + .fp = &((_fp)[0]), \ + .np = &((_np)[0]), \ + .fp_size = nitems(_fp), \ + .np_size = nitems(_np), \ + .sp = _sp, \ + .post_parse = _pp, \ +} + +#define NL_DECLARE_PARSER(_name, _t, _fp, _np) \ + NL_DECLARE_PARSER_EXT(_name, _t, NULL, _fp, _np, NULL) + +#define NL_DECLARE_STRICT_PARSER(_name, _t, _sp, _fp, _np) \ + NL_DECLARE_PARSER_EXT(_name, _t, _sp, _fp, _np, NULL) + +#define NL_DECLARE_ARR_PARSER(_name, _t, _o, _fp, _np) \ +static const struct nlhdr_parser _name = { \ + .nl_hdr_off = sizeof(_t), \ + .out_hdr_off = sizeof(_o), \ + .fp = &((_fp)[0]), \ + .np = &((_np)[0]), \ + .fp_size = nitems(_fp), \ + .np_size = nitems(_np), \ +} + +#define NL_DECLARE_ATTR_PARSER_EXT(_name, _np, _pp) \ +static const struct nlhdr_parser _name = { \ + .np = &((_np)[0]), \ + .np_size = nitems(_np), \ + .post_parse = (_pp) \ +} + +#define NL_DECLARE_ATTR_PARSER(_name, _np) \ + NL_DECLARE_ATTR_PARSER_EXT(_name, _np, NULL) + +#define NL_ATTR_BMASK_SIZE 128 +BITSET_DEFINE(nlattr_bmask, NL_ATTR_BMASK_SIZE); + +void nl_get_attrs_bmask_raw(struct nlattr *nla_head, uint32_t len, + struct nlattr_bmask *bm); +bool nl_has_attr(const struct nlattr_bmask *bm, uint16_t nla_type); + +int nl_parse_attrs_raw(struct nlattr *nla_head, uint16_t len, + const struct nlattr_parser *ps, u_int pslen, struct nl_pstate *npt, + void *target); + +int nlattr_get_flag(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_ip(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_bool(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_uint8(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_uint16(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_uint32(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_uint64(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_in_addr(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_in6_addr(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_ifp(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_ifpz(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_ipvia(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_chara(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_string(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_stringn(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_bytes(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_nla(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_nested(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_nested_ptr(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); + +bool nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...) + __printflike(2, 3); + +#define NLMSG_REPORT_ERR_MSG(_npt, _fmt, ...) { \ + nlmsg_report_err_msg(_npt, _fmt, ## __VA_ARGS__); \ + NLP_LOG(LOG_DEBUG, (_npt)->nlp, _fmt, ## __VA_ARGS__); \ +} + +bool nlmsg_report_err_offset(struct nl_pstate *npt, uint32_t off); + +void nlmsg_report_cookie(struct nl_pstate *npt, struct nlattr *nla); +void nlmsg_report_cookie_u32(struct nl_pstate *npt, uint32_t val); + +/* + * Have it inline so compiler can optimize field accesses into + * the list of direct function calls without iteration. + */ +static inline int +nl_parse_header(void *hdr, uint32_t len, const struct nlhdr_parser *parser, + struct nl_pstate *npt, void *target) +{ + int error; + + if (__predict_false(len < parser->nl_hdr_off)) { + void *tmp_hdr; + + if (npt->strict) { + nlmsg_report_err_msg(npt, + "header too short: expected %d, got %d", + parser->nl_hdr_off, len); + return (EINVAL); + } + + /* + * Compatibility with older applications: + * pretend there's a full header. + */ + tmp_hdr = npt_alloc(npt, parser->nl_hdr_off); + if (tmp_hdr == NULL) + return (EINVAL); + memcpy(tmp_hdr, hdr, len); + hdr = tmp_hdr; + len = parser->nl_hdr_off; + } + + if (npt->strict && parser->sp != NULL && !parser->sp(hdr, npt)) + return (EINVAL); + + /* Extract fields first */ + for (u_int i = 0; i < parser->fp_size; i++) { + const struct nlfield_parser *fp = &parser->fp[i]; + void *src = (char *)hdr + fp->off_in; + void *dst = (char *)target + fp->off_out; + + error = fp->cb(src, npt, dst); + if (error != 0) + return (error); + } + + error = nl_parse_attrs_raw( + (struct nlattr *)((char *)hdr + parser->nl_hdr_off), + len - parser->nl_hdr_off, parser->np, parser->np_size, npt, target); + + if (parser->post_parse != NULL && error == 0) { + if (!parser->post_parse(target, npt)) + return (EINVAL); + } + + return (error); +} + +static inline int +nl_parse_nested(struct nlattr *nla, const struct nlhdr_parser *parser, + struct nl_pstate *npt, void *target) +{ + return (nl_parse_attrs_raw((struct nlattr *)NLA_DATA(nla), + NLA_DATA_LEN(nla), parser->np, parser->np_size, npt, target)); +} + +/* + * Checks that attributes are sorted by attribute type. + */ +static inline void +nl_verify_parsers(const struct nlhdr_parser **parser, int count) +{ +#ifdef INVARIANTS + for (int i = 0; i < count; i++) { + const struct nlhdr_parser *p = parser[i]; + int attr_type = 0; + for (int j = 0; j < p->np_size; j++) { + MPASS(p->np[j].type > attr_type); + attr_type = p->np[j].type; + + /* Recurse into nested objects. */ + if (p->np[j].cb == nlattr_get_nested || + p->np[j].cb == nlattr_get_nested_ptr) { + const struct nlhdr_parser *np = + (const struct nlhdr_parser *)p->np[j].arg; + nl_verify_parsers(&np, 1); + } + } + } +#endif +} +void nl_verify_parsers(const struct nlhdr_parser **parser, int count); +#define NL_VERIFY_PARSERS(_p) nl_verify_parsers((_p), nitems(_p)) + +static inline int +nl_parse_nlmsg(struct nlmsghdr *hdr, const struct nlhdr_parser *parser, + struct nl_pstate *npt, void *target) +{ + return (nl_parse_header(hdr + 1, hdr->nlmsg_len - sizeof(*hdr), parser, + npt, target)); +} + +static inline void +nl_get_attrs_bmask_nlmsg(struct nlmsghdr *hdr, + const struct nlhdr_parser *parser, struct nlattr_bmask *bm) +{ + nl_get_attrs_bmask_raw( + (struct nlattr *)((char *)(hdr + 1) + parser->nl_hdr_off), + hdr->nlmsg_len - sizeof(*hdr) - parser->nl_hdr_off, bm); +} + +#endif +#endif diff --git a/sys/netlink/netlink_message_writer.c b/sys/netlink/netlink_message_writer.c new file mode 100644 index 000000000000..8c5b3ec14058 --- /dev/null +++ b/sys/netlink/netlink_message_writer.c @@ -0,0 +1,399 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/rmlock.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/syslog.h> + +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_linux.h> +#include <netlink/netlink_var.h> + +#define DEBUG_MOD_NAME nl_writer +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include <netlink/netlink_debug.h> +_DECLARE_DEBUG(LOG_INFO); + +static bool +nlmsg_get_buf(struct nl_writer *nw, size_t len, bool waitok) +{ + const int mflag = waitok ? M_WAITOK : M_NOWAIT; + + MPASS(nw->buf == NULL); + + NL_LOG(LOG_DEBUG3, "Setting up nw %p len %zu %s", nw, len, + waitok ? "wait" : "nowait"); + + nw->buf = nl_buf_alloc(len, mflag); + if (__predict_false(nw->buf == NULL)) + return (false); + nw->hdr = NULL; + nw->malloc_flag = mflag; + nw->num_messages = 0; + nw->enomem = false; + + return (true); +} + +static bool +nl_send_one(struct nl_writer *nw) +{ + + return (nl_send(nw, nw->nlp)); +} + +bool +_nl_writer_unicast(struct nl_writer *nw, size_t size, struct nlpcb *nlp, + bool waitok) +{ + *nw = (struct nl_writer){ + .nlp = nlp, + .cb = nl_send_one, + }; + + return (nlmsg_get_buf(nw, size, waitok)); +} + +bool +_nl_writer_group(struct nl_writer *nw, size_t size, uint16_t protocol, + uint16_t group_id, int priv, bool waitok) +{ + *nw = (struct nl_writer){ + .group.proto = protocol, + .group.id = group_id, + .group.priv = priv, + .cb = nl_send_group, + }; + + return (nlmsg_get_buf(nw, size, waitok)); +} + +void +_nlmsg_ignore_limit(struct nl_writer *nw) +{ + nw->ignore_limit = true; +} + +bool +_nlmsg_flush(struct nl_writer *nw) +{ + bool result; + + if (__predict_false(nw->hdr != NULL)) { + /* Last message has not been completed, skip it. */ + int completed_len = (char *)nw->hdr - nw->buf->data; + /* Send completed messages */ + nw->buf->datalen -= nw->buf->datalen - completed_len; + nw->hdr = NULL; + } + + if (nw->buf->datalen == 0) { + MPASS(nw->num_messages == 0); + nl_buf_free(nw->buf); + nw->buf = NULL; + return (true); + } + + result = nw->cb(nw); + nw->num_messages = 0; + + if (!result) { + NL_LOG(LOG_DEBUG, "nw %p flush with %p() failed", nw, nw->cb); + } + + return (result); +} + +/* + * Flushes previous data and allocates new underlying storage + * sufficient for holding at least @required_len bytes. + * Return true on success. + */ +bool +_nlmsg_refill_buffer(struct nl_writer *nw, size_t required_len) +{ + struct nl_buf *new; + size_t completed_len, new_len, last_len; + + MPASS(nw->buf != NULL); + + if (nw->enomem) + return (false); + + NL_LOG(LOG_DEBUG3, "no space at offset %u/%u (want %zu), trying to " + "reclaim", nw->buf->datalen, nw->buf->buflen, required_len); + + /* Calculate new buffer size and allocate it. */ + completed_len = (nw->hdr != NULL) ? + (char *)nw->hdr - nw->buf->data : nw->buf->datalen; + if (completed_len > 0 && required_len < NLMBUFSIZE) { + /* We already ran out of space, use largest effective size. */ + new_len = max(nw->buf->buflen, NLMBUFSIZE); + } else { + if (nw->buf->buflen < NLMBUFSIZE) + /* XXXGL: does this happen? */ + new_len = NLMBUFSIZE; + else + new_len = nw->buf->buflen * 2; + while (new_len < required_len) + new_len *= 2; + } + + new = nl_buf_alloc(new_len, nw->malloc_flag | M_ZERO); + if (__predict_false(new == NULL)) { + nw->enomem = true; + NL_LOG(LOG_DEBUG, "getting new buf failed, setting ENOMEM"); + return (false); + } + + /* Copy last (unfinished) header to the new storage. */ + last_len = nw->buf->datalen - completed_len; + if (last_len > 0) { + memcpy(new->data, nw->hdr, last_len); + new->datalen = last_len; + } + + NL_LOG(LOG_DEBUG2, "completed: %zu bytes, copied: %zu bytes", + completed_len, last_len); + + if (completed_len > 0) { + nlmsg_flush(nw); + MPASS(nw->buf == NULL); + } else + nl_buf_free(nw->buf); + nw->buf = new; + nw->hdr = (last_len > 0) ? (struct nlmsghdr *)new->data : NULL; + NL_LOG(LOG_DEBUG2, "switched buffer: used %u/%u bytes", + new->datalen, new->buflen); + + return (true); +} + +bool +_nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, + uint16_t flags, uint32_t len) +{ + struct nl_buf *nb = nw->buf; + struct nlmsghdr *hdr; + size_t required_len; + + MPASS(nw->hdr == NULL); + + required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr)); + if (__predict_false(nb->datalen + required_len > nb->buflen)) { + if (!nlmsg_refill_buffer(nw, required_len)) + return (false); + nb = nw->buf; + } + + hdr = (struct nlmsghdr *)(&nb->data[nb->datalen]); + + hdr->nlmsg_len = len; + hdr->nlmsg_type = type; + hdr->nlmsg_flags = flags; + hdr->nlmsg_seq = seq; + hdr->nlmsg_pid = portid; + + nw->hdr = hdr; + nb->datalen += sizeof(struct nlmsghdr); + + return (true); +} + +bool +_nlmsg_end(struct nl_writer *nw) +{ + struct nl_buf *nb = nw->buf; + + MPASS(nw->hdr != NULL); + + if (nw->enomem) { + NL_LOG(LOG_DEBUG, "ENOMEM when dumping message"); + nlmsg_abort(nw); + return (false); + } + + nw->hdr->nlmsg_len = nb->data + nb->datalen - (char *)nw->hdr; + NL_LOG(LOG_DEBUG2, "wrote msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", + nw->hdr->nlmsg_len, nw->hdr->nlmsg_type, nw->hdr->nlmsg_flags, + nw->hdr->nlmsg_seq, nw->hdr->nlmsg_pid); + nw->hdr = NULL; + nw->num_messages++; + return (true); +} + +void +_nlmsg_abort(struct nl_writer *nw) +{ + struct nl_buf *nb = nw->buf; + + if (nw->hdr != NULL) { + nb->datalen = (char *)nw->hdr - nb->data; + nw->hdr = NULL; + } +} + +void +nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *hdr, + struct nl_pstate *npt) +{ + struct nlmsgerr *errmsg; + int payload_len; + uint32_t flags = nlp->nl_flags; + struct nl_writer *nw = npt->nw; + bool cap_ack; + + payload_len = sizeof(struct nlmsgerr); + + /* + * The only case when we send the full message in the + * reply is when there is an error and NETLINK_CAP_ACK + * is not set. + */ + cap_ack = (error == 0) || (flags & NLF_CAP_ACK); + if (!cap_ack) + payload_len += hdr->nlmsg_len - sizeof(struct nlmsghdr); + payload_len = NETLINK_ALIGN(payload_len); + + uint16_t nl_flags = cap_ack ? NLM_F_CAPPED : 0; + if ((npt->err_msg || npt->err_off) && nlp->nl_flags & NLF_EXT_ACK) + nl_flags |= NLM_F_ACK_TLVS; + + NL_LOG(LOG_DEBUG3, "acknowledging message type %d seq %d", + hdr->nlmsg_type, hdr->nlmsg_seq); + + if (!nlmsg_add(nw, nlp->nl_port, hdr->nlmsg_seq, NLMSG_ERROR, nl_flags, payload_len)) + goto enomem; + + errmsg = nlmsg_reserve_data(nw, payload_len, struct nlmsgerr); + errmsg->error = error; + /* In case of error copy the whole message, else just the header */ + memcpy(&errmsg->msg, hdr, cap_ack ? sizeof(*hdr) : hdr->nlmsg_len); + + if (npt->err_msg != NULL && nlp->nl_flags & NLF_EXT_ACK) + nlattr_add_string(nw, NLMSGERR_ATTR_MSG, npt->err_msg); + if (npt->err_off != 0 && nlp->nl_flags & NLF_EXT_ACK) + nlattr_add_u32(nw, NLMSGERR_ATTR_OFFS, npt->err_off); + if (npt->cookie != NULL) + nlattr_add_raw(nw, npt->cookie); + + if (nlmsg_end(nw)) + return; +enomem: + NLP_LOG(LOG_DEBUG, nlp, "error allocating ack data for message %d seq %u", + hdr->nlmsg_type, hdr->nlmsg_seq); + nlmsg_abort(nw); +} + +bool +_nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr) +{ + if (!nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) { + NL_LOG(LOG_DEBUG, "Error finalizing table dump"); + return (false); + } + /* Save operation result */ + int *perror = nlmsg_reserve_object(nw, int); + NL_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error, + nw->buf->datalen, perror); + *perror = error; + nlmsg_end(nw); + nw->suppress_ack = true; + + return (true); +} + +/* + * KPI functions. + */ + +u_int +nlattr_save_offset(const struct nl_writer *nw) +{ + return (nw->buf->datalen - ((char *)nw->hdr - nw->buf->data)); +} + +void * +nlmsg_reserve_data_raw(struct nl_writer *nw, size_t sz) +{ + struct nl_buf *nb = nw->buf; + void *data; + + sz = NETLINK_ALIGN(sz); + if (__predict_false(nb->datalen + sz > nb->buflen)) { + if (!nlmsg_refill_buffer(nw, sz)) + return (NULL); + nb = nw->buf; + } + + data = &nb->data[nb->datalen]; + bzero(data, sz); + nb->datalen += sz; + + return (data); +} + +bool +nlattr_add(struct nl_writer *nw, uint16_t attr_type, uint16_t attr_len, + const void *data) +{ + struct nl_buf *nb = nw->buf; + struct nlattr *nla; + size_t required_len; + + KASSERT(attr_len <= UINT16_MAX - sizeof(struct nlattr), + ("%s: invalid attribute length %u", __func__, attr_len)); + + required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr)); + if (__predict_false(nb->datalen + required_len > nb->buflen)) { + if (!nlmsg_refill_buffer(nw, required_len)) + return (false); + nb = nw->buf; + } + + nla = (struct nlattr *)(&nb->data[nb->datalen]); + + nla->nla_len = attr_len + sizeof(struct nlattr); + nla->nla_type = attr_type; + if (attr_len > 0) { + if ((attr_len % 4) != 0) { + /* clear padding bytes */ + bzero((char *)nla + required_len - 4, 4); + } + memcpy((nla + 1), data, attr_len); + } + nb->datalen += required_len; + return (true); +} + +#include <netlink/ktest_netlink_message_writer.h> diff --git a/sys/netlink/netlink_message_writer.h b/sys/netlink/netlink_message_writer.h new file mode 100644 index 000000000000..83f925e8d93d --- /dev/null +++ b/sys/netlink/netlink_message_writer.h @@ -0,0 +1,312 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETLINK_NETLINK_MESSAGE_WRITER_H_ +#define _NETLINK_NETLINK_MESSAGE_WRITER_H_ + +#ifdef _KERNEL + +#include <netinet/in.h> + +/* + * It is not meant to be included directly + */ + +struct nl_buf; +struct nl_writer; +typedef bool nl_writer_cb(struct nl_writer *nw); + +struct nl_writer { + struct nl_buf *buf; /* Underlying storage pointer */ + struct nlmsghdr *hdr; /* Pointer to the currently-filled msg */ + nl_writer_cb *cb; /* Callback to flush data */ + union { + struct nlpcb *nlp; + struct { + uint16_t proto; + uint16_t id; + int priv; + } group; + }; + u_int num_messages; /* Number of messages in the buffer */ + int malloc_flag; /* M_WAITOK or M_NOWAIT */ + bool ignore_limit; /* If true, ignores RCVBUF limit */ + bool enomem; /* True if ENOMEM occured */ + bool suppress_ack; /* If true, don't send NLMSG_ERR */ +}; + +#define NLMSG_SMALL 128 +#define NLMSG_LARGE 2048 + +/* Message and attribute writing */ +#if defined(NETLINK) || defined(NETLINK_MODULE) +/* Provide optimized calls to the functions inside the same linking unit */ + +bool _nl_writer_unicast(struct nl_writer *, size_t, struct nlpcb *nlp, bool); +bool _nl_writer_group(struct nl_writer *, size_t, uint16_t, uint16_t, int, + bool); +bool _nlmsg_flush(struct nl_writer *nw); +void _nlmsg_ignore_limit(struct nl_writer *nw); + +bool _nlmsg_refill_buffer(struct nl_writer *nw, size_t required_len); +bool _nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, + uint16_t type, uint16_t flags, uint32_t len); +bool _nlmsg_end(struct nl_writer *nw); +void _nlmsg_abort(struct nl_writer *nw); + +bool _nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr); + + +static inline bool +nl_writer_unicast(struct nl_writer *nw, size_t size, struct nlpcb *nlp, + bool waitok) +{ + return (_nl_writer_unicast(nw, size, nlp, waitok)); +} + +static inline bool +nl_writer_group(struct nl_writer *nw, size_t size, uint16_t proto, + uint16_t group_id, int priv, bool waitok) +{ + return (_nl_writer_group(nw, size, proto, group_id, priv, waitok)); +} + +static inline bool +nlmsg_flush(struct nl_writer *nw) +{ + return (_nlmsg_flush(nw)); +} + +static inline void +nlmsg_ignore_limit(struct nl_writer *nw) +{ + _nlmsg_ignore_limit(nw); +} + +static inline bool +nlmsg_refill_buffer(struct nl_writer *nw, size_t required_size) +{ + return (_nlmsg_refill_buffer(nw, required_size)); +} + +static inline bool +nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, + uint16_t flags, uint32_t len) +{ + return (_nlmsg_add(nw, portid, seq, type, flags, len)); +} + +static inline bool +nlmsg_end(struct nl_writer *nw) +{ + return (_nlmsg_end(nw)); +} + +static inline void +nlmsg_abort(struct nl_writer *nw) +{ + return (_nlmsg_abort(nw)); +} + +static inline bool +nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr) +{ + return (_nlmsg_end_dump(nw, error, hdr)); +} + +#else +/* Provide access to the functions via netlink_glue.c */ + +bool nl_writer_unicast(struct nl_writer *, size_t, struct nlpcb *, bool waitok); +bool nl_writer_group(struct nl_writer *, size_t, uint16_t, uint16_t, int, + bool waitok); +bool nlmsg_flush(struct nl_writer *nw); +void nlmsg_ignore_limit(struct nl_writer *nw); + +bool nlmsg_refill_buffer(struct nl_writer *nw, size_t required_size); +bool nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, + uint16_t type, uint16_t flags, uint32_t len); +bool nlmsg_end(struct nl_writer *nw); +void nlmsg_abort(struct nl_writer *nw); + +bool nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr); + +#endif /* defined(NETLINK) || defined(NETLINK_MODULE) */ + +static inline bool +nlmsg_reply(struct nl_writer *nw, const struct nlmsghdr *hdr, int payload_len) +{ + return (nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type, + hdr->nlmsg_flags, payload_len)); +} + +/* + * KPI similar to mtodo(): + * current (uncompleted) header is guaranteed to be contiguous, + * but can be reallocated, thus pointers may need to be readjusted. + */ +u_int nlattr_save_offset(const struct nl_writer *nw); + +static inline void * +_nlattr_restore_offset(const struct nl_writer *nw, int off) +{ + return ((void *)((char *)nw->hdr + off)); +} +#define nlattr_restore_offset(_ns, _off, _t) ((_t *)_nlattr_restore_offset(_ns, _off)) + +static inline void +nlattr_set_len(const struct nl_writer *nw, int off) +{ + struct nlattr *nla = nlattr_restore_offset(nw, off, struct nlattr); + nla->nla_len = nlattr_save_offset(nw) - off; +} + +void *nlmsg_reserve_data_raw(struct nl_writer *nw, size_t sz); +#define nlmsg_reserve_object(_ns, _t) ((_t *)nlmsg_reserve_data_raw(_ns, sizeof(_t))) +#define nlmsg_reserve_data(_ns, _sz, _t) ((_t *)nlmsg_reserve_data_raw(_ns, _sz)) + +static inline int +nlattr_add_nested(struct nl_writer *nw, uint16_t nla_type) +{ + int off = nlattr_save_offset(nw); + struct nlattr *nla = nlmsg_reserve_data(nw, sizeof(struct nlattr), struct nlattr); + if (__predict_false(nla == NULL)) + return (0); + nla->nla_type = nla_type; + return (off); +} + +static inline void * +_nlmsg_reserve_attr(struct nl_writer *nw, uint16_t nla_type, uint16_t sz) +{ + sz += sizeof(struct nlattr); + + struct nlattr *nla = nlmsg_reserve_data(nw, sz, struct nlattr); + if (__predict_false(nla == NULL)) + return (NULL); + nla->nla_type = nla_type; + nla->nla_len = sz; + + return ((void *)(nla + 1)); +} +#define nlmsg_reserve_attr(_ns, _at, _t) ((_t *)_nlmsg_reserve_attr(_ns, _at, NLA_ALIGN(sizeof(_t)))) + +bool nlattr_add(struct nl_writer *nw, uint16_t attr_type, uint16_t attr_len, + const void *data); + +static inline bool +nlattr_add_raw(struct nl_writer *nw, const struct nlattr *nla_src) +{ + MPASS(nla_src->nla_len >= sizeof(struct nlattr)); + + return (nlattr_add(nw, nla_src->nla_type, + nla_src->nla_len - sizeof(struct nlattr), + (const void *)(nla_src + 1))); +} + +static inline bool +nlattr_add_bool(struct nl_writer *nw, uint16_t attrtype, bool value) +{ + return (nlattr_add(nw, attrtype, sizeof(bool), &value)); +} + +static inline bool +nlattr_add_u8(struct nl_writer *nw, uint16_t attrtype, uint8_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(uint8_t), &value)); +} + +static inline bool +nlattr_add_u16(struct nl_writer *nw, uint16_t attrtype, uint16_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(uint16_t), &value)); +} + +static inline bool +nlattr_add_u32(struct nl_writer *nw, uint16_t attrtype, uint32_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(uint32_t), &value)); +} + +static inline bool +nlattr_add_u64(struct nl_writer *nw, uint16_t attrtype, uint64_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(uint64_t), &value)); +} + +static inline bool +nlattr_add_s8(struct nl_writer *nw, uint16_t attrtype, int8_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(int8_t), &value)); +} + +static inline bool +nlattr_add_s16(struct nl_writer *nw, uint16_t attrtype, int16_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(int16_t), &value)); +} + +static inline bool +nlattr_add_s32(struct nl_writer *nw, uint16_t attrtype, int32_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(int32_t), &value)); +} + +static inline bool +nlattr_add_s64(struct nl_writer *nw, uint16_t attrtype, int64_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(int64_t), &value)); +} + +static inline bool +nlattr_add_flag(struct nl_writer *nw, uint16_t attrtype) +{ + return (nlattr_add(nw, attrtype, 0, NULL)); +} + +static inline bool +nlattr_add_string(struct nl_writer *nw, uint16_t attrtype, const char *str) +{ + return (nlattr_add(nw, attrtype, strlen(str) + 1, str)); +} + +static inline bool +nlattr_add_in_addr(struct nl_writer *nw, uint16_t attrtype, + const struct in_addr *in) +{ + return (nlattr_add(nw, attrtype, sizeof(*in), in)); +} + +static inline bool +nlattr_add_in6_addr(struct nl_writer *nw, uint16_t attrtype, + const struct in6_addr *in6) +{ + return (nlattr_add(nw, attrtype, sizeof(*in6), in6)); +} +#endif +#endif diff --git a/sys/netlink/netlink_module.c b/sys/netlink/netlink_module.c new file mode 100644 index 000000000000..6c3cd90e61ab --- /dev/null +++ b/sys/netlink/netlink_module.c @@ -0,0 +1,221 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/module.h> + +#include <sys/lock.h> +#include <sys/rmlock.h> +#include <sys/ck.h> +#include <sys/syslog.h> + +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_var.h> +#include <netlink/route/route_var.h> + +#include <machine/atomic.h> + +FEATURE(netlink, "Netlink support"); + +#define DEBUG_MOD_NAME nl_mod +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include <netlink/netlink_debug.h> +_DECLARE_DEBUG(LOG_INFO); + + +#define NL_MAX_HANDLERS 20 +struct nl_proto_handler _nl_handlers[NL_MAX_HANDLERS]; +struct nl_proto_handler *nl_handlers = _nl_handlers; + +CK_LIST_HEAD(nl_control_head, nl_control); +static struct nl_control_head vnets_head = CK_LIST_HEAD_INITIALIZER(); + +VNET_DEFINE(struct nl_control, nl_ctl) = { + .ctl_port_head = CK_LIST_HEAD_INITIALIZER(), + .ctl_pcb_head = CK_LIST_HEAD_INITIALIZER(), +}; + +struct mtx nl_global_mtx; +MTX_SYSINIT(nl_global_mtx, &nl_global_mtx, "global netlink lock", MTX_DEF); + +#define NL_GLOBAL_LOCK() mtx_lock(&nl_global_mtx) +#define NL_GLOBAL_UNLOCK() mtx_unlock(&nl_global_mtx) + +int netlink_unloading = 0; + +static void +vnet_nl_init(const void *unused __unused) +{ + rm_init(&V_nl_ctl.ctl_lock, "netlink lock"); + + NL_GLOBAL_LOCK(); + CK_LIST_INSERT_HEAD(&vnets_head, &V_nl_ctl, ctl_next); + NL_LOG(LOG_DEBUG2, "VNET %p init done, inserted %p into global list", + curvnet, &V_nl_ctl); + NL_GLOBAL_UNLOCK(); +} +VNET_SYSINIT(vnet_nl_init, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_nl_init, NULL); + +static void +vnet_nl_uninit(const void *unused __unused) +{ + /* Assume at the time all of the processes / sockets are dead */ + NL_GLOBAL_LOCK(); + NL_LOG(LOG_DEBUG2, "Removing %p from global list", &V_nl_ctl); + CK_LIST_REMOVE(&V_nl_ctl, ctl_next); + NL_GLOBAL_UNLOCK(); + + rm_destroy(&V_nl_ctl.ctl_lock); +} +VNET_SYSUNINIT(vnet_nl_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_nl_uninit, + NULL); + +int +nl_verify_proto(int proto) +{ + if (proto < 0 || proto >= NL_MAX_HANDLERS) { + return (EINVAL); + } + int handler_defined = nl_handlers[proto].cb != NULL; + return (handler_defined ? 0 : EPROTONOSUPPORT); +} + +const char * +nl_get_proto_name(int proto) +{ + return (nl_handlers[proto].proto_name); +} + +bool +netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler) +{ + if ((proto < 0) || (proto >= NL_MAX_HANDLERS)) + return (false); + NL_GLOBAL_LOCK(); + KASSERT((nl_handlers[proto].cb == NULL), ("netlink handler %d is already set", proto)); + nl_handlers[proto].cb = handler; + nl_handlers[proto].proto_name = proto_name; + NL_GLOBAL_UNLOCK(); + NL_LOG(LOG_DEBUG2, "Registered netlink %s(%d) handler", proto_name, proto); + return (true); +} + +bool +netlink_unregister_proto(int proto) +{ + if ((proto < 0) || (proto >= NL_MAX_HANDLERS)) + return (false); + NL_GLOBAL_LOCK(); + KASSERT((nl_handlers[proto].cb != NULL), ("netlink handler %d is not set", proto)); + nl_handlers[proto].cb = NULL; + nl_handlers[proto].proto_name = NULL; + NL_GLOBAL_UNLOCK(); + NL_LOG(LOG_DEBUG2, "Unregistered netlink proto %d handler", proto); + return (true); +} + +#if !defined(NETLINK) && defined(NETLINK_MODULE) +/* Non-stub function provider */ +const static struct nl_function_wrapper nl_module = { + .nlmsg_add = _nlmsg_add, + .nlmsg_refill_buffer = _nlmsg_refill_buffer, + .nlmsg_flush = _nlmsg_flush, + .nlmsg_end = _nlmsg_end, + .nlmsg_abort = _nlmsg_abort, + .nl_writer_unicast = _nl_writer_unicast, + .nl_writer_group = _nl_writer_group, + .nlmsg_end_dump = _nlmsg_end_dump, + .nl_modify_ifp_generic = _nl_modify_ifp_generic, + .nl_store_ifp_cookie = _nl_store_ifp_cookie, + .nl_get_thread_nlp = _nl_get_thread_nlp, +}; +#endif + +static bool +can_unload(void) +{ + struct nl_control *ctl; + bool result = true; + + NL_GLOBAL_LOCK(); + + CK_LIST_FOREACH(ctl, &vnets_head, ctl_next) { + NL_LOG(LOG_DEBUG2, "Iterating VNET head %p", ctl); + if (!CK_LIST_EMPTY(&ctl->ctl_pcb_head)) { + NL_LOG(LOG_NOTICE, "non-empty socket list in ctl %p", ctl); + result = false; + break; + } + } + + NL_GLOBAL_UNLOCK(); + + return (result); +} + +static int +netlink_modevent(module_t mod __unused, int what, void *priv __unused) +{ + int ret = 0; + + switch (what) { + case MOD_LOAD: + NL_LOG(LOG_DEBUG2, "Loading"); + nl_osd_register(); +#if !defined(NETLINK) && defined(NETLINK_MODULE) + nl_set_functions(&nl_module); +#endif + break; + + case MOD_UNLOAD: + NL_LOG(LOG_DEBUG2, "Unload called"); + if (can_unload()) { + NL_LOG(LOG_WARNING, "unloading"); + netlink_unloading = 1; +#if !defined(NETLINK) && defined(NETLINK_MODULE) + nl_set_functions(NULL); +#endif + nl_osd_unregister(); + } else + ret = EBUSY; + break; + + default: + ret = EOPNOTSUPP; + break; + } + + return (ret); +} +static moduledata_t netlink_mod = { "netlink", netlink_modevent, NULL }; + +DECLARE_MODULE(netlink, netlink_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(netlink, 1); diff --git a/sys/netlink/netlink_route.c b/sys/netlink/netlink_route.c new file mode 100644 index 000000000000..0123193c204f --- /dev/null +++ b/sys/netlink/netlink_route.c @@ -0,0 +1,143 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/types.h> +#include <sys/ck.h> +#include <sys/epoch.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/socket.h> + +#include <net/route.h> +#include <net/route/route_ctl.h> +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_route.h> +#include <netlink/route/route_var.h> + +#define DEBUG_MOD_NAME nl_route_core +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include <netlink/netlink_debug.h> +_DECLARE_DEBUG(LOG_INFO); + +#define HANDLER_MAX_NUM (NL_RTM_MAX + 10) +static const struct rtnl_cmd_handler *rtnl_handler[HANDLER_MAX_NUM] = {}; + +bool +rtnl_register_messages(const struct rtnl_cmd_handler *handlers, int count) +{ + for (int i = 0; i < count; i++) { + if (handlers[i].cmd >= HANDLER_MAX_NUM) + return (false); + MPASS(rtnl_handler[handlers[i].cmd] == NULL); + } + for (int i = 0; i < count; i++) + rtnl_handler[handlers[i].cmd] = &handlers[i]; + return (true); +} + +/* + * Handler called by netlink subsystem when matching netlink message is received + */ +static int +rtnl_handle_message(struct nlmsghdr *hdr, struct nl_pstate *npt) +{ + const struct rtnl_cmd_handler *cmd; + struct epoch_tracker et; + struct nlpcb *nlp = npt->nlp; + int error = 0; + + if (__predict_false(hdr->nlmsg_type >= HANDLER_MAX_NUM)) { + NLMSG_REPORT_ERR_MSG(npt, "unknown message type: %d", hdr->nlmsg_type); + return (ENOTSUP); + } + + cmd = rtnl_handler[hdr->nlmsg_type]; + if (__predict_false(cmd == NULL)) { + NLMSG_REPORT_ERR_MSG(npt, "unknown message type: %d", hdr->nlmsg_type); + return (ENOTSUP); + } + + NLP_LOG(LOG_DEBUG2, nlp, "received msg %s(%d) len %d", cmd->name, + hdr->nlmsg_type, hdr->nlmsg_len); + + if (cmd->priv != 0 && !nlp_has_priv(nlp, cmd->priv)) { + NLP_LOG(LOG_DEBUG2, nlp, "priv %d check failed for msg %s", cmd->priv, cmd->name); + return (EPERM); + } else if (cmd->priv != 0) + NLP_LOG(LOG_DEBUG3, nlp, "priv %d check passed for msg %s", cmd->priv, cmd->name); + + if (!nlp_unconstrained_vnet(nlp) && (cmd->flags & RTNL_F_ALLOW_NONVNET_JAIL) == 0) { + NLP_LOG(LOG_DEBUG2, nlp, "jail check failed for msg %s", cmd->name); + return (EPERM); + } + + bool need_epoch = !(cmd->flags & RTNL_F_NOEPOCH); + + if (need_epoch) + NET_EPOCH_ENTER(et); + error = cmd->cb(hdr, nlp, npt); + if (need_epoch) + NET_EPOCH_EXIT(et); + + NLP_LOG(LOG_DEBUG3, nlp, "message %s -> error %d", cmd->name, error); + + return (error); +} + +static struct rtbridge nlbridge = { + .route_f = rtnl_handle_route_event, + .ifmsg_f = rtnl_handle_ifnet_event, +}; +static struct rtbridge *nlbridge_orig_p; + +static void +rtnl_load(void *u __unused) +{ + NL_LOG(LOG_DEBUG2, "rtnl loading"); + nlbridge_orig_p = netlink_callback_p; + netlink_callback_p = &nlbridge; + rtnl_neighs_init(); + rtnl_ifaces_init(); + rtnl_nexthops_init(); + rtnl_routes_init(); + netlink_register_proto(NETLINK_ROUTE, "NETLINK_ROUTE", rtnl_handle_message); +} +SYSINIT(rtnl_load, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_load, NULL); + +static void +rtnl_unload(void *u __unused) +{ + netlink_callback_p = nlbridge_orig_p; + netlink_unregister_proto(NETLINK_ROUTE); + rtnl_ifaces_destroy(); + rtnl_neighs_destroy(); + + /* Wait till all consumers read nlbridge data */ + NET_EPOCH_WAIT(); +} +SYSUNINIT(rtnl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_unload, NULL); diff --git a/sys/netlink/netlink_route.h b/sys/netlink/netlink_route.h new file mode 100644 index 000000000000..ecdad83312de --- /dev/null +++ b/sys/netlink/netlink_route.h @@ -0,0 +1,44 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _NETLINK_NETLINK_ROUTE_H_ +#define _NETLINK_NETLINK_ROUTE_H_ + +#include <sys/types.h> + +#include <net/if.h> +#include <net/if_types.h> +#include <net/if_var.h> + +#include <netlink/netlink_bitset.h> +#include <netlink/route/common.h> +#include <netlink/route/ifaddrs.h> +#include <netlink/route/interface.h> +#include <netlink/route/neigh.h> +#include <netlink/route/route.h> +#include <netlink/route/nexthop.h> + +#endif diff --git a/sys/netlink/netlink_snl.h b/sys/netlink/netlink_snl.h new file mode 100644 index 000000000000..586716776bc5 --- /dev/null +++ b/sys/netlink/netlink_snl.h @@ -0,0 +1,1330 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _NETLINK_NETLINK_SNL_H_ +#define _NETLINK_NETLINK_SNL_H_ + +/* + * Simple Netlink Library + */ + +#include <sys/param.h> +#include <sys/socket.h> + +#include <netlink/netlink.h> +#include <netlink/netlink_bitset.h> + +#include <assert.h> +#include <errno.h> +#include <stdalign.h> +#include <stddef.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#define _roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) + +#define NETLINK_ALIGN_SIZE sizeof(uint32_t) +#define NETLINK_ALIGN(_len) _roundup2(_len, NETLINK_ALIGN_SIZE) + +#define NLA_ALIGN_SIZE sizeof(uint32_t) +#define NLA_HDRLEN ((int)sizeof(struct nlattr)) +#define NLA_DATA_LEN(_nla) ((int)((_nla)->nla_len - NLA_HDRLEN)) +#define NLA_DATA(_nla) NL_ITEM_DATA(_nla, NLA_HDRLEN) +#define NLA_DATA_CONST(_nla) NL_ITEM_DATA_CONST(_nla, NLA_HDRLEN) + +#define NLA_TYPE(_nla) ((_nla)->nla_type & 0x3FFF) + +#define NLA_NEXT(_attr) (struct nlattr *)(void *)((char *)_attr + NLA_ALIGN(_attr->nla_len)) + +#define _NLA_END(_start, _len) ((char *)(_start) + (_len)) +#define NLA_FOREACH(_attr, _start, _len) \ + for (_attr = (struct nlattr *)(_start); \ + ((char *)_attr < _NLA_END(_start, _len)) && \ + ((char *)NLA_NEXT(_attr) <= _NLA_END(_start, _len)); \ + _attr = NLA_NEXT(_attr)) + +struct linear_buffer { + char *base; /* Base allocated memory pointer */ + uint32_t offset; /* Currently used offset */ + uint32_t size; /* Total buffer size */ + struct linear_buffer *next; /* Buffer chaining */ +} __aligned(alignof(__max_align_t)); + +static inline struct linear_buffer * +lb_init(uint32_t size) +{ + struct linear_buffer *lb = (struct linear_buffer *)calloc(1, size); + + if (lb != NULL) { + lb->base = (char *)(lb + 1); + lb->size = size - sizeof(*lb); + } + + return (lb); +} + +static inline void +lb_free(struct linear_buffer *lb) +{ + free(lb); +} + +static inline char * +lb_allocz(struct linear_buffer *lb, int len) +{ + len = roundup2(len, alignof(__max_align_t)); + if (lb->offset + len > lb->size) + return (NULL); + char *data = (lb->base + lb->offset); + lb->offset += len; + return (data); +} + +static inline void +lb_clear(struct linear_buffer *lb) +{ + memset(lb->base, 0, lb->offset); + lb->offset = 0; +} + +struct snl_state { + int fd; + char *buf; + size_t off; + size_t bufsize; + size_t datalen; + uint32_t seq; + bool init_done; + struct linear_buffer *lb; +}; +#define SCRATCH_BUFFER_SIZE 1024 +#define SNL_WRITER_BUFFER_SIZE 256 + +typedef void snl_parse_field_f(struct snl_state *ss, void *hdr, void *target); +struct snl_field_parser { + uint16_t off_in; + uint16_t off_out; + snl_parse_field_f *cb; +}; +static const struct snl_field_parser snl_f_p_empty[] = {}; + +typedef bool snl_parse_attr_f(struct snl_state *ss, struct nlattr *attr, + const void *arg, void *target); +struct snl_attr_parser { + uint16_t type; /* Attribute type */ + uint16_t off; /* field offset in the target structure */ + snl_parse_attr_f *cb; /* parser function to call */ + + /* Optional parser argument */ + union { + const void *arg; + const uint32_t arg_u32; + }; +}; + +typedef bool snl_parse_post_f(struct snl_state *ss, void *target); + +struct snl_hdr_parser { + uint16_t in_hdr_size; /* Input header size */ + uint16_t out_size; /* Output structure size */ + uint16_t fp_size; /* Number of items in field parser */ + uint16_t np_size; /* Number of items in attribute parser */ + const struct snl_field_parser *fp; /* array of header field parsers */ + const struct snl_attr_parser *np; /* array of attribute parsers */ + snl_parse_post_f *cb_post; /* post-parse callback */ +}; + +#define SNL_DECLARE_PARSER_EXT(_name, _sz_h_in, _sz_out, _fp, _np, _cb) \ +static const struct snl_hdr_parser _name = { \ + .in_hdr_size = _sz_h_in, \ + .out_size = _sz_out, \ + .fp = &((_fp)[0]), \ + .np = &((_np)[0]), \ + .fp_size = nitems(_fp), \ + .np_size = nitems(_np), \ + .cb_post = _cb, \ +} + +#define SNL_DECLARE_PARSER(_name, _t, _fp, _np) \ + SNL_DECLARE_PARSER_EXT(_name, sizeof(_t), 0, _fp, _np, NULL) + +#define SNL_DECLARE_FIELD_PARSER_EXT(_name, _sz_h_in, _sz_out, _fp, _cb) \ +static const struct snl_hdr_parser _name = { \ + .in_hdr_size = _sz_h_in, \ + .out_size = _sz_out, \ + .fp = &((_fp)[0]), \ + .fp_size = nitems(_fp), \ + .cb_post = _cb, \ +} + +#define SNL_DECLARE_FIELD_PARSER(_name, _t, _fp) \ + SNL_DECLARE_FIELD_PARSER_EXT(_name, sizeof(_t), 0, _fp, NULL) + +#define SNL_DECLARE_ATTR_PARSER_EXT(_name, _sz_out, _np, _cb) \ +static const struct snl_hdr_parser _name = { \ + .out_size = _sz_out, \ + .np = &((_np)[0]), \ + .np_size = nitems(_np), \ + .cb_post = _cb, \ +} + +#define SNL_DECLARE_ATTR_PARSER(_name, _np) \ + SNL_DECLARE_ATTR_PARSER_EXT(_name, 0, _np, NULL) + + +static inline void * +snl_allocz(struct snl_state *ss, int len) +{ + void *data = lb_allocz(ss->lb, len); + + if (data == NULL) { + uint32_t size = ss->lb->size * 2; + + while (size < len + sizeof(struct linear_buffer)) + size *= 2; + + struct linear_buffer *lb = lb_init(size); + + if (lb != NULL) { + lb->next = ss->lb; + ss->lb = lb; + data = lb_allocz(ss->lb, len); + } + } + + return (data); +} + +static inline void +snl_clear_lb(struct snl_state *ss) +{ + struct linear_buffer *lb = ss->lb; + + lb_clear(lb); + lb = lb->next; + ss->lb->next = NULL; + /* Remove all linear bufs except the largest one */ + while (lb != NULL) { + struct linear_buffer *lb_next = lb->next; + lb_free(lb); + lb = lb_next; + } +} + +static void +snl_free(struct snl_state *ss) +{ + if (ss->init_done) + close(ss->fd); + if (ss->buf != NULL) + free(ss->buf); + if (ss->lb != NULL) { + snl_clear_lb(ss); + lb_free(ss->lb); + } +} + +static inline bool +snl_init(struct snl_state *ss, int netlink_family) +{ + memset(ss, 0, sizeof(*ss)); + + ss->fd = socket(AF_NETLINK, SOCK_RAW, netlink_family); + if (ss->fd == -1) + return (false); + ss->init_done = true; + + int val = 1; + socklen_t optlen = sizeof(val); + if (setsockopt(ss->fd, SOL_NETLINK, NETLINK_EXT_ACK, &val, optlen) == -1) { + snl_free(ss); + return (false); + } + + int rcvbuf; + if (getsockopt(ss->fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf, &optlen) == -1) { + snl_free(ss); + return (false); + } + + ss->bufsize = rcvbuf; + ss->buf = (char *)malloc(ss->bufsize); + if (ss->buf == NULL) { + snl_free(ss); + return (false); + } + + ss->lb = lb_init(SCRATCH_BUFFER_SIZE); + if (ss->lb == NULL) { + snl_free(ss); + return (false); + } + + return (true); +} + +static inline bool +snl_clone(struct snl_state *ss, const struct snl_state *orig) +{ + *ss = (struct snl_state){ + .fd = orig->fd, + .init_done = false, + }; + return ((ss->lb = lb_init(SCRATCH_BUFFER_SIZE)) != NULL); +} + +static inline bool +snl_send(struct snl_state *ss, void *data, int sz) +{ + return (send(ss->fd, data, sz, 0) == sz); +} + +static inline bool +snl_send_message(struct snl_state *ss, struct nlmsghdr *hdr) +{ + ssize_t sz = NLMSG_ALIGN(hdr->nlmsg_len); + + return (send(ss->fd, hdr, sz, 0) == sz); +} + +static inline uint32_t +snl_get_seq(struct snl_state *ss) +{ + return (++ss->seq); +} + +struct snl_msg_info { + int cmsg_type; + int cmsg_level; + uint32_t process_id; + uint8_t port_id; + uint8_t seq_id; +}; +static inline bool parse_cmsg(struct snl_state *ss, const struct msghdr *msg, + struct snl_msg_info *attrs); + +static inline struct nlmsghdr * +snl_read_message_dbg(struct snl_state *ss, struct snl_msg_info *cinfo) +{ + memset(cinfo, 0, sizeof(*cinfo)); + + if (ss->off == ss->datalen) { + struct sockaddr_nl nladdr; + char cbuf[64]; + + struct iovec iov = { + .iov_base = ss->buf, + .iov_len = ss->bufsize, + }; + struct msghdr msg = { + .msg_name = &nladdr, + .msg_namelen = sizeof(nladdr), + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = cbuf, + .msg_controllen = sizeof(cbuf), + }; + ss->off = 0; + ss->datalen = 0; + for (;;) { + ssize_t datalen = recvmsg(ss->fd, &msg, 0); + if (datalen > 0) { + ss->datalen = datalen; + parse_cmsg(ss, &msg, cinfo); + break; + } else if (errno != EINTR) + return (NULL); + } + } + struct nlmsghdr *hdr = (struct nlmsghdr *)(void *)&ss->buf[ss->off]; + ss->off += NLMSG_ALIGN(hdr->nlmsg_len); + return (hdr); +} + + +static inline struct nlmsghdr * +snl_read_message(struct snl_state *ss) +{ + if (ss->off == ss->datalen) { + struct sockaddr_nl nladdr; + struct iovec iov = { + .iov_base = ss->buf, + .iov_len = ss->bufsize, + }; + struct msghdr msg = { + .msg_name = &nladdr, + .msg_namelen = sizeof(nladdr), + .msg_iov = &iov, + .msg_iovlen = 1, + }; + ss->off = 0; + ss->datalen = 0; + for (;;) { + ssize_t datalen = recvmsg(ss->fd, &msg, 0); + if (datalen > 0) { + ss->datalen = datalen; + break; + } else if (errno != EINTR) + return (NULL); + } + } + struct nlmsghdr *hdr = (struct nlmsghdr *)(void *)&ss->buf[ss->off]; + ss->off += NLMSG_ALIGN(hdr->nlmsg_len); + return (hdr); +} + +static inline struct nlmsghdr * +snl_read_reply(struct snl_state *ss, uint32_t nlmsg_seq) +{ + struct nlmsghdr *hdr; + + while ((hdr = snl_read_message(ss)) != NULL) { + if (hdr->nlmsg_seq == nlmsg_seq) + return (hdr); + } + + return (NULL); +} + +/* + * Checks that attributes are sorted by attribute type. + */ +static inline void +snl_verify_parsers(const struct snl_hdr_parser **parser, int count) +{ + for (int i = 0; i < count; i++) { + const struct snl_hdr_parser *p = parser[i]; + int attr_type = 0; + for (int j = 0; j < p->np_size; j++) { + assert(p->np[j].type > attr_type); + attr_type = p->np[j].type; + } + } +} +#define SNL_VERIFY_PARSERS(_p) snl_verify_parsers((_p), nitems(_p)) + +static const struct snl_attr_parser * +find_parser(const struct snl_attr_parser *ps, int pslen, int key) +{ + int left_i = 0, right_i = pslen - 1; + + if (key < ps[0].type || key > ps[pslen - 1].type) + return (NULL); + + while (left_i + 1 < right_i) { + int mid_i = (left_i + right_i) / 2; + if (key < ps[mid_i].type) + right_i = mid_i; + else if (key > ps[mid_i].type) + left_i = mid_i + 1; + else + return (&ps[mid_i]); + } + if (ps[left_i].type == key) + return (&ps[left_i]); + else if (ps[right_i].type == key) + return (&ps[right_i]); + return (NULL); +} + +static inline bool +snl_parse_attrs_raw(struct snl_state *ss, struct nlattr *nla_head, int len, + const struct snl_attr_parser *ps, int pslen, void *target) +{ + struct nlattr *nla; + + NLA_FOREACH(nla, nla_head, len) { + if (nla->nla_len < sizeof(struct nlattr)) + return (false); + int nla_type = nla->nla_type & NLA_TYPE_MASK; + const struct snl_attr_parser *s = find_parser(ps, pslen, nla_type); + if (s != NULL) { + void *ptr = (void *)((char *)target + s->off); + if (!s->cb(ss, nla, s->arg, ptr)) + return (false); + } + } + return (true); +} + +static inline bool +snl_parse_attrs(struct snl_state *ss, struct nlmsghdr *hdr, int hdrlen, + const struct snl_attr_parser *ps, int pslen, void *target) +{ + int off = NLMSG_HDRLEN + NETLINK_ALIGN(hdrlen); + int len = hdr->nlmsg_len - off; + struct nlattr *nla_head = (struct nlattr *)(void *)((char *)hdr + off); + + return (snl_parse_attrs_raw(ss, nla_head, len, ps, pslen, target)); +} + +static inline void +snl_parse_fields(struct snl_state *ss, struct nlmsghdr *hdr, int hdrlen __unused, + const struct snl_field_parser *ps, int pslen, void *target) +{ + for (int i = 0; i < pslen; i++) { + const struct snl_field_parser *fp = &ps[i]; + void *src = (char *)hdr + fp->off_in; + void *dst = (char *)target + fp->off_out; + + fp->cb(ss, src, dst); + } +} + +static inline bool +snl_parse_header(struct snl_state *ss, void *hdr, int len, + const struct snl_hdr_parser *parser, void *target) +{ + struct nlattr *nla_head; + + /* Extract fields first (if any) */ + snl_parse_fields(ss, (struct nlmsghdr *)hdr, parser->in_hdr_size, + parser->fp, parser->fp_size, target); + + nla_head = (struct nlattr *)(void *)((char *)hdr + parser->in_hdr_size); + bool result = snl_parse_attrs_raw(ss, nla_head, len - parser->in_hdr_size, + parser->np, parser->np_size, target); + + if (result && parser->cb_post != NULL) + result = parser->cb_post(ss, target); + + return (result); +} + +static inline bool +snl_parse_nlmsg(struct snl_state *ss, struct nlmsghdr *hdr, + const struct snl_hdr_parser *parser, void *target) +{ + return (snl_parse_header(ss, hdr + 1, hdr->nlmsg_len - sizeof(*hdr), parser, target)); +} + +static inline bool +snl_attr_get_flag(struct snl_state *ss __unused, struct nlattr *nla, const void *arg __unused, + void *target) +{ + if (NLA_DATA_LEN(nla) == 0) { + *((uint8_t *)target) = 1; + return (true); + } + return (false); +} + +static inline bool +snl_attr_get_bytes(struct snl_state *ss __unused, struct nlattr *nla, const void *arg, + void *target) +{ + if ((size_t)NLA_DATA_LEN(nla) != (size_t)arg) + return (false); + + memcpy(target, NLA_DATA_CONST(nla), (size_t)arg); + + return (true); +} + +static inline bool +snl_attr_get_bool(struct snl_state *ss __unused, struct nlattr *nla, + const void *arg __unused, void *target) +{ + if (NLA_DATA_LEN(nla) == sizeof(bool)) { + *((bool *)target) = *((const bool *)NLA_DATA_CONST(nla)); + return (true); + } + return (false); +} + +static inline bool +snl_attr_get_uint8(struct snl_state *ss __unused, struct nlattr *nla, + const void *arg __unused, void *target) +{ + if (NLA_DATA_LEN(nla) == sizeof(uint8_t)) { + *((uint8_t *)target) = *((const uint8_t *)NLA_DATA_CONST(nla)); + return (true); + } + return (false); +} + +static inline bool +snl_attr_get_uint16(struct snl_state *ss __unused, struct nlattr *nla, + const void *arg __unused, void *target) +{ + if (NLA_DATA_LEN(nla) == sizeof(uint16_t)) { + *((uint16_t *)target) = *((const uint16_t *)NLA_DATA_CONST(nla)); + return (true); + } + return (false); +} + +static inline bool +snl_attr_get_uint32(struct snl_state *ss __unused, struct nlattr *nla, + const void *arg __unused, void *target) +{ + if (NLA_DATA_LEN(nla) == sizeof(uint32_t)) { + *((uint32_t *)target) = *((const uint32_t *)NLA_DATA_CONST(nla)); + return (true); + } + return (false); +} + +static inline bool +snl_attr_get_uint64(struct snl_state *ss __unused, struct nlattr *nla, + const void *arg __unused, void *target) +{ + if (NLA_DATA_LEN(nla) == sizeof(uint64_t)) { + memcpy(target, NLA_DATA_CONST(nla), sizeof(uint64_t)); + return (true); + } + return (false); +} + +static inline bool +snl_attr_get_int8(struct snl_state *ss, struct nlattr *nla, const void *arg, + void *target) +{ + return (snl_attr_get_uint8(ss, nla, arg, target)); +} + +static inline bool +snl_attr_get_int16(struct snl_state *ss, struct nlattr *nla, const void *arg, + void *target) +{ + return (snl_attr_get_uint16(ss, nla, arg, target)); +} + +static inline bool +snl_attr_get_int32(struct snl_state *ss, struct nlattr *nla, const void *arg, + void *target) +{ + return (snl_attr_get_uint32(ss, nla, arg, target)); +} + +static inline bool +snl_attr_get_int64(struct snl_state *ss, struct nlattr *nla, const void *arg, + void *target) +{ + return (snl_attr_get_uint64(ss, nla, arg, target)); +} + +static inline bool +snl_attr_get_string(struct snl_state *ss __unused, struct nlattr *nla, + const void *arg __unused, void *target) +{ + size_t maxlen = NLA_DATA_LEN(nla); + + if (strnlen((char *)NLA_DATA(nla), maxlen) < maxlen) { + *((char **)target) = (char *)NLA_DATA(nla); + return (true); + } + return (false); +} + +static inline bool +snl_attr_get_stringn(struct snl_state *ss, struct nlattr *nla, + const void *arg __unused, void *target) +{ + int maxlen = NLA_DATA_LEN(nla); + + char *buf = (char *)snl_allocz(ss, maxlen + 1); + if (buf == NULL) + return (false); + buf[maxlen] = '\0'; + memcpy(buf, NLA_DATA(nla), maxlen); + + *((char **)target) = buf; + return (true); +} + +static inline bool +snl_attr_copy_string(struct snl_state *ss, struct nlattr *nla, + const void *arg, void *target) +{ + char *tmp; + + if (snl_attr_get_string(ss, nla, NULL, &tmp)) { + strlcpy((char *)target, tmp, (size_t)arg); + return (true); + } + return (false); +} + +static inline bool +snl_attr_dup_string(struct snl_state *ss __unused, struct nlattr *nla, + const void *arg __unused, void *target) +{ + size_t maxlen = NLA_DATA_LEN(nla); + + if (strnlen((char *)NLA_DATA(nla), maxlen) < maxlen) { + char *buf = (char *)snl_allocz(ss, maxlen); + if (buf == NULL) + return (false); + memcpy(buf, NLA_DATA(nla), maxlen); + *((char **)target) = buf; + return (true); + } + return (false); +} + +static inline bool +snl_attr_get_nested(struct snl_state *ss, struct nlattr *nla, const void *arg, void *target) +{ + const struct snl_hdr_parser *p = (const struct snl_hdr_parser *)arg; + + /* Assumes target points to the beginning of the structure */ + return (snl_parse_header(ss, NLA_DATA(nla), NLA_DATA_LEN(nla), p, target)); +} + +struct snl_parray { + uint32_t count; + void **items; +}; + +static inline bool +snl_attr_get_parray_sz(struct snl_state *ss, struct nlattr *container_nla, + uint32_t start_size, const void *arg, void *target) +{ + const struct snl_hdr_parser *p = (const struct snl_hdr_parser *)arg; + struct snl_parray *array = (struct snl_parray *)target; + struct nlattr *nla; + uint32_t count = 0, size = start_size; + + if (p->out_size == 0) + return (false); + + array->items = (void **)snl_allocz(ss, size * sizeof(void *)); + if (array->items == NULL) + return (false); + + /* + * If the provided parser is an attribute parser, assume that each + * nla in the container nla is the container nla itself and parse + * the contents of this nla. + * Otherwise, run the parser on raw data, assuming the header of this + * data has u16 field with total size in the beginning. + */ + uint32_t data_off = 0; + + if (p->in_hdr_size == 0) + data_off = sizeof(struct nlattr); + + NLA_FOREACH(nla, NLA_DATA(container_nla), NLA_DATA_LEN(container_nla)) { + void *item = snl_allocz(ss, p->out_size); + + if (item == NULL) + return (false); + + void *data = (char *)(void *)nla + data_off; + int data_len = nla->nla_len - data_off; + + if (!(snl_parse_header(ss, data, data_len, p, item))) + return (false); + + if (count == size) { + uint32_t new_size = size * 2; + void **new_array = (void **)snl_allocz(ss, new_size *sizeof(void *)); + + memcpy(new_array, array->items, size * sizeof(void *)); + array->items = new_array; + size = new_size; + } + array->items[count++] = item; + } + array->count = count; + + return (true); +} + +/* + * Parses and stores the unknown-size array. + * Assumes each array item is a container and the NLAs in the container are parsable + * by the parser provided in @arg. + * Assumes @target is struct snl_parray + */ +static inline bool +snl_attr_get_parray(struct snl_state *ss, struct nlattr *nla, const void *arg, void *target) +{ + return (snl_attr_get_parray_sz(ss, nla, 8, arg, target)); +} + +static inline bool +snl_attr_get_nla(struct snl_state *ss __unused, struct nlattr *nla, + const void *arg __unused, void *target) +{ + *((struct nlattr **)target) = nla; + return (true); +} + +static inline bool +snl_attr_dup_nla(struct snl_state *ss, struct nlattr *nla, + const void *arg __unused, void *target) +{ + void *ptr = snl_allocz(ss, nla->nla_len); + + if (ptr != NULL) { + memcpy(ptr, nla, nla->nla_len); + *((void **)target) = ptr; + return (true); + } + return (false); +} + +static inline bool +snl_attr_copy_struct(struct snl_state *ss, struct nlattr *nla, + const void *arg __unused, void *target) +{ + void *ptr = snl_allocz(ss, NLA_DATA_LEN(nla)); + + if (ptr != NULL) { + memcpy(ptr, NLA_DATA(nla), NLA_DATA_LEN(nla)); + *((void **)target) = ptr; + return (true); + } + return (false); +} + +static inline bool +snl_attr_dup_struct(struct snl_state *ss, struct nlattr *nla, + const void *arg __unused, void *target) +{ + void *ptr = snl_allocz(ss, NLA_DATA_LEN(nla)); + + if (ptr != NULL) { + memcpy(ptr, NLA_DATA(nla), NLA_DATA_LEN(nla)); + *((void **)target) = ptr; + return (true); + } + return (false); +} + +struct snl_attr_bit { + uint32_t bit_index; + char *bit_name; + int bit_value; +}; + +struct snl_attr_bits { + uint32_t num_bits; + struct snl_attr_bit **bits; +}; + +#define _OUT(_field) offsetof(struct snl_attr_bit, _field) +static const struct snl_attr_parser _nla_p_bit[] = { + { .type = NLA_BITSET_BIT_INDEX, .off = _OUT(bit_index), .cb = snl_attr_get_uint32 }, + { .type = NLA_BITSET_BIT_NAME, .off = _OUT(bit_name), .cb = snl_attr_dup_string }, + { .type = NLA_BITSET_BIT_VALUE, .off = _OUT(bit_value), .cb = snl_attr_get_flag }, +}; +#undef _OUT +SNL_DECLARE_ATTR_PARSER_EXT(_nla_bit_parser, sizeof(struct snl_attr_bit), _nla_p_bit, NULL); + +struct snl_attr_bitset { + uint32_t nla_bitset_size; + uint32_t *nla_bitset_mask; + uint32_t *nla_bitset_value; + struct snl_attr_bits bits; +}; + +#define _OUT(_field) offsetof(struct snl_attr_bitset, _field) +static const struct snl_attr_parser _nla_p_bitset[] = { + { .type = NLA_BITSET_SIZE, .off = _OUT(nla_bitset_size), .cb = snl_attr_get_uint32 }, + { .type = NLA_BITSET_BITS, .off = _OUT(bits), .cb = snl_attr_get_parray, .arg = &_nla_bit_parser }, + { .type = NLA_BITSET_VALUE, .off = _OUT(nla_bitset_mask), .cb = snl_attr_dup_nla }, + { .type = NLA_BITSET_MASK, .off = _OUT(nla_bitset_value), .cb = snl_attr_dup_nla }, +}; + +static inline bool +_cb_p_bitset(struct snl_state *ss __unused, void *_target) +{ + struct snl_attr_bitset *target = (struct snl_attr_bitset *)_target; + + uint32_t sz_bytes = _roundup2(target->nla_bitset_size, 32) / 8; + + if (target->nla_bitset_mask != NULL) { + struct nlattr *nla = (struct nlattr *)target->nla_bitset_mask; + uint32_t data_len = NLA_DATA_LEN(nla); + + if (data_len != sz_bytes || _roundup2(data_len, 4) != data_len) + return (false); + target->nla_bitset_mask = (uint32_t *)NLA_DATA(nla); + } + + if (target->nla_bitset_value != NULL) { + struct nlattr *nla = (struct nlattr *)target->nla_bitset_value; + uint32_t data_len = NLA_DATA_LEN(nla); + + if (data_len != sz_bytes || _roundup2(data_len, 4) != data_len) + return (false); + target->nla_bitset_value = (uint32_t *)NLA_DATA(nla); + } + return (true); +} +#undef _OUT +SNL_DECLARE_ATTR_PARSER_EXT(_nla_bitset_parser, + sizeof(struct snl_attr_bitset), + _nla_p_bitset, _cb_p_bitset); + +/* + * Parses the compact bitset representation. + */ +static inline bool +snl_attr_get_bitset_c(struct snl_state *ss, struct nlattr *nla, + const void *arg __unused, void *_target) +{ + const struct snl_hdr_parser *p = &_nla_bitset_parser; + struct snl_attr_bitset *target = (struct snl_attr_bitset *)_target; + + /* Assumes target points to the beginning of the structure */ + if (!snl_parse_header(ss, NLA_DATA(nla), NLA_DATA_LEN(nla), p, _target)) + return (false); + if (target->nla_bitset_mask == NULL || target->nla_bitset_value == NULL) + return (false); + return (true); +} + +static inline void +snl_field_get_uint8(struct snl_state *ss __unused, void *src, void *target) +{ + *((uint8_t *)target) = *((uint8_t *)src); +} + +static inline void +snl_field_get_uint16(struct snl_state *ss __unused, void *src, void *target) +{ + *((uint16_t *)target) = *((uint16_t *)src); +} + +static inline void +snl_field_get_uint32(struct snl_state *ss __unused, void *src, void *target) +{ + *((uint32_t *)target) = *((uint32_t *)src); +} + +static inline void +snl_field_get_ptr(struct snl_state *ss __unused, void *src, void *target) +{ + *((void **)target) = src; +} + +struct snl_errmsg_data { + struct nlmsghdr *orig_hdr; + int error; + uint32_t error_offs; + char *error_str; + struct nlattr *cookie; +}; + +#define _IN(_field) offsetof(struct nlmsgerr, _field) +#define _OUT(_field) offsetof(struct snl_errmsg_data, _field) +static const struct snl_attr_parser nla_p_errmsg[] = { + { .type = NLMSGERR_ATTR_MSG, .off = _OUT(error_str), .cb = snl_attr_get_string }, + { .type = NLMSGERR_ATTR_OFFS, .off = _OUT(error_offs), .cb = snl_attr_get_uint32 }, + { .type = NLMSGERR_ATTR_COOKIE, .off = _OUT(cookie), .cb = snl_attr_get_nla }, +}; + +static const struct snl_field_parser nlf_p_errmsg[] = { + { .off_in = _IN(error), .off_out = _OUT(error), .cb = snl_field_get_uint32 }, + { .off_in = _IN(msg), .off_out = _OUT(orig_hdr), .cb = snl_field_get_ptr }, +}; +#undef _IN +#undef _OUT +SNL_DECLARE_PARSER(snl_errmsg_parser, struct nlmsgerr, nlf_p_errmsg, nla_p_errmsg); + +#define _IN(_field) offsetof(struct nlmsgerr, _field) +#define _OUT(_field) offsetof(struct snl_errmsg_data, _field) +static const struct snl_field_parser nlf_p_donemsg[] = { + { .off_in = _IN(error), .off_out = _OUT(error), .cb = snl_field_get_uint32 }, +}; +#undef _IN +#undef _OUT +SNL_DECLARE_FIELD_PARSER(snl_donemsg_parser, struct nlmsgerr, nlf_p_donemsg); + +static inline bool +snl_parse_errmsg(struct snl_state *ss, struct nlmsghdr *hdr, struct snl_errmsg_data *e) +{ + if ((hdr->nlmsg_flags & NLM_F_CAPPED) != 0) + return (snl_parse_nlmsg(ss, hdr, &snl_errmsg_parser, e)); + + const struct snl_hdr_parser *ps = &snl_errmsg_parser; + struct nlmsgerr *errmsg = (struct nlmsgerr *)(hdr + 1); + int hdrlen = sizeof(int) + NLMSG_ALIGN(errmsg->msg.nlmsg_len); + struct nlattr *attr_head = (struct nlattr *)(void *)((char *)errmsg + hdrlen); + int attr_len = hdr->nlmsg_len - sizeof(struct nlmsghdr) - hdrlen; + + snl_parse_fields(ss, (struct nlmsghdr *)errmsg, hdrlen, ps->fp, ps->fp_size, e); + return (snl_parse_attrs_raw(ss, attr_head, attr_len, ps->np, ps->np_size, e)); +} + +static inline bool +snl_read_reply_code(struct snl_state *ss, uint32_t nlmsg_seq, struct snl_errmsg_data *e) +{ + struct nlmsghdr *hdr = snl_read_reply(ss, nlmsg_seq); + + if (hdr == NULL) { + e->error = EINVAL; + } else if (hdr->nlmsg_type == NLMSG_ERROR) { + if (!snl_parse_errmsg(ss, hdr, e)) + e->error = EINVAL; + return (e->error == 0); + } + + return (false); +} + +#define _OUT(_field) offsetof(struct snl_msg_info, _field) +static const struct snl_attr_parser _nla_p_cinfo[] = { + { .type = NLMSGINFO_ATTR_PROCESS_ID, .off = _OUT(process_id), .cb = snl_attr_get_uint32 }, + { .type = NLMSGINFO_ATTR_PORT_ID, .off = _OUT(port_id), .cb = snl_attr_get_uint32 }, + { .type = NLMSGINFO_ATTR_SEQ_ID, .off = _OUT(seq_id), .cb = snl_attr_get_uint32 }, +}; +#undef _OUT +SNL_DECLARE_ATTR_PARSER(snl_msg_info_parser, _nla_p_cinfo); + +static inline bool +parse_cmsg(struct snl_state *ss, const struct msghdr *msg, struct snl_msg_info *attrs) +{ + for (struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (cmsg->cmsg_level != SOL_NETLINK || cmsg->cmsg_type != NETLINK_MSG_INFO) + continue; + + void *data = CMSG_DATA(cmsg); + int len = cmsg->cmsg_len - ((char *)data - (char *)cmsg); + const struct snl_hdr_parser *ps = &snl_msg_info_parser; + + return (snl_parse_attrs_raw(ss, (struct nlattr *)data, len, ps->np, ps->np_size, attrs)); + } + + return (false); +} + +/* + * Assumes e is zeroed + */ +static inline struct nlmsghdr * +snl_read_reply_multi(struct snl_state *ss, uint32_t nlmsg_seq, struct snl_errmsg_data *e) +{ + struct nlmsghdr *hdr = snl_read_reply(ss, nlmsg_seq); + + if (hdr == NULL) { + e->error = EINVAL; + } else if (hdr->nlmsg_type == NLMSG_ERROR) { + if (!snl_parse_errmsg(ss, hdr, e)) + e->error = EINVAL; + } else if (hdr->nlmsg_type == NLMSG_DONE) { + snl_parse_nlmsg(ss, hdr, &snl_donemsg_parser, e); + } else + return (hdr); + + return (NULL); +} + + +/* writer logic */ +struct snl_writer { + char *base; + uint32_t offset; + uint32_t size; + struct nlmsghdr *hdr; + struct snl_state *ss; + bool error; +}; + +static inline void +snl_init_writer(struct snl_state *ss, struct snl_writer *nw) +{ + nw->size = SNL_WRITER_BUFFER_SIZE; + nw->base = (char *)snl_allocz(ss, nw->size); + if (nw->base == NULL) { + nw->error = true; + nw->size = 0; + } + + nw->offset = 0; + nw->hdr = NULL; + nw->error = false; + nw->ss = ss; +} + +static inline bool +snl_realloc_msg_buffer(struct snl_writer *nw, size_t sz) +{ + uint32_t new_size = nw->size * 2; + + while (new_size < nw->size + sz) + new_size *= 2; + + if (nw->error) + return (false); + + if (snl_allocz(nw->ss, new_size) == NULL) { + nw->error = true; + return (false); + } + nw->size = new_size; + + void *new_base = nw->ss->lb->base; + if (new_base != nw->base) { + memcpy(new_base, nw->base, nw->offset); + if (nw->hdr != NULL) { + int hdr_off = (char *)(nw->hdr) - nw->base; + + nw->hdr = (struct nlmsghdr *) + (void *)((char *)new_base + hdr_off); + } + nw->base = (char *)new_base; + } + + return (true); +} + +static inline void * +snl_reserve_msg_data_raw(struct snl_writer *nw, size_t sz) +{ + sz = NETLINK_ALIGN(sz); + + if (__predict_false(nw->offset + sz > nw->size)) { + if (!snl_realloc_msg_buffer(nw, sz)) + return (NULL); + } + + void *data_ptr = &nw->base[nw->offset]; + nw->offset += sz; + + return (data_ptr); +} +#define snl_reserve_msg_object(_ns, _t) ((_t *)snl_reserve_msg_data_raw(_ns, sizeof(_t))) +#define snl_reserve_msg_data(_ns, _sz, _t) ((_t *)snl_reserve_msg_data_raw(_ns, _sz)) + +static inline struct nlattr * +snl_reserve_msg_attr_raw(struct snl_writer *nw, uint16_t nla_type, uint16_t sz) +{ + struct nlattr *nla; + + sz += sizeof(struct nlattr); + nla = snl_reserve_msg_data(nw, sz, struct nlattr); + if (__predict_false(nla == NULL)) + return (NULL); + nla->nla_type = nla_type; + nla->nla_len = sz; + + return (nla); +} +#define snl_reserve_msg_attr(_ns, _at, _t) \ + ((_t *)(snl_reserve_msg_attr_raw(_ns, _at, sizeof(_t)) + 1)) + +static inline bool +snl_add_msg_attr(struct snl_writer *nw, int attr_type, int attr_len, const void *data) +{ + int required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr)); + + if (__predict_false(nw->offset + required_len > nw->size)) { + if (!snl_realloc_msg_buffer(nw, required_len)) + return (false); + } + + struct nlattr *nla = (struct nlattr *)(void *)(&nw->base[nw->offset]); + + nla->nla_len = attr_len + sizeof(struct nlattr); + nla->nla_type = attr_type; + if (attr_len > 0) { + if ((attr_len % 4) != 0) { + /* clear padding bytes */ + bzero((char *)nla + required_len - 4, 4); + } + memcpy((nla + 1), data, attr_len); + } + nw->offset += required_len; + return (true); +} + +static inline bool +snl_add_msg_attr_raw(struct snl_writer *nw, const struct nlattr *nla_src) +{ + int attr_len = nla_src->nla_len - sizeof(struct nlattr); + + assert(attr_len >= 0); + + return (snl_add_msg_attr(nw, nla_src->nla_type, attr_len, (const void *)(nla_src + 1))); +} + +static inline bool +snl_add_msg_attr_bool(struct snl_writer *nw, int attrtype, bool value) +{ + return (snl_add_msg_attr(nw, attrtype, sizeof(bool), &value)); +} + +static inline bool +snl_add_msg_attr_u8(struct snl_writer *nw, int attrtype, uint8_t value) +{ + return (snl_add_msg_attr(nw, attrtype, sizeof(uint8_t), &value)); +} + +static inline bool +snl_add_msg_attr_u16(struct snl_writer *nw, int attrtype, uint16_t value) +{ + return (snl_add_msg_attr(nw, attrtype, sizeof(uint16_t), &value)); +} + +static inline bool +snl_add_msg_attr_u32(struct snl_writer *nw, int attrtype, uint32_t value) +{ + return (snl_add_msg_attr(nw, attrtype, sizeof(uint32_t), &value)); +} + +static inline bool +snl_add_msg_attr_u64(struct snl_writer *nw, int attrtype, uint64_t value) +{ + return (snl_add_msg_attr(nw, attrtype, sizeof(uint64_t), &value)); +} + +static inline bool +snl_add_msg_attr_s8(struct snl_writer *nw, int attrtype, int8_t value) +{ + return (snl_add_msg_attr(nw, attrtype, sizeof(int8_t), &value)); +} + +static inline bool +snl_add_msg_attr_s16(struct snl_writer *nw, int attrtype, int16_t value) +{ + return (snl_add_msg_attr(nw, attrtype, sizeof(int16_t), &value)); +} + +static inline bool +snl_add_msg_attr_s32(struct snl_writer *nw, int attrtype, int32_t value) +{ + return (snl_add_msg_attr(nw, attrtype, sizeof(int32_t), &value)); +} + +static inline bool +snl_add_msg_attr_s64(struct snl_writer *nw, int attrtype, int64_t value) +{ + return (snl_add_msg_attr(nw, attrtype, sizeof(int64_t), &value)); +} + +static inline bool +snl_add_msg_attr_flag(struct snl_writer *nw, int attrtype) +{ + return (snl_add_msg_attr(nw, attrtype, 0, NULL)); +} + +static inline bool +snl_add_msg_attr_string(struct snl_writer *nw, int attrtype, const char *str) +{ + return (snl_add_msg_attr(nw, attrtype, strlen(str) + 1, str)); +} + + +static inline int +snl_get_msg_offset(const struct snl_writer *nw) +{ + return (nw->offset - ((char *)nw->hdr - nw->base)); +} + +static inline void * +_snl_restore_msg_offset(const struct snl_writer *nw, int off) +{ + return ((void *)((char *)nw->hdr + off)); +} +#define snl_restore_msg_offset(_ns, _off, _t) ((_t *)_snl_restore_msg_offset(_ns, _off)) + +static inline int +snl_add_msg_attr_nested(struct snl_writer *nw, int attrtype) +{ + int off = snl_get_msg_offset(nw); + struct nlattr *nla = snl_reserve_msg_data(nw, sizeof(struct nlattr), struct nlattr); + if (__predict_false(nla == NULL)) + return (0); + nla->nla_type = attrtype; + return (off); +} + +static inline void +snl_end_attr_nested(const struct snl_writer *nw, int off) +{ + if (!nw->error) { + struct nlattr *nla = snl_restore_msg_offset(nw, off, struct nlattr); + nla->nla_len = NETLINK_ALIGN(snl_get_msg_offset(nw) - off); + } +} + +static inline struct nlmsghdr * +snl_create_msg_request(struct snl_writer *nw, int nlmsg_type) +{ + struct nlmsghdr *hdr; + + assert(nw->hdr == NULL); + + if (__predict_false((hdr = + snl_reserve_msg_object(nw, struct nlmsghdr)) == NULL)) + return (NULL); + hdr->nlmsg_type = nlmsg_type; + hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + nw->hdr = hdr; + + return (hdr); +} + +static void +snl_abort_msg(struct snl_writer *nw) +{ + if (nw->hdr != NULL) { + int offset = (char *)(&nw->base[nw->offset]) - (char *)(nw->hdr); + + nw->offset -= offset; + nw->hdr = NULL; + } +} + +static inline struct nlmsghdr * +snl_finalize_msg(struct snl_writer *nw) +{ + if (nw->error) + snl_abort_msg(nw); + if (nw->hdr != NULL) { + struct nlmsghdr *hdr = nw->hdr; + + int offset = (char *)(&nw->base[nw->offset]) - (char *)(nw->hdr); + hdr->nlmsg_len = offset; + hdr->nlmsg_seq = snl_get_seq(nw->ss); + nw->hdr = NULL; + + return (hdr); + } + return (NULL); +} + +static inline bool +snl_send_msgs(struct snl_writer *nw) +{ + int offset = nw->offset; + + assert(nw->hdr == NULL); + nw->offset = 0; + + return (snl_send(nw->ss, nw->base, offset)); +} + +#endif diff --git a/sys/netlink/netlink_snl_generic.h b/sys/netlink/netlink_snl_generic.h new file mode 100644 index 000000000000..10e98a0266e0 --- /dev/null +++ b/sys/netlink/netlink_snl_generic.h @@ -0,0 +1,175 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _NETLINK_NETLINK_SNL_GENERIC_H_ +#define _NETLINK_NETLINK_SNL_GENERIC_H_ + +#include <netlink/netlink.h> +#include <netlink/netlink_generic.h> +#include <netlink/netlink_snl.h> + +/* Genetlink helpers */ +static inline struct nlmsghdr * +snl_create_genl_msg_request(struct snl_writer *nw, uint16_t genl_family, + uint8_t genl_cmd) +{ + struct nlmsghdr *hdr; + struct genlmsghdr *ghdr; + + assert(nw->hdr == NULL); + + hdr = snl_reserve_msg_object(nw, struct nlmsghdr); + if (__predict_false(hdr == NULL)) + return (NULL); + hdr->nlmsg_type = genl_family; + hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + ghdr = snl_reserve_msg_object(nw, struct genlmsghdr); + if (__predict_false(ghdr == NULL)) + return (NULL); + ghdr->cmd = genl_cmd; + nw->hdr = hdr; + + return (hdr); +} + +static struct snl_field_parser snl_fp_genl[] = {}; + +#define SNL_DECLARE_GENL_PARSER(_name, _np) SNL_DECLARE_PARSER(_name,\ + struct genlmsghdr, snl_fp_genl, _np) + +struct _snl_genl_ctrl_mcast_group { + uint32_t mcast_grp_id; + const char *mcast_grp_name; +}; + +struct _snl_genl_ctrl_mcast_groups { + uint32_t num_groups; + struct _snl_genl_ctrl_mcast_group **groups; +}; + +#define _OUT(_field) offsetof(struct _snl_genl_ctrl_mcast_group, _field) +static struct snl_attr_parser _nla_p_getmc[] = { + { + .type = CTRL_ATTR_MCAST_GRP_NAME, + .off = _OUT(mcast_grp_name), + .cb = snl_attr_get_string, + }, + { + .type = CTRL_ATTR_MCAST_GRP_ID, + .off = _OUT(mcast_grp_id), + .cb = snl_attr_get_uint32, + }, +}; +#undef _OUT +SNL_DECLARE_ATTR_PARSER_EXT(_genl_ctrl_mc_parser, + sizeof(struct _snl_genl_ctrl_mcast_group), _nla_p_getmc, NULL); + +struct _getfamily_attrs { + uint16_t family_id; + const char *family_name; + struct _snl_genl_ctrl_mcast_groups mcast_groups; +}; + +#define _IN(_field) offsetof(struct genlmsghdr, _field) +#define _OUT(_field) offsetof(struct _getfamily_attrs, _field) +static struct snl_attr_parser _nla_p_getfam[] = { + { + .type = CTRL_ATTR_FAMILY_ID, + .off = _OUT(family_id), + .cb = snl_attr_get_uint16, + }, + { + .type = CTRL_ATTR_FAMILY_NAME, + .off = _OUT(family_name), + .cb = snl_attr_get_string, + }, + { + .type = CTRL_ATTR_MCAST_GROUPS, + .off = _OUT(mcast_groups), + .cb = snl_attr_get_parray, + .arg = &_genl_ctrl_mc_parser, + }, +}; +#undef _IN +#undef _OUT +SNL_DECLARE_GENL_PARSER(_genl_ctrl_getfam_parser, _nla_p_getfam); + +static bool +_snl_get_genl_family_info(struct snl_state *ss, const char *family_name, + struct _getfamily_attrs *attrs) +{ + struct snl_writer nw; + struct nlmsghdr *hdr; + + memset(attrs, 0, sizeof(*attrs)); + + snl_init_writer(ss, &nw); + snl_create_genl_msg_request(&nw, GENL_ID_CTRL, CTRL_CMD_GETFAMILY); + snl_add_msg_attr_string(&nw, CTRL_ATTR_FAMILY_NAME, family_name); + if ((hdr = snl_finalize_msg(&nw)) == NULL || !snl_send_message(ss, hdr)) + return (false); + + hdr = snl_read_reply(ss, hdr->nlmsg_seq); + if (hdr != NULL && hdr->nlmsg_type != NLMSG_ERROR) { + if (snl_parse_nlmsg(ss, hdr, &_genl_ctrl_getfam_parser, attrs)) + return (true); + } + + return (false); +} + +static inline uint16_t +snl_get_genl_family(struct snl_state *ss, const char *family_name) +{ + struct _getfamily_attrs attrs = {}; + + if (__predict_false(!_snl_get_genl_family_info(ss, family_name, + &attrs))) + return (0); + return (attrs.family_id); +} + +static inline uint16_t +snl_get_genl_mcast_group(struct snl_state *ss, const char *family_name, + const char *group_name, uint16_t *family_id) +{ + struct _getfamily_attrs attrs = {}; + + if (__predict_false(!_snl_get_genl_family_info(ss, family_name, + &attrs))) + return (0); + if (attrs.family_id == 0) + return (0); + if (family_id != NULL) + *family_id = attrs.family_id; + for (u_int i = 0; i < attrs.mcast_groups.num_groups; i++) + if (strcmp(attrs.mcast_groups.groups[i]->mcast_grp_name, + group_name) == 0) + return (attrs.mcast_groups.groups[i]->mcast_grp_id); + return (0); +} + +#endif diff --git a/sys/netlink/netlink_snl_route.h b/sys/netlink/netlink_snl_route.h new file mode 100644 index 000000000000..62055b2db417 --- /dev/null +++ b/sys/netlink/netlink_snl_route.h @@ -0,0 +1,201 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _NETLINK_NETLINK_SNL_ROUTE_H_ +#define _NETLINK_NETLINK_SNL_ROUTE_H_ + +#include <netlink/netlink_snl.h> +#include <netlink/netlink_route.h> +#include <netinet/in.h> + +/* + * Simple Netlink Library - NETLINK_ROUTE helpers + */ + +static inline struct sockaddr * +parse_rta_ip4(struct snl_state *ss, void *rta_data, int *perror) +{ + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)snl_allocz(ss, sizeof(struct sockaddr_in)); + if (sin == NULL) { + *perror = ENOBUFS; + return (NULL); + } + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_family = AF_INET; + memcpy(&sin->sin_addr, rta_data, sizeof(struct in_addr)); + return ((struct sockaddr *)sin); +} + +static inline struct sockaddr * +parse_rta_ip6(struct snl_state *ss, void *rta_data, int *perror) +{ + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)snl_allocz(ss, sizeof(struct sockaddr_in6)); + if (sin6 == NULL) { + *perror = ENOBUFS; + return (NULL); + } + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_family = AF_INET6; + memcpy(&sin6->sin6_addr, rta_data, sizeof(struct in6_addr)); + return ((struct sockaddr *)sin6); +} + +static inline struct sockaddr * +parse_rta_ip(struct snl_state *ss, struct rtattr *rta, int *perror) +{ + void *rta_data = NL_RTA_DATA(rta); + int rta_len = NL_RTA_DATA_LEN(rta); + + if (rta_len == sizeof(struct in_addr)) { + return (parse_rta_ip4(ss, rta_data, perror)); + } else if (rta_len == sizeof(struct in6_addr)) { + return (parse_rta_ip6(ss, rta_data, perror)); + } else { + *perror = ENOTSUP; + return (NULL); + } + return (NULL); +} + +static inline bool +snl_attr_get_ip(struct snl_state *ss, struct nlattr *nla, + const void *arg __unused, void *target) +{ + int error = 0; + struct sockaddr *sa = parse_rta_ip(ss, (struct rtattr *)nla, &error); + if (error == 0) { + *((struct sockaddr **)target) = sa; + return (true); + } + return (false); +} + +static inline struct sockaddr * +parse_rta_via(struct snl_state *ss, struct rtattr *rta, int *perror) +{ + struct rtvia *via = (struct rtvia *)NL_RTA_DATA(rta); + + switch (via->rtvia_family) { + case AF_INET: + return (parse_rta_ip4(ss, via->rtvia_addr, perror)); + case AF_INET6: + return (parse_rta_ip6(ss, via->rtvia_addr, perror)); + default: + *perror = ENOTSUP; + return (NULL); + } +} + +static inline bool +snl_attr_get_ipvia(struct snl_state *ss, struct nlattr *nla, + const void *arg __unused, void *target) +{ + int error = 0; + + struct sockaddr *sa = parse_rta_via(ss, (struct rtattr *)nla, &error); + if (error == 0) { + *((struct sockaddr **)target) = sa; + return (true); + } + return (false); +} + +static inline bool +snl_add_msg_attr_ip4(struct snl_writer *nw, int attrtype, const struct in_addr *addr) +{ + return (snl_add_msg_attr(nw, attrtype, 4, addr)); +} + +static inline bool +snl_add_msg_attr_ip6(struct snl_writer *nw, int attrtype, const struct in6_addr *addr) +{ + return (snl_add_msg_attr(nw, attrtype, 16, addr)); +} + +static inline bool +snl_add_msg_attr_ip(struct snl_writer *nw, int attrtype, const struct sockaddr *sa) +{ + const void *addr; + + switch (sa->sa_family) { + case AF_INET: + addr = &((const struct sockaddr_in *)(const void *)sa)->sin_addr; + return (snl_add_msg_attr(nw, attrtype, 4, addr)); + case AF_INET6: + addr = &((const struct sockaddr_in6 *)(const void *)sa)->sin6_addr; + return (snl_add_msg_attr(nw, attrtype, 16, addr)); + } + + return (false); +} + +static inline bool +snl_add_msg_attr_ipvia(struct snl_writer *nw, int attrtype, const struct sockaddr *sa) +{ + char buf[17]; + + buf[0] = sa->sa_family; + + switch (sa->sa_family) { + case AF_INET: + memcpy(&buf[1], &((const struct sockaddr_in *)(const void *)sa)->sin_addr, 4); + return (snl_add_msg_attr(nw, attrtype, 5, buf)); + case AF_INET6: + memcpy(&buf[1], &((const struct sockaddr_in6 *)(const void *)sa)->sin6_addr, 16); + return (snl_add_msg_attr(nw, attrtype, 17, buf)); + } + + return (false); +} + +static inline bool +snl_attr_get_in_addr(struct snl_state *ss __unused, struct nlattr *nla, + const void *arg __unused, void *target) +{ + if (NLA_DATA_LEN(nla) != sizeof(struct in_addr)) + return (false); + + memcpy(target, NLA_DATA_CONST(nla), sizeof(struct in_addr)); + return (true); +} + +static inline bool +snl_attr_get_in6_addr(struct snl_state *ss __unused, struct nlattr *nla, + const void *arg __unused, void *target) +{ + if (NLA_DATA_LEN(nla) != sizeof(struct in6_addr)) + return (false); + + memcpy(target, NLA_DATA_CONST(nla), sizeof(struct in6_addr)); + return (true); +} + + +#endif diff --git a/sys/netlink/netlink_snl_route_compat.h b/sys/netlink/netlink_snl_route_compat.h new file mode 100644 index 000000000000..87c65f1adcda --- /dev/null +++ b/sys/netlink/netlink_snl_route_compat.h @@ -0,0 +1,53 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _NETLINK_NETLINK_SNL_ROUTE_COMPAT_H_ +#define _NETLINK_NETLINK_SNL_ROUTE_COMPAT_H_ + +#include <sys/socket.h> +#include <sys/types.h> + +/* + * This file contains netlink-compatible definitions from the + * net/route.h header. + */ +#define NETLINK_COMPAT + +#include <net/route.h> + +#define RTSOCK_RTM_ADD 0x1 +#define RTSOCK_RTM_DELETE 0x2 +#define RTSOCK_RTM_CHANGE 0x3 +#define RTSOCK_RTM_GET 0x4 +#define RTSOCK_RTM_NEWADDR 0xc +#define RTSOCK_RTM_DELADDR 0xd +#define RTSOCK_RTM_IFINFO 0xe +#define RTSOCK_RTM_NEWMADDR 0xf +#define RTSOCK_RTM_DELMADDR 0x10 +#define RTSOCK_RTM_IFANNOUNCE 0x11 +#define RTSOCK_RTM_IEEE80211 0x12 + +#endif diff --git a/sys/netlink/netlink_snl_route_parsers.h b/sys/netlink/netlink_snl_route_parsers.h new file mode 100644 index 000000000000..6b7a8188180d --- /dev/null +++ b/sys/netlink/netlink_snl_route_parsers.h @@ -0,0 +1,392 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _NETLINK_NETLINK_SNL_ROUTE_PARSERS_H_ +#define _NETLINK_NETLINK_SNL_ROUTE_PARSERS_H_ + +#include <netlink/netlink_snl.h> +#include <netlink/netlink_snl_route.h> +#include <netlink/route/nexthop.h> + +/* TODO: this file should be generated automatically */ + +static inline void +finalize_sockaddr(struct sockaddr *sa, uint32_t ifindex) +{ + if (sa != NULL && sa->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)(void *)sa; + + if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) + sin6->sin6_scope_id = ifindex; + } +} + +/* RTM_<NEW|DEL|GET>ROUTE message parser */ + +struct rta_mpath_nh { + struct sockaddr *gw; + uint32_t ifindex; + uint8_t rtnh_flags; + uint8_t rtnh_weight; + uint32_t rtax_mtu; + uint32_t rta_rtflags; +}; + +#define _IN(_field) offsetof(struct rtnexthop, _field) +#define _OUT(_field) offsetof(struct rta_mpath_nh, _field) +static const struct snl_attr_parser _nla_p_mp_nh_metrics[] = { + { .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = snl_attr_get_uint32 }, +}; +SNL_DECLARE_ATTR_PARSER(_metrics_mp_nh_parser, _nla_p_mp_nh_metrics); + +static const struct snl_attr_parser _nla_p_mp_nh[] = { + { .type = NL_RTA_GATEWAY, .off = _OUT(gw), .cb = snl_attr_get_ip }, + { .type = NL_RTA_METRICS, .arg = &_metrics_mp_nh_parser, .cb = snl_attr_get_nested }, + { .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = snl_attr_get_uint32 }, + { .type = NL_RTA_VIA, .off = _OUT(gw), .cb = snl_attr_get_ipvia }, +}; + +static const struct snl_field_parser _fp_p_mp_nh[] = { + { .off_in = _IN(rtnh_flags), .off_out = _OUT(rtnh_flags), .cb = snl_field_get_uint8 }, + { .off_in = _IN(rtnh_hops), .off_out = _OUT(rtnh_weight), .cb = snl_field_get_uint8 }, + { .off_in = _IN(rtnh_ifindex), .off_out = _OUT(ifindex), .cb = snl_field_get_uint32 }, +}; + +static inline bool +_cb_p_mp_nh(struct snl_state *ss __unused, void *_target) +{ + struct rta_mpath_nh *target = (struct rta_mpath_nh *)_target; + + finalize_sockaddr(target->gw, target->ifindex); + return (true); +} +#undef _IN +#undef _OUT +SNL_DECLARE_PARSER_EXT(_mpath_nh_parser, sizeof(struct rtnexthop), + sizeof(struct rta_mpath_nh), _fp_p_mp_nh, _nla_p_mp_nh, + _cb_p_mp_nh); + +struct rta_mpath { + uint32_t num_nhops; + struct rta_mpath_nh **nhops; +}; + +static bool +nlattr_get_multipath(struct snl_state *ss, struct nlattr *nla, + const void *arg __unused, void *target) +{ + uint32_t start_size = 4; + + while (start_size < NLA_DATA_LEN(nla) / sizeof(struct rtnexthop)) + start_size *= 2; + + return (snl_attr_get_parray_sz(ss, nla, start_size, &_mpath_nh_parser, target)); +} + +struct snl_parsed_route { + struct sockaddr *rta_dst; + struct sockaddr *rta_gw; + struct nlattr *rta_metrics; + struct rta_mpath rta_multipath; + uint32_t rta_expires; + uint32_t rta_oif; + uint32_t rta_expire; + uint32_t rta_table; + uint32_t rta_knh_id; + uint32_t rta_nh_id; + uint32_t rta_rtflags; + uint32_t rtax_mtu; + uint32_t rtax_weight; + uint8_t rtm_family; + uint8_t rtm_type; + uint8_t rtm_protocol; + uint8_t rtm_dst_len; +}; + +#define _IN(_field) offsetof(struct rtmsg, _field) +#define _OUT(_field) offsetof(struct snl_parsed_route, _field) +static const struct snl_attr_parser _nla_p_rtmetrics[] = { + { .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = snl_attr_get_uint32 }, +}; +SNL_DECLARE_ATTR_PARSER(_metrics_parser, _nla_p_rtmetrics); + +static const struct snl_attr_parser _nla_p_route[] = { + { .type = NL_RTA_DST, .off = _OUT(rta_dst), .cb = snl_attr_get_ip }, + { .type = NL_RTA_OIF, .off = _OUT(rta_oif), .cb = snl_attr_get_uint32 }, + { .type = NL_RTA_GATEWAY, .off = _OUT(rta_gw), .cb = snl_attr_get_ip }, + { .type = NL_RTA_METRICS, .arg = &_metrics_parser, .cb = snl_attr_get_nested }, + { .type = NL_RTA_MULTIPATH, .off = _OUT(rta_multipath), .cb = nlattr_get_multipath }, + { .type = NL_RTA_KNH_ID, .off = _OUT(rta_knh_id), .cb = snl_attr_get_uint32 }, + { .type = NL_RTA_WEIGHT, .off = _OUT(rtax_weight), .cb = snl_attr_get_uint32 }, + { .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = snl_attr_get_uint32 }, + { .type = NL_RTA_TABLE, .off = _OUT(rta_table), .cb = snl_attr_get_uint32 }, + { .type = NL_RTA_VIA, .off = _OUT(rta_gw), .cb = snl_attr_get_ipvia }, + { .type = NL_RTA_EXPIRES, .off = _OUT(rta_expire), .cb = snl_attr_get_uint32 }, + { .type = NL_RTA_NH_ID, .off = _OUT(rta_nh_id), .cb = snl_attr_get_uint32 }, +}; + +static const struct snl_field_parser _fp_p_route[] = { + {.off_in = _IN(rtm_family), .off_out = _OUT(rtm_family), .cb = snl_field_get_uint8 }, + {.off_in = _IN(rtm_type), .off_out = _OUT(rtm_type), .cb = snl_field_get_uint8 }, + {.off_in = _IN(rtm_protocol), .off_out = _OUT(rtm_protocol), .cb = snl_field_get_uint8 }, + {.off_in = _IN(rtm_dst_len), .off_out = _OUT(rtm_dst_len), .cb = snl_field_get_uint8 }, +}; + +static inline bool +_cb_p_route(struct snl_state *ss __unused, void *_target) +{ + struct snl_parsed_route *target = (struct snl_parsed_route *)_target; + + finalize_sockaddr(target->rta_dst, target->rta_oif); + finalize_sockaddr(target->rta_gw, target->rta_oif); + return (true); +} +#undef _IN +#undef _OUT +SNL_DECLARE_PARSER_EXT(snl_rtm_route_parser, sizeof(struct rtmsg), + sizeof(struct snl_parsed_route), _fp_p_route, _nla_p_route, + _cb_p_route); + +/* RTM_<NEW|DEL|GET>LINK message parser */ +struct snl_parsed_link { + uint32_t ifi_index; + uint32_t ifi_flags; + uint32_t ifi_change; + uint16_t ifi_type; + uint8_t ifla_operstate; + uint8_t ifla_carrier; + uint32_t ifla_mtu; + char *ifla_ifname; + struct nlattr *ifla_address; + struct nlattr *ifla_broadcast; + char *ifla_ifalias; + uint32_t ifla_promiscuity; + struct rtnl_link_stats64 *ifla_stats64; + struct nlattr *iflaf_orig_hwaddr; + struct snl_attr_bitset iflaf_caps; +}; + +#define _IN(_field) offsetof(struct ifinfomsg, _field) +#define _OUT(_field) offsetof(struct snl_parsed_link, _field) +static const struct snl_attr_parser _nla_p_link_fbsd[] = { + { .type = IFLAF_ORIG_HWADDR, .off = _OUT(iflaf_orig_hwaddr), .cb = snl_attr_dup_nla }, + { .type = IFLAF_CAPS, .off = _OUT(iflaf_caps), .cb = snl_attr_get_bitset_c }, +}; +SNL_DECLARE_ATTR_PARSER(_link_fbsd_parser, _nla_p_link_fbsd); + +static const struct snl_attr_parser _nla_p_link[] = { + { .type = IFLA_ADDRESS, .off = _OUT(ifla_address), .cb = snl_attr_dup_nla }, + { .type = IFLA_BROADCAST, .off = _OUT(ifla_broadcast), .cb = snl_attr_dup_nla }, + { .type = IFLA_IFNAME, .off = _OUT(ifla_ifname), .cb = snl_attr_dup_string }, + { .type = IFLA_MTU, .off = _OUT(ifla_mtu), .cb = snl_attr_get_uint32 }, + { .type = IFLA_OPERSTATE, .off = _OUT(ifla_operstate), .cb = snl_attr_get_uint8 }, + { .type = IFLA_IFALIAS, .off = _OUT(ifla_ifalias), .cb = snl_attr_dup_string }, + { .type = IFLA_STATS64, .off = _OUT(ifla_stats64), .cb = snl_attr_dup_struct }, + { .type = IFLA_PROMISCUITY, .off = _OUT(ifla_promiscuity), .cb = snl_attr_get_uint32 }, + { .type = IFLA_CARRIER, .off = _OUT(ifla_carrier), .cb = snl_attr_get_uint8 }, + { .type = IFLA_FREEBSD, .arg = &_link_fbsd_parser, .cb = snl_attr_get_nested }, +}; +static const struct snl_field_parser _fp_p_link[] = { + {.off_in = _IN(ifi_index), .off_out = _OUT(ifi_index), .cb = snl_field_get_uint32 }, + {.off_in = _IN(ifi_flags), .off_out = _OUT(ifi_flags), .cb = snl_field_get_uint32 }, + {.off_in = _IN(ifi_change), .off_out = _OUT(ifi_change), .cb = snl_field_get_uint32 }, + {.off_in = _IN(ifi_type), .off_out = _OUT(ifi_type), .cb = snl_field_get_uint16 }, +}; +#undef _IN +#undef _OUT +SNL_DECLARE_PARSER(snl_rtm_link_parser, struct ifinfomsg, _fp_p_link, _nla_p_link); + +struct snl_parsed_link_simple { + uint32_t ifi_index; + uint32_t ifla_mtu; + uint16_t ifi_type; + uint32_t ifi_flags; + char *ifla_ifname; +}; + +#define _IN(_field) offsetof(struct ifinfomsg, _field) +#define _OUT(_field) offsetof(struct snl_parsed_link_simple, _field) +static struct snl_attr_parser _nla_p_link_s[] = { + { .type = IFLA_IFNAME, .off = _OUT(ifla_ifname), .cb = snl_attr_dup_string }, + { .type = IFLA_MTU, .off = _OUT(ifla_mtu), .cb = snl_attr_get_uint32 }, +}; +static struct snl_field_parser _fp_p_link_s[] = { + {.off_in = _IN(ifi_index), .off_out = _OUT(ifi_index), .cb = snl_field_get_uint32 }, + {.off_in = _IN(ifi_type), .off_out = _OUT(ifi_type), .cb = snl_field_get_uint16 }, + {.off_in = _IN(ifi_flags), .off_out = _OUT(ifi_flags), .cb = snl_field_get_uint32 }, +}; +#undef _IN +#undef _OUT +SNL_DECLARE_PARSER(snl_rtm_link_parser_simple, struct ifinfomsg, _fp_p_link_s, _nla_p_link_s); + +struct snl_parsed_neigh { + uint8_t ndm_family; + uint8_t ndm_flags; + uint16_t ndm_state; + uint32_t nda_ifindex; + uint32_t nda_probes; + uint32_t ndaf_next_ts; + struct sockaddr *nda_dst; + struct nlattr *nda_lladdr; +}; + +#define _IN(_field) offsetof(struct ndmsg, _field) +#define _OUT(_field) offsetof(struct snl_parsed_neigh, _field) +static const struct snl_attr_parser _nla_p_neigh_fbsd[] = { + { .type = NDAF_NEXT_STATE_TS, .off = _OUT(ndaf_next_ts), .cb = snl_attr_get_uint32 }, +}; +SNL_DECLARE_ATTR_PARSER(_neigh_fbsd_parser, _nla_p_neigh_fbsd); + +static struct snl_attr_parser _nla_p_neigh_s[] = { + { .type = NDA_DST, .off = _OUT(nda_dst), .cb = snl_attr_get_ip }, + { .type = NDA_LLADDR , .off = _OUT(nda_lladdr), .cb = snl_attr_dup_nla }, + { .type = NDA_PROBES, .off = _OUT(nda_probes), .cb = snl_attr_get_uint32 }, + { .type = NDA_IFINDEX, .off = _OUT(nda_ifindex), .cb = snl_attr_get_uint32 }, + { .type = NDA_FREEBSD, .arg = &_neigh_fbsd_parser, .cb = snl_attr_get_nested }, +}; +static struct snl_field_parser _fp_p_neigh_s[] = { + {.off_in = _IN(ndm_family), .off_out = _OUT(ndm_family), .cb = snl_field_get_uint8 }, + {.off_in = _IN(ndm_flags), .off_out = _OUT(ndm_flags), .cb = snl_field_get_uint8 }, + {.off_in = _IN(ndm_state), .off_out = _OUT(ndm_state), .cb = snl_field_get_uint16 }, + {.off_in = _IN(ndm_ifindex), .off_out = _OUT(nda_ifindex), .cb = snl_field_get_uint32 }, +}; + +static inline bool +_cb_p_neigh(struct snl_state *ss __unused, void *_target) +{ + struct snl_parsed_neigh *target = (struct snl_parsed_neigh *)_target; + + finalize_sockaddr(target->nda_dst, target->nda_ifindex); + return (true); +} +#undef _IN +#undef _OUT +SNL_DECLARE_PARSER_EXT(snl_rtm_neigh_parser, sizeof(struct ndmsg), + sizeof(struct snl_parsed_neigh), _fp_p_neigh_s, _nla_p_neigh_s, + _cb_p_neigh); + +struct snl_parsed_addr { + uint8_t ifa_family; + uint8_t ifa_prefixlen; + uint32_t ifa_index; + struct sockaddr *ifa_local; + struct sockaddr *ifa_address; + struct sockaddr *ifa_broadcast; + char *ifa_label; + struct ifa_cacheinfo *ifa_cacheinfo; + uint32_t ifaf_vhid; + uint32_t ifaf_flags; +}; + +#define _IN(_field) offsetof(struct ifaddrmsg, _field) +#define _OUT(_field) offsetof(struct snl_parsed_addr, _field) +static const struct snl_attr_parser _nla_p_addr_fbsd[] = { + { .type = IFAF_VHID, .off = _OUT(ifaf_vhid), .cb = snl_attr_get_uint32 }, + { .type = IFAF_FLAGS, .off = _OUT(ifaf_flags), .cb = snl_attr_get_uint32 }, +}; +SNL_DECLARE_ATTR_PARSER(_addr_fbsd_parser, _nla_p_addr_fbsd); + +static const struct snl_attr_parser _nla_p_addr_s[] = { + { .type = IFA_ADDRESS, .off = _OUT(ifa_address), .cb = snl_attr_get_ip }, + { .type = IFA_LOCAL, .off = _OUT(ifa_local), .cb = snl_attr_get_ip }, + { .type = IFA_LABEL, .off = _OUT(ifa_label), .cb = snl_attr_dup_string }, + { .type = IFA_BROADCAST, .off = _OUT(ifa_broadcast), .cb = snl_attr_get_ip }, + { .type = IFA_CACHEINFO, .off = _OUT(ifa_cacheinfo), .cb = snl_attr_dup_struct }, + { .type = IFA_FREEBSD, .arg = &_addr_fbsd_parser, .cb = snl_attr_get_nested }, +}; +static const struct snl_field_parser _fp_p_addr_s[] = { + {.off_in = _IN(ifa_family), .off_out = _OUT(ifa_family), .cb = snl_field_get_uint8 }, + {.off_in = _IN(ifa_prefixlen), .off_out = _OUT(ifa_prefixlen), .cb = snl_field_get_uint8 }, + {.off_in = _IN(ifa_index), .off_out = _OUT(ifa_index), .cb = snl_field_get_uint32 }, +}; + +static inline bool +_cb_p_addr(struct snl_state *ss __unused, void *_target) +{ + struct snl_parsed_addr *target = (struct snl_parsed_addr *)_target; + + finalize_sockaddr(target->ifa_address, target->ifa_index); + finalize_sockaddr(target->ifa_local, target->ifa_index); + return (true); +} +#undef _IN +#undef _OUT +SNL_DECLARE_PARSER_EXT(snl_rtm_addr_parser, sizeof(struct ifaddrmsg), + sizeof(struct snl_parsed_addr), _fp_p_addr_s, _nla_p_addr_s, + _cb_p_addr); + +struct snl_parsed_nhop { + uint32_t nha_id; + uint8_t nha_blackhole; + uint8_t nha_groups; + uint8_t nhaf_knhops; + uint8_t nhaf_family; + uint32_t nha_oif; + struct sockaddr *nha_gw; + uint8_t nh_family; + uint8_t nh_protocol; + uint32_t nhaf_table; + uint32_t nhaf_kid; + uint32_t nhaf_aif; +}; + +#define _IN(_field) offsetof(struct nhmsg, _field) +#define _OUT(_field) offsetof(struct snl_parsed_nhop, _field) +static struct snl_attr_parser _nla_p_nh_fbsd[] = { + { .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = snl_attr_get_flag }, + { .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = snl_attr_get_uint32 }, + { .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = snl_attr_get_uint32 }, + { .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = snl_attr_get_uint32 }, +}; +SNL_DECLARE_ATTR_PARSER(_nh_fbsd_parser, _nla_p_nh_fbsd); + +static const struct snl_field_parser _fp_p_nh[] = { + { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = snl_field_get_uint8 }, + { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = snl_field_get_uint8 }, +}; + +static const struct snl_attr_parser _nla_p_nh[] = { + { .type = NHA_ID, .off = _OUT(nha_id), .cb = snl_attr_get_uint32 }, + { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = snl_attr_get_flag }, + { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = snl_attr_get_uint32 }, + { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = snl_attr_get_ip }, + { .type = NHA_FREEBSD, .arg = &_nh_fbsd_parser, .cb = snl_attr_get_nested }, +}; + +static inline bool +_cb_p_nh(struct snl_state *ss __unused, void *_target) +{ + struct snl_parsed_nhop *target = (struct snl_parsed_nhop *)_target; + + finalize_sockaddr(target->nha_gw, target->nha_oif); + return (true); +} +#undef _IN +#undef _OUT +SNL_DECLARE_PARSER_EXT(snl_nhmsg_parser, sizeof(struct nhmsg), + sizeof(struct snl_parsed_nhop), _fp_p_nh, _nla_p_nh, _cb_p_nh); + +#endif diff --git a/sys/netlink/netlink_sysevent.c b/sys/netlink/netlink_sysevent.c new file mode 100644 index 000000000000..09e7e50a7409 --- /dev/null +++ b/sys/netlink/netlink_sysevent.c @@ -0,0 +1,205 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Baptiste Daroussin <bapt@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/devctl.h> +#include <sys/errno.h> +#include <sys/module.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <net/vnet.h> +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_generic.h> +#include <netlink/netlink_sysevent.h> + +#define DEBUG_MOD_NAME nl_sysevent +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include <netlink/netlink_debug.h> +_DECLARE_DEBUG(LOG_INFO); + +MALLOC_DEFINE(M_NLSE, "nlsysevent", "Memory used for Netlink sysevent"); +#define NLSE_FAMILY_NAME "nlsysevent" +static uint16_t ctrl_family_id; + +#define MAX_SYSEVENT_GROUPS 64 +static struct sysevent_group { + char *name; + uint32_t id; +} sysevent_groups[MAX_SYSEVENT_GROUPS] = {}; + +static const char *devctl_systems[] = { + "ACPI", + "AEON", + "CAM", + "CARP", + "coretemp", + "DEVFS", + "device", + "ETHERNET", + "GEOM", + "HYPERV_NIC_VF", + "IFNET", + "INFINIBAND", + "KERNEL", + "nvme", + "PMU", + "RCTL", + "USB", + "VFS", + "VT", + "ZFS", +}; + +static void +sysevent_write(struct sysevent_group *se, const char *subsystem, const char *type, + const char *data) +{ + struct nl_writer nw; + + if (!nl_writer_group(&nw, NLMSG_LARGE, NETLINK_GENERIC, se->id, 0, + false)) { + NL_LOG(LOG_DEBUG, "error allocating group writer"); + return; + } + struct nlmsghdr hdr = { .nlmsg_type = ctrl_family_id }; + if (!nlmsg_reply(&nw, &hdr, sizeof(struct genlmsghdr))) { + return; + } + + struct genlmsghdr *ghdr = nlmsg_reserve_object(&nw, struct genlmsghdr); + if (ghdr == NULL) { + NL_LOG(LOG_DEBUG, "unable to allocate memory"); + return; + } + ghdr->version = 0; + ghdr->cmd = NLSE_CMD_NEWEVENT; + ghdr->reserved = 0; + nlattr_add_string(&nw, NLSE_ATTR_SYSTEM, se->name); + nlattr_add_string(&nw, NLSE_ATTR_SUBSYSTEM, subsystem); + nlattr_add_string(&nw, NLSE_ATTR_TYPE, type); + if (data != NULL) + nlattr_add_string(&nw, NLSE_ATTR_DATA, data); + nlmsg_end(&nw); + nlmsg_flush(&nw); +} + +static void +sysevent_new_group(size_t index, const char *name) +{ + if (index >= MAX_SYSEVENT_GROUPS) { + NL_LOG(LOG_WARNING, "impossible to add the event %s, " + "too many event groups\n", name); + return; + } + sysevent_groups[index].name = strdup(name, M_NLSE); + sysevent_groups[index].id = genl_register_group(ctrl_family_id, + sysevent_groups[index].name); +} + +static struct sysevent_group * +sysevent_get_group(const char *system) +{ + for (size_t i = 0; i < MAX_SYSEVENT_GROUPS; i++) { + if (sysevent_groups[i].name == NULL) { + sysevent_new_group(i, system); + return (&sysevent_groups[i]); + } + if (strcmp(sysevent_groups[i].name, system) == 0) + return (&sysevent_groups[i]); + } + + return (NULL); +} + +static void +sysevent_send(const char *system, const char *subsystem, const char *type, + const char *data) +{ + struct sysevent_group *se = sysevent_get_group(system); + + if (se == NULL) { + NL_LOG(LOG_WARNING, "impossible to add the event %s, " + "too many event groups\n", system); + return; + } + + CURVNET_SET(vnet0); + sysevent_write(se, subsystem, type, data); + CURVNET_RESTORE(); +} + +static void +nlsysevent_load(void) +{ + devctl_set_notify_hook(sysevent_send); + ctrl_family_id = genl_register_family(NLSE_FAMILY_NAME, 0, 2, NLSE_ATTR_MAX); + for (size_t i = 0; i < nitems(devctl_systems); i++) { + if (i >= MAX_SYSEVENT_GROUPS) { + NL_LOG(LOG_WARNING, "impossible to add the event %s, too many events\n", devctl_systems[i]); + continue; + } + sysevent_new_group(i, devctl_systems[i]); + } +} + +static void +nlsysevent_unload(void) +{ + devctl_unset_notify_hook(); + genl_unregister_family(ctrl_family_id); + for (size_t i = 0; i < MAX_SYSEVENT_GROUPS; i++) { + if (sysevent_groups[i].name == NULL) + break; + free(sysevent_groups[i].name, M_NLSE); + } +} + +static int +nlsysevent_loader(module_t mod __unused, int what, void *priv __unused) +{ + int err = 0; + + switch (what) { + case MOD_LOAD: + nlsysevent_load(); + break; + case MOD_UNLOAD: + nlsysevent_unload(); + break; + default: + err = EOPNOTSUPP; + break; + } + return (err); +} +static moduledata_t nlsysevent_mod = { "nlsysevent", nlsysevent_loader, NULL}; + +DECLARE_MODULE(nlsysevent, nlsysevent_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_DEPEND(nlsysevent, netlink, 1, 1, 1); +MODULE_VERSION(nlsysevent, 1); diff --git a/sys/netlink/netlink_sysevent.h b/sys/netlink/netlink_sysevent.h new file mode 100644 index 000000000000..8434a0de078e --- /dev/null +++ b/sys/netlink/netlink_sysevent.h @@ -0,0 +1,49 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Baptiste Daroussin <bapt@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETLINK_SYSEVENT_H_ +#define _NETLINK_SYSEVENT_H_ + +enum { + NLSE_ATTR_UNSPEC = 0, + NLSE_ATTR_SYSTEM = 1, /* string reporting the system name */ + NLSE_ATTR_SUBSYSTEM = 2, /* string reporting the subsystem name */ + NLSE_ATTR_TYPE = 3, /* string reporting the type if the event */ + NLSE_ATTR_DATA = 4, /* string reporting the extra data (can be null) */ + __NLSE_ATTR_MAX, +}; +#define NLSE_ATTR_MAX (__NLSE_ATTR_MAX -1) + +/* commands */ +enum { + NLSE_CMD_UNSPEC = 0, + NLSE_CMD_NEWEVENT = 1, + __NLSE_CMD_MAX, +}; +#define NLSE_CMD_MAX (__NLSE_CMD_MAX - 1) + +#endif diff --git a/sys/netlink/netlink_var.h b/sys/netlink/netlink_var.h new file mode 100644 index 000000000000..23e7395d44c2 --- /dev/null +++ b/sys/netlink/netlink_var.h @@ -0,0 +1,181 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _NETLINK_NETLINK_VAR_H_ +#define _NETLINK_NETLINK_VAR_H_ + +#ifdef _KERNEL + +#include <sys/ck.h> +#include <sys/epoch.h> +#include <sys/sysctl.h> +#include <sys/taskqueue.h> +#include <net/vnet.h> + +#define NLSNDQ 65536 /* Default socket sendspace */ +#define NLRCVQ 65536 /* Default socket recvspace */ + +#define NLMBUFSIZE 2048 /* External storage size for Netlink mbufs */ + +struct ucred; + +struct nl_buf { + TAILQ_ENTRY(nl_buf) tailq; + u_int buflen; + u_int datalen; + u_int offset; + char data[]; +}; + +#define NLP_MAX_GROUPS 128 + +BITSET_DEFINE(nl_groups, NLP_MAX_GROUPS); +struct nlpcb { + struct socket *nl_socket; + struct nl_groups nl_groups; + uint32_t nl_port; + uint32_t nl_flags; + uint32_t nl_process_id; + int nl_proto; + bool nl_bound; + bool nl_task_pending; + bool nl_tx_blocked; /* No new requests accepted */ + bool nl_linux; /* true if running under compat */ + bool nl_unconstrained_vnet; /* true if running under VNET jail (or without jail) */ + bool nl_need_thread_setup; + struct taskqueue *nl_taskqueue; + struct task nl_task; + uint64_t nl_dropped_bytes; + uint64_t nl_dropped_messages; + CK_LIST_ENTRY(nlpcb) nl_next; + CK_LIST_ENTRY(nlpcb) nl_port_next; + volatile u_int nl_refcount; + struct mtx nl_lock; + struct epoch_context nl_epoch_ctx; +}; +#define sotonlpcb(so) ((struct nlpcb *)(so)->so_pcb) + +#define NLP_LOCK_INIT(_nlp) mtx_init(&((_nlp)->nl_lock), "nlp mtx", NULL, MTX_DEF) +#define NLP_LOCK_DESTROY(_nlp) mtx_destroy(&((_nlp)->nl_lock)) +#define NLP_LOCK(_nlp) mtx_lock(&((_nlp)->nl_lock)) +#define NLP_UNLOCK(_nlp) mtx_unlock(&((_nlp)->nl_lock)) + +#define ALIGNED_NL_SZ(_data) roundup2((((struct nlmsghdr *)(_data))->nlmsg_len), 16) + +/* nl_flags */ +#define NLF_CAP_ACK 0x01 /* Do not send message body with errmsg */ +#define NLF_EXT_ACK 0x02 /* Allow including extended TLVs in ack */ +#define NLF_STRICT 0x04 /* Perform strict header checks */ +#define NLF_MSG_INFO 0x08 /* Send caller info along with the notifications */ + +SYSCTL_DECL(_net_netlink); +SYSCTL_DECL(_net_netlink_debug); + +struct nl_control { + CK_LIST_HEAD(nl_pid_head, nlpcb) ctl_port_head; + CK_LIST_HEAD(nlpcb_head, nlpcb) ctl_pcb_head; + CK_LIST_ENTRY(nl_control) ctl_next; + struct rmlock ctl_lock; +}; +VNET_DECLARE(struct nl_control, nl_ctl); +#define V_nl_ctl VNET(nl_ctl) + +struct sockaddr_nl; +struct sockaddr; +struct nlmsghdr; + +int nl_verify_proto(int proto); +const char *nl_get_proto_name(int proto); + +extern int netlink_unloading; + +struct nl_proto_handler { + nl_handler_f cb; + const char *proto_name; +}; +extern struct nl_proto_handler *nl_handlers; + +/* netlink_domain.c */ +bool nl_send_group(struct nl_writer *); +void nl_clear_group(u_int); +void nl_osd_register(void); +void nl_osd_unregister(void); +void nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp); + +/* netlink_io.c */ +bool nl_send(struct nl_writer *, struct nlpcb *); +void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *nlmsg, + struct nl_pstate *npt); +void nl_on_transmit(struct nlpcb *nlp); + +void nl_taskqueue_handler(void *_arg, int pending); +void nl_schedule_taskqueue(struct nlpcb *nlp); +void nl_process_receive_locked(struct nlpcb *nlp); +void nl_set_source_metadata(struct mbuf *m, int num_messages); +struct nl_buf *nl_buf_alloc(size_t len, int mflag); +void nl_buf_free(struct nl_buf *nb); + +#define MAX_FAMILIES 20 +#define MAX_GROUPS 64 + +#define MIN_GROUP_NUM 48 + +#define CTRL_FAMILY_ID 0 +#define CTRL_FAMILY_NAME "nlctrl" +#define CTRL_GROUP_ID 0 +#define CTRL_GROUP_NAME "notify" + +struct ifnet; +struct nl_parsed_link; +struct nlattr_bmask; +struct nl_pstate; + +/* Function map */ +struct nl_function_wrapper { + bool (*nlmsg_add)(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, + uint16_t flags, uint32_t len); + bool (*nlmsg_refill_buffer)(struct nl_writer *nw, size_t required_len); + bool (*nlmsg_flush)(struct nl_writer *nw); + bool (*nlmsg_end)(struct nl_writer *nw); + void (*nlmsg_abort)(struct nl_writer *nw); + void (*nlmsg_ignore_limit)(struct nl_writer *nw); + bool (*nl_writer_unicast)(struct nl_writer *nw, size_t size, + struct nlpcb *nlp, bool waitok); + bool (*nl_writer_group)(struct nl_writer *nw, size_t size, + uint16_t protocol, uint16_t group_id, int priv, bool waitok); + bool (*nlmsg_end_dump)(struct nl_writer *nw, int error, struct nlmsghdr *hdr); + int (*nl_modify_ifp_generic)(struct ifnet *ifp, struct nl_parsed_link *lattrs, + const struct nlattr_bmask *bm, struct nl_pstate *npt); + void (*nl_store_ifp_cookie)(struct nl_pstate *npt, struct ifnet *ifp); + struct nlpcb * (*nl_get_thread_nlp)(struct thread *td); +}; +void nl_set_functions(const struct nl_function_wrapper *nl); + + + +#endif +#endif diff --git a/sys/netlink/route/common.h b/sys/netlink/route/common.h new file mode 100644 index 000000000000..5cd3a5ee3524 --- /dev/null +++ b/sys/netlink/route/common.h @@ -0,0 +1,259 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Common defines for all parts of the NETLINK_ROUTE family + */ +#ifndef _NETLINK_ROUTE_COMMON_H_ +#define _NETLINK_ROUTE_COMMON_H_ + +/* Defined NETLINK_ROUTE messages */ +enum { + NL_RTM_BASE = 16, + NL_RTM_NEWLINK = 16, /* creates new interface */ + NL_RTM_DELLINK = 17, /* deletes matching interface */ + NL_RTM_GETLINK = 18, /* lists matching interfaces */ + NL_RTM_SETLINK = 19, /* not supported */ + NL_RTM_NEWADDR = 20, /* not supported */ + NL_RTM_DELADDR = 21, /* not supported */ + NL_RTM_GETADDR = 22, /* lists matching ifaddrs */ + NL_RTM_NEWROUTE = 24, /* adds or changes a route */ + NL_RTM_DELROUTE = 25, /* deletes matching route */ + NL_RTM_GETROUTE = 26, /* lists matching routes */ + NL_RTM_NEWNEIGH = 28, /* creates new arp/ndp entry */ + NL_RTM_DELNEIGH = 29, /* deletes matching arp/ndp entry */ + NL_RTM_GETNEIGH = 30, /* lists matching arp/ndp entry */ + NL_RTM_NEWRULE = 32, /* not supported */ + NL_RTM_DELRULE = 33, /* not supported */ + NL_RTM_GETRULE = 34, /* not supported */ + NL_RTM_NEWQDISC = 36, /* not supported */ + NL_RTM_DELQDISC = 37, /* not supported */ + NL_RTM_GETQDISC = 38, /* not supported */ + NL_RTM_NEWTCLASS = 40, /* not supported */ + NL_RTM_DELTCLASS = 41, /* not supported */ + NL_RTM_GETTCLASS = 42, /* not supported */ + NL_RTM_NEWTFILTER = 44, /* not supported */ + NL_RTM_DELTFILTER = 45, /* not supported */ + NL_RTM_GETTFILTER = 46, /* not supported */ + NL_RTM_NEWACTION = 48, /* not supported */ + NL_RTM_DELACTION = 49, /* not supported */ + NL_RTM_GETACTION = 50, /* not supported */ + NL_RTM_NEWPREFIX = 52, /* not supported */ + NL_RTM_GETMULTICAST = 58, /* not supported */ + NL_RTM_GETANYCAST = 62, /* not supported */ + NL_RTM_NEWNEIGHTBL = 64, /* not supported */ + NL_RTM_GETNEIGHTBL = 66, /* not supported */ + NL_RTM_SETNEIGHTBL = 67, /* not supported */ + NL_RTM_NEWNDUSEROPT = 68, /* not supported */ + NL_RTM_NEWADDRLABEL = 72, /* not supported */ + NL_RTM_DELADDRLABEL = 73, /* not supported */ + NL_RTM_GETADDRLABEL = 74, /* not supported */ + NL_RTM_GETDCB = 78, /* not supported */ + NL_RTM_SETDCB = 79, /* not supported */ + NL_RTM_NEWNETCONF = 80, /* not supported */ + NL_RTM_GETNETCONF = 82, /* not supported */ + NL_RTM_NEWMDB = 84, /* not supported */ + NL_RTM_DELMDB = 85, /* not supported */ + NL_RTM_GETMDB = 86, /* not supported */ + NL_RTM_NEWNSID = 88, /* not supported */ + NL_RTM_DELNSID = 89, /* not supported */ + NL_RTM_GETNSID = 90, /* not supported */ + NL_RTM_NEWSTATS = 92, /* not supported */ + NL_RTM_GETSTATS = 94, /* not supported */ + NL_RTM_NEWNEXTHOP = 104, /* creates new user nexhtop */ + NL_RTM_DELNEXTHOP = 105, /* deletes matching nexthop */ + NL_RTM_GETNEXTHOP = 106, /* lists created user nexthops */ + __NL_RTM_MAX, +}; +#define NL_RTM_MAX (((__NL_RTM_MAX + 3) & ~3) - 1) + +#ifndef _KERNEL +/* + * RTM_* namespace clashes with BSD rtsock namespace. + * Use NL_RTM_ prefix in the kernel and map it to RTM_ + * for userland. + */ +#define RTM_BASE NL_RTM_BASE +#define RTM_NEWLINK NL_RTM_NEWLINK +#define RTM_DELLINK NL_RTM_DELLINK +#define RTM_GETLINK NL_RTM_GETLINK +#define RTM_SETLINK NL_RTM_SETLINK +#define RTM_NEWADDR NL_RTM_NEWADDR +#define RTM_DELADDR NL_RTM_DELADDR +#define RTM_GETADDR NL_RTM_GETADDR +#define RTM_NEWROUTE NL_RTM_NEWROUTE +#define RTM_DELROUTE NL_RTM_DELROUTE +#define RTM_GETROUTE NL_RTM_GETROUTE +#define RTM_NEWNEIGH NL_RTM_NEWNEIGH +#define RTM_DELNEIGH NL_RTM_DELNEIGH +#define RTM_GETNEIGH NL_RTM_GETNEIGH +#define RTM_NEWRULE NL_RTM_NEWRULE +#define RTM_DELRULE NL_RTM_DELRULE +#define RTM_GETRULE NL_RTM_GETRULE +#define RTM_NEWQDISC NL_RTM_NEWQDISC +#define RTM_DELQDISC NL_RTM_DELQDISC +#define RTM_GETQDISC NL_RTM_GETQDISC +#define RTM_NEWTCLASS NL_RTM_NEWTCLASS +#define RTM_DELTCLASS NL_RTM_DELTCLASS +#define RTM_GETTCLASS NL_RTM_GETTCLASS +#define RTM_NEWTFILTER NL_RTM_NEWTFILTER +#define RTM_DELTFILTER NL_RTM_DELTFILTER +#define RTM_GETTFILTER NL_RTM_GETTFILTER +#define RTM_NEWACTION NL_RTM_NEWACTION +#define RTM_DELACTION NL_RTM_DELACTION +#define RTM_GETACTION NL_RTM_GETACTION +#define RTM_NEWPREFIX NL_RTM_NEWPREFIX +#define RTM_GETMULTICAST NL_RTM_GETMULTICAST +#define RTM_GETANYCAST NL_RTM_GETANYCAST +#define RTM_NEWNEIGHTBL NL_RTM_NEWNEIGHTBL +#define RTM_GETNEIGHTBL NL_RTM_GETNEIGHTBL +#define RTM_SETNEIGHTBL NL_RTM_SETNEIGHTBL +#define RTM_NEWNDUSEROPT NL_RTM_NEWNDUSEROPT +#define RTM_NEWADDRLABEL NL_RTM_NEWADDRLABEL +#define RTM_DELADDRLABEL NL_RTM_DELADDRLABEL +#define RTM_GETADDRLABEL NL_RTM_GETADDRLABEL +#define RTM_GETDCB NL_RTM_GETDCB +#define RTM_SETDCB NL_RTM_SETDCB +#define RTM_NEWNETCONF NL_RTM_NEWNETCONF +#define RTM_GETNETCONF NL_RTM_GETNETCONF +#define RTM_NEWMDB NL_RTM_NEWMDB +#define RTM_DELMDB NL_RTM_DELMDB +#define RTM_GETMDB NL_RTM_GETMDB +#define RTM_NEWNSID NL_RTM_NEWNSID +#define RTM_DELNSID NL_RTM_DELNSID +#define RTM_GETNSID NL_RTM_GETNSID +#define RTM_NEWSTATS NL_RTM_NEWSTATS +#define RTM_GETSTATS NL_RTM_GETSTATS +#define RTM_NEWNEXTHOP NL_RTM_NEWNEXTHOP +#define RTM_DELNEXTHOP NL_RTM_DELNEXTHOP +#define RTM_GETNEXTHOP NL_RTM_GETNEXTHOP +#define __RTM_MAX __NL_RTM_MAX + +#define RTM_MAX (roundup2(__RTM_MAX, 4)) + +/* rtnetlink multicast groups - backwards compatibility for userspace */ +#define RTMGRP_LINK 0x01 +#define RTMGRP_NOTIFY 0x02 +#define RTMGRP_NEIGH 0x04 +#define RTMGRP_TC 0x08 + +#define RTMGRP_IPV4_IFADDR 0x10 +#define RTMGRP_IPV4_MROUTE 0x20 +#define RTMGRP_IPV4_ROUTE 0x40 +#define RTMGRP_IPV4_RULE 0x80 + +#define RTMGRP_IPV6_IFADDR 0x100 +#define RTMGRP_IPV6_MROUTE 0x200 +#define RTMGRP_IPV6_ROUTE 0x400 +#define RTMGRP_IPV6_IFINFO 0x800 + +#define RTMGRP_DECnet_IFADDR 0x1000 +#define RTMGRP_DECnet_ROUTE 0x4000 + +#define RTMGRP_IPV6_PREFIX 0x20000 +#endif + +/* Defined NETLINK_ROUTE multicast groups */ +enum rtnetlink_groups { + RTNLGRP_NONE, +#define RTNLGRP_NONE RTNLGRP_NONE + RTNLGRP_LINK, +#define RTNLGRP_LINK RTNLGRP_LINK + RTNLGRP_NOTIFY, +#define RTNLGRP_NOTIFY RTNLGRP_NOTIFY + RTNLGRP_NEIGH, +#define RTNLGRP_NEIGH RTNLGRP_NEIGH + RTNLGRP_TC, +#define RTNLGRP_TC RTNLGRP_TC + RTNLGRP_IPV4_IFADDR, +#define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR + RTNLGRP_IPV4_MROUTE, +#define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE + RTNLGRP_IPV4_ROUTE, +#define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE + RTNLGRP_IPV4_RULE, +#define RTNLGRP_IPV4_RULE RTNLGRP_IPV4_RULE + RTNLGRP_IPV6_IFADDR, +#define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR + RTNLGRP_IPV6_MROUTE, +#define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE + RTNLGRP_IPV6_ROUTE, +#define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE + RTNLGRP_IPV6_IFINFO, +#define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO + RTNLGRP_DECnet_IFADDR, +#define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR + RTNLGRP_NOP2, + RTNLGRP_DECnet_ROUTE, +#define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE + RTNLGRP_DECnet_RULE, +#define RTNLGRP_DECnet_RULE RTNLGRP_DECnet_RULE + RTNLGRP_NOP4, + RTNLGRP_IPV6_PREFIX, +#define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX + RTNLGRP_IPV6_RULE, +#define RTNLGRP_IPV6_RULE RTNLGRP_IPV6_RULE + RTNLGRP_ND_USEROPT, +#define RTNLGRP_ND_USEROPT RTNLGRP_ND_USEROPT + RTNLGRP_PHONET_IFADDR, +#define RTNLGRP_PHONET_IFADDR RTNLGRP_PHONET_IFADDR + RTNLGRP_PHONET_ROUTE, +#define RTNLGRP_PHONET_ROUTE RTNLGRP_PHONET_ROUTE + RTNLGRP_DCB, +#define RTNLGRP_DCB RTNLGRP_DCB + RTNLGRP_IPV4_NETCONF, +#define RTNLGRP_IPV4_NETCONF RTNLGRP_IPV4_NETCONF + RTNLGRP_IPV6_NETCONF, +#define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF + RTNLGRP_MDB, +#define RTNLGRP_MDB RTNLGRP_MDB + RTNLGRP_MPLS_ROUTE, +#define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE + RTNLGRP_NSID, +#define RTNLGRP_NSID RTNLGRP_NSID + RTNLGRP_MPLS_NETCONF, +#define RTNLGRP_MPLS_NETCONF RTNLGRP_MPLS_NETCONF + RTNLGRP_IPV4_MROUTE_R, +#define RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV4_MROUTE_R + RTNLGRP_IPV6_MROUTE_R, +#define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R + RTNLGRP_NEXTHOP, +#define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP + RTNLGRP_BRVLAN, +#define RTNLGRP_BRVLAN RTNLGRP_BRVLAN + __RTNLGRP_MAX +}; +#define RTNLGRP_MAX (__RTNLGRP_MAX - 1) + + +/* Defined NETLINK_ROUTE virtual multicast address families */ +#define RTNL_FAMILY_IPMR 128 /* Not supported */ +#define RTNL_FAMILY_IP6MR 129 /* Not supported */ +#define RTNL_FAMILY_MAX 129 + +#endif + diff --git a/sys/netlink/route/iface.c b/sys/netlink/route/iface.c new file mode 100644 index 000000000000..8b871576d0b2 --- /dev/null +++ b/sys/netlink/route/iface.c @@ -0,0 +1,1530 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_inet.h" +#include "opt_inet6.h" +#include <sys/types.h> +#include <sys/eventhandler.h> +#include <sys/kernel.h> +#include <sys/jail.h> +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/syslog.h> + +#include <net/if.h> +#include <net/if_dl.h> +#include <net/if_media.h> +#include <net/if_var.h> +#include <net/if_clone.h> +#include <net/route.h> +#include <net/route/nhop.h> +#include <net/route/route_ctl.h> +#include <netinet/in_var.h> +#include <netinet6/in6_var.h> +#include <netinet6/scope6_var.h> /* scope deembedding */ +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_route.h> +#include <netlink/route/route_var.h> + +#define DEBUG_MOD_NAME nl_iface +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include <netlink/netlink_debug.h> +_DECLARE_DEBUG(LOG_INFO); + +struct netlink_walkargs { + struct nl_writer *nw; + struct nlmsghdr hdr; + struct nlpcb *so; + struct ucred *cred; + uint32_t fibnum; + int family; + int error; + int count; + int dumped; +}; + +static eventhandler_tag ifdetach_event, ifattach_event, iflink_event, ifaddr_event; + +static SLIST_HEAD(, nl_cloner) nl_cloners = SLIST_HEAD_INITIALIZER(nl_cloners); + +static struct sx rtnl_cloner_lock; +SX_SYSINIT(rtnl_cloner_lock, &rtnl_cloner_lock, "rtnl cloner lock"); + +/* These are external hooks for CARP. */ +extern int (*carp_get_vhid_p)(struct ifaddr *); + +/* + * RTM_GETLINK request + * sendto(3, {{len=32, type=RTM_GETLINK, flags=NLM_F_REQUEST|NLM_F_DUMP, seq=1641940952, pid=0}, + * {ifi_family=AF_INET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}, 32, 0, NULL, 0) = 32 + * + * Reply: + * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_ETHER, ifi_index=if_nametoindex("enp0s31f6"), ifi_flags=IFF_UP|IFF_BROADCAST|IFF_RUNNING|IFF_MULTICAST|IFF_LOWER_UP, ifi_change=0}, +{{nla_len=10, nla_type=IFLA_ADDRESS}, "\xfe\x54\x00\x52\x3e\x90"} + +[ +{{nla_len=14, nla_type=IFLA_IFNAME}, "enp0s31f6"}, +{{nla_len=8, nla_type=IFLA_TXQLEN}, 1000}, +{{nla_len=5, nla_type=IFLA_OPERSTATE}, 6}, +{{nla_len=5, nla_type=IFLA_LINKMODE}, 0}, +{{nla_len=8, nla_type=IFLA_MTU}, 1500}, +{{nla_len=8, nla_type=IFLA_MIN_MTU}, 68}, + {{nla_len=8, nla_type=IFLA_MAX_MTU}, 9000}, +{{nla_len=8, nla_type=IFLA_GROUP}, 0}, +{{nla_len=8, nla_type=IFLA_PROMISCUITY}, 0}, +{{nla_len=8, nla_type=IFLA_NUM_TX_QUEUES}, 1}, +{{nla_len=8, nla_type=IFLA_GSO_MAX_SEGS}, 65535}, +{{nla_len=8, nla_type=IFLA_GSO_MAX_SIZE}, 65536}, +{{nla_len=8, nla_type=IFLA_NUM_RX_QUEUES}, 1}, +{{nla_len=5, nla_type=IFLA_CARRIER}, 1}, +{{nla_len=13, nla_type=IFLA_QDISC}, "fq_codel"}, +{{nla_len=8, nla_type=IFLA_CARRIER_CHANGES}, 2}, +{{nla_len=5, nla_type=IFLA_PROTO_DOWN}, 0}, +{{nla_len=8, nla_type=IFLA_CARRIER_UP_COUNT}, 1}, +{{nla_len=8, nla_type=IFLA_CARRIER_DOWN_COUNT}, 1}, + */ + +struct if_state { + uint8_t ifla_operstate; + uint8_t ifla_carrier; +}; + +static void +get_operstate_ether(if_t ifp, struct if_state *pstate) +{ + struct ifmediareq ifmr = {}; + int error; + error = if_ioctl(ifp, SIOCGIFMEDIA, (void *)&ifmr); + + if (error != 0) { + NL_LOG(LOG_DEBUG, "error calling SIOCGIFMEDIA on %s: %d", + if_name(ifp), error); + return; + } + + switch (IFM_TYPE(ifmr.ifm_active)) { + case IFM_ETHER: + if (ifmr.ifm_status & IFM_ACTIVE) { + pstate->ifla_carrier = 1; + if (if_getflags(ifp) & IFF_MONITOR) + pstate->ifla_operstate = IF_OPER_DORMANT; + else + pstate->ifla_operstate = IF_OPER_UP; + } else + pstate->ifla_operstate = IF_OPER_DOWN; + } +} + +static bool +get_stats(struct nl_writer *nw, if_t ifp) +{ + struct rtnl_link_stats64 *stats; + + int nla_len = sizeof(struct nlattr) + sizeof(*stats); + struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); + if (nla == NULL) + return (false); + nla->nla_type = IFLA_STATS64; + nla->nla_len = nla_len; + stats = (struct rtnl_link_stats64 *)(nla + 1); + + stats->rx_packets = if_getcounter(ifp, IFCOUNTER_IPACKETS); + stats->tx_packets = if_getcounter(ifp, IFCOUNTER_OPACKETS); + stats->rx_bytes = if_getcounter(ifp, IFCOUNTER_IBYTES); + stats->tx_bytes = if_getcounter(ifp, IFCOUNTER_OBYTES); + stats->rx_errors = if_getcounter(ifp, IFCOUNTER_IERRORS); + stats->tx_errors = if_getcounter(ifp, IFCOUNTER_OERRORS); + stats->rx_dropped = if_getcounter(ifp, IFCOUNTER_IQDROPS); + stats->tx_dropped = if_getcounter(ifp, IFCOUNTER_OQDROPS); + stats->multicast = if_getcounter(ifp, IFCOUNTER_IMCASTS); + stats->rx_nohandler = if_getcounter(ifp, IFCOUNTER_NOPROTO); + + return (true); +} + +static void +get_operstate(if_t ifp, struct if_state *pstate) +{ + pstate->ifla_operstate = IF_OPER_UNKNOWN; + pstate->ifla_carrier = 0; /* no carrier */ + + switch (if_gettype(ifp)) { + case IFT_ETHER: + case IFT_L2VLAN: + get_operstate_ether(ifp, pstate); + break; + default: + /* Map admin state to the operstate */ + if (if_getflags(ifp) & IFF_UP) { + pstate->ifla_operstate = IF_OPER_UP; + pstate->ifla_carrier = 1; + } else + pstate->ifla_operstate = IF_OPER_DOWN; + break; + } +} + +static void +get_hwaddr(struct nl_writer *nw, if_t ifp) +{ + struct ifreq ifr = {}; + + if (if_gethwaddr(ifp, &ifr) == 0) { + nlattr_add(nw, IFLAF_ORIG_HWADDR, if_getaddrlen(ifp), + ifr.ifr_addr.sa_data); + } +} + +static unsigned +ifp_flags_to_netlink(const if_t ifp) +{ + return (if_getflags(ifp) | if_getdrvflags(ifp)); +} + +#define LLADDR_CONST(s) ((const void *)((s)->sdl_data + (s)->sdl_nlen)) +static bool +dump_sa(struct nl_writer *nw, int attr, const struct sockaddr *sa) +{ + uint32_t addr_len = 0; + const void *addr_data = NULL; +#ifdef INET6 + struct in6_addr addr6; +#endif + + if (sa == NULL) + return (true); + + switch (sa->sa_family) { +#ifdef INET + case AF_INET: + addr_len = sizeof(struct in_addr); + addr_data = &((const struct sockaddr_in *)sa)->sin_addr; + break; +#endif +#ifdef INET6 + case AF_INET6: + in6_splitscope(&((const struct sockaddr_in6 *)sa)->sin6_addr, &addr6, &addr_len); + addr_len = sizeof(struct in6_addr); + addr_data = &addr6; + break; +#endif + case AF_LINK: + addr_len = ((const struct sockaddr_dl *)sa)->sdl_alen; + addr_data = LLADDR_CONST((const struct sockaddr_dl *)sa); + break; + case AF_UNSPEC: + /* Ignore empty SAs without warning */ + return (true); + default: + NL_LOG(LOG_DEBUG2, "unsupported family: %d, skipping", sa->sa_family); + return (true); + } + + return (nlattr_add(nw, attr, addr_len, addr_data)); +} + +static bool +dump_iface_caps(struct nl_writer *nw, struct ifnet *ifp) +{ + int off = nlattr_add_nested(nw, IFLAF_CAPS); + uint32_t active_caps[roundup2(IFCAP_B_SIZE, 32) / 32] = {}; + uint32_t all_caps[roundup2(IFCAP_B_SIZE, 32) / 32] = {}; + + MPASS(sizeof(active_caps) >= 8); + MPASS(sizeof(all_caps) >= 8); + + if (off == 0) + return (false); + + active_caps[0] = (uint32_t)if_getcapabilities(ifp); + all_caps[0] = (uint32_t)if_getcapenable(ifp); + active_caps[1] = (uint32_t)if_getcapabilities2(ifp); + all_caps[1] = (uint32_t)if_getcapenable2(ifp); + + nlattr_add_u32(nw, NLA_BITSET_SIZE, IFCAP_B_SIZE); + nlattr_add(nw, NLA_BITSET_MASK, sizeof(all_caps), all_caps); + nlattr_add(nw, NLA_BITSET_VALUE, sizeof(active_caps), active_caps); + + nlattr_set_len(nw, off); + + return (true); +} + +/* + * Dumps interface state, properties and metrics. + * @nw: message writer + * @ifp: target interface + * @hdr: template header + * @if_flags_mask: changed if_[drv]_flags bitmask + * + * This function is called without epoch and MAY sleep. + */ +static bool +dump_iface(struct nl_writer *nw, if_t ifp, const struct nlmsghdr *hdr, + int if_flags_mask) +{ + struct epoch_tracker et; + struct ifinfomsg *ifinfo; + + NL_LOG(LOG_DEBUG3, "dumping interface %s data", if_name(ifp)); + + if (!nlmsg_reply(nw, hdr, sizeof(struct ifinfomsg))) + goto enomem; + + ifinfo = nlmsg_reserve_object(nw, struct ifinfomsg); + ifinfo->ifi_family = AF_UNSPEC; + ifinfo->__ifi_pad = 0; + ifinfo->ifi_type = if_gettype(ifp); + ifinfo->ifi_index = if_getindex(ifp); + ifinfo->ifi_flags = ifp_flags_to_netlink(ifp); + ifinfo->ifi_change = if_flags_mask; + + struct if_state ifs = {}; + get_operstate(ifp, &ifs); + + if (ifs.ifla_operstate == IF_OPER_UP) + ifinfo->ifi_flags |= IFF_LOWER_UP; + + nlattr_add_string(nw, IFLA_IFNAME, if_name(ifp)); + nlattr_add_u8(nw, IFLA_OPERSTATE, ifs.ifla_operstate); + nlattr_add_u8(nw, IFLA_CARRIER, ifs.ifla_carrier); + +/* + nlattr_add_u8(nw, IFLA_PROTO_DOWN, val); + nlattr_add_u8(nw, IFLA_LINKMODE, val); +*/ + if (if_getaddrlen(ifp) != 0) { + struct ifaddr *ifa; + struct ifa_iter it; + + NET_EPOCH_ENTER(et); + ifa = ifa_iter_start(ifp, &it); + if (ifa != NULL) + dump_sa(nw, IFLA_ADDRESS, ifa->ifa_addr); + ifa_iter_finish(&it); + NET_EPOCH_EXIT(et); + } + + if ((if_getbroadcastaddr(ifp) != NULL)) { + nlattr_add(nw, IFLA_BROADCAST, if_getaddrlen(ifp), + if_getbroadcastaddr(ifp)); + } + + nlattr_add_u32(nw, IFLA_MTU, if_getmtu(ifp)); +/* + nlattr_add_u32(nw, IFLA_MIN_MTU, 60); + nlattr_add_u32(nw, IFLA_MAX_MTU, 9000); + nlattr_add_u32(nw, IFLA_GROUP, 0); +*/ + + if (if_getdescr(ifp) != NULL) + nlattr_add_string(nw, IFLA_IFALIAS, if_getdescr(ifp)); + + /* Store FreeBSD-specific attributes */ + int off = nlattr_add_nested(nw, IFLA_FREEBSD); + if (off != 0) { + get_hwaddr(nw, ifp); + dump_iface_caps(nw, ifp); + + nlattr_set_len(nw, off); + } + + get_stats(nw, ifp); + + uint32_t val = (if_getflags(ifp) & IFF_PROMISC) != 0; + nlattr_add_u32(nw, IFLA_PROMISCUITY, val); + + ifc_dump_ifp_nl(ifp, nw); + + if (nlmsg_end(nw)) + return (true); + +enomem: + NL_LOG(LOG_DEBUG, "unable to dump interface %s state (ENOMEM)", if_name(ifp)); + nlmsg_abort(nw); + return (false); +} + +static bool +check_ifmsg(void *hdr, struct nl_pstate *npt) +{ + struct ifinfomsg *ifm = hdr; + + if (ifm->__ifi_pad != 0 || ifm->ifi_type != 0 || + ifm->ifi_flags != 0 || ifm->ifi_change != 0) { + nlmsg_report_err_msg(npt, + "strict checking: non-zero values in ifinfomsg header"); + return (false); + } + + return (true); +} + +#define _IN(_field) offsetof(struct ifinfomsg, _field) +#define _OUT(_field) offsetof(struct nl_parsed_link, _field) +static const struct nlfield_parser nlf_p_if[] = { + { .off_in = _IN(ifi_type), .off_out = _OUT(ifi_type), .cb = nlf_get_u16 }, + { .off_in = _IN(ifi_index), .off_out = _OUT(ifi_index), .cb = nlf_get_u32 }, + { .off_in = _IN(ifi_flags), .off_out = _OUT(ifi_flags), .cb = nlf_get_u32 }, + { .off_in = _IN(ifi_change), .off_out = _OUT(ifi_change), .cb = nlf_get_u32 }, +}; + +static const struct nlattr_parser nla_p_linfo[] = { + { .type = IFLA_INFO_KIND, .off = _OUT(ifla_cloner), .cb = nlattr_get_stringn }, + { .type = IFLA_INFO_DATA, .off = _OUT(ifla_idata), .cb = nlattr_get_nla }, +}; +NL_DECLARE_ATTR_PARSER(linfo_parser, nla_p_linfo); + +static const struct nlattr_parser nla_p_if[] = { + { .type = IFLA_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string }, + { .type = IFLA_MTU, .off = _OUT(ifla_mtu), .cb = nlattr_get_uint32 }, + { .type = IFLA_LINK, .off = _OUT(ifla_link), .cb = nlattr_get_uint32 }, + { .type = IFLA_LINKINFO, .arg = &linfo_parser, .cb = nlattr_get_nested }, + { .type = IFLA_IFALIAS, .off = _OUT(ifla_ifalias), .cb = nlattr_get_string }, + { .type = IFLA_GROUP, .off = _OUT(ifla_group), .cb = nlattr_get_string }, + { .type = IFLA_ALT_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string }, +}; +#undef _IN +#undef _OUT +NL_DECLARE_STRICT_PARSER(ifmsg_parser, struct ifinfomsg, check_ifmsg, nlf_p_if, nla_p_if); + +static bool +match_iface(if_t ifp, void *_arg) +{ + struct nl_parsed_link *attrs = (struct nl_parsed_link *)_arg; + + if (attrs->ifi_index != 0 && attrs->ifi_index != if_getindex(ifp)) + return (false); + if (attrs->ifi_type != 0 && attrs->ifi_index != if_gettype(ifp)) + return (false); + if (attrs->ifla_ifname != NULL && strcmp(attrs->ifla_ifname, if_name(ifp))) + return (false); + /* TODO: add group match */ + + return (true); +} + +static int +dump_cb(if_t ifp, void *_arg) +{ + struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg; + if (!dump_iface(wa->nw, ifp, &wa->hdr, 0)) + return (ENOMEM); + return (0); +} + +/* + * {nlmsg_len=52, nlmsg_type=RTM_GETLINK, nlmsg_flags=NLM_F_REQUEST, nlmsg_seq=1662842818, nlmsg_pid=0}, + * {ifi_family=AF_PACKET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, + * [ + * [{nla_len=10, nla_type=IFLA_IFNAME}, "vnet9"], + * [{nla_len=8, nla_type=IFLA_EXT_MASK}, RTEXT_FILTER_VF] + * ] + */ +static int +rtnl_handle_getlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + struct epoch_tracker et; + if_t ifp; + int error = 0; + + struct nl_parsed_link attrs = {}; + error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + struct netlink_walkargs wa = { + .so = nlp, + .nw = npt->nw, + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags, + .hdr.nlmsg_type = NL_RTM_NEWLINK, + }; + + /* Fast track for an interface w/ explicit name or index match */ + if ((attrs.ifi_index != 0) || (attrs.ifla_ifname != NULL)) { + if (attrs.ifi_index != 0) { + NLP_LOG(LOG_DEBUG3, nlp, "fast track -> searching index %u", + attrs.ifi_index); + NET_EPOCH_ENTER(et); + ifp = ifnet_byindex_ref(attrs.ifi_index); + NET_EPOCH_EXIT(et); + } else { + NLP_LOG(LOG_DEBUG3, nlp, "fast track -> searching name %s", + attrs.ifla_ifname); + ifp = ifunit_ref(attrs.ifla_ifname); + } + + if (ifp != NULL) { + if (match_iface(ifp, &attrs)) { + if (!dump_iface(wa.nw, ifp, &wa.hdr, 0)) + error = ENOMEM; + } else + error = ENODEV; + if_rele(ifp); + } else + error = ENODEV; + return (error); + } + + /* Always treat non-direct-match as a multipart message */ + wa.hdr.nlmsg_flags |= NLM_F_MULTI; + + /* + * Fetching some link properties require performing ioctl's that may be blocking. + * Address it by saving referenced pointers of the matching links, + * exiting from epoch and going through the list one-by-one. + */ + + NL_LOG(LOG_DEBUG2, "Start dump"); + if_foreach_sleep(match_iface, &attrs, dump_cb, &wa); + NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped); + + if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) { + NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); + return (ENOMEM); + } + + return (error); +} + +/* + * sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[ + * {nlmsg_len=60, nlmsg_type=RTM_NEWLINK, nlmsg_flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, nlmsg_seq=1662715618, nlmsg_pid=0}, + * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, + * {nla_len=11, nla_type=IFLA_IFNAME}, "dummy0"], + * [ + * {nla_len=16, nla_type=IFLA_LINKINFO}, + * [ + * {nla_len=9, nla_type=IFLA_INFO_KIND}, "dummy"... + * ] + * ] + */ + +static int +rtnl_handle_dellink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + struct epoch_tracker et; + if_t ifp; + int error; + + struct nl_parsed_link attrs = {}; + error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + NET_EPOCH_ENTER(et); + ifp = ifnet_byindex_ref(attrs.ifi_index); + NET_EPOCH_EXIT(et); + if (ifp == NULL) { + NLP_LOG(LOG_DEBUG, nlp, "unable to find interface %u", attrs.ifi_index); + return (ENOENT); + } + NLP_LOG(LOG_DEBUG3, nlp, "mapped ifindex %u to %s", attrs.ifi_index, if_name(ifp)); + + sx_xlock(&ifnet_detach_sxlock); + error = if_clone_destroy(if_name(ifp)); + sx_xunlock(&ifnet_detach_sxlock); + + NLP_LOG(LOG_DEBUG2, nlp, "deleting interface %s returned %d", if_name(ifp), error); + + if_rele(ifp); + return (error); +} + +/* + * New link: + * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1668185590, pid=0}, + * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0} + * [ + * {{nla_len=8, nla_type=IFLA_MTU}, 123}, + * {{nla_len=10, nla_type=IFLA_IFNAME}, "vlan1"}, + * {{nla_len=24, nla_type=IFLA_LINKINFO}, + * [ + * {{nla_len=8, nla_type=IFLA_INFO_KIND}, "vlan"...}, + * {{nla_len=12, nla_type=IFLA_INFO_DATA}, "\x06\x00\x01\x00\x7b\x00\x00\x00"}]}]} + * + * Update link: + * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1668185923, pid=0}, + * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=if_nametoindex("lo"), ifi_flags=0, ifi_change=0}, + * {{nla_len=8, nla_type=IFLA_MTU}, 123}} + * + * + * Check command availability: + * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=0, pid=0}, + * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0} + */ + + +static int +create_link(struct nlmsghdr *hdr, struct nl_parsed_link *lattrs, + struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt) +{ + if (lattrs->ifla_ifname == NULL || strlen(lattrs->ifla_ifname) == 0) { + NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_IFNAME attribute"); + return (EINVAL); + } + if (lattrs->ifla_cloner == NULL || strlen(lattrs->ifla_cloner) == 0) { + NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_INFO_KIND attribute"); + return (EINVAL); + } + + struct ifc_data_nl ifd = { + .flags = IFC_F_CREATE, + .lattrs = lattrs, + .bm = bm, + .npt = npt, + }; + if (ifc_create_ifp_nl(lattrs->ifla_ifname, &ifd) && ifd.error == 0) + nl_store_ifp_cookie(npt, ifd.ifp); + + return (ifd.error); +} + +static int +modify_link(struct nlmsghdr *hdr, struct nl_parsed_link *lattrs, + struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt) +{ + if_t ifp = NULL; + struct epoch_tracker et; + + if (lattrs->ifi_index == 0 && lattrs->ifla_ifname == NULL) { + /* + * Applications like ip(8) verify RTM_NEWLINK command + * existence by calling it with empty arguments. Always + * return "innocent" error in that case. + */ + NLMSG_REPORT_ERR_MSG(npt, "empty ifi_index field"); + return (EPERM); + } + + if (lattrs->ifi_index != 0) { + NET_EPOCH_ENTER(et); + ifp = ifnet_byindex_ref(lattrs->ifi_index); + NET_EPOCH_EXIT(et); + if (ifp == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "unable to find interface #%u", + lattrs->ifi_index); + return (ENOENT); + } + } + + if (ifp == NULL && lattrs->ifla_ifname != NULL) { + ifp = ifunit_ref(lattrs->ifla_ifname); + if (ifp == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "unable to find interface %s", + lattrs->ifla_ifname); + return (ENOENT); + } + } + + MPASS(ifp != NULL); + + /* + * Modification request can address either + * 1) cloned interface, in which case we call the cloner-specific + * modification routine + * or + * 2) non-cloned (e.g. "physical") interface, in which case we call + * generic modification routine + */ + struct ifc_data_nl ifd = { .lattrs = lattrs, .bm = bm, .npt = npt }; + if (!ifc_modify_ifp_nl(ifp, &ifd)) + ifd.error = nl_modify_ifp_generic(ifp, lattrs, bm, npt); + + if_rele(ifp); + + return (ifd.error); +} + + +static int +rtnl_handle_newlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + struct nlattr_bmask bm; + int error; + + struct nl_parsed_link attrs = {}; + error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); + if (error != 0) + return (error); + nl_get_attrs_bmask_nlmsg(hdr, &ifmsg_parser, &bm); + + if (hdr->nlmsg_flags & NLM_F_CREATE) + return (create_link(hdr, &attrs, &bm, nlp, npt)); + else + return (modify_link(hdr, &attrs, &bm, nlp, npt)); +} + +static void +set_scope6(struct sockaddr *sa, uint32_t ifindex) +{ +#ifdef INET6 + if (sa != NULL && sa->sa_family == AF_INET6) { + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; + + if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) + in6_set_unicast_scopeid(&sa6->sin6_addr, ifindex); + } +#endif +} + +static bool +check_sa_family(const struct sockaddr *sa, int family, const char *attr_name, + struct nl_pstate *npt) +{ + if (sa == NULL || sa->sa_family == family) + return (true); + + nlmsg_report_err_msg(npt, "wrong family for %s attribute: %d != %d", + attr_name, family, sa->sa_family); + return (false); +} + +struct nl_parsed_ifa { + uint8_t ifa_family; + uint8_t ifa_prefixlen; + uint8_t ifa_scope; + uint32_t ifa_index; + uint32_t ifa_flags; + uint32_t ifaf_vhid; + uint32_t ifaf_flags; + struct sockaddr *ifa_address; + struct sockaddr *ifa_local; + struct sockaddr *ifa_broadcast; + struct ifa_cacheinfo *ifa_cacheinfo; + struct sockaddr *f_ifa_addr; + struct sockaddr *f_ifa_dst; +}; + +static int +nlattr_get_cinfo(struct nlattr *nla, struct nl_pstate *npt, + const void *arg __unused, void *target) +{ + if (__predict_false(NLA_DATA_LEN(nla) != sizeof(struct ifa_cacheinfo))) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not ifa_cacheinfo", + nla->nla_type, NLA_DATA_LEN(nla)); + return (EINVAL); + } + *((struct ifa_cacheinfo **)target) = (struct ifa_cacheinfo *)NL_RTA_DATA(nla); + return (0); +} + +#define _IN(_field) offsetof(struct ifaddrmsg, _field) +#define _OUT(_field) offsetof(struct nl_parsed_ifa, _field) +static const struct nlfield_parser nlf_p_ifa[] = { + { .off_in = _IN(ifa_family), .off_out = _OUT(ifa_family), .cb = nlf_get_u8 }, + { .off_in = _IN(ifa_prefixlen), .off_out = _OUT(ifa_prefixlen), .cb = nlf_get_u8 }, + { .off_in = _IN(ifa_scope), .off_out = _OUT(ifa_scope), .cb = nlf_get_u8 }, + { .off_in = _IN(ifa_flags), .off_out = _OUT(ifa_flags), .cb = nlf_get_u8_u32 }, + { .off_in = _IN(ifa_index), .off_out = _OUT(ifa_index), .cb = nlf_get_u32 }, +}; + +static const struct nlattr_parser nla_p_ifa_fbsd[] = { + { .type = IFAF_VHID, .off = _OUT(ifaf_vhid), .cb = nlattr_get_uint32 }, + { .type = IFAF_FLAGS, .off = _OUT(ifaf_flags), .cb = nlattr_get_uint32 }, +}; +NL_DECLARE_ATTR_PARSER(ifa_fbsd_parser, nla_p_ifa_fbsd); + +static const struct nlattr_parser nla_p_ifa[] = { + { .type = IFA_ADDRESS, .off = _OUT(ifa_address), .cb = nlattr_get_ip }, + { .type = IFA_LOCAL, .off = _OUT(ifa_local), .cb = nlattr_get_ip }, + { .type = IFA_BROADCAST, .off = _OUT(ifa_broadcast), .cb = nlattr_get_ip }, + { .type = IFA_CACHEINFO, .off = _OUT(ifa_cacheinfo), .cb = nlattr_get_cinfo }, + { .type = IFA_FLAGS, .off = _OUT(ifa_flags), .cb = nlattr_get_uint32 }, + { .type = IFA_FREEBSD, .arg = &ifa_fbsd_parser, .cb = nlattr_get_nested }, +}; +#undef _IN +#undef _OUT + +static bool +post_p_ifa(void *_attrs, struct nl_pstate *npt) +{ + struct nl_parsed_ifa *attrs = (struct nl_parsed_ifa *)_attrs; + + if (!check_sa_family(attrs->ifa_address, attrs->ifa_family, "IFA_ADDRESS", npt)) + return (false); + if (!check_sa_family(attrs->ifa_local, attrs->ifa_family, "IFA_LOCAL", npt)) + return (false); + if (!check_sa_family(attrs->ifa_broadcast, attrs->ifa_family, "IFA_BROADADDR", npt)) + return (false); + + set_scope6(attrs->ifa_address, attrs->ifa_index); + set_scope6(attrs->ifa_local, attrs->ifa_index); + + return (true); +} + +NL_DECLARE_PARSER_EXT(ifa_parser, struct ifaddrmsg, NULL, nlf_p_ifa, nla_p_ifa, post_p_ifa); + + +/* + +{ifa_family=AF_INET, ifa_prefixlen=8, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_HOST, ifa_index=if_nametoindex("lo")}, + [ + {{nla_len=8, nla_type=IFA_ADDRESS}, inet_addr("127.0.0.1")}, + {{nla_len=8, nla_type=IFA_LOCAL}, inet_addr("127.0.0.1")}, + {{nla_len=7, nla_type=IFA_LABEL}, "lo"}, + {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}, + {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=3619, tstamp=3619}}]}, +--- + +{{len=72, type=RTM_NEWADDR, flags=NLM_F_MULTI, seq=1642191126, pid=566735}, + {ifa_family=AF_INET6, ifa_prefixlen=96, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_UNIVERSE, ifa_index=if_nametoindex("virbr0")}, + [ + {{nla_len=20, nla_type=IFA_ADDRESS}, inet_pton(AF_INET6, "2a01:4f8:13a:70c:ffff::1")}, + {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=4283, tstamp=4283}}, + {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}]}, +*/ + +static uint8_t +ifa_get_scope(const struct ifaddr *ifa) +{ + const struct sockaddr *sa; + uint8_t addr_scope = RT_SCOPE_UNIVERSE; + + sa = ifa->ifa_addr; + switch (sa->sa_family) { +#ifdef INET + case AF_INET: + { + struct in_addr addr; + addr = ((const struct sockaddr_in *)sa)->sin_addr; + if (IN_LOOPBACK(ntohl(addr.s_addr))) + addr_scope = RT_SCOPE_HOST; + else if (IN_LINKLOCAL(ntohl(addr.s_addr))) + addr_scope = RT_SCOPE_LINK; + break; + } +#endif +#ifdef INET6 + case AF_INET6: + { + const struct in6_addr *addr; + addr = &((const struct sockaddr_in6 *)sa)->sin6_addr; + if (IN6_IS_ADDR_LOOPBACK(addr)) + addr_scope = RT_SCOPE_HOST; + else if (IN6_IS_ADDR_LINKLOCAL(addr)) + addr_scope = RT_SCOPE_LINK; + break; + } +#endif + } + + return (addr_scope); +} + +#ifdef INET6 +static uint8_t +inet6_get_plen(const struct in6_addr *addr) +{ + + return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) + + bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3])); +} +#endif + +static uint8_t +get_sa_plen(const struct sockaddr *sa) +{ +#ifdef INET + const struct in_addr *paddr; +#endif +#ifdef INET6 + const struct in6_addr *paddr6; +#endif + + switch (sa->sa_family) { +#ifdef INET + case AF_INET: + paddr = &(((const struct sockaddr_in *)sa)->sin_addr); + return bitcount32(paddr->s_addr); +#endif +#ifdef INET6 + case AF_INET6: + paddr6 = &(((const struct sockaddr_in6 *)sa)->sin6_addr); + return inet6_get_plen(paddr6); +#endif + } + + return (0); +} + +#ifdef INET6 +static uint32_t +in6_flags_to_nl(uint32_t flags) +{ + uint32_t nl_flags = 0; + + if (flags & IN6_IFF_TEMPORARY) + nl_flags |= IFA_F_TEMPORARY; + if (flags & IN6_IFF_NODAD) + nl_flags |= IFA_F_NODAD; + if (flags & IN6_IFF_DEPRECATED) + nl_flags |= IFA_F_DEPRECATED; + if (flags & IN6_IFF_TENTATIVE) + nl_flags |= IFA_F_TENTATIVE; + if ((flags & (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY)) == 0) + flags |= IFA_F_PERMANENT; + if (flags & IN6_IFF_DUPLICATED) + flags |= IFA_F_DADFAILED; + return (nl_flags); +} + +static uint32_t +nl_flags_to_in6(uint32_t flags) +{ + uint32_t in6_flags = 0; + + if (flags & IFA_F_TEMPORARY) + in6_flags |= IN6_IFF_TEMPORARY; + if (flags & IFA_F_NODAD) + in6_flags |= IN6_IFF_NODAD; + if (flags & IFA_F_DEPRECATED) + in6_flags |= IN6_IFF_DEPRECATED; + if (flags & IFA_F_TENTATIVE) + in6_flags |= IN6_IFF_TENTATIVE; + if (flags & IFA_F_DADFAILED) + in6_flags |= IN6_IFF_DUPLICATED; + + return (in6_flags); +} + +static void +export_cache_info6(struct nl_writer *nw, const struct in6_ifaddr *ia) +{ + struct ifa_cacheinfo ci = { + .cstamp = ia->ia6_createtime * 1000, + .tstamp = ia->ia6_updatetime * 1000, + .ifa_prefered = ia->ia6_lifetime.ia6t_pltime, + .ifa_valid = ia->ia6_lifetime.ia6t_vltime, + }; + + nlattr_add(nw, IFA_CACHEINFO, sizeof(ci), &ci); +} +#endif + +static void +export_cache_info(struct nl_writer *nw, struct ifaddr *ifa) +{ + switch (ifa->ifa_addr->sa_family) { +#ifdef INET6 + case AF_INET6: + export_cache_info6(nw, (struct in6_ifaddr *)ifa); + break; +#endif + } +} + +/* + * {'attrs': [('IFA_ADDRESS', '12.0.0.1'), + ('IFA_LOCAL', '12.0.0.1'), + ('IFA_LABEL', 'eth10'), + ('IFA_FLAGS', 128), + ('IFA_CACHEINFO', {'ifa_preferred': 4294967295, 'ifa_valid': 4294967295, 'cstamp': 63745746, 'tstamp': 63745746})], + */ +static bool +dump_iface_addr(struct nl_writer *nw, if_t ifp, struct ifaddr *ifa, + const struct nlmsghdr *hdr) +{ + struct ifaddrmsg *ifamsg; + struct sockaddr *sa = ifa->ifa_addr; + struct sockaddr *sa_dst = ifa->ifa_dstaddr; + + NL_LOG(LOG_DEBUG3, "dumping ifa %p type %s(%d) for interface %s", + ifa, rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp)); + + if (!nlmsg_reply(nw, hdr, sizeof(struct ifaddrmsg))) + goto enomem; + + ifamsg = nlmsg_reserve_object(nw, struct ifaddrmsg); + ifamsg->ifa_family = sa->sa_family; + ifamsg->ifa_prefixlen = get_sa_plen(ifa->ifa_netmask); + ifamsg->ifa_flags = 0; // ifa_flags is useless + ifamsg->ifa_scope = ifa_get_scope(ifa); + ifamsg->ifa_index = if_getindex(ifp); + + if ((if_getflags(ifp) & IFF_POINTOPOINT) && sa_dst != NULL && sa_dst->sa_family != 0) { + /* P2P interface may have IPv6 LL with no dst address */ + dump_sa(nw, IFA_ADDRESS, sa_dst); + dump_sa(nw, IFA_LOCAL, sa); + } else { + dump_sa(nw, IFA_ADDRESS, sa); +#ifdef INET + /* + * In most cases, IFA_ADDRESS == IFA_LOCAL + * Skip IFA_LOCAL for anything except INET + */ + if (sa->sa_family == AF_INET) + dump_sa(nw, IFA_LOCAL, sa); +#endif + } + if (if_getflags(ifp) & IFF_BROADCAST) + dump_sa(nw, IFA_BROADCAST, ifa->ifa_broadaddr); + + nlattr_add_string(nw, IFA_LABEL, if_name(ifp)); + + uint32_t nl_ifa_flags = 0; +#ifdef INET6 + if (sa->sa_family == AF_INET6) { + struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; + nl_ifa_flags = in6_flags_to_nl(ia->ia6_flags); + } +#endif + nlattr_add_u32(nw, IFA_FLAGS, nl_ifa_flags); + + export_cache_info(nw, ifa); + + /* Store FreeBSD-specific attributes */ + int off = nlattr_add_nested(nw, IFA_FREEBSD); + if (off != 0) { + if (ifa->ifa_carp != NULL && carp_get_vhid_p != NULL) { + uint32_t vhid = (uint32_t)(*carp_get_vhid_p)(ifa); + nlattr_add_u32(nw, IFAF_VHID, vhid); + } +#ifdef INET6 + if (sa->sa_family == AF_INET6) { + uint32_t ifa_flags = ((struct in6_ifaddr *)ifa)->ia6_flags; + + nlattr_add_u32(nw, IFAF_FLAGS, ifa_flags); + } +#endif + + nlattr_set_len(nw, off); + } + + if (nlmsg_end(nw)) + return (true); +enomem: + NL_LOG(LOG_DEBUG, "Failed to dump ifa type %s(%d) for interface %s", + rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp)); + nlmsg_abort(nw); + return (false); +} + +static int +dump_iface_addrs(struct netlink_walkargs *wa, if_t ifp) +{ + struct ifaddr *ifa; + struct ifa_iter it; + int error = 0; + + for (ifa = ifa_iter_start(ifp, &it); ifa != NULL; ifa = ifa_iter_next(&it)) { + if (wa->family != 0 && wa->family != ifa->ifa_addr->sa_family) + continue; + if (ifa->ifa_addr->sa_family == AF_LINK) + continue; + if (prison_if(wa->cred, ifa->ifa_addr) != 0) + continue; + wa->count++; + if (!dump_iface_addr(wa->nw, ifp, ifa, &wa->hdr)) { + error = ENOMEM; + break; + } + wa->dumped++; + } + ifa_iter_finish(&it); + + return (error); +} + +static int +rtnl_handle_getaddr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + if_t ifp; + int error = 0; + + struct nl_parsed_ifa attrs = {}; + error = nl_parse_nlmsg(hdr, &ifa_parser, npt, &attrs); + if (error != 0) + return (error); + + struct netlink_walkargs wa = { + .so = nlp, + .nw = npt->nw, + .cred = nlp_get_cred(nlp), + .family = attrs.ifa_family, + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, + .hdr.nlmsg_type = NL_RTM_NEWADDR, + }; + + NL_LOG(LOG_DEBUG2, "Start dump"); + + if (attrs.ifa_index != 0) { + ifp = ifnet_byindex(attrs.ifa_index); + if (ifp == NULL) + error = ENOENT; + else + error = dump_iface_addrs(&wa, ifp); + } else { + struct if_iter it; + + for (ifp = if_iter_start(&it); ifp != NULL; ifp = if_iter_next(&it)) { + error = dump_iface_addrs(&wa, ifp); + if (error != 0) + break; + } + if_iter_finish(&it); + } + + NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped); + + if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) { + NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); + return (ENOMEM); + } + + return (error); +} + +#ifdef INET +static int +handle_newaddr_inet(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, + if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) +{ + int plen = attrs->ifa_prefixlen; + int if_flags = if_getflags(ifp); + struct sockaddr_in *addr, *dst; + + if (plen > 32) { + nlmsg_report_err_msg(npt, "invalid ifa_prefixlen"); + return (EINVAL); + }; + + if (if_flags & IFF_POINTOPOINT) { + /* + * Only P2P IFAs are allowed by the implementation. + */ + if (attrs->ifa_address == NULL || attrs->ifa_local == NULL) { + nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); + return (EINVAL); + } + addr = (struct sockaddr_in *)attrs->ifa_local; + dst = (struct sockaddr_in *)attrs->ifa_address; + } else { + /* + * Map the Netlink attributes to FreeBSD ifa layout. + * If only IFA_ADDRESS or IFA_LOCAL is set OR + * both are set to the same value => ifa is not p2p + * and the attribute value contains interface address. + * + * Otherwise (both IFA_ADDRESS and IFA_LOCAL are set and + * different), IFA_LOCAL contains an interface address and + * IFA_ADDRESS contains peer address. + */ + addr = (struct sockaddr_in *)attrs->ifa_local; + if (addr == NULL) + addr = (struct sockaddr_in *)attrs->ifa_address; + + if (addr == NULL) { + nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); + return (EINVAL); + } + + /* Generate broadcast address if not set */ + if ((if_flags & IFF_BROADCAST) && attrs->ifa_broadcast == NULL) { + uint32_t s_baddr; + struct sockaddr_in *sin_brd; + + if (plen == 31) + s_baddr = INADDR_BROADCAST; /* RFC 3021 */ + else { + uint32_t s_mask; + + s_mask = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0); + s_baddr = addr->sin_addr.s_addr | ~s_mask; + } + + sin_brd = (struct sockaddr_in *)npt_alloc(npt, sizeof(*sin_brd)); + if (sin_brd == NULL) + return (ENOMEM); + sin_brd->sin_family = AF_INET; + sin_brd->sin_len = sizeof(*sin_brd); + sin_brd->sin_addr.s_addr = s_baddr; + attrs->ifa_broadcast = (struct sockaddr *)sin_brd; + } + dst = (struct sockaddr_in *)attrs->ifa_broadcast; + } + + struct sockaddr_in mask = { + .sin_len = sizeof(struct sockaddr_in), + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0), + }; + struct in_aliasreq req = { + .ifra_addr = *addr, + .ifra_mask = mask, + .ifra_vhid = attrs->ifaf_vhid, + }; + if (dst != NULL) + req.ifra_dstaddr = *dst; + + return (in_control_ioctl(SIOCAIFADDR, &req, ifp, nlp_get_cred(nlp))); +} + +static int +handle_deladdr_inet(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, + if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) +{ + struct sockaddr *addr = attrs->ifa_local; + + if (addr == NULL) + addr = attrs->ifa_address; + + if (addr == NULL) { + nlmsg_report_err_msg(npt, "empty IFA_ADDRESS/IFA_LOCAL"); + return (EINVAL); + } + + struct ifreq req = { .ifr_addr = *addr }; + + return (in_control_ioctl(SIOCDIFADDR, &req, ifp, nlp_get_cred(nlp))); +} +#endif + +#ifdef INET6 +static int +handle_newaddr_inet6(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, + if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) +{ + struct sockaddr_in6 *addr, *dst; + + if (attrs->ifa_prefixlen > 128) { + nlmsg_report_err_msg(npt, "invalid ifa_prefixlen"); + return (EINVAL); + } + + /* + * In IPv6 implementation, adding non-P2P address to the P2P interface + * is allowed. + */ + addr = (struct sockaddr_in6 *)(attrs->ifa_local); + dst = (struct sockaddr_in6 *)(attrs->ifa_address); + + if (addr == NULL) { + addr = dst; + dst = NULL; + } else if (dst != NULL) { + if (IN6_ARE_ADDR_EQUAL(&addr->sin6_addr, &dst->sin6_addr)) { + /* + * Sometimes Netlink users fills in both attributes + * with the same address. It still means "non-p2p". + */ + dst = NULL; + } + } + + if (addr == NULL) { + nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); + return (EINVAL); + } + + uint32_t flags = nl_flags_to_in6(attrs->ifa_flags) | attrs->ifaf_flags; + + uint32_t pltime = 0, vltime = 0; + if (attrs->ifa_cacheinfo != 0) { + pltime = attrs->ifa_cacheinfo->ifa_prefered; + vltime = attrs->ifa_cacheinfo->ifa_valid; + } + + struct sockaddr_in6 mask = { + .sin6_len = sizeof(struct sockaddr_in6), + .sin6_family = AF_INET6, + }; + ip6_writemask(&mask.sin6_addr, attrs->ifa_prefixlen); + + struct in6_aliasreq req = { + .ifra_addr = *addr, + .ifra_prefixmask = mask, + .ifra_flags = flags, + .ifra_lifetime = { .ia6t_vltime = vltime, .ia6t_pltime = pltime }, + .ifra_vhid = attrs->ifaf_vhid, + }; + if (dst != NULL) + req.ifra_dstaddr = *dst; + + return (in6_control_ioctl(SIOCAIFADDR_IN6, &req, ifp, nlp_get_cred(nlp))); +} + +static int +handle_deladdr_inet6(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, + if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) +{ + struct sockaddr_in6 *addr = (struct sockaddr_in6 *)attrs->ifa_local; + + if (addr == NULL) + addr = (struct sockaddr_in6 *)(attrs->ifa_address); + + if (addr == NULL) { + nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); + return (EINVAL); + } + + struct in6_ifreq req = { .ifr_addr = *addr }; + + return (in6_control_ioctl(SIOCDIFADDR_IN6, &req, ifp, nlp_get_cred(nlp))); +} +#endif + + +static int +rtnl_handle_addr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + struct epoch_tracker et; + int error; + + struct nl_parsed_ifa attrs = {}; + error = nl_parse_nlmsg(hdr, &ifa_parser, npt, &attrs); + if (error != 0) + return (error); + + NET_EPOCH_ENTER(et); + if_t ifp = ifnet_byindex_ref(attrs.ifa_index); + NET_EPOCH_EXIT(et); + + if (ifp == NULL) { + nlmsg_report_err_msg(npt, "Unable to find interface with index %u", + attrs.ifa_index); + return (ENOENT); + } + int if_flags = if_getflags(ifp); + +#if defined(INET) || defined(INET6) + bool new = hdr->nlmsg_type == NL_RTM_NEWADDR; +#endif + + /* + * TODO: Properly handle NLM_F_CREATE / NLM_F_EXCL. + * The current ioctl-based KPI always does an implicit create-or-replace. + * It is not possible to specify fine-grained options. + */ + + switch (attrs.ifa_family) { +#ifdef INET + case AF_INET: + if (new) + error = handle_newaddr_inet(hdr, &attrs, ifp, nlp, npt); + else + error = handle_deladdr_inet(hdr, &attrs, ifp, nlp, npt); + break; +#endif +#ifdef INET6 + case AF_INET6: + if (new) + error = handle_newaddr_inet6(hdr, &attrs, ifp, nlp, npt); + else + error = handle_deladdr_inet6(hdr, &attrs, ifp, nlp, npt); + break; +#endif + default: + error = EAFNOSUPPORT; + } + + if (error == 0 && !(if_flags & IFF_UP) && (if_getflags(ifp) & IFF_UP)) + if_up(ifp); + + if_rele(ifp); + + return (error); +} + + +static void +rtnl_handle_ifaddr(void *arg __unused, struct ifaddr *ifa, int cmd) +{ + struct nlmsghdr hdr = {}; + struct nl_writer nw; + uint32_t group = 0; + + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + group = RTNLGRP_IPV4_IFADDR; + break; +#endif +#ifdef INET6 + case AF_INET6: + group = RTNLGRP_IPV6_IFADDR; + break; +#endif + default: + NL_LOG(LOG_DEBUG2, "ifa notification for unknown AF: %d", + ifa->ifa_addr->sa_family); + return; + } + + if (!nl_writer_group(&nw, NLMSG_LARGE, NETLINK_ROUTE, group, 0, + false)) { + NL_LOG(LOG_DEBUG, "error allocating group writer"); + return; + } + + hdr.nlmsg_type = (cmd == RTM_DELETE) ? NL_RTM_DELADDR : NL_RTM_NEWADDR; + + dump_iface_addr(&nw, ifa->ifa_ifp, ifa, &hdr); + nlmsg_flush(&nw); +} + +static void +rtnl_handle_ifevent(if_t ifp, int nlmsg_type, int if_flags_mask) +{ + struct nlmsghdr hdr = { .nlmsg_type = nlmsg_type }; + struct nl_writer nw; + + if (!nl_writer_group(&nw, NLMSG_LARGE, NETLINK_ROUTE, RTNLGRP_LINK, 0, + false)) { + NL_LOG(LOG_DEBUG, "error allocating group writer"); + return; + } + dump_iface(&nw, ifp, &hdr, if_flags_mask); + nlmsg_flush(&nw); +} + +static void +rtnl_handle_ifattach(void *arg, if_t ifp) +{ + NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); + rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, 0); +} + +static void +rtnl_handle_ifdetach(void *arg, if_t ifp) +{ + NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); + rtnl_handle_ifevent(ifp, NL_RTM_DELLINK, 0); +} + +static void +rtnl_handle_iflink(void *arg, if_t ifp, int link_state __unused) +{ + NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); + rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, 0); +} + +void +rtnl_handle_ifnet_event(if_t ifp, int if_flags_mask) +{ + NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); + rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, if_flags_mask); +} + +static const struct rtnl_cmd_handler cmd_handlers[] = { + { + .cmd = NL_RTM_GETLINK, + .name = "RTM_GETLINK", + .cb = &rtnl_handle_getlink, + .flags = RTNL_F_NOEPOCH | RTNL_F_ALLOW_NONVNET_JAIL, + }, + { + .cmd = NL_RTM_DELLINK, + .name = "RTM_DELLINK", + .cb = &rtnl_handle_dellink, + .priv = PRIV_NET_IFDESTROY, + .flags = RTNL_F_NOEPOCH, + }, + { + .cmd = NL_RTM_NEWLINK, + .name = "RTM_NEWLINK", + .cb = &rtnl_handle_newlink, + .priv = PRIV_NET_IFCREATE, + .flags = RTNL_F_NOEPOCH, + }, + { + .cmd = NL_RTM_GETADDR, + .name = "RTM_GETADDR", + .cb = &rtnl_handle_getaddr, + .flags = RTNL_F_ALLOW_NONVNET_JAIL, + }, + { + .cmd = NL_RTM_NEWADDR, + .name = "RTM_NEWADDR", + .cb = &rtnl_handle_addr, + .priv = PRIV_NET_ADDIFADDR, + .flags = RTNL_F_NOEPOCH, + }, + { + .cmd = NL_RTM_DELADDR, + .name = "RTM_DELADDR", + .cb = &rtnl_handle_addr, + .priv = PRIV_NET_DELIFADDR, + .flags = RTNL_F_NOEPOCH, + }, +}; + +static const struct nlhdr_parser *all_parsers[] = { + &ifmsg_parser, &ifa_parser, &ifa_fbsd_parser, +}; + +void +rtnl_iface_add_cloner(struct nl_cloner *cloner) +{ + sx_xlock(&rtnl_cloner_lock); + SLIST_INSERT_HEAD(&nl_cloners, cloner, next); + sx_xunlock(&rtnl_cloner_lock); +} + +void +rtnl_iface_del_cloner(struct nl_cloner *cloner) +{ + sx_xlock(&rtnl_cloner_lock); + SLIST_REMOVE(&nl_cloners, cloner, nl_cloner, next); + sx_xunlock(&rtnl_cloner_lock); +} + +void +rtnl_ifaces_init(void) +{ + ifattach_event = EVENTHANDLER_REGISTER( + ifnet_arrival_event, rtnl_handle_ifattach, NULL, + EVENTHANDLER_PRI_ANY); + ifdetach_event = EVENTHANDLER_REGISTER( + ifnet_departure_event, rtnl_handle_ifdetach, NULL, + EVENTHANDLER_PRI_ANY); + ifaddr_event = EVENTHANDLER_REGISTER( + rt_addrmsg, rtnl_handle_ifaddr, NULL, + EVENTHANDLER_PRI_ANY); + iflink_event = EVENTHANDLER_REGISTER( + ifnet_link_event, rtnl_handle_iflink, NULL, + EVENTHANDLER_PRI_ANY); + NL_VERIFY_PARSERS(all_parsers); + rtnl_register_messages(cmd_handlers, nitems(cmd_handlers)); +} + +void +rtnl_ifaces_destroy(void) +{ + EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ifattach_event); + EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_event); + EVENTHANDLER_DEREGISTER(rt_addrmsg, ifaddr_event); + EVENTHANDLER_DEREGISTER(ifnet_link_event, iflink_event); +} diff --git a/sys/netlink/route/iface_drivers.c b/sys/netlink/route/iface_drivers.c new file mode 100644 index 000000000000..4bf913d9c978 --- /dev/null +++ b/sys/netlink/route/iface_drivers.c @@ -0,0 +1,145 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_inet.h" +#include "opt_inet6.h" +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/syslog.h> +#include <sys/socketvar.h> + +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_dl.h> +#include <net/if_media.h> +#include <net/if_var.h> +#include <net/if_clone.h> +#include <net/if_vlan_var.h> +#include <net/route.h> +#include <net/route/nhop.h> +#include <net/route/route_ctl.h> +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_route.h> +#include <netlink/route/route_var.h> + +#include <netinet6/scope6_var.h> /* scope deembedding */ + +#define DEBUG_MOD_NAME nl_iface_drivers +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include <netlink/netlink_debug.h> +_DECLARE_DEBUG(LOG_INFO); + +/* + * Generic modification interface handler. + * Responsible for changing network stack interface attributes + * such as state, mtu or description. + */ +int +_nl_modify_ifp_generic(struct ifnet *ifp, struct nl_parsed_link *lattrs, + const struct nlattr_bmask *bm, struct nl_pstate *npt) +{ + int error; + + if (lattrs->ifla_ifalias != NULL) { + if (nlp_has_priv(npt->nlp, PRIV_NET_SETIFDESCR)) { + int len = strlen(lattrs->ifla_ifalias) + 1; + char *buf = if_allocdescr(len, M_WAITOK); + + memcpy(buf, lattrs->ifla_ifalias, len); + if_setdescr(ifp, buf); + if_setlastchange(ifp); + } else { + nlmsg_report_err_msg(npt, "Not enough privileges to set descr"); + return (EPERM); + } + } + + if ((lattrs->ifi_change & IFF_UP) && (lattrs->ifi_flags & IFF_UP) == 0) { + /* Request to down the interface */ + if_down(ifp); + } + + if (lattrs->ifla_mtu > 0) { + if (nlp_has_priv(npt->nlp, PRIV_NET_SETIFMTU)) { + struct ifreq ifr = { .ifr_mtu = lattrs->ifla_mtu }; + error = ifhwioctl(SIOCSIFMTU, ifp, (char *)&ifr, curthread); + } else { + nlmsg_report_err_msg(npt, "Not enough privileges to set mtu"); + return (EPERM); + } + } + + if (lattrs->ifi_change & IFF_PROMISC) { + error = ifpromisc(ifp, lattrs->ifi_flags & IFF_PROMISC); + if (error != 0) { + nlmsg_report_err_msg(npt, "unable to set promisc"); + return (error); + } + } + + return (0); +} + +/* + * Saves the resulting ifindex and ifname to report them + * to userland along with the operation result. + * NLA format: + * NLMSGERR_ATTR_COOKIE(nested) + * IFLA_NEW_IFINDEX(u32) + * IFLA_IFNAME(string) + */ +void +_nl_store_ifp_cookie(struct nl_pstate *npt, struct ifnet *ifp) +{ + int ifname_len = strlen(if_name(ifp)); + uint32_t ifindex = (uint32_t)if_getindex(ifp); + + int nla_len = sizeof(struct nlattr) * 3 + + sizeof(ifindex) + NL_ITEM_ALIGN(ifname_len + 1); + struct nlattr *nla_cookie = npt_alloc(npt, nla_len); + + /* Nested TLV */ + nla_cookie->nla_len = nla_len; + nla_cookie->nla_type = NLMSGERR_ATTR_COOKIE; + + struct nlattr *nla = nla_cookie + 1; + nla->nla_len = sizeof(struct nlattr) + sizeof(ifindex); + nla->nla_type = IFLA_NEW_IFINDEX; + memcpy(NLA_DATA(nla), &ifindex, sizeof(ifindex)); + + nla = NLA_NEXT(nla); + nla->nla_len = sizeof(struct nlattr) + ifname_len + 1; + nla->nla_type = IFLA_IFNAME; + strlcpy(NLA_DATA(nla), if_name(ifp), ifname_len + 1); + + nlmsg_report_cookie(npt, nla_cookie); +} + diff --git a/sys/netlink/route/ifaddrs.h b/sys/netlink/route/ifaddrs.h new file mode 100644 index 000000000000..88d776c3b925 --- /dev/null +++ b/sys/netlink/route/ifaddrs.h @@ -0,0 +1,99 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Interface address-related (RTM_<NEW|DEL|GET>ADDR) message header and attributes. + */ + +#ifndef _NETLINK_ROUTE_IFADDRS_H_ +#define _NETLINK_ROUTE_IFADDRS_H_ + +/* Base header for all of the relevant messages */ +struct ifaddrmsg { + uint8_t ifa_family; /* Address family */ + uint8_t ifa_prefixlen; /* Prefix length */ + uint8_t ifa_flags; /* Address-specific flags */ + uint8_t ifa_scope; /* Address scope */ + uint32_t ifa_index; /* Link ifindex */ +}; + +#ifndef _KERNEL +#define _NL_IFA_HDRLEN ((int)sizeof(struct ifaddrmsg)) +#define IFA_RTA(_ifa) ((struct rtattr *)(NL_ITEM_DATA(_ifa, _NL_IFA_HDRLEN))) +#define IFA_PAYLOAD(_hdr) NLMSG_PAYLOAD(_hdr, _NL_IFA_HDRLEN) +#endif + +/* Defined attributes */ +enum { + IFA_UNSPEC, + IFA_ADDRESS = 1, /* binary, prefix address (destination for p2p) */ + IFA_LOCAL = 2, /* binary, interface address */ + IFA_LABEL = 3, /* string, interface name */ + IFA_BROADCAST = 4, /* binary, broadcast ifa */ + IFA_ANYCAST = 5, /* not supported */ + IFA_CACHEINFO = 6, /* binary, struct ifa_cacheinfo */ + IFA_MULTICAST = 7, /* not supported */ + IFA_FLAGS = 8, /* u32, IFA_F flags */ + IFA_RT_PRIORITY = 9, /* not supported */ + IFA_TARGET_NETNSID = 10, /* not supported */ + IFA_FREEBSD = 11, /* nested, FreeBSD-specific */ + __IFA_MAX, +}; +#define IFA_MAX (__IFA_MAX - 1) + +enum { + IFAF_UNSPEC, + IFAF_VHID = 1, /* u32: carp vhid */ + IFAF_FLAGS = 2, /* u32: FreeBSD-specific ifa flags */ + __IFAF_MAX, +}; +#define IFAF_MAX (__IFAF_MAX - 1) + +/* IFA_FLAGS attribute flags */ +#define IFA_F_SECONDARY 0x0001 +#define IFA_F_TEMPORARY IFA_F_SECONDARY +#define IFA_F_NODAD 0x0002 +#define IFA_F_OPTIMISTIC 0x0004 +#define IFA_F_DADFAILED 0x0008 +#define IFA_F_HOMEADDRESS 0x0010 +#define IFA_F_DEPRECATED 0x0020 +#define IFA_F_TENTATIVE 0x0040 +#define IFA_F_PERMANENT 0x0080 +#define IFA_F_MANAGETEMPADDR 0x0100 +#define IFA_F_NOPREFIXROUTE 0x0200 +#define IFA_F_MCAUTOJOIN 0x0400 +#define IFA_F_STABLE_PRIVACY 0x0800 + +/* IFA_CACHEINFO value */ +struct ifa_cacheinfo { + uint32_t ifa_prefered; /* seconds till the end of the prefix considered preferred */ + uint32_t ifa_valid; /* seconds till the end of the prefix considered valid */ + uint32_t cstamp; /* creation time in 1ms intervals from the boot time */ + uint32_t tstamp; /* update time in 1ms intervals from the boot time */ +}; + +#endif diff --git a/sys/netlink/route/interface.h b/sys/netlink/route/interface.h new file mode 100644 index 000000000000..667bf2c96151 --- /dev/null +++ b/sys/netlink/route/interface.h @@ -0,0 +1,266 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Interface-related (RTM_<NEW|DEL|GET|SET>LINK) message header and attributes. + */ + +#ifndef _NETLINK_ROUTE_INTERFACE_H_ +#define _NETLINK_ROUTE_INTERFACE_H_ + +/* Base header for all of the relevant messages */ +struct ifinfomsg { + unsigned char ifi_family; /* not used */ + unsigned char __ifi_pad; + unsigned short ifi_type; /* ARPHRD_* */ + int ifi_index; /* Inteface index */ + unsigned ifi_flags; /* IFF_* flags */ + unsigned ifi_change; /* IFF_* change mask */ +}; + +/* Linux-specific link-level state flag */ +#define IFF_LOWER_UP IFF_NETLINK_1 + +#ifndef _KERNEL +/* Compatilbility helpers */ +#define _IFINFO_HDRLEN ((int)sizeof(struct ifinfomsg)) +#define IFLA_RTA(_ifi) ((struct rtattr *)NL_ITEM_DATA(_ifi, _IFINFO_HDRLEN)) +#define IFLA_PAYLOAD(_ifi) NLMSG_PAYLOAD(_ifi, _IFINFO_HDRLEN) +#endif + +enum { + IFLA_UNSPEC = 0, + IFLA_ADDRESS = 1, /* binary: Link-level address (MAC) */ +#define IFLA_ADDRESS IFLA_ADDRESS + IFLA_BROADCAST = 2, /* binary: link-level broadcast address */ +#define IFLA_BROADCAST IFLA_BROADCAST + IFLA_IFNAME = 3, /* string: Interface name */ +#define IFLA_IFNAME IFLA_IFNAME + IFLA_MTU = 4, /* u32: Current interface L3 mtu */ +#define IFLA_MTU IFLA_MTU + IFLA_LINK = 5, /* u32: interface index */ +#define IFLA_LINK IFLA_LINK + IFLA_QDISC = 6, /* string: Queing policy (not supported) */ +#define IFLA_QDISC IFLA_QDISC + IFLA_STATS = 7, /* Interface counters */ +#define IFLA_STATS IFLA_STATS + IFLA_COST = 8, /* not supported */ +#define IFLA_COST IFLA_COST + IFLA_PRIORITY = 9, /* not supported */ +#define IFLA_PRIORITY IFLA_PRIORITY + IFLA_MASTER = 10, /* u32: parent interface ifindex */ +#define IFLA_MASTER IFLA_MASTER + IFLA_WIRELESS = 11, /* not supported */ +#define IFLA_WIRELESS IFLA_WIRELESS + IFLA_PROTINFO = 12, /* protocol-specific data */ +#define IFLA_PROTINFO IFLA_PROTINFO + IFLA_TXQLEN = 13, /* u32: transmit queue length */ +#define IFLA_TXQLEN IFLA_TXQLEN + IFLA_MAP = 14, /* not supported */ +#define IFLA_MAP IFLA_MAP + IFLA_WEIGHT = 15, /* not supported */ +#define IFLA_WEIGHT IFLA_WEIGHT + IFLA_OPERSTATE = 16, /* u8: ifOperStatus per RFC 2863 */ +#define IFLA_OPERSTATE IFLA_OPERSTATE + IFLA_LINKMODE = 17, /* u8: ifmedia (not supported) */ +#define IFLA_LINKMODE IFLA_LINKMODE + IFLA_LINKINFO = 18, /* nested: IFLA_INFO_ */ +#define IFLA_LINKINFO IFLA_LINKINFO + IFLA_NET_NS_PID = 19, /* u32: vnet id (not supported) */ +#define IFLA_NET_NS_PID IFLA_NET_NS_PID + IFLA_IFALIAS = 20, /* string: interface description */ +#define IFLA_IFALIAS IFLA_IFALIAS + IFLA_NUM_VF = 21, /* not supported */ +#define IFLA_NUM_VF IFLA_NUM_VF + IFLA_VFINFO_LIST= 22, /* not supported */ +#define IFLA_VFINFO_LIST IFLA_VFINFO_LIST + IFLA_STATS64 = 23, /* rtnl_link_stats64: iface stats */ +#define IFLA_STATS64 IFLA_STATS64 + IFLA_VF_PORTS, + IFLA_PORT_SELF, + IFLA_AF_SPEC, + IFLA_GROUP, /* Group the device belongs to */ + IFLA_NET_NS_FD, + IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ + IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */ +#define IFLA_PROMISCUITY IFLA_PROMISCUITY + IFLA_NUM_TX_QUEUES, + IFLA_NUM_RX_QUEUES, + IFLA_CARRIER, + IFLA_PHYS_PORT_ID, + IFLA_CARRIER_CHANGES, + IFLA_PHYS_SWITCH_ID, + IFLA_LINK_NETNSID, + IFLA_PHYS_PORT_NAME, + IFLA_PROTO_DOWN, + IFLA_GSO_MAX_SEGS, + IFLA_GSO_MAX_SIZE, + IFLA_PAD, + IFLA_XDP, + IFLA_EVENT, + IFLA_NEW_NETNSID, + IFLA_IF_NETNSID, + IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */ + IFLA_CARRIER_UP_COUNT, + IFLA_CARRIER_DOWN_COUNT, + IFLA_NEW_IFINDEX, + IFLA_MIN_MTU, + IFLA_MAX_MTU, + IFLA_PROP_LIST, + IFLA_ALT_IFNAME, /* Alternative ifname */ + IFLA_PERM_ADDRESS, + IFLA_PROTO_DOWN_REASON, + IFLA_PARENT_DEV_NAME, + IFLA_PARENT_DEV_BUS_NAME, + IFLA_GRO_MAX_SIZE, + IFLA_TSO_MAX_SEGS, + IFLA_ALLMULTI, + IFLA_DEVLINK_PORT, + IFLA_GSO_IPV4_MAX_SIZE, + IFLA_GRO_IPV4_MAX_SIZE, + IFLA_FREEBSD, + __IFLA_MAX +}; +#define IFLA_MAX (__IFLA_MAX - 1) + +enum { + IFLAF_UNSPEC = 0, + IFLAF_ORIG_IFNAME = 1, /* string, original interface name at creation */ + IFLAF_ORIG_HWADDR = 2, /* binary, original hardware address */ + IFLAF_CAPS = 3, /* bitset, interface capabilities */ + __IFLAF_MAX +}; +#define IFLAF_MAX (__IFLAF_MAX - 1) + +/* + * Attributes that can be used as filters: + * IFLA_IFNAME, IFLA_GROUP, IFLA_ALT_IFNAME + * Headers that can be used as filters: + * ifi_index, ifi_type + */ + +/* + * IFLA_OPERSTATE. + * The values below represent the possible + * states of ifOperStatus defined by RFC 2863 + */ +enum { + IF_OPER_UNKNOWN = 0, /* status can not be determined */ + IF_OPER_NOTPRESENT = 1, /* some (hardware) component not present */ + IF_OPER_DOWN = 2, /* down */ + IF_OPER_LOWERLAYERDOWN = 3, /* some lower-level interface is down */ + IF_OPER_TESTING = 4, /* in some test mode */ + IF_OPER_DORMANT = 5, /* "up" but waiting for some condition (802.1X) */ + IF_OPER_UP = 6, /* ready to pass packets */ +}; + +/* IFLA_STATS */ +struct rtnl_link_stats { + uint32_t rx_packets; /* total RX packets (IFCOUNTER_IPACKETS) */ + uint32_t tx_packets; /* total TX packets (IFCOUNTER_OPACKETS) */ + uint32_t rx_bytes; /* total RX bytes (IFCOUNTER_IBYTES) */ + uint32_t tx_bytes; /* total TX bytes (IFCOUNTER_OBYTES) */ + uint32_t rx_errors; /* RX errors (IFCOUNTER_IERRORS) */ + uint32_t tx_errors; /* RX errors (IFCOUNTER_OERRORS) */ + uint32_t rx_dropped; /* RX drop (no space in ring/no bufs) (IFCOUNTER_IQDROPS) */ + uint32_t tx_dropped; /* TX drop (IFCOUNTER_OQDROPS) */ + uint32_t multicast; /* RX multicast packets (IFCOUNTER_IMCASTS) */ + uint32_t collisions; /* not supported */ + uint32_t rx_length_errors; /* not supported */ + uint32_t rx_over_errors; /* not supported */ + uint32_t rx_crc_errors; /* not supported */ + uint32_t rx_frame_errors; /* not supported */ + uint32_t rx_fifo_errors; /* not supported */ + uint32_t rx_missed_errors; /* not supported */ + uint32_t tx_aborted_errors; /* not supported */ + uint32_t tx_carrier_errors; /* not supported */ + uint32_t tx_fifo_errors; /* not supported */ + uint32_t tx_heartbeat_errors; /* not supported */ + uint32_t tx_window_errors; /* not supported */ + uint32_t rx_compressed; /* not supported */ + uint32_t tx_compressed; /* not supported */ + uint32_t rx_nohandler; /* dropped due to no proto handler (IFCOUNTER_NOPROTO) */ +}; + +/* IFLA_STATS64 */ +struct rtnl_link_stats64 { + uint64_t rx_packets; /* total RX packets (IFCOUNTER_IPACKETS) */ + uint64_t tx_packets; /* total TX packets (IFCOUNTER_OPACKETS) */ + uint64_t rx_bytes; /* total RX bytes (IFCOUNTER_IBYTES) */ + uint64_t tx_bytes; /* total TX bytes (IFCOUNTER_OBYTES) */ + uint64_t rx_errors; /* RX errors (IFCOUNTER_IERRORS) */ + uint64_t tx_errors; /* RX errors (IFCOUNTER_OERRORS) */ + uint64_t rx_dropped; /* RX drop (no space in ring/no bufs) (IFCOUNTER_IQDROPS) */ + uint64_t tx_dropped; /* TX drop (IFCOUNTER_OQDROPS) */ + uint64_t multicast; /* RX multicast packets (IFCOUNTER_IMCASTS) */ + uint64_t collisions; /* not supported */ + uint64_t rx_length_errors; /* not supported */ + uint64_t rx_over_errors; /* not supported */ + uint64_t rx_crc_errors; /* not supported */ + uint64_t rx_frame_errors; /* not supported */ + uint64_t rx_fifo_errors; /* not supported */ + uint64_t rx_missed_errors; /* not supported */ + uint64_t tx_aborted_errors; /* not supported */ + uint64_t tx_carrier_errors; /* not supported */ + uint64_t tx_fifo_errors; /* not supported */ + uint64_t tx_heartbeat_errors; /* not supported */ + uint64_t tx_window_errors; /* not supported */ + uint64_t rx_compressed; /* not supported */ + uint64_t tx_compressed; /* not supported */ + uint64_t rx_nohandler; /* dropped due to no proto handler (IFCOUNTER_NOPROTO) */ +}; + +/* IFLA_LINKINFO child nlattr types */ +enum { + IFLA_INFO_UNSPEC, + IFLA_INFO_KIND = 1, /* string, link type ("vlan") */ + IFLA_INFO_DATA = 2, /* Per-link-type custom data */ + IFLA_INFO_XSTATS = 3, + IFLA_INFO_SLAVE_KIND = 4, + IFLA_INFO_SLAVE_DATA = 5, + __IFLA_INFO_MAX, +}; +#define IFLA_INFO_MAX (__IFLA_INFO_MAX - 1) + +/* IFLA_INFO_DATA vlan attributes */ +enum { + IFLA_VLAN_UNSPEC, + IFLA_VLAN_ID, + IFLA_VLAN_FLAGS, + IFLA_VLAN_EGRESS_QOS, + IFLA_VLAN_INGRESS_QOS, + IFLA_VLAN_PROTOCOL, + __IFLA_VLAN_MAX, +}; + +#define IFLA_VLAN_MAX (__IFLA_VLAN_MAX - 1) +struct ifla_vlan_flags { + uint32_t flags; + uint32_t mask; +}; + +#endif diff --git a/sys/netlink/route/neigh.c b/sys/netlink/route/neigh.c new file mode 100644 index 000000000000..9eaaae263254 --- /dev/null +++ b/sys/netlink/route/neigh.c @@ -0,0 +1,601 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_inet.h" +#include "opt_inet6.h" +#include <sys/types.h> +#include <sys/eventhandler.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/syslog.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_private.h> +#include <net/if_llatbl.h> +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_route.h> +#include <netlink/route/route_var.h> + +#include <netinet6/in6_var.h> /* nd6.h requires this */ +#include <netinet6/nd6.h> /* nd6 state machine */ +#include <netinet6/scope6_var.h> /* scope deembedding */ + +#define DEBUG_MOD_NAME nl_neigh +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include <netlink/netlink_debug.h> +_DECLARE_DEBUG(LOG_INFO); + +static int lle_families[] = { AF_INET, AF_INET6 }; + +static eventhandler_tag lle_event_p; + +struct netlink_walkargs { + struct nl_writer *nw; + struct nlmsghdr hdr; + struct nlpcb *so; + if_t ifp; + int family; + int error; + int count; + int dumped; +}; + +static int +lle_state_to_nl_state(int family, struct llentry *lle) +{ + int state = lle->ln_state; + + switch (family) { + case AF_INET: + if (lle->la_flags & (LLE_STATIC | LLE_IFADDR)) + state = 1; + switch (state) { + case 0: /* ARP_LLINFO_INCOMPLETE */ + return (NUD_INCOMPLETE); + case 1: /* ARP_LLINFO_REACHABLE */ + return (NUD_REACHABLE); + case 2: /* ARP_LLINFO_VERIFY */ + return (NUD_PROBE); + } + break; + case AF_INET6: + switch (state) { + case ND6_LLINFO_INCOMPLETE: + return (NUD_INCOMPLETE); + case ND6_LLINFO_REACHABLE: + return (NUD_REACHABLE); + case ND6_LLINFO_STALE: + return (NUD_STALE); + case ND6_LLINFO_DELAY: + return (NUD_DELAY); + case ND6_LLINFO_PROBE: + return (NUD_PROBE); + } + break; + } + + return (NUD_NONE); +} + +static uint32_t +lle_flags_to_nl_flags(const struct llentry *lle) +{ + uint32_t nl_flags = 0; + + if (lle->la_flags & LLE_IFADDR) + nl_flags |= NTF_SELF; + if (lle->la_flags & LLE_PUB) + nl_flags |= NTF_PROXY; + if (lle->la_flags & LLE_STATIC) + nl_flags |= NTF_STICKY; + if (lle->ln_router != 0) + nl_flags |= NTF_ROUTER; + + return (nl_flags); +} + +static uint32_t +get_lle_next_ts(const struct llentry *lle) +{ + if (lle->la_expire == 0) + return (0); + return (lle->la_expire + lle->lle_remtime / hz + time_second - time_uptime); +} + +static int +dump_lle_locked(struct llentry *lle, void *arg) +{ + struct netlink_walkargs *wa = (struct netlink_walkargs *)arg; + struct nlmsghdr *hdr = &wa->hdr; + struct nl_writer *nw = wa->nw; + struct ndmsg *ndm; +#if defined(INET) || defined(INET6) + union { + struct in_addr in; + struct in6_addr in6; + } addr; +#endif + + IF_DEBUG_LEVEL(LOG_DEBUG2) { + char llebuf[NHOP_PRINT_BUFSIZE]; + llentry_print_buf_lltable(lle, llebuf, sizeof(llebuf)); + NL_LOG(LOG_DEBUG2, "dumping %s", llebuf); + } + + if (!nlmsg_reply(nw, hdr, sizeof(struct ndmsg))) + goto enomem; + + ndm = nlmsg_reserve_object(nw, struct ndmsg); + ndm->ndm_family = wa->family; + ndm->ndm_ifindex = if_getindex(wa->ifp); + ndm->ndm_state = lle_state_to_nl_state(wa->family, lle); + ndm->ndm_flags = lle_flags_to_nl_flags(lle); + + switch (wa->family) { +#ifdef INET + case AF_INET: + addr.in = lle->r_l3addr.addr4; + nlattr_add(nw, NDA_DST, 4, &addr); + break; +#endif +#ifdef INET6 + case AF_INET6: + addr.in6 = lle->r_l3addr.addr6; + in6_clearscope(&addr.in6); + nlattr_add(nw, NDA_DST, 16, &addr); + break; +#endif + } + + if (lle->r_flags & RLLE_VALID) { + /* Has L2 */ + int addrlen = if_getaddrlen(wa->ifp); + nlattr_add(nw, NDA_LLADDR, addrlen, lle->ll_addr); + } + + nlattr_add_u32(nw, NDA_PROBES, lle->la_asked); + + struct nda_cacheinfo *cache; + cache = nlmsg_reserve_attr(nw, NDA_CACHEINFO, struct nda_cacheinfo); + if (cache == NULL) + goto enomem; + /* TODO: provide confirmed/updated */ + cache->ndm_refcnt = lle->lle_refcnt; + + int off = nlattr_add_nested(nw, NDA_FREEBSD); + if (off != 0) { + nlattr_add_u32(nw, NDAF_NEXT_STATE_TS, get_lle_next_ts(lle)); + + nlattr_set_len(nw, off); + } + + if (nlmsg_end(nw)) + return (0); +enomem: + NL_LOG(LOG_DEBUG, "unable to dump lle state (ENOMEM)"); + nlmsg_abort(nw); + return (ENOMEM); +} + +static int +dump_lle(struct lltable *llt, struct llentry *lle, void *arg) +{ + int error; + + LLE_RLOCK(lle); + error = dump_lle_locked(lle, arg); + LLE_RUNLOCK(lle); + return (error); +} + +static bool +dump_llt(struct lltable *llt, struct netlink_walkargs *wa) +{ + lltable_foreach_lle(llt, dump_lle, wa); + + return (true); +} + +static int +dump_llts_iface(struct netlink_walkargs *wa, if_t ifp, int family) +{ + int error = 0; + + wa->ifp = ifp; + for (int i = 0; i < sizeof(lle_families) / sizeof(int); i++) { + int fam = lle_families[i]; + struct lltable *llt = lltable_get(ifp, fam); + if (llt != NULL && (family == 0 || family == fam)) { + wa->count++; + wa->family = fam; + if (!dump_llt(llt, wa)) { + error = ENOMEM; + break; + } + wa->dumped++; + } + } + return (error); +} + +static int +dump_llts(struct netlink_walkargs *wa, if_t ifp, int family) +{ + NL_LOG(LOG_DEBUG2, "Start dump ifp=%s family=%d", ifp ? if_name(ifp) : "NULL", family); + + wa->hdr.nlmsg_flags |= NLM_F_MULTI; + + if (ifp != NULL) { + dump_llts_iface(wa, ifp, family); + } else { + struct if_iter it; + + for (ifp = if_iter_start(&it); ifp != NULL; ifp = if_iter_next(&it)) { + dump_llts_iface(wa, ifp, family); + } + if_iter_finish(&it); + } + + NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa->count, wa->dumped); + + if (!nlmsg_end_dump(wa->nw, wa->error, &wa->hdr)) { + NL_LOG(LOG_DEBUG, "Unable to add new message"); + return (ENOMEM); + } + + return (0); +} + +static int +get_lle(struct netlink_walkargs *wa, if_t ifp, int family, struct sockaddr *dst) +{ + struct lltable *llt = lltable_get(ifp, family); + if (llt == NULL) + return (ESRCH); + + struct llentry *lle = lla_lookup(llt, LLE_UNLOCKED, dst); + if (lle == NULL) + return (ESRCH); + + wa->ifp = ifp; + wa->family = family; + + return (dump_lle(llt, lle, wa)); +} + +static void +set_scope6(struct sockaddr *sa, if_t ifp) +{ +#ifdef INET6 + if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) { + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; + + if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) + in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp)); + } +#endif +} + +struct nl_parsed_neigh { + struct sockaddr *nda_dst; + struct ifnet *nda_ifp; + struct nlattr *nda_lladdr; + uint32_t ndaf_next_ts; + uint32_t ndm_flags; + uint16_t ndm_state; + uint8_t ndm_family; +}; + +#define _IN(_field) offsetof(struct ndmsg, _field) +#define _OUT(_field) offsetof(struct nl_parsed_neigh, _field) +static const struct nlattr_parser nla_p_neigh_fbsd[] = { + { .type = NDAF_NEXT_STATE_TS, .off = _OUT(ndaf_next_ts), .cb = nlattr_get_uint32 }, +}; +NL_DECLARE_ATTR_PARSER(neigh_fbsd_parser, nla_p_neigh_fbsd); + +static const struct nlfield_parser nlf_p_neigh[] = { + { .off_in = _IN(ndm_family), .off_out = _OUT(ndm_family), .cb = nlf_get_u8 }, + { .off_in = _IN(ndm_flags), .off_out = _OUT(ndm_flags), .cb = nlf_get_u8_u32 }, + { .off_in = _IN(ndm_state), .off_out = _OUT(ndm_state), .cb = nlf_get_u16 }, + { .off_in = _IN(ndm_ifindex), .off_out = _OUT(nda_ifp), .cb = nlf_get_ifpz }, +}; + +static const struct nlattr_parser nla_p_neigh[] = { + { .type = NDA_DST, .off = _OUT(nda_dst), .cb = nlattr_get_ip }, + { .type = NDA_LLADDR, .off = _OUT(nda_lladdr), .cb = nlattr_get_nla }, + { .type = NDA_IFINDEX, .off = _OUT(nda_ifp), .cb = nlattr_get_ifp }, + { .type = NDA_FLAGS_EXT, .off = _OUT(ndm_flags), .cb = nlattr_get_uint32 }, + { .type = NDA_FREEBSD, .arg = &neigh_fbsd_parser, .cb = nlattr_get_nested }, +}; +#undef _IN +#undef _OUT + +static bool +post_p_neigh(void *_attrs, struct nl_pstate *npt __unused) +{ + struct nl_parsed_neigh *attrs = (struct nl_parsed_neigh *)_attrs; + + set_scope6(attrs->nda_dst, attrs->nda_ifp); + return (true); +} +NL_DECLARE_PARSER_EXT(ndmsg_parser, struct ndmsg, NULL, nlf_p_neigh, nla_p_neigh, post_p_neigh); + + +/* + * type=RTM_NEWNEIGH, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1661941473, pid=0}, + * {ndm_family=AF_INET6, ndm_ifindex=if_nametoindex("enp0s31f6"), ndm_state=NUD_PERMANENT, ndm_flags=0, ndm_type=RTN_UNSPEC}, + * [ + * {{nla_len=20, nla_type=NDA_DST}, inet_pton(AF_INET6, "2a01:4f8:13a:70c::3")}, + * {{nla_len=10, nla_type=NDA_LLADDR}, 20:4e:71:62:ae:f2}]}, iov_len=60} + */ + +static int +rtnl_handle_newneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + int error; + + struct nl_parsed_neigh attrs = {}; + error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + if (attrs.nda_ifp == NULL || attrs.nda_dst == NULL || attrs.nda_lladdr == NULL) { + if (attrs.nda_ifp == NULL) + NLMSG_REPORT_ERR_MSG(npt, "NDA_IFINDEX / ndm_ifindex not set"); + if (attrs.nda_dst == NULL) + NLMSG_REPORT_ERR_MSG(npt, "NDA_DST not set"); + if (attrs.nda_lladdr == NULL) + NLMSG_REPORT_ERR_MSG(npt, "NDA_LLADDR not set"); + return (EINVAL); + } + + if (attrs.nda_dst->sa_family != attrs.ndm_family) { + NLMSG_REPORT_ERR_MSG(npt, + "NDA_DST family (%d) is different from ndm_family (%d)", + attrs.nda_dst->sa_family, attrs.ndm_family); + return (EINVAL); + } + + int addrlen = if_getaddrlen(attrs.nda_ifp); + if (attrs.nda_lladdr->nla_len != sizeof(struct nlattr) + addrlen) { + NLMSG_REPORT_ERR_MSG(npt, + "NDA_LLADDR address length (%d) is different from expected (%d)", + (int)attrs.nda_lladdr->nla_len - (int)sizeof(struct nlattr), addrlen); + return (EINVAL); + } + + const uint16_t supported_flags = NTF_PROXY | NTF_STICKY; + if ((attrs.ndm_flags & supported_flags) != attrs.ndm_flags) { + NLMSG_REPORT_ERR_MSG(npt, "ndm_flags %X not supported", + attrs.ndm_flags &~ supported_flags); + return (ENOTSUP); + } + + /* Replacement requires new entry creation anyway */ + if ((hdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_REPLACE)) == 0) + return (ENOTSUP); + + struct lltable *llt = lltable_get(attrs.nda_ifp, attrs.ndm_family); + if (llt == NULL) + return (EAFNOSUPPORT); + + + uint8_t linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize = sizeof(linkhdr); + int lladdr_off = 0; + if (lltable_calc_llheader(attrs.nda_ifp, attrs.ndm_family, + (char *)(attrs.nda_lladdr + 1), linkhdr, &linkhdrsize, &lladdr_off) != 0) { + NLMSG_REPORT_ERR_MSG(npt, "unable to calculate lle prepend data"); + return (EINVAL); + } + + int lle_flags = (attrs.ndm_flags & NTF_PROXY) ? LLE_PUB : 0; + if (attrs.ndm_flags & NTF_STICKY) + lle_flags |= LLE_STATIC; + struct llentry *lle = lltable_alloc_entry(llt, lle_flags, attrs.nda_dst); + if (lle == NULL) + return (ENOMEM); + lltable_set_entry_addr(attrs.nda_ifp, lle, linkhdr, linkhdrsize, lladdr_off); + + if (attrs.ndm_flags & NTF_STICKY) + lle->la_expire = 0; + else + lle->la_expire = attrs.ndaf_next_ts - time_second + time_uptime; + + /* llentry created, try to insert or update */ + IF_AFDATA_WLOCK(attrs.nda_ifp); + LLE_WLOCK(lle); + struct llentry *lle_tmp = lla_lookup(llt, LLE_EXCLUSIVE, attrs.nda_dst); + if (lle_tmp != NULL) { + error = EEXIST; + if (hdr->nlmsg_flags & NLM_F_REPLACE) { + error = EPERM; + if ((lle_tmp->la_flags & LLE_IFADDR) == 0) { + error = 0; /* success */ + lltable_unlink_entry(llt, lle_tmp); + llentry_free(lle_tmp); + lle_tmp = NULL; + lltable_link_entry(llt, lle); + } + } + if (lle_tmp) + LLE_WUNLOCK(lle_tmp); + } else { + if (hdr->nlmsg_flags & NLM_F_CREATE) + lltable_link_entry(llt, lle); + else + error = ENOENT; + } + IF_AFDATA_WUNLOCK(attrs.nda_ifp); + + if (error != 0) { + /* throw away the newly allocated llentry */ + llentry_free(lle); + return (error); + } + + /* XXX: We're inside epoch */ + EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED); + LLE_WUNLOCK(lle); + llt->llt_post_resolved(llt, lle); + + return (0); +} + +static int +rtnl_handle_delneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + int error; + + struct nl_parsed_neigh attrs = {}; + error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + if (attrs.nda_dst == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "NDA_DST not set"); + return (EINVAL); + } + + if (attrs.nda_ifp == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "no ifindex provided"); + return (EINVAL); + } + + struct lltable *llt = lltable_get(attrs.nda_ifp, attrs.ndm_family); + if (llt == NULL) + return (EAFNOSUPPORT); + + return (lltable_delete_addr(llt, 0, attrs.nda_dst)); +} + +static int +rtnl_handle_getneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + int error; + + struct nl_parsed_neigh attrs = {}; + error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + if (attrs.nda_dst != NULL && attrs.nda_ifp == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "has NDA_DST but no ifindex provided"); + return (EINVAL); + } + + struct netlink_walkargs wa = { + .so = nlp, + .nw = npt->nw, + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags, + .hdr.nlmsg_type = NL_RTM_NEWNEIGH, + }; + + if (attrs.nda_dst == NULL) + error = dump_llts(&wa, attrs.nda_ifp, attrs.ndm_family); + else + error = get_lle(&wa, attrs.nda_ifp, attrs.ndm_family, attrs.nda_dst); + + return (error); +} + +static const struct rtnl_cmd_handler cmd_handlers[] = { + { + .cmd = NL_RTM_NEWNEIGH, + .name = "RTM_NEWNEIGH", + .cb = &rtnl_handle_newneigh, + .priv = PRIV_NET_ROUTE, + }, + { + .cmd = NL_RTM_DELNEIGH, + .name = "RTM_DELNEIGH", + .cb = &rtnl_handle_delneigh, + .priv = PRIV_NET_ROUTE, + }, + { + .cmd = NL_RTM_GETNEIGH, + .name = "RTM_GETNEIGH", + .cb = &rtnl_handle_getneigh, + } +}; + +static void +rtnl_lle_event(void *arg __unused, struct llentry *lle, int evt) +{ + struct nl_writer nw; + if_t ifp; + int family; + + LLE_WLOCK_ASSERT(lle); + + ifp = lltable_get_ifp(lle->lle_tbl); + family = lltable_get_af(lle->lle_tbl); + + if (family != AF_INET && family != AF_INET6) + return; + + int nlmsgs_type = evt == LLENTRY_RESOLVED ? NL_RTM_NEWNEIGH : NL_RTM_DELNEIGH; + + if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEIGH, 0, + false)) { + NL_LOG(LOG_DEBUG, "error allocating group writer"); + return; + } + + struct netlink_walkargs wa = { + .hdr.nlmsg_type = nlmsgs_type, + .nw = &nw, + .ifp = ifp, + .family = family, + }; + + dump_lle_locked(lle, &wa); + nlmsg_flush(&nw); +} + +static const struct nlhdr_parser *all_parsers[] = { &ndmsg_parser, &neigh_fbsd_parser }; + +void +rtnl_neighs_init(void) +{ + NL_VERIFY_PARSERS(all_parsers); + rtnl_register_messages(cmd_handlers, nitems(cmd_handlers)); + lle_event_p = EVENTHANDLER_REGISTER(lle_event, rtnl_lle_event, NULL, + EVENTHANDLER_PRI_ANY); +} + +void +rtnl_neighs_destroy(void) +{ + EVENTHANDLER_DEREGISTER(lle_event, lle_event_p); +} diff --git a/sys/netlink/route/neigh.h b/sys/netlink/route/neigh.h new file mode 100644 index 000000000000..10bc3b93d16a --- /dev/null +++ b/sys/netlink/route/neigh.h @@ -0,0 +1,111 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Neighbors-related (RTM_<NEW|DEL|GET>NEIGH) message header and attributes. + */ + +#ifndef _NETLINK_ROUTE_NEIGH_H_ +#define _NETLINK_ROUTE_NEIGH_H_ + +/* Base header for all of the relevant messages */ +struct ndmsg { + uint8_t ndm_family; + uint8_t ndm_pad1; + uint16_t ndm_pad2; + int32_t ndm_ifindex; + uint16_t ndm_state; + uint8_t ndm_flags; + uint8_t ndm_type; +}; + +/* Attributes */ +enum { + NDA_UNSPEC, + NDA_DST, /* binary: neigh l3 address */ + NDA_LLADDR, /* binary: neigh link-level address */ + NDA_CACHEINFO, /* binary, struct nda_cacheinfo */ + NDA_PROBES, /* u32: number of probes sent */ + NDA_VLAN, /* upper 802.1Q tag */ + NDA_PORT, /* not supported */ + NDA_VNI, /* not supported */ + NDA_IFINDEX, /* interface index */ + NDA_MASTER, /* not supported */ + NDA_LINK_NETNSID, /* not supported */ + NDA_SRC_VNI, /* not supported */ + NDA_PROTOCOL, /* XXX */ + NDA_NH_ID, /* not supported */ + NDA_FDB_EXT_ATTRS, /* not supported */ + NDA_FLAGS_EXT, /* u32: ndm_flags */ + NDA_NDM_STATE_MASK, /* XXX */ + NDA_NDM_FLAGS_MASK, /* XXX */ + NDA_FREEBSD, /* nested: FreeBSD-specific */ + __NDA_MAX +}; + +#define NDA_MAX (__NDA_MAX - 1) + +enum { + NDAF_UNSPEC, + NDAF_NEXT_STATE_TS, /* (u32) seconds from time_uptime when moving to the next state */ +}; + + +/* ndm_flags / NDA_FLAGS_EXT */ +#define NTF_USE 0x0001 /* XXX */ +#define NTF_SELF 0x0002 /* local station */ +#define NTF_MASTER 0x0004 /* XXX */ +#define NTF_PROXY 0x0008 /* proxy entry */ +#define NTF_EXT_LEARNED 0x0010 /* not used */ +#define NTF_OFFLOADED 0x0020 /* not used */ +#define NTF_STICKY 0x0040 /* permanent entry */ +#define NTF_ROUTER 0x0080 /* dst indicated itself as a router */ +/* start of NDA_FLAGS_EXT */ +#define NTF_EXT_MANAGED 0x0100 /* not used */ + +/* ndm_state */ +#define NUD_INCOMPLETE 0x01 /* No lladdr, address resolution in progress */ +#define NUD_REACHABLE 0x02 /* reachable & recently resolved */ +#define NUD_STALE 0x04 /* has lladdr but it's stale */ +#define NUD_DELAY 0x08 /* has lladdr, is stale, probes delayed */ +#define NUD_PROBE 0x10 /* has lladdr, is stale, probes sent */ +#define NUD_FAILED 0x20 /* unused */ + +/* Dummy states */ +#define NUD_NOARP 0x40 /* not used */ +#define NUD_PERMANENT 0x80 /* not flushed */ +#define NUD_NONE 0x00 + +/* NDA_CACHEINFO */ +struct nda_cacheinfo { + uint32_t ndm_confirmed; /* seconds since ARP/ND was received from neigh */ + uint32_t ndm_used; /* seconds since last used (not provided) */ + uint32_t ndm_updated; /* seconds since state was updated last */ + uint32_t ndm_refcnt; /* number of references held */ +}; + +#endif diff --git a/sys/netlink/route/nexthop.c b/sys/netlink/route/nexthop.c new file mode 100644 index 000000000000..30aa3dd72534 --- /dev/null +++ b/sys/netlink/route/nexthop.c @@ -0,0 +1,1123 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_route.h" +#include <sys/types.h> +#include <sys/ck.h> +#include <sys/epoch.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/rmlock.h> +#include <sys/socket.h> + +#include <net/if.h> +#include <net/route.h> +#include <net/route/nhop.h> +#include <net/route/nhop_utils.h> + +#include <net/route/route_ctl.h> +#include <net/route/route_var.h> +#include <netinet6/scope6_var.h> +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_route.h> +#include <netlink/route/route_var.h> + +#define DEBUG_MOD_NAME nl_nhop +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include <netlink/netlink_debug.h> +_DECLARE_DEBUG(LOG_INFO); + +/* + * This file contains the logic to maintain kernel nexthops and + * nexhop groups based om the data provided by the user. + * + * Kernel stores (nearly) all of the routing data in the nexthops, + * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT). + * + * Netlink API provides higher-level abstraction for the user. Each + * user-created nexthop may map to multiple kernel nexthops. + * + * The following variations require separate kernel nexthop to be + * created: + * * prefix flags (NHF_HOST, NHF_DEFAULT) + * * using IPv6 gateway for IPv4 routes + * * different fibnum + * + * These kernel nexthops have the lifetime bound to the lifetime of + * the user_nhop object. They are not collected until user requests + * to delete the created user_nhop. + * + */ +struct user_nhop { + uint32_t un_idx; /* Userland-provided index */ + uint32_t un_fibfam; /* fibnum+af(as highest byte) */ + uint8_t un_protocol; /* protocol that install the record */ + struct nhop_object *un_nhop; /* "production" nexthop */ + struct nhop_object *un_nhop_src; /* nexthop to copy from */ + struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */ + uint32_t un_nhgrp_count; /* number of nexthops */ + struct user_nhop *un_next; /* next item in hash chain */ + struct user_nhop *un_nextchild; /* master -> children */ + struct epoch_context un_epoch_ctx; /* epoch ctl helper */ +}; + +/* produce hash value for an object */ +#define unhop_hash_obj(_obj) (hash_unhop(_obj)) +/* compare two objects */ +#define unhop_cmp(_one, _two) (cmp_unhop(_one, _two)) +/* next object accessor */ +#define unhop_next(_obj) (_obj)->un_next + +CHT_SLIST_DEFINE(unhop, struct user_nhop); + +struct unhop_ctl { + struct unhop_head un_head; + struct rmlock un_lock; +}; +#define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl") +#define UN_TRACKER struct rm_priotracker un_tracker +#define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker) +#define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker) + +#define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock); +#define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock); + +VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL; +#define V_un_ctl VNET(un_ctl) + +static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size); +static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b); +static unsigned int hash_unhop(const struct user_nhop *obj); + +static void destroy_unhop(struct user_nhop *unhop); +static struct nhop_object *clone_unhop(const struct user_nhop *unhop, + uint32_t fibnum, int family, int nh_flags); + +static int +cmp_unhop(const struct user_nhop *a, const struct user_nhop *b) +{ + return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam); +} + +/* + * Hash callback: calculate hash of an object + */ +static unsigned int +hash_unhop(const struct user_nhop *obj) +{ + return (obj->un_idx ^ obj->un_fibfam); +} + +#define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0) + +/* + * Factory interface for creating matching kernel nexthops/nexthop groups + * + * @uidx: userland nexhop index used to create the nexthop + * @fibnum: fibnum nexthop will be used in + * @family: upper family nexthop will be used in + * @nh_flags: desired nexthop prefix flags + * @perror: pointer to store error to + * + * Returns referenced nexthop linked to @fibnum/@family rib on success. + */ +struct nhop_object * +nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx, + int nh_flags, int *perror) +{ + struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); + UN_TRACKER; + + if (__predict_false(ctl == NULL)) + return (NULL); + + struct user_nhop key= { + .un_idx = uidx, + .un_fibfam = fibnum | ((uint32_t)family) << 24, + }; + struct user_nhop *unhop; + + nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT); + + if (__predict_false(family == 0)) + return (NULL); + + UN_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + if (unhop != NULL) { + struct nhop_object *nh = unhop->un_nhop; + UN_RLOCK(ctl); + *perror = 0; + nhop_ref_any(nh); + return (nh); + } + + /* + * Exact nexthop not found. Search for template nexthop to clone from. + */ + key.un_fibfam = 0; + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + if (unhop == NULL) { + UN_RUNLOCK(ctl); + *perror = ESRCH; + return (NULL); + } + + UN_RUNLOCK(ctl); + + /* Create entry to insert first */ + struct user_nhop *un_new, *un_tmp; + un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); + if (un_new == NULL) { + *perror = ENOMEM; + return (NULL); + } + un_new->un_idx = uidx; + un_new->un_fibfam = fibnum | ((uint32_t)family) << 24; + + /* Relying on epoch to protect unhop here */ + un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags); + if (un_new->un_nhop == NULL) { + free(un_new, M_NETLINK); + *perror = ENOMEM; + return (NULL); + } + + /* Insert back and report */ + UN_WLOCK(ctl); + + /* First, find template record once again */ + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + if (unhop == NULL) { + /* Someone deleted the nexthop during the call */ + UN_WUNLOCK(ctl); + *perror = ESRCH; + destroy_unhop(un_new); + return (NULL); + } + + /* Second, check the direct match */ + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp); + struct nhop_object *nh; + if (un_tmp != NULL) { + /* Another thread already created the desired nextop, use it */ + nh = un_tmp->un_nhop; + } else { + /* Finally, insert the new nexthop and link it to the primary */ + nh = un_new->un_nhop; + CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new); + un_new->un_nextchild = unhop->un_nextchild; + unhop->un_nextchild = un_new; + un_new = NULL; + NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh); + } + + UN_WUNLOCK(ctl); + + if (un_new != NULL) + destroy_unhop(un_new); + + *perror = 0; + nhop_ref_any(nh); + return (nh); +} + +static struct user_nhop * +nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx) +{ + struct user_nhop key= { .un_idx = uidx }; + struct user_nhop *unhop = NULL; + UN_TRACKER; + + UN_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + UN_RUNLOCK(ctl); + + return (unhop); +} + +#define MAX_STACK_NHOPS 4 +static struct nhop_object * +clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags) +{ +#ifdef ROUTE_MPATH + const struct weightened_nhop *wn; + struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS]; + uint32_t num_nhops; +#endif + struct nhop_object *nh = NULL; + int error; + + if (unhop->un_nhop_src != NULL) { + IF_DEBUG_LEVEL(LOG_DEBUG2) { + char nhbuf[NHOP_PRINT_BUFSIZE]; + nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf)); + FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src, + "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum, + family, nh_flags); + } + struct nhop_object *nh; + nh = nhop_alloc(fibnum, AF_UNSPEC); + if (nh == NULL) + return (NULL); + nhop_copy(nh, unhop->un_nhop_src); + /* Check that nexthop gateway is compatible with the new family */ + if (!nhop_set_upper_family(nh, family)) { + nhop_free(nh); + return (NULL); + } + nhop_set_uidx(nh, unhop->un_idx); + nhop_set_pxtype_flag(nh, nh_flags); + return (nhop_get_nhop(nh, &error)); + } +#ifdef ROUTE_MPATH + wn = unhop->un_nhgrp_src; + num_nhops = unhop->un_nhgrp_count; + + if (num_nhops > MAX_STACK_NHOPS) { + wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT); + if (wn_new == NULL) + return (NULL); + } else + wn_new = wn_base; + + for (int i = 0; i < num_nhops; i++) { + uint32_t uidx = nhop_get_uidx(wn[i].nh); + MPASS(uidx != 0); + wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error); + if (error != 0) + break; + wn_new[i].weight = wn[i].weight; + } + + if (error == 0) { + struct rib_head *rh = nhop_get_rh(wn_new[0].nh); + struct nhgrp_object *nhg; + + error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg); + nh = (struct nhop_object *)nhg; + } + + if (wn_new != wn_base) + free(wn_new, M_TEMP); +#endif + return (nh); +} + +static void +destroy_unhop(struct user_nhop *unhop) +{ + if (unhop->un_nhop != NULL) + nhop_free_any(unhop->un_nhop); + if (unhop->un_nhop_src != NULL) + nhop_free_any(unhop->un_nhop_src); + free(unhop, M_NETLINK); +} + +static void +destroy_unhop_epoch(epoch_context_t ctx) +{ + struct user_nhop *unhop; + + unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx); + + destroy_unhop(unhop); +} + +static uint32_t +find_spare_uidx(struct unhop_ctl *ctl) +{ + struct user_nhop *unhop, key = {}; + uint32_t uidx = 0; + UN_TRACKER; + + UN_RLOCK(ctl); + /* This should return spare uid with 75% of 65k used in ~99/100 cases */ + for (int i = 0; i < 16; i++) { + key.un_idx = (arc4random() % 65536) + 65536 * 4; + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + if (unhop == NULL) { + uidx = key.un_idx; + break; + } + } + UN_RUNLOCK(ctl); + + return (uidx); +} + + +/* + * Actual netlink code + */ +struct netlink_walkargs { + struct nl_writer *nw; + struct nlmsghdr hdr; + struct nlpcb *so; + int family; + int error; + int count; + int dumped; +}; +#define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem + +static bool +dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr, + struct nl_writer *nw) +{ + + if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) + goto enomem; + + struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); + nhm->nh_family = AF_UNSPEC; + nhm->nh_scope = 0; + nhm->nh_protocol = unhop->un_protocol; + nhm->nh_flags = 0; + + nlattr_add_u32(nw, NHA_ID, unhop->un_idx); + nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH); + + struct weightened_nhop *wn = unhop->un_nhgrp_src; + uint32_t num_nhops = unhop->un_nhgrp_count; + /* TODO: a better API? */ + int nla_len = sizeof(struct nlattr); + nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp)); + struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); + if (nla == NULL) + goto enomem; + nla->nla_type = NHA_GROUP; + nla->nla_len = nla_len; + for (int i = 0; i < num_nhops; i++) { + struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i]; + grp->id = nhop_get_uidx(wn[i].nh); + grp->weight = wn[i].weight; + grp->resvd1 = 0; + grp->resvd2 = 0; + } + + if (nlmsg_end(nw)) + return (true); +enomem: + NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory"); + nlmsg_abort(nw); + return (false); +} + +static bool +dump_nhop(const struct nhop_object *nh, uint32_t uidx, struct nlmsghdr *hdr, + struct nl_writer *nw) +{ + if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) + goto enomem; + + struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); + ENOMEM_IF_NULL(nhm); + nhm->nh_family = nhop_get_neigh_family(nh); + nhm->nh_scope = 0; // XXX: what's that? + nhm->nh_protocol = nhop_get_origin(nh); + nhm->nh_flags = 0; + + if (uidx != 0) + nlattr_add_u32(nw, NHA_ID, uidx); + if (nh->nh_flags & NHF_BLACKHOLE) { + nlattr_add_flag(nw, NHA_BLACKHOLE); + goto done; + } + nlattr_add_u32(nw, NHA_OIF, if_getindex(nh->nh_ifp)); + + switch (nh->gw_sa.sa_family) { +#ifdef INET + case AF_INET: + nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr); + break; +#endif +#ifdef INET6 + case AF_INET6: + { + struct in6_addr addr = nh->gw6_sa.sin6_addr; + in6_clearscope(&addr); + nlattr_add(nw, NHA_GATEWAY, 16, &addr); + break; + } +#endif + } + + int off = nlattr_add_nested(nw, NHA_FREEBSD); + if (off != 0) { + nlattr_add_u32(nw, NHAF_AIF, if_getindex(nh->nh_aifp)); + + if (uidx == 0) { + nlattr_add_u32(nw, NHAF_KID, nhop_get_idx(nh)); + nlattr_add_u32(nw, NHAF_FAMILY, nhop_get_upper_family(nh)); + nlattr_add_u32(nw, NHAF_TABLE, nhop_get_fibnum(nh)); + } + + nlattr_set_len(nw, off); + } + +done: + if (nlmsg_end(nw)) + return (true); +enomem: + nlmsg_abort(nw); + return (false); +} + +static void +dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, + struct nl_writer *nw) +{ + if (unhop->un_nhop_src != NULL) + dump_nhop(unhop->un_nhop_src, unhop->un_idx, hdr, nw); + else + dump_nhgrp(unhop, hdr, nw); +} + +static int +delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx) +{ + struct user_nhop *unhop_ret, *unhop_base, *unhop_chain; + struct nl_writer nw; + struct user_nhop key = { .un_idx = uidx }; + + UN_WLOCK(ctl); + + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base); + + if (unhop_base != NULL) { + CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret); + IF_DEBUG_LEVEL(LOG_DEBUG2) { + char nhbuf[NHOP_PRINT_BUFSIZE]; + nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf)); + FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop, + "removed base nhop %u: %s", uidx, nhbuf); + } + /* Unlink all child nexhops as well, keeping the chain intact */ + unhop_chain = unhop_base->un_nextchild; + while (unhop_chain != NULL) { + CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain, + unhop_ret); + MPASS(unhop_chain == unhop_ret); + IF_DEBUG_LEVEL(LOG_DEBUG3) { + char nhbuf[NHOP_PRINT_BUFSIZE]; + nhop_print_buf_any(unhop_chain->un_nhop, + nhbuf, sizeof(nhbuf)); + FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop, + "removed child nhop %u: %s", uidx, nhbuf); + } + unhop_chain = unhop_chain->un_nextchild; + } + } + + UN_WUNLOCK(ctl); + + if (unhop_base == NULL) { + NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx); + return (ENOENT); + } + + /* Report nexthop deletion */ + struct netlink_walkargs wa = { + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags, + .hdr.nlmsg_type = NL_RTM_DELNEXTHOP, + }; + + if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP, + 0, false)) { + NL_LOG(LOG_DEBUG, "error allocating message writer"); + return (ENOMEM); + } + + dump_unhop(unhop_base, &wa.hdr, &nw); + nlmsg_flush(&nw); + + while (unhop_base != NULL) { + unhop_chain = unhop_base->un_nextchild; + NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx); + unhop_base = unhop_chain; + } + + return (0); +} + +static void +consider_resize(struct unhop_ctl *ctl, uint32_t new_size) +{ + void *new_ptr = NULL; + size_t alloc_size; + + if (new_size == 0) + return; + + if (new_size != 0) { + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size); + new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); + if (new_ptr == NULL) + return; + } + + NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size); + UN_WLOCK(ctl); + if (new_ptr != NULL) { + CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size); + } + UN_WUNLOCK(ctl); + + + if (new_ptr != NULL) + free(new_ptr, M_NETLINK); +} + +static bool __noinline +vnet_init_unhops(void) +{ + uint32_t num_buckets = 16; + size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); + + struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK, + M_NOWAIT | M_ZERO); + if (ctl == NULL) + return (false); + + void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); + if (ptr == NULL) { + free(ctl, M_NETLINK); + return (false); + } + CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets); + UN_LOCK_INIT(ctl); + + if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) { + free(ptr, M_NETLINK); + free(ctl, M_NETLINK); + } + + if (atomic_load_ptr(&V_un_ctl) == NULL) + return (false); + + NL_LOG(LOG_NOTICE, "UNHOPS init done"); + + return (true); +} + +static void +vnet_destroy_unhops(const void *unused __unused) +{ + struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); + struct user_nhop *unhop, *tmp; + + if (ctl == NULL) + return; + V_un_ctl = NULL; + + /* Wait till all unhop users finish their reads */ + NET_EPOCH_WAIT(); + + UN_WLOCK(ctl); + CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) { + destroy_unhop(unhop); + } CHT_SLIST_FOREACH_SAFE_END; + UN_WUNLOCK(ctl); + + free(ctl->un_head.ptr, M_NETLINK); + free(ctl, M_NETLINK); +} +VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY, + vnet_destroy_unhops, NULL); + +static int +nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + int error = 0; + + /* Verify attribute correctness */ + struct nexthop_grp *grp = NLA_DATA(nla); + int data_len = NLA_DATA_LEN(nla); + + int count = data_len / sizeof(*grp); + if (count == 0 || (count * sizeof(*grp) != data_len)) { + NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len); + return (EINVAL); + } + + *((struct nlattr **)target) = nla; + return (error); +} + +static void +set_scope6(struct sockaddr *sa, if_t ifp) +{ +#ifdef INET6 + if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) { + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; + + if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) + in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp)); + } +#endif +} + +struct nl_parsed_nhop { + uint32_t nha_id; + uint8_t nha_blackhole; + uint8_t nha_groups; + uint8_t nhaf_knhops; + uint8_t nhaf_family; + struct ifnet *nha_oif; + struct sockaddr *nha_gw; + struct nlattr *nha_group; + uint8_t nh_family; + uint8_t nh_protocol; + uint32_t nhaf_table; + uint32_t nhaf_kid; + uint32_t nhaf_aif; +}; + +#define _IN(_field) offsetof(struct nhmsg, _field) +#define _OUT(_field) offsetof(struct nl_parsed_nhop, _field) +static struct nlattr_parser nla_p_nh_fbsd[] = { + { .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = nlattr_get_flag }, + { .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = nlattr_get_uint32 }, + { .type = NHAF_FAMILY, .off = _OUT(nhaf_family), .cb = nlattr_get_uint8 }, + { .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = nlattr_get_uint32 }, + { .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = nlattr_get_uint32 }, +}; +NL_DECLARE_ATTR_PARSER(nh_fbsd_parser, nla_p_nh_fbsd); + +static const struct nlfield_parser nlf_p_nh[] = { + { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 }, + { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 }, +}; + +static const struct nlattr_parser nla_p_nh[] = { + { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 }, + { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg }, + { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag }, + { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp }, + { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip }, + { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag }, + { .type = NHA_FREEBSD, .arg = &nh_fbsd_parser, .cb = nlattr_get_nested }, +}; +#undef _IN +#undef _OUT + +static bool +post_p_nh(void *_attrs, struct nl_pstate *npt) +{ + struct nl_parsed_nhop *attrs = (struct nl_parsed_nhop *)_attrs; + + set_scope6(attrs->nha_gw, attrs->nha_oif); + return (true); +} +NL_DECLARE_PARSER_EXT(nhmsg_parser, struct nhmsg, NULL, nlf_p_nh, nla_p_nh, post_p_nh); + +static bool +eligible_nhg(const struct nhop_object *nh) +{ + return (nh->nh_flags & NHF_GATEWAY); +} + +static int +newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop) +{ + struct nexthop_grp *grp = NLA_DATA(attrs->nha_group); + int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp); + struct weightened_nhop *wn; + + wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO); + if (wn == NULL) + return (ENOMEM); + + for (int i = 0; i < count; i++) { + struct user_nhop *unhop; + unhop = nl_find_base_unhop(ctl, grp[i].id); + if (unhop == NULL) { + NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id); + free(wn, M_NETLINK); + return (ESRCH); + } else if (unhop->un_nhop_src == NULL) { + NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported", + grp[i].id); + free(wn, M_NETLINK); + return (ENOTSUP); + } else if (!eligible_nhg(unhop->un_nhop_src)) { + NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible", + grp[i].id); + free(wn, M_NETLINK); + return (ENOTSUP); + } + /* + * TODO: consider more rigid eligibility checks: + * restrict nexthops with the same gateway + */ + wn[i].nh = unhop->un_nhop_src; + wn[i].weight = grp[i].weight; + } + unhop->un_nhgrp_src = wn; + unhop->un_nhgrp_count = count; + return (0); +} + +/* + * Sets nexthop @nh gateway specified by @gw. + * If gateway is IPv6 link-local, alters @gw to include scopeid equal to + * @ifp ifindex. + * Returns 0 on success or errno. + */ +int +nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, if_t ifp, + struct nl_pstate *npt) +{ +#ifdef INET6 + if (gw->sa_family == AF_INET6) { + struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw; + if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) { + if (ifp == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "interface not set"); + return (EINVAL); + } + in6_set_unicast_scopeid(&gw6->sin6_addr, if_getindex(ifp)); + } + } +#endif + nhop_set_gw(nh, gw, true); + return (0); +} + +static int +newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt) +{ + struct ifaddr *ifa = NULL; + struct nhop_object *nh; + int error; + + if (!attrs->nha_blackhole) { + if (attrs->nha_gw == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY"); + return (EINVAL); + } + if (attrs->nha_oif == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF"); + return (EINVAL); + } + if (ifa == NULL) + ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif); + if (ifa == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP"); + return (EINVAL); + } + } + + int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family; + + nh = nhop_alloc(RT_DEFAULT_FIB, family); + if (nh == NULL) { + NL_LOG(LOG_DEBUG, "Unable to allocate nexthop"); + return (ENOMEM); + } + nhop_set_uidx(nh, attrs->nha_id); + nhop_set_origin(nh, attrs->nh_protocol); + + if (attrs->nha_blackhole) + nhop_set_blackhole(nh, NHF_BLACKHOLE); + else { + error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt); + if (error != 0) { + nhop_free(nh); + return (error); + } + nhop_set_transmit_ifp(nh, attrs->nha_oif); + nhop_set_src(nh, ifa); + } + + error = nhop_get_unlinked(nh); + if (error != 0) { + NL_LOG(LOG_DEBUG, "unable to finalize nexthop"); + return (error); + } + + IF_DEBUG_LEVEL(LOG_DEBUG2) { + char nhbuf[NHOP_PRINT_BUFSIZE]; + nhop_print_buf(nh, nhbuf, sizeof(nhbuf)); + NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf); + } + + unhop->un_nhop_src = nh; + return (0); +} + +static int +rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nl_pstate *npt) +{ + struct nl_writer nw; + struct user_nhop *unhop; + int error; + + if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops())) + return (ENOMEM); + struct unhop_ctl *ctl = V_un_ctl; + + struct nl_parsed_nhop attrs = {}; + error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + /* + * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class + * citizen. + */ + if (attrs.nha_id == 0) { + attrs.nha_id = find_spare_uidx(ctl); + if (attrs.nha_id == 0) { + NL_LOG(LOG_DEBUG, "Unable to get spare uidx"); + return (ENOSPC); + } + } + + NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? if_getindex(attrs.nha_oif) : 0); + + unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); + if (unhop == NULL) { + NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop"); + return (ENOMEM); + } + unhop->un_idx = attrs.nha_id; + unhop->un_protocol = attrs.nh_protocol; + + if (attrs.nha_group) + error = newnhg(ctl, &attrs, unhop); + else + error = newnhop(&attrs, unhop, npt); + + if (error != 0) { + free(unhop, M_NETLINK); + return (error); + } + + UN_WLOCK(ctl); + /* Check if uidx already exists */ + struct user_nhop *tmp = NULL; + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp); + if (tmp != NULL) { + UN_WUNLOCK(ctl); + NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id); + destroy_unhop(unhop); + return (EEXIST); + } + CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop); + uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head); + UN_WUNLOCK(ctl); + + /* Report addition of the next nexhop */ + struct netlink_walkargs wa = { + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags, + .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, + }; + + if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP, + 0, false)) { + NL_LOG(LOG_DEBUG, "error allocating message writer"); + return (ENOMEM); + } + + dump_unhop(unhop, &wa.hdr, &nw); + nlmsg_flush(&nw); + + consider_resize(ctl, num_buckets_new); + + return (0); +} + +static int +rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nl_pstate *npt) +{ + struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); + int error; + + if (__predict_false(ctl == NULL)) + return (ESRCH); + + struct nl_parsed_nhop attrs = {}; + error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + if (attrs.nha_id == 0) { + NL_LOG(LOG_DEBUG, "NHA_ID not set"); + return (EINVAL); + } + + error = delete_unhop(ctl, hdr, attrs.nha_id); + + return (error); +} + +static bool +match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop) +{ + if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id) + return (false); + if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL) + return (false); + if (attrs->nha_oif != NULL && + (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif)) + return (false); + + return (true); +} + +static int +rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nl_pstate *npt) +{ + struct user_nhop *unhop; + UN_TRACKER; + int error; + + struct nl_parsed_nhop attrs = {}; + error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + struct netlink_walkargs wa = { + .nw = npt->nw, + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags, + .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, + }; + + if (attrs.nha_id != 0) { + struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); + struct user_nhop key = { .un_idx = attrs.nha_id }; + + if (__predict_false(ctl == NULL)) + return (ESRCH); + + NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id); + UN_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + UN_RUNLOCK(ctl); + + if (unhop == NULL) + return (ESRCH); + dump_unhop(unhop, &wa.hdr, wa.nw); + return (0); + } else if (attrs.nhaf_kid != 0) { + struct nhop_iter iter = { + .fibnum = attrs.nhaf_table, + .family = attrs.nhaf_family, + }; + int error = ESRCH; + + NL_LOG(LOG_DEBUG2, "START table %u family %d", attrs.nhaf_table, attrs.nhaf_family); + for (struct nhop_object *nh = nhops_iter_start(&iter); nh; + nh = nhops_iter_next(&iter)) { + NL_LOG(LOG_DEBUG3, "get %u", nhop_get_idx(nh)); + if (nhop_get_idx(nh) == attrs.nhaf_kid) { + dump_nhop(nh, 0, &wa.hdr, wa.nw); + error = 0; + break; + } + } + nhops_iter_stop(&iter); + return (error); + } else if (attrs.nhaf_knhops) { + struct nhop_iter iter = { + .fibnum = attrs.nhaf_table, + .family = attrs.nhaf_family, + }; + + NL_LOG(LOG_DEBUG2, "DUMP table %u family %d", attrs.nhaf_table, attrs.nhaf_family); + wa.hdr.nlmsg_flags |= NLM_F_MULTI; + for (struct nhop_object *nh = nhops_iter_start(&iter); nh; + nh = nhops_iter_next(&iter)) { + dump_nhop(nh, 0, &wa.hdr, wa.nw); + } + nhops_iter_stop(&iter); + } else { + struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); + + if (__predict_false(ctl == NULL)) + return (ESRCH); + + NL_LOG(LOG_DEBUG2, "DUMP unhops"); + UN_RLOCK(ctl); + wa.hdr.nlmsg_flags |= NLM_F_MULTI; + CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) { + if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop)) + dump_unhop(unhop, &wa.hdr, wa.nw); + } CHT_SLIST_FOREACH_END; + UN_RUNLOCK(ctl); + } + + if (wa.error == 0) { + if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) + return (ENOMEM); + } + return (0); +} + +static const struct rtnl_cmd_handler cmd_handlers[] = { + { + .cmd = NL_RTM_NEWNEXTHOP, + .name = "RTM_NEWNEXTHOP", + .cb = &rtnl_handle_newnhop, + .priv = PRIV_NET_ROUTE, + }, + { + .cmd = NL_RTM_DELNEXTHOP, + .name = "RTM_DELNEXTHOP", + .cb = &rtnl_handle_delnhop, + .priv = PRIV_NET_ROUTE, + }, + { + .cmd = NL_RTM_GETNEXTHOP, + .name = "RTM_GETNEXTHOP", + .cb = &rtnl_handle_getnhop, + } +}; + +static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser, &nh_fbsd_parser }; + +void +rtnl_nexthops_init(void) +{ + NL_VERIFY_PARSERS(all_parsers); + rtnl_register_messages(cmd_handlers, nitems(cmd_handlers)); +} diff --git a/sys/netlink/route/nexthop.h b/sys/netlink/route/nexthop.h new file mode 100644 index 000000000000..81a1c9ac88f8 --- /dev/null +++ b/sys/netlink/route/nexthop.h @@ -0,0 +1,113 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * NEXTHOP-related (RTM_<NEW|DEL|GET>NEXTHOP) message header and attributes. + */ + +#ifndef _NETLINK_ROUTE_NEXTHOP_H_ +#define _NETLINK_ROUTE_NEXTHOP_H_ + +/* Base header for all of the relevant messages */ +struct nhmsg { + unsigned char nh_family; /* transport family */ + unsigned char nh_scope; /* ignored on RX, filled by kernel */ + unsigned char nh_protocol; /* Routing protocol that installed nh */ + unsigned char resvd; + unsigned int nh_flags; /* RTNH_F_* flags from route.h */ +}; + +enum { + NHA_UNSPEC, + NHA_ID, /* u32: nexthop userland index, auto-assigned if 0 */ + NHA_GROUP, /* binary: array of struct nexthop_grp */ + NHA_GROUP_TYPE, /* u16: set to NEXTHOP_GRP_TYPE */ + NHA_BLACKHOLE, /* flag: nexthop used to blackhole packets */ + NHA_OIF, /* u32: transmit ifindex */ + NHA_GATEWAY, /* network: IPv4/IPv6 gateway addr */ + NHA_ENCAP_TYPE, /* not supported */ + NHA_ENCAP, /* not supported */ + NHA_GROUPS, /* flag: match nexthop groups */ + NHA_MASTER, /* not supported */ + NHA_FDB, /* not supported */ + NHA_RES_GROUP, /* not supported */ + NHA_RES_BUCKET, /* not supported */ + NHA_FREEBSD, /* nested: FreeBSD-specific attributes */ + __NHA_MAX, +}; +#define NHA_MAX (__NHA_MAX - 1) + +enum { + NHAF_UNSPEC, + NHAF_KNHOPS, /* flag: dump kernel nexthops */ + NHAF_KGOUPS, /* flag: dump kernel nexthop groups */ + NHAF_TABLE, /* u32: rtable id */ + NHAF_FAMILY, /* u32: upper family */ + NHAF_KID, /* u32: kernel nexthop index */ + NHAF_AIF, /* u32: source interface address */ +}; + +/* + * Attributes that can be used as filters: + * NHA_ID (nexhop or group), NHA_OIF, NHA_GROUPS, + */ + +/* + * NHA_GROUP: array of the following structures. + * If attribute is set, the only other valid attributes are + * NHA_ID and NHA_GROUP_TYPE. + * NHA_RES_GROUP and NHA_RES_BUCKET are not supported yet + */ +struct nexthop_grp { + uint32_t id; /* nexhop userland index */ + uint8_t weight; /* weight of this nexthop */ + uint8_t resvd1; + uint16_t resvd2; +}; + +/* NHA_GROUP_TYPE: u16 */ +enum { + NEXTHOP_GRP_TYPE_MPATH, /* default nexthop group */ + NEXTHOP_GRP_TYPE_RES, /* resilient nexthop group */ + __NEXTHOP_GRP_TYPE_MAX, +}; +#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1) + + +/* NHA_RES_GROUP */ +enum { + NHA_RES_GROUP_UNSPEC, + NHA_RES_GROUP_PAD = NHA_RES_GROUP_UNSPEC, + NHA_RES_GROUP_BUCKETS, + NHA_RES_GROUP_IDLE_TIMER, + NHA_RES_GROUP_UNBALANCED_TIMER, + NHA_RES_GROUP_UNBALANCED_TIME, + __NHA_RES_GROUP_MAX, +}; +#define NHA_RES_GROUP_MAX (__NHA_RES_GROUP_MAX - 1) + +#endif diff --git a/sys/netlink/route/route.h b/sys/netlink/route/route.h new file mode 100644 index 000000000000..60c3a22718a3 --- /dev/null +++ b/sys/netlink/route/route.h @@ -0,0 +1,368 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Route-related (RTM_<NEW|DEL|GET>ROUTE) message header and attributes. + */ + +#ifndef _NETLINK_ROUTE_ROUTE_H_ +#define _NETLINK_ROUTE_ROUTE_H_ + +/* Base header for all of the relevant messages */ +struct rtmsg { + unsigned char rtm_family; /* address family */ + unsigned char rtm_dst_len; /* Prefix length */ + unsigned char rtm_src_len; /* Source prefix length (not used) */ + unsigned char rtm_tos; /* Type of service (not used) */ + unsigned char rtm_table; /* rtable id */ + unsigned char rtm_protocol; /* Routing protocol id (RTPROT_) */ + unsigned char rtm_scope; /* Route distance (RT_SCOPE_) */ + unsigned char rtm_type; /* Route type (RTN_) */ + unsigned rtm_flags; /* Route flags (RTM_F_) */ +}; + +/* + * RFC 3549, 3.1.1, route type (rtm_type field). + */ +enum { + RTN_UNSPEC, + RTN_UNICAST, /* Unicast route */ + RTN_LOCAL, /* Accept locally (not supported) */ + RTN_BROADCAST, /* Accept locally as broadcast, send as broadcast */ + RTN_ANYCAST, /* Accept locally as broadcast, but send as unicast */ + RTN_MULTICAST, /* Multicast route */ + RTN_BLACKHOLE, /* Drop traffic towards destination */ + RTN_UNREACHABLE, /* Destination is unreachable */ + RTN_PROHIBIT, /* Administratively prohibited */ + RTN_THROW, /* Not in this table (not supported) */ + RTN_NAT, /* Translate this address (not supported) */ + RTN_XRESOLVE, /* Use external resolver (not supported) */ + __RTN_MAX, +}; +#define RTN_MAX (__RTN_MAX - 1) + +/* + * RFC 3549, 3.1.1, protocol (Identifies what/who added the route). + * Values larger than RTPROT_STATIC(4) are not interpreted by the + * kernel, they are just for user information. + */ +#define RTPROT_UNSPEC 0 +#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirect */ +#define RTPROT_KERNEL 2 /* Route installed by kernel */ +#define RTPROT_BOOT 3 /* Route installed during boot */ +#define RTPROT_STATIC 4 /* Route installed by administrator */ + +#define RTPROT_GATED 8 +#define RTPROT_RA 9 +#define RTPROT_MRT 1 +#define RTPROT_ZEBRA 11 +#define RTPROT_BIRD 12 +#define RTPROT_DNROUTED 13 +#define RTPROT_XORP 14 +#define RTPROT_NTK 15 +#define RTPROT_DHCP 16 +#define RTPROT_MROUTED 17 +#define RTPROT_KEEPALIVED 18 +#define RTPROT_BABEL 42 +#define RTPROT_OPENR 99 +#define RTPROT_BGP 186 +#define RTPROT_ISIS 187 +#define RTPROT_OSPF 188 +#define RTPROT_RIP 189 +#define RTPROT_EIGRP 192 + +/* + * RFC 3549 3.1.1 Route scope (valid distance to destination). + * + * The values between RT_SCOPE_UNIVERSE(0) and RT_SCOPE_SITE(200) + * are available to the user. + */ +enum rt_scope_t { + RT_SCOPE_UNIVERSE = 0, + /* User defined values */ + RT_SCOPE_SITE = 200, + RT_SCOPE_LINK = 253, + RT_SCOPE_HOST = 254, + RT_SCOPE_NOWHERE = 255 +}; + +/* + * RFC 3549 3.1.1 Route flags (rtm_flags). + * Is a composition of RTNH_F flags (0x1..0x40 range), RTM_F flags (below) + * and per-protocol (IPv4/IPv6) flags. + */ +#define RTM_F_NOTIFY 0x00000100 /* not supported */ +#define RTM_F_CLONED 0x00000200 /* not supported */ +#define RTM_F_EQUALIZE 0x00000400 /* not supported */ +#define RTM_F_PREFIX 0x00000800 /* not supported */ +#define RTM_F_LOOKUP_TABLE 0x00001000 /* not supported */ +#define RTM_F_FIB_MATCH 0x00002000 /* not supported */ +#define RTM_F_OFFLOAD 0x00004000 /* not supported */ +#define RTM_F_TRAP 0x00008000 /* not supported */ +#define RTM_F_OFFLOAD_FAILED 0x20000000 /* not supported */ + +/* Compatibility handling helpers */ +#ifndef _KERNEL +#define NL_RTM_HDRLEN ((int)sizeof(struct rtmsg)) +#define RTM_RTA(_rtm) ((struct rtattr *)((char *)(_rtm) + NL_RTM_HDRLEN)) +#define RTM_PAYLOAD(_hdr) NLMSG_PAYLOAD((_hdr), NL_RTM_HDRLEN) +#endif + +/* + * Routing table identifiers. + * FreeBSD route table numbering starts from 0, where 0 is a valid default + * routing table. Indicating "all tables" via netlink can be done by not + * including RTA_TABLE attribute and keeping rtm_table=0 (compatibility) or + * setting RTA_TABLE value to RT_TABLE_UNSPEC. + */ +#define RT_TABLE_MAIN 0 /* RT_DEFAULT_FIB */ +#define RT_TABLE_UNSPEC 0xFFFFFFFF /* RT_ALL_FIBS */ + +enum rtattr_type_t { + NL_RTA_UNSPEC, + NL_RTA_DST = 1, /* binary, IPv4/IPv6 destination */ + NL_RTA_SRC = 2, /* binary, preferred source address */ + NL_RTA_IIF = 3, /* not supported */ + NL_RTA_OIF = 4, /* u32, transmit ifindex */ + NL_RTA_GATEWAY = 5, /* binary: IPv4/IPv6 gateway */ + NL_RTA_PRIORITY = 6, /* not supported */ + NL_RTA_PREFSRC = 7, /* not supported */ + NL_RTA_METRICS = 8, /* nested, list of NL_RTAX* attrs */ + NL_RTA_MULTIPATH = 9, /* binary, array of struct rtnexthop */ + NL_RTA_PROTOINFO = 10, /* not supported / deprecated */ + NL_RTA_KNH_ID = 10, /* u32, FreeBSD specific, kernel nexthop index */ + NL_RTA_FLOW = 11, /* not supported */ + NL_RTA_CACHEINFO = 12, /* not supported */ + NL_RTA_SESSION = 13, /* not supported / deprecated */ + NL_RTA_WEIGHT = 13, /* u32, FreeBSD specific, path weight */ + NL_RTA_MP_ALGO = 14, /* not supported / deprecated */ + NL_RTA_RTFLAGS = 14, /* u32, FreeBSD specific, path flags (RTF_)*/ + NL_RTA_TABLE = 15, /* u32, fibnum */ + NL_RTA_MARK = 16, /* not supported */ + NL_RTA_MFC_STATS = 17, /* not supported */ + NL_RTA_VIA = 18, /* binary, struct rtvia */ + NL_RTA_NEWDST = 19, /* not supported */ + NL_RTA_PREF = 20, /* not supported */ + NL_RTA_ENCAP_TYPE = 21, /* not supported */ + NL_RTA_ENCAP = 22, /* not supported */ + NL_RTA_EXPIRES = 23, /* u32, seconds till expiration */ + NL_RTA_PAD = 24, /* not supported */ + NL_RTA_UID = 25, /* not supported */ + NL_RTA_TTL_PROPAGATE = 26, /* not supported */ + NL_RTA_IP_PROTO = 27, /* not supported */ + NL_RTA_SPORT = 28, /* not supported */ + NL_RTA_DPORT = 29, /* not supported */ + NL_RTA_NH_ID = 30, /* u32, nexthop/nexthop group index */ + __RTA_MAX +}; +#define NL_RTA_MAX (__RTA_MAX - 1) + +/* + * Attributes that can be used as filters: + * + */ + +#ifndef _KERNEL +/* + * RTA_* space has clashes with rtsock namespace. + * Use NL_RTA_ prefix in the kernel and map to + * RTA_ for userland. + */ +#define RTA_UNSPEC NL_RTA_UNSPEC +#define RTA_DST NL_RTA_DST +#define RTA_SRC NL_RTA_SRC +#define RTA_IIF NL_RTA_IIF +#define RTA_OIF NL_RTA_OIF +#define RTA_GATEWAY NL_RTA_GATEWAY +#define RTA_PRIORITY NL_RTA_PRIORITY +#define RTA_PREFSRC NL_RTA_PREFSRC +#define RTA_METRICS NL_RTA_METRICS +#define RTA_MULTIPATH NL_RTA_MULTIPATH +#define RTA_PROTOINFO NL_RTA_PROTOINFO +#define RTA_KNH_ID NL_RTA_KNH_ID +#define RTA_FLOW NL_RTA_FLOW +#define RTA_CACHEINFO NL_RTA_CACHEINFO +#define RTA_SESSION NL_RTA_SESSION +#define RTA_MP_ALGO NL_RTA_MP_ALGO +#define RTA_TABLE NL_RTA_TABLE +#define RTA_MARK NL_RTA_MARK +#define RTA_MFC_STATS NL_RTA_MFC_STATS +#define RTA_VIA NL_RTA_VIA +#define RTA_NEWDST NL_RTA_NEWDST +#define RTA_PREF NL_RTA_PREF +#define RTA_ENCAP_TYPE NL_RTA_ENCAP_TYPE +#define RTA_ENCAP NL_RTA_ENCAP +#define RTA_EXPIRES NL_RTA_EXPIRES +#define RTA_PAD NL_RTA_PAD +#define RTA_UID NL_RTA_UID +#define RTA_TTL_PROPAGATE NL_RTA_TTL_PROPAGATE +#define RTA_IP_PROTO NL_RTA_IP_PROTO +#define RTA_SPORT NL_RTA_SPORT +#define RTA_DPORT NL_RTA_DPORT +#define RTA_NH_ID NL_RTA_NH_ID +#define RTA_MAX NL_RTA_MAX +#endif + +/* route attribute header */ +struct rtattr { + unsigned short rta_len; + unsigned short rta_type; +}; + +#define NL_RTA_ALIGN_SIZE NL_ITEM_ALIGN_SIZE +#define NL_RTA_ALIGN NL_ITEM_ALIGN +#define NL_RTA_HDRLEN ((int)sizeof(struct rtattr)) +#define NL_RTA_DATA_LEN(_rta) ((int)((_rta)->rta_len - NL_RTA_HDRLEN)) +#define NL_RTA_DATA(_rta) NL_ITEM_DATA(_rta, NL_RTA_HDRLEN) +#define NL_RTA_DATA_CONST(_rta) NL_ITEM_DATA_CONST(_rta, NL_RTA_HDRLEN) + +/* Compatibility attribute handling helpers */ +#ifndef _KERNEL +#define RTA_ALIGNTO NL_RTA_ALIGN_SIZE +#define RTA_ALIGN(_len) NL_RTA_ALIGN(_len) +#define _RTA_LEN(_rta) ((int)(_rta)->rta_len) +#define _RTA_ALIGNED_LEN(_rta) RTA_ALIGN(_RTA_LEN(_rta)) +#define RTA_OK(_rta, _len) NL_ITEM_OK(_rta, _len, NL_RTA_HDRLEN, _RTA_LEN) +#define RTA_NEXT(_rta, _len) NL_ITEM_ITER(_rta, _len, _RTA_ALIGNED_LEN) +#define RTA_LENGTH(_len) (NL_RTA_HDRLEN + (_len)) +#define RTA_SPACE(_len) RTA_ALIGN(RTA_LENGTH(_len)) +#define RTA_DATA(_rta) NL_RTA_DATA(_rta) +#define RTA_PAYLOAD(_rta) ((int)(_RTA_LEN(_rta) - NL_RTA_HDRLEN)) +#endif + +/* RTA attribute headers */ + +/* RTA_VIA */ +struct rtvia { + sa_family_t rtvia_family; + uint8_t rtvia_addr[0]; +}; + +/* + * RTA_METRICS is a nested attribute, consisting of a list of + * TLVs with types defined below. + */ + enum { + NL_RTAX_UNSPEC, + NL_RTAX_LOCK = 1, /* not supported */ + NL_RTAX_MTU = 2, /* desired path MTU */ + NL_RTAX_WINDOW = 3, /* not supported */ + NL_RTAX_RTT = 4, /* not supported */ + NL_RTAX_RTTVAR = 5, /* not supported */ + NL_RTAX_SSTHRESH = 6, /* not supported */ + NL_RTAX_CWND = 7, /* not supported */ + NL_RTAX_ADVMSS = 8, /* not supported */ + NL_RTAX_REORDERING = 9, /* not supported */ + NL_RTAX_HOPLIMIT = 10, /* not supported */ + NL_RTAX_INITCWND = 11, /* not supporrted */ + NL_RTAX_FEATURES = 12, /* not supported */ + NL_RTAX_RTO_MIN = 13, /* not supported */ + NL_RTAX_INITRWND = 14, /* not supported */ + NL_RTAX_QUICKACK = 15, /* not supported */ + NL_RTAX_CC_ALGO = 16, /* not supported */ + NL_RTAX_FASTOPEN_NO_COOKIE = 17, /* not supported */ + __NL_RTAX_MAX +}; +#define NL_RTAX_MAX (__NL_RTAX_MAX - 1) + +#define RTAX_FEATURE_ECN (1 << 0) +#define RTAX_FEATURE_SACK (1 << 1) +#define RTAX_FEATURE_TIMESTAMP (1 << 2) +#define RTAX_FEATURE_ALLFRAG (1 << 3) + +#define RTAX_FEATURE_MASK \ + (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG) + +#ifndef _KERNEL + +/* + * RTAX_* space clashes with rtsock namespace. + * Use NL_RTAX_ prefix in the kernel and map to + * RTAX_ for userland. + */ +#define RTAX_UNSPEC NL_RTAX_UNSPEC +#define RTAX_LOCK NL_RTAX_LOCK +#define RTAX_MTU NL_RTAX_MTU +#define RTAX_WINDOW NL_RTAX_WINDOW +#define RTAX_RTT NL_RTAX_RTT +#define RTAX_RTTVAR NL_RTAX_RTTVAR +#define RTAX_SSTHRESH NL_RTAX_SSTHRESH +#define RTAX_CWND NL_RTAX_CWND +#define RTAX_ADVMSS NL_RTAX_ADVMSS +#define RTAX_REORDERING NL_RTAX_REORDERING +#define RTAX_HOPLIMIT NL_RTAX_HOPLIMIT +#define RTAX_INITCWND NL_RTAX_INITCWND +#define RTAX_FEATURES NL_RTAX_FEATURES +#define RTAX_RTO_MIN NL_RTAX_RTO_MIN +#define RTAX_INITRWND NL_RTAX_INITRWND +#define RTAX_QUICKACK NL_RTAX_QUICKACK +#define RTAX_CC_ALGO NL_RTAX_CC_ALGO +#define RTAX_FASTOPEN_NO_COOKIE NL_RTAX_FASTOPEN_NO_COOKIE +#endif + +/* + * RTA_MULTIPATH consists of an array of rtnexthop structures. + * Each rtnexthop structure contains RTA_GATEWAY or RTA_VIA + * attribute following the header. + */ +struct rtnexthop { + unsigned short rtnh_len; + unsigned char rtnh_flags; + unsigned char rtnh_hops; /* nexthop weight */ + int rtnh_ifindex; +}; + +/* rtnh_flags */ +#define RTNH_F_DEAD 0x01 /* not supported */ +#define RTNH_F_PERVASIVE 0x02 /* not supported */ +#define RTNH_F_ONLINK 0x04 /* not supported */ +#define RTNH_F_OFFLOAD 0x08 /* not supported */ +#define RTNH_F_LINKDOWN 0x10 /* not supported */ +#define RTNH_F_UNRESOLVED 0x20 /* not supported */ +#define RTNH_F_TRAP 0x40 /* not supported */ + +#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN | \ + RTNH_F_OFFLOAD | RTNH_F_TRAP) + +/* Macros to handle hexthops */ +#define RTNH_ALIGNTO NL_ITEM_ALIGN_SIZE +#define RTNH_ALIGN(_len) NL_ITEM_ALIGN(_len) +#define RTNH_HDRLEN ((int)sizeof(struct rtnexthop)) +#define _RTNH_LEN(_nh) ((int)(_nh)->rtnh_len) +#define _RTNH_ALIGNED_LEN(_nh) RTNH_ALIGN(_RTNH_LEN(_nh)) +#define RTNH_OK(_nh, _len) NL_ITEM_OK(_nh, _len, RTNH_HDRLEN, _RTNH_LEN) +#define RTNH_NEXT(_nh) ((struct rtnexthop *)((char *)(_nh) + _RTNH_ALIGNED_LEN(_nh))) +#define RTNH_LENGTH(_len) (RTNH_HDRLEN + (_len)) +#define RTNH_SPACE(_len) RTNH_ALIGN(RTNH_LENGTH(_len)) +#define RTNH_DATA(_nh) ((struct rtattr *)NL_ITEM_DATA(_nh, RTNH_HDRLEN)) + +struct rtgenmsg { + unsigned char rtgen_family; +}; + +#endif diff --git a/sys/netlink/route/route_var.h b/sys/netlink/route/route_var.h new file mode 100644 index 000000000000..b84b34461e35 --- /dev/null +++ b/sys/netlink/route/route_var.h @@ -0,0 +1,140 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file contains definitions shared among NETLINK_ROUTE family + */ +#ifndef _NETLINK_ROUTE_ROUTE_VAR_H_ +#define _NETLINK_ROUTE_ROUTE_VAR_H_ + +#include <sys/priv.h> /* values for priv_check */ + +struct nlmsghdr; +struct nlpcb; +struct nl_pstate; + +typedef int rtnl_msg_cb_f(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nl_pstate *npt); + +struct rtnl_cmd_handler { + int cmd; + const char *name; + rtnl_msg_cb_f *cb; + int priv; + int flags; +}; + +#define RTNL_F_NOEPOCH 0x01 /* Do not enter epoch when handling command */ +#define RTNL_F_ALLOW_NONVNET_JAIL 0x02 /* Allow command execution inside non-VNET jail */ + +bool rtnl_register_messages(const struct rtnl_cmd_handler *handlers, int count); + +/* route.c */ +struct rib_cmd_info; +void rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc); +void rtnl_routes_init(void); + +/* neigh.c */ +void rtnl_neighs_init(void); +void rtnl_neighs_destroy(void); + +/* iface.c */ +struct nl_parsed_link { + char *ifla_group; + char *ifla_ifname; + char *ifla_cloner; + char *ifla_ifalias; + struct nlattr *ifla_idata; + unsigned short ifi_type; + int ifi_index; + uint32_t ifla_link; + uint32_t ifla_mtu; + uint32_t ifi_flags; + uint32_t ifi_change; +}; + +#if defined(NETLINK) || defined(NETLINK_MODULE) +/* Provide optimized calls to the functions inside the same linking unit */ + +int _nl_modify_ifp_generic(struct ifnet *ifp, struct nl_parsed_link *lattrs, + const struct nlattr_bmask *bm, struct nl_pstate *npt); +void _nl_store_ifp_cookie(struct nl_pstate *npt, struct ifnet *ifp); + +static inline int +nl_modify_ifp_generic(struct ifnet *ifp, struct nl_parsed_link *lattrs, + const struct nlattr_bmask *bm, struct nl_pstate *npt) +{ + return (_nl_modify_ifp_generic(ifp, lattrs, bm, npt)); +} + +static inline void +nl_store_ifp_cookie(struct nl_pstate *npt, struct ifnet *ifp) +{ + _nl_store_ifp_cookie(npt, ifp); +} +#else +/* Provide access to the functions via netlink_glue.c */ +int nl_modify_ifp_generic(struct ifnet *ifp, struct nl_parsed_link *lattrs, + const struct nlattr_bmask *bm, struct nl_pstate *npt); +void nl_store_ifp_cookie(struct nl_pstate *npt, struct ifnet *ifp); +#endif /* defined(NETLINK) || defined(NETLINK_MODULE) */ + + +typedef int rtnl_iface_create_f(struct nl_parsed_link *lattrs, + const struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt); +typedef int rtnl_iface_modify_f(struct ifnet *ifp, struct nl_parsed_link *lattrs, + const struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt); +typedef int rtnl_iface_dump_f(struct ifnet *ifp, struct nl_writer *nw); + +struct nl_cloner { + const char *name; + rtnl_iface_create_f *create_f; + rtnl_iface_modify_f *modify_f; + rtnl_iface_dump_f *dump_f; + SLIST_ENTRY(nl_cloner) next; +}; + +extern struct nl_cloner generic_cloner; + +void rtnl_ifaces_init(void); +void rtnl_ifaces_destroy(void); +void rtnl_iface_add_cloner(struct nl_cloner *cloner); +void rtnl_iface_del_cloner(struct nl_cloner *cloner); +void rtnl_handle_ifnet_event(struct ifnet *ifp, int if_change_mask); + +/* iface_drivers.c */ +void rtnl_iface_drivers_register(void); + +/* nexthop.c */ +void rtnl_nexthops_init(void); +struct nhop_object *nl_find_nhop(uint32_t fibnum, int family, + uint32_t uidx, int nh_flags, int *perror); +int nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, + struct ifnet *ifp, struct nl_pstate *npt); + + +#endif diff --git a/sys/netlink/route/rt.c b/sys/netlink/route/rt.c new file mode 100644 index 000000000000..dcd19b43105c --- /dev/null +++ b/sys/netlink/route/rt.c @@ -0,0 +1,1139 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_route.h" +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/rmlock.h> +#include <sys/socket.h> + +#include <net/if.h> +#include <net/route.h> +#include <net/route/nhop.h> +#include <net/route/route_ctl.h> +#include <net/route/route_var.h> +#include <netinet6/scope6_var.h> +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_route.h> +#include <netlink/route/route_var.h> + +#define DEBUG_MOD_NAME nl_route +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include <netlink/netlink_debug.h> +_DECLARE_DEBUG(LOG_INFO); + +static unsigned char +get_rtm_type(const struct nhop_object *nh) +{ + int nh_flags = nh->nh_flags; + + /* Use the fact that nhg runtime flags are only NHF_MULTIPATH */ + if (nh_flags & NHF_BLACKHOLE) + return (RTN_BLACKHOLE); + else if (nh_flags & NHF_REJECT) + return (RTN_PROHIBIT); + return (RTN_UNICAST); +} + +static uint8_t +nl_get_rtm_protocol(const struct nhop_object *nh) +{ +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh; + uint8_t origin = nhgrp_get_origin(nhg); + if (origin != RTPROT_UNSPEC) + return (origin); + nh = nhg->nhops[0]; + } +#endif + uint8_t origin = nhop_get_origin(nh); + if (origin != RTPROT_UNSPEC) + return (origin); + /* TODO: remove guesswork once all kernel users fill in origin */ + int rt_flags = nhop_get_rtflags(nh); + if (rt_flags & RTF_PROTO1) + return (RTPROT_ZEBRA); + if (rt_flags & RTF_STATIC) + return (RTPROT_STATIC); + return (RTPROT_KERNEL); +} + +static int +get_rtmsg_type_from_rtsock(int cmd) +{ + switch (cmd) { + case RTM_ADD: + case RTM_CHANGE: + case RTM_GET: + return NL_RTM_NEWROUTE; + case RTM_DELETE: + return NL_RTM_DELROUTE; + } + + return (0); +} + +/* + * fibnum heuristics + * + * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS + * msg rtm_table RTA_TABLE result + * RTM_GETROUTE/dump 0 - RT_ALL_FIBS + * RTM_GETROUTE/dump 1 - 1 + * RTM_GETROUTE/get 0 - 0 + * + */ + +static struct nhop_object * +rc_get_nhop(const struct rib_cmd_info *rc) +{ + return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new); +} + +static void +dump_rc_nhop_gw(struct nl_writer *nw, const struct nhop_object *nh) +{ +#ifdef INET6 + int upper_family; +#endif + + switch (nhop_get_neigh_family(nh)) { + case AF_LINK: + /* onlink prefix, skip */ + break; + case AF_INET: + nlattr_add(nw, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr); + break; +#ifdef INET6 + case AF_INET6: + upper_family = nhop_get_upper_family(nh); + if (upper_family == AF_INET6) { + struct in6_addr gw6 = nh->gw6_sa.sin6_addr; + in6_clearscope(&gw6); + + nlattr_add(nw, NL_RTA_GATEWAY, 16, &gw6); + } else if (upper_family == AF_INET) { + /* IPv4 over IPv6 */ + struct in6_addr gw6 = nh->gw6_sa.sin6_addr; + in6_clearscope(&gw6); + + char buf[20]; + struct rtvia *via = (struct rtvia *)&buf[0]; + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &gw6, 16); + nlattr_add(nw, NL_RTA_VIA, 17, via); + } + break; +#endif + } +} + +static void +dump_rc_nhop_mtu(struct nl_writer *nw, const struct nhop_object *nh) +{ + int nla_len = sizeof(struct nlattr) * 2 + sizeof(uint32_t); + struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); + + if (nla == NULL) + return; + nla->nla_type = NL_RTA_METRICS; + nla->nla_len = nla_len; + nla++; + nla->nla_type = NL_RTAX_MTU; + nla->nla_len = sizeof(struct nlattr) + sizeof(uint32_t); + *((uint32_t *)(nla + 1)) = nh->nh_mtu; +} + +#ifdef ROUTE_MPATH +static void +dump_rc_nhg(struct nl_writer *nw, const struct nhgrp_object *nhg, struct rtmsg *rtm) +{ + uint32_t uidx = nhgrp_get_uidx(nhg); + uint32_t num_nhops; + const struct weightened_nhop *wn = nhgrp_get_nhops(nhg, &num_nhops); + uint32_t base_rtflags = nhop_get_rtflags(wn[0].nh); + + if (uidx != 0) + nlattr_add_u32(nw, NL_RTA_NH_ID, uidx); + nlattr_add_u32(nw, NL_RTA_KNH_ID, nhgrp_get_idx(nhg)); + + nlattr_add_u32(nw, NL_RTA_RTFLAGS, base_rtflags); + int off = nlattr_add_nested(nw, NL_RTA_MULTIPATH); + if (off == 0) + return; + + for (int i = 0; i < num_nhops; i++) { + int nh_off = nlattr_save_offset(nw); + struct rtnexthop *rtnh = nlmsg_reserve_object(nw, struct rtnexthop); + if (rtnh == NULL) + return; + rtnh->rtnh_flags = 0; + rtnh->rtnh_ifindex = if_getindex(wn[i].nh->nh_ifp); + rtnh->rtnh_hops = wn[i].weight; + dump_rc_nhop_gw(nw, wn[i].nh); + uint32_t rtflags = nhop_get_rtflags(wn[i].nh); + if (rtflags != base_rtflags) + nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags); + if (rtflags & RTF_FIXEDMTU) + dump_rc_nhop_mtu(nw, wn[i].nh); + rtnh = nlattr_restore_offset(nw, nh_off, struct rtnexthop); + /* + * nlattr_add() allocates 4-byte aligned storage, no need to aligh + * length here + * */ + rtnh->rtnh_len = nlattr_save_offset(nw) - nh_off; + } + nlattr_set_len(nw, off); +} +#endif + +static void +dump_rc_nhop(struct nl_writer *nw, const struct route_nhop_data *rnd, struct rtmsg *rtm) +{ +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(rnd->rnd_nhop)) { + dump_rc_nhg(nw, rnd->rnd_nhgrp, rtm); + return; + } +#endif + const struct nhop_object *nh = rnd->rnd_nhop; + uint32_t rtflags = nhop_get_rtflags(nh); + + /* + * IPv4 over IPv6 + * ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2), + * IPv4 w/ gw + * ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)], + * Direct route: + * ('RTA_OIF', 2) + */ + if (nh->nh_flags & NHF_GATEWAY) + dump_rc_nhop_gw(nw, nh); + + uint32_t uidx = nhop_get_uidx(nh); + if (uidx != 0) + nlattr_add_u32(nw, NL_RTA_NH_ID, uidx); + nlattr_add_u32(nw, NL_RTA_KNH_ID, nhop_get_idx(nh)); + nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags); + + if (rtflags & RTF_FIXEDMTU) + dump_rc_nhop_mtu(nw, nh); + uint32_t nh_expire = nhop_get_expire(nh); + if (nh_expire > 0) + nlattr_add_u32(nw, NL_RTA_EXPIRES, nh_expire - time_uptime); + + /* In any case, fill outgoing interface */ + nlattr_add_u32(nw, NL_RTA_OIF, if_getindex(nh->nh_ifp)); + + if (rnd->rnd_weight != RT_DEFAULT_WEIGHT) + nlattr_add_u32(nw, NL_RTA_WEIGHT, rnd->rnd_weight); +} + +/* + * Dumps output from a rib command into an rtmsg + */ + +static int +dump_px(uint32_t fibnum, const struct nlmsghdr *hdr, + const struct rtentry *rt, struct route_nhop_data *rnd, + struct nl_writer *nw) +{ + struct rtmsg *rtm; + int error = 0; + + NET_EPOCH_ASSERT(); + + if (!nlmsg_reply(nw, hdr, sizeof(struct rtmsg))) + goto enomem; + + int family = rt_get_family(rt); + int rtm_off = nlattr_save_offset(nw); + rtm = nlmsg_reserve_object(nw, struct rtmsg); + rtm->rtm_family = family; + rtm->rtm_dst_len = 0; + rtm->rtm_src_len = 0; + rtm->rtm_tos = 0; + if (fibnum < 255) + rtm->rtm_table = (unsigned char)fibnum; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_protocol = nl_get_rtm_protocol(rnd->rnd_nhop); + rtm->rtm_type = get_rtm_type(rnd->rnd_nhop); + + nlattr_add_u32(nw, NL_RTA_TABLE, fibnum); + + int plen = 0; +#if defined(INET) || defined(INET6) + uint32_t scopeid; +#endif + switch (family) { +#ifdef INET + case AF_INET: + { + struct in_addr addr; + rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid); + nlattr_add(nw, NL_RTA_DST, 4, &addr); + break; + } +#endif +#ifdef INET6 + case AF_INET6: + { + struct in6_addr addr; + rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid); + nlattr_add(nw, NL_RTA_DST, 16, &addr); + break; + } +#endif + default: + FIB_LOG(LOG_NOTICE, fibnum, family, "unsupported rt family: %d", family); + error = EAFNOSUPPORT; + goto flush; + } + + rtm = nlattr_restore_offset(nw, rtm_off, struct rtmsg); + if (plen > 0) + rtm->rtm_dst_len = plen; + dump_rc_nhop(nw, rnd, rtm); + + if (nlmsg_end(nw)) + return (0); +enomem: + error = ENOMEM; +flush: + nlmsg_abort(nw); + return (error); +} + +static int +family_to_group(int family) +{ + switch (family) { + case AF_INET: + return (RTNLGRP_IPV4_ROUTE); + case AF_INET6: + return (RTNLGRP_IPV6_ROUTE); + } + return (0); +} + +static void +report_operation(uint32_t fibnum, struct rib_cmd_info *rc, + struct nlpcb *nlp, struct nlmsghdr *hdr) +{ + struct nl_writer nw; + uint32_t group_id = family_to_group(rt_get_family(rc->rc_rt)); + + if (nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id, 0, + false)) { + struct route_nhop_data rnd = { + .rnd_nhop = rc_get_nhop(rc), + .rnd_weight = rc->rc_nh_weight, + }; + hdr->nlmsg_flags &= ~(NLM_F_REPLACE | NLM_F_CREATE); + hdr->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_APPEND); + switch (rc->rc_cmd) { + case RTM_ADD: + hdr->nlmsg_type = NL_RTM_NEWROUTE; + hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL; + break; + case RTM_CHANGE: + hdr->nlmsg_type = NL_RTM_NEWROUTE; + hdr->nlmsg_flags |= NLM_F_REPLACE; + break; + case RTM_DELETE: + hdr->nlmsg_type = NL_RTM_DELROUTE; + break; + } + dump_px(fibnum, hdr, rc->rc_rt, &rnd, &nw); + nlmsg_flush(&nw); + } + + rtsock_callback_p->route_f(fibnum, rc); +} + +static void +set_scope6(struct sockaddr *sa, struct ifnet *ifp) +{ +#ifdef INET6 + if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) { + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; + + if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) + in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp)); + } +#endif +} + +struct rta_mpath_nh { + struct sockaddr *gw; + struct ifnet *ifp; + uint8_t rtnh_flags; + uint8_t rtnh_weight; +}; + +#define _IN(_field) offsetof(struct rtnexthop, _field) +#define _OUT(_field) offsetof(struct rta_mpath_nh, _field) +const static struct nlattr_parser nla_p_rtnh[] = { + { .type = NL_RTA_GATEWAY, .off = _OUT(gw), .cb = nlattr_get_ip }, + { .type = NL_RTA_VIA, .off = _OUT(gw), .cb = nlattr_get_ipvia }, +}; +const static struct nlfield_parser nlf_p_rtnh[] = { + { .off_in = _IN(rtnh_flags), .off_out = _OUT(rtnh_flags), .cb = nlf_get_u8 }, + { .off_in = _IN(rtnh_hops), .off_out = _OUT(rtnh_weight), .cb = nlf_get_u8 }, + { .off_in = _IN(rtnh_ifindex), .off_out = _OUT(ifp), .cb = nlf_get_ifpz }, +}; +#undef _IN +#undef _OUT + +static bool +post_p_rtnh(void *_attrs, struct nl_pstate *npt __unused) +{ + struct rta_mpath_nh *attrs = (struct rta_mpath_nh *)_attrs; + + set_scope6(attrs->gw, attrs->ifp); + return (true); +} +NL_DECLARE_PARSER_EXT(mpath_parser, struct rtnexthop, NULL, nlf_p_rtnh, nla_p_rtnh, post_p_rtnh); + +struct rta_mpath { + u_int num_nhops; + struct rta_mpath_nh nhops[0]; +}; + +static int +nlattr_get_multipath(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target) +{ + struct rta_mpath *mp; + struct rtnexthop *rtnh; + uint16_t data_len, len; + u_int max_nhops; + int error; + + data_len = nla->nla_len - sizeof(struct nlattr); + max_nhops = data_len / sizeof(struct rtnexthop); + + mp = npt_alloc(npt, (max_nhops + 2) * sizeof(struct rta_mpath_nh)); + mp->num_nhops = 0; + + for (rtnh = (struct rtnexthop *)(nla + 1); data_len > 0; ) { + struct rta_mpath_nh *mpnh; + + if (__predict_false(rtnh->rtnh_len <= sizeof(*rtnh) || + rtnh->rtnh_len > data_len)) { + NLMSG_REPORT_ERR_MSG(npt, "%s: bad length %u", + __func__, rtnh->rtnh_len); + return (EINVAL); + } + mpnh = &mp->nhops[mp->num_nhops++]; + error = nl_parse_header(rtnh, rtnh->rtnh_len, &mpath_parser, + npt, mpnh); + if (error != 0) { + NLMSG_REPORT_ERR_MSG(npt, + "RTA_MULTIPATH: nexthop %u: parse failed", + mp->num_nhops - 1); + return (error); + } + len = NL_ITEM_ALIGN(rtnh->rtnh_len); + data_len -= len; + rtnh = (struct rtnexthop *)((char *)rtnh + len); + } + if (data_len != 0 || mp->num_nhops == 0) { + NLMSG_REPORT_ERR_MSG(npt, "invalid RTA_MULTIPATH attr"); + return (EINVAL); + } + + *((struct rta_mpath **)target) = mp; + return (0); +} + + +struct nl_parsed_route { + struct sockaddr *rta_dst; + struct sockaddr *rta_gw; + struct ifnet *rta_oif; + struct rta_mpath *rta_multipath; + uint32_t rta_table; + uint32_t rta_rtflags; + uint32_t rta_nh_id; + uint32_t rta_weight; + uint32_t rtax_mtu; + uint8_t rtm_table; + uint8_t rtm_family; + uint8_t rtm_dst_len; + uint8_t rtm_protocol; + uint8_t rtm_type; + uint32_t rtm_flags; +}; + +#define _IN(_field) offsetof(struct rtmsg, _field) +#define _OUT(_field) offsetof(struct nl_parsed_route, _field) +static struct nlattr_parser nla_p_rtmetrics[] = { + { .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = nlattr_get_uint32 }, +}; +NL_DECLARE_ATTR_PARSER(metrics_parser, nla_p_rtmetrics); + +static const struct nlattr_parser nla_p_rtmsg[] = { + { .type = NL_RTA_DST, .off = _OUT(rta_dst), .cb = nlattr_get_ip }, + { .type = NL_RTA_OIF, .off = _OUT(rta_oif), .cb = nlattr_get_ifp }, + { .type = NL_RTA_GATEWAY, .off = _OUT(rta_gw), .cb = nlattr_get_ip }, + { .type = NL_RTA_METRICS, .arg = &metrics_parser, .cb = nlattr_get_nested }, + { .type = NL_RTA_MULTIPATH, .off = _OUT(rta_multipath), .cb = nlattr_get_multipath }, + { .type = NL_RTA_WEIGHT, .off = _OUT(rta_weight), .cb = nlattr_get_uint32 }, + { .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = nlattr_get_uint32 }, + { .type = NL_RTA_TABLE, .off = _OUT(rta_table), .cb = nlattr_get_uint32 }, + { .type = NL_RTA_VIA, .off = _OUT(rta_gw), .cb = nlattr_get_ipvia }, + { .type = NL_RTA_NH_ID, .off = _OUT(rta_nh_id), .cb = nlattr_get_uint32 }, +}; + +static const struct nlfield_parser nlf_p_rtmsg[] = { + { .off_in = _IN(rtm_family), .off_out = _OUT(rtm_family), .cb = nlf_get_u8 }, + { .off_in = _IN(rtm_dst_len), .off_out = _OUT(rtm_dst_len), .cb = nlf_get_u8 }, + { .off_in = _IN(rtm_protocol), .off_out = _OUT(rtm_protocol), .cb = nlf_get_u8 }, + { .off_in = _IN(rtm_type), .off_out = _OUT(rtm_type), .cb = nlf_get_u8 }, + { .off_in = _IN(rtm_table), .off_out = _OUT(rtm_table), .cb = nlf_get_u8 }, + { .off_in = _IN(rtm_flags), .off_out = _OUT(rtm_flags), .cb = nlf_get_u32 }, +}; +#undef _IN +#undef _OUT + +static bool +post_p_rtmsg(void *_attrs, struct nl_pstate *npt __unused) +{ + struct nl_parsed_route *attrs = (struct nl_parsed_route *)_attrs; + + set_scope6(attrs->rta_dst, attrs->rta_oif); + set_scope6(attrs->rta_gw, attrs->rta_oif); + return (true); +} +NL_DECLARE_PARSER_EXT(rtm_parser, struct rtmsg, NULL, nlf_p_rtmsg, nla_p_rtmsg, post_p_rtmsg); + +struct netlink_walkargs { + struct nl_writer *nw; + struct route_nhop_data rnd; + struct nlmsghdr hdr; + struct nlpcb *nlp; + uint32_t fibnum; + int family; + int error; + int count; + int dumped; + int dumped_tables; +}; + +static int +dump_rtentry(struct rtentry *rt, void *_arg) +{ + struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg; + int error; + + wa->count++; + if (wa->error != 0) + return (0); + if (!rt_is_exportable(rt, nlp_get_cred(wa->nlp))) + return (0); + wa->dumped++; + + rt_get_rnd(rt, &wa->rnd); + + error = dump_px(wa->fibnum, &wa->hdr, rt, &wa->rnd, wa->nw); + + IF_DEBUG_LEVEL(LOG_DEBUG3) { + char rtbuf[INET6_ADDRSTRLEN + 5]; + FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family, + "Dump %s, error %d", + rt_print_buf(rt, rtbuf, sizeof(rtbuf)), error); + } + wa->error = error; + + return (0); +} + +static void +dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family) +{ + FIB_LOG(LOG_DEBUG2, fibnum, family, "Start dump"); + wa->count = 0; + wa->dumped = 0; + + rib_walk(fibnum, family, false, dump_rtentry, wa); + + wa->dumped_tables++; + + FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d", + wa->count, wa->dumped); +} + +static int +dump_rtable_fib(struct netlink_walkargs *wa, uint32_t fibnum, int family) +{ + wa->fibnum = fibnum; + + if (family == AF_UNSPEC) { + for (int i = 0; i < AF_MAX; i++) { + if (rt_tables_get_rnh(fibnum, i) != 0) { + wa->family = i; + dump_rtable_one(wa, fibnum, i); + if (wa->error != 0) + break; + } + } + } else { + if (rt_tables_get_rnh(fibnum, family) != 0) { + wa->family = family; + dump_rtable_one(wa, fibnum, family); + } + } + + return (wa->error); +} + +static int +handle_rtm_getroute(struct nlpcb *nlp, struct nl_parsed_route *attrs, + struct nlmsghdr *hdr, struct nl_pstate *npt) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rnh; + const struct rtentry *rt; + struct route_nhop_data rnd; + uint32_t fibnum = attrs->rta_table; + sa_family_t family = attrs->rtm_family; + + if (attrs->rta_dst == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "No RTA_DST supplied"); + return (EINVAL); + } + + rnh = rt_tables_get_rnh(fibnum, family); + if (rnh == NULL) + return (EAFNOSUPPORT); + + RIB_RLOCK(rnh); + + struct sockaddr *dst = attrs->rta_dst; + + if (attrs->rtm_flags & RTM_F_PREFIX) + rt = rib_lookup_prefix_plen(rnh, dst, attrs->rtm_dst_len, &rnd); + else + rt = (const struct rtentry *)rnh->rnh_matchaddr(dst, &rnh->head); + if (rt == NULL) { + RIB_RUNLOCK(rnh); + return (ESRCH); + } + + rt_get_rnd(rt, &rnd); + rnd.rnd_nhop = nhop_select_func(rnd.rnd_nhop, 0); + + RIB_RUNLOCK(rnh); + + if (!rt_is_exportable(rt, nlp_get_cred(nlp))) + return (ESRCH); + + IF_DEBUG_LEVEL(LOG_DEBUG2) { + char rtbuf[NHOP_PRINT_BUFSIZE] __unused, nhbuf[NHOP_PRINT_BUFSIZE] __unused; + FIB_LOG(LOG_DEBUG2, fibnum, family, "getroute completed: got %s for %s", + nhop_print_buf_any(rnd.rnd_nhop, nhbuf, sizeof(nhbuf)), + rt_print_buf(rt, rtbuf, sizeof(rtbuf))); + } + + hdr->nlmsg_type = NL_RTM_NEWROUTE; + dump_px(fibnum, hdr, rt, &rnd, npt->nw); + + return (0); +} + +static int +handle_rtm_dump(struct nlpcb *nlp, uint32_t fibnum, int family, + struct nlmsghdr *hdr, struct nl_writer *nw) +{ + struct netlink_walkargs wa = { + .nlp = nlp, + .nw = nw, + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_type = NL_RTM_NEWROUTE, + .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, + }; + + if (fibnum == RT_TABLE_UNSPEC) { + for (int i = 0; i < V_rt_numfibs; i++) { + dump_rtable_fib(&wa, fibnum, family); + if (wa.error != 0) + break; + } + } else + dump_rtable_fib(&wa, fibnum, family); + + if (wa.error == 0 && wa.dumped_tables == 0) { + FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum/family"); + wa.error = ESRCH; + // How do we propagate it? + } + + if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) { + NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); + return (ENOMEM); + } + + return (wa.error); +} + +static struct nhop_object * +finalize_nhop(struct nhop_object *nh, const struct sockaddr *dst, int *perror) +{ + /* + * The following MUST be filled: + * nh_ifp, nh_ifa, nh_gw + */ + if (nh->gw_sa.sa_family == 0) { + /* + * Empty gateway. Can be direct route with RTA_OIF set. + */ + if (nh->nh_ifp != NULL) + nhop_set_direct_gw(nh, nh->nh_ifp); + else { + NL_LOG(LOG_DEBUG, "empty gateway and interface, skipping"); + *perror = EINVAL; + return (NULL); + } + /* Both nh_ifp and gateway are set */ + } else { + /* Gateway is set up, we can derive ifp if not set */ + if (nh->nh_ifp == NULL) { + uint32_t fibnum = nhop_get_fibnum(nh); + uint32_t flags = 0; + + if (nh->nh_flags & NHF_GATEWAY) + flags = RTF_GATEWAY; + else if (nh->nh_flags & NHF_HOST) + flags = RTF_HOST; + + struct ifaddr *ifa = ifa_ifwithroute(flags, dst, &nh->gw_sa, fibnum); + if (ifa == NULL) { + NL_LOG(LOG_DEBUG, "Unable to determine ifp, skipping"); + *perror = EINVAL; + return (NULL); + } + nhop_set_transmit_ifp(nh, ifa->ifa_ifp); + } + } + /* Both nh_ifp and gateway are set */ + if (nh->nh_ifa == NULL) { + const struct sockaddr *gw_sa = &nh->gw_sa; + + if (gw_sa->sa_family != dst->sa_family) { + /* + * Use dst as the target for determining the default + * preferred ifa IF + * 1) the gateway is link-level (e.g. direct route) + * 2) the gateway family is different (e.g. IPv4 over IPv6). + */ + gw_sa = dst; + } + + struct ifaddr *ifa = ifaof_ifpforaddr(gw_sa, nh->nh_ifp); + if (ifa == NULL) { + /* Try link-level ifa. */ + gw_sa = &nh->gw_sa; + ifa = ifaof_ifpforaddr(gw_sa, nh->nh_ifp); + if (ifa == NULL) { + NL_LOG(LOG_DEBUG, "Unable to determine ifa, skipping"); + *perror = EINVAL; + return (NULL); + } + } + nhop_set_src(nh, ifa); + } + + return (nhop_get_nhop(nh, perror)); +} + +static int +get_pxflag(const struct nl_parsed_route *attrs) +{ + int pxflag = 0; + switch (attrs->rtm_family) { + case AF_INET: + if (attrs->rtm_dst_len == 32) + pxflag = NHF_HOST; + else if (attrs->rtm_dst_len == 0) + pxflag = NHF_DEFAULT; + break; + case AF_INET6: + if (attrs->rtm_dst_len == 128) + pxflag = NHF_HOST; + else if (attrs->rtm_dst_len == 0) + pxflag = NHF_DEFAULT; + break; + } + + return (pxflag); +} + +static int +get_op_flags(int nlm_flags) +{ + int op_flags = 0; + + op_flags |= (nlm_flags & NLM_F_REPLACE) ? RTM_F_REPLACE : 0; + op_flags |= (nlm_flags & NLM_F_EXCL) ? RTM_F_EXCL : 0; + op_flags |= (nlm_flags & NLM_F_CREATE) ? RTM_F_CREATE : 0; + op_flags |= (nlm_flags & NLM_F_APPEND) ? RTM_F_APPEND : 0; + + return (op_flags); +} + +#ifdef ROUTE_MPATH +static int +create_nexthop_one(struct nl_parsed_route *attrs, struct rta_mpath_nh *mpnh, + struct nl_pstate *npt, struct nhop_object **pnh) +{ + int error; + + if (mpnh->gw == NULL) + return (EINVAL); + + struct nhop_object *nh = nhop_alloc(attrs->rta_table, attrs->rtm_family); + if (nh == NULL) + return (ENOMEM); + + error = nl_set_nexthop_gw(nh, mpnh->gw, mpnh->ifp, npt); + if (error != 0) { + nhop_free(nh); + return (error); + } + if (mpnh->ifp != NULL) + nhop_set_transmit_ifp(nh, mpnh->ifp); + nhop_set_pxtype_flag(nh, get_pxflag(attrs)); + nhop_set_rtflags(nh, attrs->rta_rtflags); + if (attrs->rtm_protocol > RTPROT_STATIC) + nhop_set_origin(nh, attrs->rtm_protocol); + + *pnh = finalize_nhop(nh, attrs->rta_dst, &error); + + return (error); +} +#endif + +static struct nhop_object * +create_nexthop_from_attrs(struct nl_parsed_route *attrs, + struct nl_pstate *npt, int *perror) +{ + struct nhop_object *nh = NULL; + int error = 0; + + if (attrs->rta_multipath != NULL) { +#ifdef ROUTE_MPATH + /* Multipath w/o explicit nexthops */ + int num_nhops = attrs->rta_multipath->num_nhops; + struct weightened_nhop *wn = npt_alloc(npt, sizeof(*wn) * num_nhops); + + for (int i = 0; i < num_nhops; i++) { + struct rta_mpath_nh *mpnh = &attrs->rta_multipath->nhops[i]; + + error = create_nexthop_one(attrs, mpnh, npt, &wn[i].nh); + if (error != 0) { + for (int j = 0; j < i; j++) + nhop_free(wn[j].nh); + break; + } + wn[i].weight = mpnh->rtnh_weight > 0 ? mpnh->rtnh_weight : 1; + } + if (error == 0) { + struct rib_head *rh = nhop_get_rh(wn[0].nh); + struct nhgrp_object *nhg; + + nhg = nhgrp_alloc(rh->rib_fibnum, rh->rib_family, + wn, num_nhops, perror); + if (nhg != NULL) { + if (attrs->rtm_protocol > RTPROT_STATIC) + nhgrp_set_origin(nhg, attrs->rtm_protocol); + nhg = nhgrp_get_nhgrp(nhg, perror); + } + for (int i = 0; i < num_nhops; i++) + nhop_free(wn[i].nh); + if (nhg != NULL) + return ((struct nhop_object *)nhg); + error = *perror; + } +#else + error = ENOTSUP; +#endif + *perror = error; + } else { + nh = nhop_alloc(attrs->rta_table, attrs->rtm_family); + if (nh == NULL) { + *perror = ENOMEM; + return (NULL); + } + if (attrs->rta_gw != NULL) { + *perror = nl_set_nexthop_gw(nh, attrs->rta_gw, attrs->rta_oif, npt); + if (*perror != 0) { + nhop_free(nh); + return (NULL); + } + } + if (attrs->rta_oif != NULL) + nhop_set_transmit_ifp(nh, attrs->rta_oif); + if (attrs->rtax_mtu != 0) + nhop_set_mtu(nh, attrs->rtax_mtu, true); + if (attrs->rta_rtflags & RTF_BROADCAST) + nhop_set_broadcast(nh, true); + if (attrs->rtm_protocol > RTPROT_STATIC) + nhop_set_origin(nh, attrs->rtm_protocol); + nhop_set_pxtype_flag(nh, get_pxflag(attrs)); + nhop_set_rtflags(nh, attrs->rta_rtflags); + + switch (attrs->rtm_type) { + case RTN_UNICAST: + break; + case RTN_BLACKHOLE: + nhop_set_blackhole(nh, RTF_BLACKHOLE); + break; + case RTN_PROHIBIT: + case RTN_UNREACHABLE: + nhop_set_blackhole(nh, RTF_REJECT); + break; + /* TODO: return ENOTSUP for other types if strict option is set */ + } + + nh = finalize_nhop(nh, attrs->rta_dst, perror); + } + + return (nh); +} + +static int +rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nl_pstate *npt) +{ + struct rib_cmd_info rc = {}; + struct nhop_object *nh = NULL; + int error; + + struct nl_parsed_route attrs = {}; + error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); + if (error != 0) + return (error); + + /* Check if we have enough data */ + if (attrs.rta_dst == NULL) { + NL_LOG(LOG_DEBUG, "missing RTA_DST"); + return (EINVAL); + } + + /* pre-2.6.19 Linux API compatibility */ + if (attrs.rtm_table > 0 && attrs.rta_table == 0) + attrs.rta_table = attrs.rtm_table; + if (attrs.rta_table >= V_rt_numfibs || attrs.rtm_family > AF_MAX) { + NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); + return (EINVAL); + } + + if (attrs.rta_nh_id != 0) { + /* Referenced uindex */ + int pxflag = get_pxflag(&attrs); + nh = nl_find_nhop(attrs.rta_table, attrs.rtm_family, attrs.rta_nh_id, + pxflag, &error); + if (error != 0) + return (error); + } else { + nh = create_nexthop_from_attrs(&attrs, npt, &error); + if (error != 0) { + NL_LOG(LOG_DEBUG, "Error creating nexthop"); + return (error); + } + } + + if (!NH_IS_NHGRP(nh) && attrs.rta_weight == 0) + attrs.rta_weight = RT_DEFAULT_WEIGHT; + struct route_nhop_data rnd = { .rnd_nhop = nh, .rnd_weight = attrs.rta_weight }; + int op_flags = get_op_flags(hdr->nlmsg_flags); + + error = rib_add_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len, + &rnd, op_flags, &rc); + if (error == 0) + report_operation(attrs.rta_table, &rc, nlp, hdr); + return (error); +} + +static int +path_match_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data) +{ + struct nl_parsed_route *attrs = (struct nl_parsed_route *)_data; + + if ((attrs->rta_gw != NULL) && !rib_match_gw(rt, nh, attrs->rta_gw)) + return (0); + + if ((attrs->rta_oif != NULL) && (attrs->rta_oif != nh->nh_ifp)) + return (0); + + return (1); +} + +static int +rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nl_pstate *npt) +{ + struct rib_cmd_info rc; + int error; + + struct nl_parsed_route attrs = {}; + error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); + if (error != 0) + return (error); + + if (attrs.rta_dst == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "RTA_DST is not set"); + return (ESRCH); + } + + if (attrs.rta_table >= V_rt_numfibs || attrs.rtm_family > AF_MAX) { + NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); + return (EINVAL); + } + + error = rib_del_route_px(attrs.rta_table, attrs.rta_dst, + attrs.rtm_dst_len, path_match_func, &attrs, + (attrs.rta_rtflags & RTF_PINNED) ? RTM_F_FORCE : 0, &rc); + if (error == 0) + report_operation(attrs.rta_table, &rc, nlp, hdr); + return (error); +} + +static int +rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + int error; + + struct nl_parsed_route attrs = {}; + error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); + if (error != 0) + return (error); + + if (attrs.rta_table >= V_rt_numfibs || attrs.rtm_family > AF_MAX) { + NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); + return (EINVAL); + } + + if (hdr->nlmsg_flags & NLM_F_DUMP) + error = handle_rtm_dump(nlp, attrs.rta_table, attrs.rtm_family, hdr, npt->nw); + else + error = handle_rtm_getroute(nlp, &attrs, hdr, npt); + + return (error); +} + +void +rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc) +{ + struct nl_writer nw; + int family, nlm_flags = 0; + + family = rt_get_family(rc->rc_rt); + + /* XXX: check if there are active listeners first */ + + /* TODO: consider passing PID/type/seq */ + switch (rc->rc_cmd) { + case RTM_ADD: + nlm_flags = NLM_F_EXCL | NLM_F_CREATE; + break; + case RTM_CHANGE: + nlm_flags = NLM_F_REPLACE; + break; + case RTM_DELETE: + nlm_flags = 0; + break; + } + IF_DEBUG_LEVEL(LOG_DEBUG2) { + char rtbuf[NHOP_PRINT_BUFSIZE] __unused; + FIB_LOG(LOG_DEBUG2, fibnum, family, + "received event %s for %s / nlm_flags=%X", + rib_print_cmd(rc->rc_cmd), + rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)), + nlm_flags); + } + + struct nlmsghdr hdr = { + .nlmsg_flags = nlm_flags, + .nlmsg_type = get_rtmsg_type_from_rtsock(rc->rc_cmd), + }; + + struct route_nhop_data rnd = { + .rnd_nhop = rc_get_nhop(rc), + .rnd_weight = rc->rc_nh_weight, + }; + + uint32_t group_id = family_to_group(family); + if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id, 0, + false)) { + NL_LOG(LOG_DEBUG, "error allocating event buffer"); + return; + } + + dump_px(fibnum, &hdr, rc->rc_rt, &rnd, &nw); + nlmsg_flush(&nw); +} + +static const struct rtnl_cmd_handler cmd_handlers[] = { + { + .cmd = NL_RTM_GETROUTE, + .name = "RTM_GETROUTE", + .cb = &rtnl_handle_getroute, + .flags = RTNL_F_ALLOW_NONVNET_JAIL, + }, + { + .cmd = NL_RTM_DELROUTE, + .name = "RTM_DELROUTE", + .cb = &rtnl_handle_delroute, + .priv = PRIV_NET_ROUTE, + .flags = RTNL_F_ALLOW_NONVNET_JAIL, + }, + { + .cmd = NL_RTM_NEWROUTE, + .name = "RTM_NEWROUTE", + .cb = &rtnl_handle_newroute, + .priv = PRIV_NET_ROUTE, + .flags = RTNL_F_ALLOW_NONVNET_JAIL, + } +}; + +static const struct nlhdr_parser *all_parsers[] = {&mpath_parser, &metrics_parser, &rtm_parser}; + +void +rtnl_routes_init(void) +{ + NL_VERIFY_PARSERS(all_parsers); + rtnl_register_messages(cmd_handlers, nitems(cmd_handlers)); +} |