aboutsummaryrefslogtreecommitdiff
path: root/sys/netlink/netlink_io.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/netlink/netlink_io.c')
-rw-r--r--sys/netlink/netlink_io.c369
1 files changed, 369 insertions, 0 deletions
diff --git a/sys/netlink/netlink_io.c b/sys/netlink/netlink_io.c
new file mode 100644
index 000000000000..e7908d6f3a44
--- /dev/null
+++ b/sys/netlink/netlink_io.c
@@ -0,0 +1,369 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/ck.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syslog.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_linux.h>
+#include <netlink/netlink_var.h>
+
+#define DEBUG_MOD_NAME nl_io
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_INFO);
+
+/*
+ * The logic below provide a p2p interface for receiving and
+ * sending netlink data between the kernel and userland.
+ */
+
+static bool nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp);
+
+struct nl_buf *
+nl_buf_alloc(size_t len, int mflag)
+{
+ struct nl_buf *nb;
+
+ KASSERT(len > 0 && len <= UINT_MAX, ("%s: invalid length %zu",
+ __func__, len));
+
+ nb = malloc(sizeof(struct nl_buf) + len, M_NETLINK, mflag);
+ if (__predict_true(nb != NULL)) {
+ nb->buflen = len;
+ nb->datalen = nb->offset = 0;
+ }
+
+ return (nb);
+}
+
+void
+nl_buf_free(struct nl_buf *nb)
+{
+
+ free(nb, M_NETLINK);
+}
+
+void
+nl_schedule_taskqueue(struct nlpcb *nlp)
+{
+ if (!nlp->nl_task_pending) {
+ nlp->nl_task_pending = true;
+ taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
+ NL_LOG(LOG_DEBUG3, "taskqueue scheduled");
+ } else {
+ NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped");
+ }
+}
+
+static bool
+nl_process_received_one(struct nlpcb *nlp)
+{
+ struct socket *so = nlp->nl_socket;
+ struct sockbuf *sb;
+ struct nl_buf *nb;
+ bool reschedule = false;
+
+ NLP_LOCK(nlp);
+ nlp->nl_task_pending = false;
+ NLP_UNLOCK(nlp);
+
+ /*
+ * Do not process queued up requests if there is no space to queue
+ * replies.
+ */
+ sb = &so->so_rcv;
+ SOCK_RECVBUF_LOCK(so);
+ if (sb->sb_hiwat <= sb->sb_ccc) {
+ SOCK_RECVBUF_UNLOCK(so);
+ NL_LOG(LOG_DEBUG3, "socket %p stuck", so);
+ return (false);
+ }
+ SOCK_RECVBUF_UNLOCK(so);
+
+ sb = &so->so_snd;
+ SOCK_SENDBUF_LOCK(so);
+ while ((nb = TAILQ_FIRST(&sb->nl_queue)) != NULL) {
+ TAILQ_REMOVE(&sb->nl_queue, nb, tailq);
+ SOCK_SENDBUF_UNLOCK(so);
+ reschedule = nl_process_nbuf(nb, nlp);
+ SOCK_SENDBUF_LOCK(so);
+ if (reschedule) {
+ sb->sb_acc -= nb->datalen;
+ sb->sb_ccc -= nb->datalen;
+ /* XXXGL: potentially can reduce lock&unlock count. */
+ sowwakeup_locked(so);
+ nl_buf_free(nb);
+ SOCK_SENDBUF_LOCK(so);
+ } else {
+ TAILQ_INSERT_HEAD(&sb->nl_queue, nb, tailq);
+ break;
+ }
+ }
+ SOCK_SENDBUF_UNLOCK(so);
+
+ return (reschedule);
+}
+
+static void
+nl_process_received(struct nlpcb *nlp)
+{
+ NL_LOG(LOG_DEBUG3, "taskqueue called");
+
+ if (__predict_false(nlp->nl_need_thread_setup)) {
+ nl_set_thread_nlp(curthread, nlp);
+ NLP_LOCK(nlp);
+ nlp->nl_need_thread_setup = false;
+ NLP_UNLOCK(nlp);
+ }
+
+ while (nl_process_received_one(nlp))
+ ;
+}
+
+/*
+ * Called after some data have been read from the socket.
+ */
+void
+nl_on_transmit(struct nlpcb *nlp)
+{
+ NLP_LOCK(nlp);
+
+ struct socket *so = nlp->nl_socket;
+ if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) {
+ unsigned long dropped_bytes = nlp->nl_dropped_bytes;
+ unsigned long dropped_messages = nlp->nl_dropped_messages;
+ nlp->nl_dropped_bytes = 0;
+ nlp->nl_dropped_messages = 0;
+
+ struct sockbuf *sb = &so->so_rcv;
+ NLP_LOG(LOG_DEBUG, nlp,
+ "socket RX overflowed, %lu messages (%lu bytes) dropped. "
+ "bytes: [%u/%u]", dropped_messages, dropped_bytes,
+ sb->sb_ccc, sb->sb_hiwat);
+ /* TODO: send netlink message */
+ }
+
+ nl_schedule_taskqueue(nlp);
+ NLP_UNLOCK(nlp);
+}
+
+void
+nl_taskqueue_handler(void *_arg, int pending)
+{
+ struct nlpcb *nlp = (struct nlpcb *)_arg;
+
+ CURVNET_SET(nlp->nl_socket->so_vnet);
+ nl_process_received(nlp);
+ CURVNET_RESTORE();
+}
+
+/*
+ * Tries to send current data buffer from writer.
+ *
+ * Returns true on success.
+ * If no queue overrunes happened, wakes up socket owner.
+ */
+bool
+nl_send(struct nl_writer *nw, struct nlpcb *nlp)
+{
+ struct socket *so = nlp->nl_socket;
+ struct sockbuf *sb = &so->so_rcv;
+ struct nl_buf *nb;
+
+ MPASS(nw->hdr == NULL);
+ MPASS(nw->buf != NULL);
+ MPASS(nw->buf->datalen > 0);
+
+ IF_DEBUG_LEVEL(LOG_DEBUG2) {
+ struct nlmsghdr *hdr = (struct nlmsghdr *)nw->buf->data;
+ NLP_LOG(LOG_DEBUG2, nlp,
+ "TX len %u msgs %u msg type %d first hdrlen %u",
+ nw->buf->datalen, nw->num_messages, hdr->nlmsg_type,
+ hdr->nlmsg_len);
+ }
+
+ if (nlp->nl_linux && linux_netlink_p != NULL &&
+ __predict_false(!linux_netlink_p->msgs_to_linux(nw, nlp))) {
+ nl_buf_free(nw->buf);
+ nw->buf = NULL;
+ return (false);
+ }
+
+ nb = nw->buf;
+ nw->buf = NULL;
+
+ SOCK_RECVBUF_LOCK(so);
+ if (!nw->ignore_limit && __predict_false(sb->sb_hiwat <= sb->sb_ccc)) {
+ SOCK_RECVBUF_UNLOCK(so);
+ NLP_LOCK(nlp);
+ nlp->nl_dropped_bytes += nb->datalen;
+ nlp->nl_dropped_messages += nw->num_messages;
+ NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
+ (unsigned long)nlp->nl_dropped_messages, nw->num_messages,
+ (unsigned long)nlp->nl_dropped_bytes, nb->datalen);
+ NLP_UNLOCK(nlp);
+ nl_buf_free(nb);
+ return (false);
+ } else {
+ bool full;
+
+ TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq);
+ sb->sb_acc += nb->datalen;
+ sb->sb_ccc += nb->datalen;
+ full = sb->sb_hiwat <= sb->sb_ccc;
+ sorwakeup_locked(so);
+ if (full) {
+ NLP_LOCK(nlp);
+ nlp->nl_tx_blocked = true;
+ NLP_UNLOCK(nlp);
+ }
+ return (true);
+ }
+}
+
+static int
+nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
+ struct nlpcb *nlp, struct nl_pstate *npt)
+{
+ nl_handler_f handler = nl_handlers[nlp->nl_proto].cb;
+ int error = 0;
+
+ NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
+ hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq,
+ hdr->nlmsg_pid);
+
+ if (__predict_false(hdr->nlmsg_len > remaining_length)) {
+ NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d",
+ hdr->nlmsg_len, remaining_length);
+ return (EINVAL);
+ } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
+ NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
+ return (EINVAL);
+ }
+ /* Stamp each message with sender pid */
+ hdr->nlmsg_pid = nlp->nl_port;
+
+ npt->hdr = hdr;
+
+ if (hdr->nlmsg_flags & NLM_F_REQUEST &&
+ hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
+ NL_LOG(LOG_DEBUG2, "handling message with msg type: %d",
+ hdr->nlmsg_type);
+ if (nlp->nl_linux) {
+ MPASS(linux_netlink_p != NULL);
+ error = linux_netlink_p->msg_from_linux(nlp->nl_proto,
+ &hdr, npt);
+ if (error)
+ goto ack;
+ }
+ error = handler(hdr, npt);
+ NL_LOG(LOG_DEBUG2, "retcode: %d", error);
+ }
+ack:
+ if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
+ if (!npt->nw->suppress_ack) {
+ NL_LOG(LOG_DEBUG3, "ack");
+ nlmsg_ack(nlp, error, hdr, npt);
+ }
+ }
+
+ return (0);
+}
+
+static void
+npt_clear(struct nl_pstate *npt)
+{
+ lb_clear(&npt->lb);
+ npt->cookie = NULL;
+ npt->error = 0;
+ npt->err_msg = NULL;
+ npt->err_off = 0;
+ npt->hdr = NULL;
+ npt->nw->suppress_ack = false;
+}
+
+/*
+ * Processes an incoming packet, which can contain multiple netlink messages
+ */
+static bool
+nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp)
+{
+ struct nl_writer nw;
+ struct nlmsghdr *hdr;
+ int error;
+
+ NL_LOG(LOG_DEBUG3, "RX netlink buf %p on %p", nb, nlp->nl_socket);
+
+ if (!nl_writer_unicast(&nw, NLMSG_SMALL, nlp, false)) {
+ NL_LOG(LOG_DEBUG, "error allocating socket writer");
+ return (true);
+ }
+
+ nlmsg_ignore_limit(&nw);
+
+ struct nl_pstate npt = {
+ .nlp = nlp,
+ .lb.base = &nb->data[roundup2(nb->datalen, 8)],
+ .lb.size = nb->buflen - roundup2(nb->datalen, 8),
+ .nw = &nw,
+ .strict = nlp->nl_flags & NLF_STRICT,
+ };
+
+ for (; nb->offset + sizeof(struct nlmsghdr) <= nb->datalen;) {
+ hdr = (struct nlmsghdr *)&nb->data[nb->offset];
+ /* Save length prior to calling handler */
+ int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
+ NL_LOG(LOG_DEBUG3, "parsing offset %d/%d",
+ nb->offset, nb->datalen);
+ npt_clear(&npt);
+ error = nl_receive_message(hdr, nb->datalen - nb->offset, nlp,
+ &npt);
+ nb->offset += msglen;
+ if (__predict_false(error != 0 || nlp->nl_tx_blocked))
+ break;
+ }
+ NL_LOG(LOG_DEBUG3, "packet parsing done");
+ nlmsg_flush(&nw);
+
+ if (nlp->nl_tx_blocked) {
+ NLP_LOCK(nlp);
+ nlp->nl_tx_blocked = false;
+ NLP_UNLOCK(nlp);
+ return (false);
+ } else
+ return (true);
+}