aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Baldwin <jhb@FreeBSD.org>2024-05-02 23:34:45 +0000
committerJohn Baldwin <jhb@FreeBSD.org>2024-05-02 23:38:30 +0000
commita15f7c96a27644de3ed3bfcf5feee285ebc1cc91 (patch)
tree1c103f5e1fe1adef4191e3451049fc42ad1606fb
parent51346bd594585614bc99f29a124a2473df3155bd (diff)
downloadsrc-a15f7c96a27644de3ed3bfcf5feee285ebc1cc91.tar.gz
src-a15f7c96a27644de3ed3bfcf5feee285ebc1cc91.zip
nvmft: The in-kernel NVMe over Fabrics controller
This is the server (target in SCSI terms) for NVMe over Fabrics. Userland is responsible for accepting a new queue pair and receiving the initial Connect command before handing the queue pair off via an ioctl to this CTL frontend. This frontend exposes CTL LUNs as NVMe namespaces to remote hosts. Users can ask LUNS to CTL that can be shared via either iSCSI or NVMeoF. Reviewed by: imp Sponsored by: Chelsio Communications Differential Revision: https://reviews.freebsd.org/D44726
-rw-r--r--share/man/man4/Makefile1
-rw-r--r--share/man/man4/nvmft.485
-rw-r--r--sys/conf/NOTES2
-rw-r--r--sys/conf/files6
-rw-r--r--sys/dev/nvmf/controller/ctl_frontend_nvmf.c1123
-rw-r--r--sys/dev/nvmf/controller/nvmft_controller.c1130
-rw-r--r--sys/dev/nvmf/controller/nvmft_qpair.c361
-rw-r--r--sys/dev/nvmf/controller/nvmft_var.h174
-rw-r--r--sys/modules/nvmf/Makefile3
-rw-r--r--sys/modules/nvmf/nvmft/Makefile10
10 files changed, 2893 insertions, 2 deletions
diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
index 7b6f8849be59..32ea3a1b6991 100644
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -410,6 +410,7 @@ MAN= aac.4 \
nvme.4 \
nvmf.4 \
nvmf_tcp.4 \
+ nvmft.4 \
${_nvram.4} \
oce.4 \
ocs_fc.4\
diff --git a/share/man/man4/nvmft.4 b/share/man/man4/nvmft.4
new file mode 100644
index 000000000000..d121fb97b514
--- /dev/null
+++ b/share/man/man4/nvmft.4
@@ -0,0 +1,85 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2024 Chelsio Communications, Inc.
+.\"
+.Dd May 2, 2024
+.Dt NVMFT 4
+.Os
+.Sh NAME
+.Nm nvmft
+.Nd "NVM Express over Fabrics CAM Target Layer frontend"
+.Sh SYNOPSIS
+To compile the subsystem into the kernel,
+place the following lines in the
+kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device nvmft"
+.Cd "device ctl"
+.Ed
+.Pp
+Alternatively, to load the subsystem as a
+module at boot time, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+nvmft_load="YES"
+.Ed
+.Sh DESCRIPTION
+The
+.Nm
+driver provides the kernel component of an NVM Express over Fabrics
+controller.
+The NVMeoF controller is the server exporting namespaces backed by
+local files and volumes to remote hosts.
+.Nm
+follows the dynamic controller model and creates a new dynamic controller
+for each association.
+.Pp
+.Nm
+is implemented as a
+.Xr ctl 4
+frontend and exports CAM Target Layer LUNs as namespaces to remote hosts.
+LUNs can be configured via
+.Xr ctladm 8 .
+.Pp
+Associations between the local controller and remote hosts are managed
+using both the
+.Xr nvmfd 8
+daemon and the
+.Xr ctladm 8
+utility.
+The
+.Xr nvmfd 8
+daemon listens for new associations and handles transport-specific
+negotiation before handing off connected queue pairs to
+.Nm
+which associates queue pairs with a suitable controller instance.
+The
+.Cm nvlist
+.Xr ctladm 8
+command lists active controllers.
+The
+.Cm nvterminate
+command terminates one or more associations between a local controller
+and a remote host.
+.Pp
+Associations require a supported transport such as
+.Xr nvmf_tcp 4
+for associations using TCP/IP.
+.Sh SEE ALSO
+.Xr ctl 4 ,
+.Xr nvmf 4 ,
+.Xr nvmf_tcp 4 ,
+.Xr ctladm 8 ,
+.Xr nvmfd 8
+.Sh HISTORY
+The
+.Nm
+module first appeared in
+.Fx 15.0 .
+.Sh AUTHORS
+The
+.Nm
+subsystem was developed by
+.An John Baldwin Aq Mt jhb@FreeBSD.org
+under sponsorship from Chelsio Communications, Inc.
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index ffb4b43f4efc..5819eeb57b2d 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -1677,6 +1677,7 @@ device mrsas # LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s
#
# nvme: PCI-express NVM Express host controllers
# nvmf: NVM Express over Fabrics host
+# nvmft: NVM Express over Fabrics CAM Target Layer frontend
# nvmf_tcp: TCP transport for NVM Express over Fabrics
# nda: CAM NVMe disk driver
# nvd: non-CAM NVMe disk driver
@@ -1684,6 +1685,7 @@ device mrsas # LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s
device nvme # PCI-express NVMe host driver
options NVME_USE_NVD=1 # Use nvd(4) instead of the CAM nda(4) driver
device nvmf # NVMeoF host driver
+device nvmft # NVMeoF ctl(4) frontend
device nvmf_tcp # NVMeoF TCP transport
device nda # NVMe direct access devices (aka disks)
device nvd # expose NVMe namespaces as disks, depends on nvme
diff --git a/sys/conf/files b/sys/conf/files
index b23ec357a302..f68567aa9023 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2535,6 +2535,10 @@ dev/nvme/nvme_test.c optional nvme
dev/nvme/nvme_util.c optional nvme
dev/nvmem/nvmem.c optional nvmem fdt
dev/nvmem/nvmem_if.m optional nvmem
+dev/nvmf/controller/ctl_frontend_nvmf.c optional nvmft
+dev/nvmf/controller/nvmft_controller.c optional nvmft
+dev/nvmf/controller/nvmft_subr.c optional nvmft
+dev/nvmf/controller/nvmft_qpair.c optional nvmft
dev/nvmf/host/nvmf.c optional nvmf
dev/nvmf/host/nvmf_aer.c optional nvmf
dev/nvmf/host/nvmf_cmd.c optional nvmf
@@ -2543,7 +2547,7 @@ dev/nvmf/host/nvmf_ns.c optional nvmf
dev/nvmf/host/nvmf_qpair.c optional nvmf
dev/nvmf/host/nvmf_sim.c optional nvmf
dev/nvmf/nvmf_tcp.c optional nvmf_tcp
-dev/nvmf/nvmf_transport.c optional nvmf
+dev/nvmf/nvmf_transport.c optional nvmf | optional nvmft
dev/oce/oce_hw.c optional oce pci
dev/oce/oce_if.c optional oce pci
dev/oce/oce_mbox.c optional oce pci
diff --git a/sys/dev/nvmf/controller/ctl_frontend_nvmf.c b/sys/dev/nvmf/controller/ctl_frontend_nvmf.c
new file mode 100644
index 000000000000..a203bb1c90a6
--- /dev/null
+++ b/sys/dev/nvmf/controller/ctl_frontend_nvmf.c
@@ -0,0 +1,1123 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/dnv.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/memdesc.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/refcount.h>
+#include <sys/sbuf.h>
+#include <sys/sx.h>
+
+#include <machine/bus.h>
+#include <machine/bus_dma.h>
+
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/controller/nvmft_subr.h>
+#include <dev/nvmf/controller/nvmft_var.h>
+
+#include <cam/ctl/ctl.h>
+#include <cam/ctl/ctl_error.h>
+#include <cam/ctl/ctl_io.h>
+#include <cam/ctl/ctl_frontend.h>
+
+/*
+ * Store pointers to the capsule and qpair in the two pointer members
+ * of CTL_PRIV_FRONTEND.
+ */
+#define NVMFT_NC(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[0])
+#define NVMFT_QP(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[1])
+
+static void nvmft_done(union ctl_io *io);
+static int nvmft_init(void);
+static int nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data,
+ int flag, struct thread *td);
+static int nvmft_shutdown(void);
+
+static TAILQ_HEAD(, nvmft_port) nvmft_ports;
+static struct sx nvmft_ports_lock;
+
+MALLOC_DEFINE(M_NVMFT, "nvmft", "NVMe over Fabrics controller");
+
+static struct ctl_frontend nvmft_frontend = {
+ .name = "nvmf",
+ .init = nvmft_init,
+ .ioctl = nvmft_ioctl,
+ .fe_dump = NULL,
+ .shutdown = nvmft_shutdown,
+};
+
+static void
+nvmft_online(void *arg)
+{
+ struct nvmft_port *np = arg;
+
+ sx_xlock(&np->lock);
+ np->online = true;
+ sx_xunlock(&np->lock);
+}
+
+static void
+nvmft_offline(void *arg)
+{
+ struct nvmft_port *np = arg;
+ struct nvmft_controller *ctrlr;
+
+ sx_xlock(&np->lock);
+ np->online = false;
+
+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+ nvmft_printf(ctrlr,
+ "shutting down due to port going offline\n");
+ nvmft_controller_error(ctrlr, NULL, ENODEV);
+ }
+
+ while (!TAILQ_EMPTY(&np->controllers))
+ sx_sleep(np, &np->lock, 0, "nvmfoff", 0);
+ sx_xunlock(&np->lock);
+}
+
+static int
+nvmft_lun_enable(void *arg, int lun_id)
+{
+ struct nvmft_port *np = arg;
+ struct nvmft_controller *ctrlr;
+ uint32_t *old_ns, *new_ns;
+ uint32_t nsid;
+ u_int i;
+
+ if (lun_id >= le32toh(np->cdata.nn)) {
+ printf("NVMFT: %s lun %d larger than maximum nsid %u\n",
+ np->cdata.subnqn, lun_id, le32toh(np->cdata.nn));
+ return (EOPNOTSUPP);
+ }
+ nsid = lun_id + 1;
+
+ sx_xlock(&np->lock);
+ new_ns = mallocarray(np->num_ns + 1, sizeof(*new_ns), M_NVMFT,
+ M_WAITOK);
+ for (i = 0; i < np->num_ns; i++) {
+ if (np->active_ns[i] < nsid)
+ continue;
+ if (np->active_ns[i] == nsid) {
+ sx_xunlock(&np->lock);
+ free(new_ns, M_NVMFT);
+ printf("NVMFT: %s duplicate lun %d\n",
+ np->cdata.subnqn, lun_id);
+ return (EINVAL);
+ }
+ break;
+ }
+
+ /* Copy over IDs smaller than nsid. */
+ memcpy(new_ns, np->active_ns, i * sizeof(*np->active_ns));
+
+ /* Insert nsid. */
+ new_ns[i] = nsid;
+
+ /* Copy over IDs greater than nsid. */
+ memcpy(new_ns + i + 1, np->active_ns + i, (np->num_ns - i) *
+ sizeof(*np->active_ns));
+
+ np->num_ns++;
+ old_ns = np->active_ns;
+ np->active_ns = new_ns;
+
+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+ nvmft_controller_lun_changed(ctrlr, lun_id);
+ }
+
+ sx_xunlock(&np->lock);
+ free(old_ns, M_NVMFT);
+
+ return (0);
+}
+
+static int
+nvmft_lun_disable(void *arg, int lun_id)
+{
+ struct nvmft_port *np = arg;
+ struct nvmft_controller *ctrlr;
+ uint32_t nsid;
+ u_int i;
+
+ if (lun_id >= le32toh(np->cdata.nn))
+ return (0);
+ nsid = lun_id + 1;
+
+ sx_xlock(&np->lock);
+ for (i = 0; i < np->num_ns; i++) {
+ if (np->active_ns[i] == nsid)
+ goto found;
+ }
+ sx_xunlock(&np->lock);
+ printf("NVMFT: %s request to disable nonexistent lun %d\n",
+ np->cdata.subnqn, lun_id);
+ return (EINVAL);
+
+found:
+ /* Move down IDs greater than nsid. */
+ memmove(np->active_ns + i, np->active_ns + i + 1,
+ (np->num_ns - (i + 1)) * sizeof(*np->active_ns));
+ np->num_ns--;
+
+ /* NB: Don't bother freeing the old active_ns array. */
+
+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+ nvmft_controller_lun_changed(ctrlr, lun_id);
+ }
+
+ sx_xunlock(&np->lock);
+
+ return (0);
+}
+
+void
+nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid,
+ struct nvme_ns_list *nslist)
+{
+ u_int i, count;
+
+ sx_slock(&np->lock);
+ count = 0;
+ for (i = 0; i < np->num_ns; i++) {
+ if (np->active_ns[i] <= nsid)
+ continue;
+ nslist->ns[count] = htole32(np->active_ns[i]);
+ count++;
+ if (count == nitems(nslist->ns))
+ break;
+ }
+ sx_sunlock(&np->lock);
+}
+
+void
+nvmft_dispatch_command(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
+ bool admin)
+{
+ struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp);
+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
+ struct nvmft_port *np = ctrlr->np;
+ union ctl_io *io;
+ int error;
+
+ if (cmd->nsid == htole32(0)) {
+ nvmft_send_generic_error(qp, nc,
+ NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ mtx_lock(&ctrlr->lock);
+ if (ctrlr->pending_commands == 0)
+ ctrlr->start_busy = sbinuptime();
+ ctrlr->pending_commands++;
+ mtx_unlock(&ctrlr->lock);
+ io = ctl_alloc_io(np->port.ctl_pool_ref);
+ ctl_zero_io(io);
+ NVMFT_NC(io) = nc;
+ NVMFT_QP(io) = qp;
+ io->io_hdr.io_type = admin ? CTL_IO_NVME_ADMIN : CTL_IO_NVME;
+ io->io_hdr.nexus.initid = ctrlr->cntlid;
+ io->io_hdr.nexus.targ_port = np->port.targ_port;
+ io->io_hdr.nexus.targ_lun = le32toh(cmd->nsid) - 1;
+ io->nvmeio.cmd = *cmd;
+ error = ctl_run(io);
+ if (error != 0) {
+ nvmft_printf(ctrlr, "ctl_run failed for command on %s: %d\n",
+ nvmft_qpair_name(qp), error);
+ ctl_nvme_set_generic_error(&io->nvmeio,
+ NVME_SC_INTERNAL_DEVICE_ERROR);
+ nvmft_done(io);
+
+ nvmft_controller_error(ctrlr, qp, ENXIO);
+ }
+}
+
+void
+nvmft_terminate_commands(struct nvmft_controller *ctrlr)
+{
+ struct nvmft_port *np = ctrlr->np;
+ union ctl_io *io;
+ int error;
+
+ mtx_lock(&ctrlr->lock);
+ if (ctrlr->pending_commands == 0)
+ ctrlr->start_busy = sbinuptime();
+ ctrlr->pending_commands++;
+ mtx_unlock(&ctrlr->lock);
+ io = ctl_alloc_io(np->port.ctl_pool_ref);
+ ctl_zero_io(io);
+ NVMFT_QP(io) = ctrlr->admin;
+ io->io_hdr.io_type = CTL_IO_TASK;
+ io->io_hdr.nexus.initid = ctrlr->cntlid;
+ io->io_hdr.nexus.targ_port = np->port.targ_port;
+ io->io_hdr.nexus.targ_lun = 0;
+ io->taskio.tag_type = CTL_TAG_SIMPLE; /* XXX: unused? */
+ io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET;
+ error = ctl_run(io);
+ if (error != CTL_RETVAL_COMPLETE) {
+ nvmft_printf(ctrlr, "failed to terminate tasks: %d\n", error);
+#ifdef INVARIANTS
+ io->io_hdr.status = CTL_SUCCESS;
+#endif
+ nvmft_done(io);
+ }
+}
+
+static void
+nvmft_datamove_out_cb(void *arg, size_t xfered, int error)
+{
+ struct ctl_nvmeio *ctnio = arg;
+
+ if (error != 0) {
+ ctl_nvme_set_data_transfer_error(ctnio);
+ } else {
+ MPASS(xfered == ctnio->kern_data_len);
+ ctnio->kern_data_resid -= xfered;
+ }
+
+ if (ctnio->kern_sg_entries) {
+ free(ctnio->ext_data_ptr, M_NVMFT);
+ ctnio->ext_data_ptr = NULL;
+ } else
+ MPASS(ctnio->ext_data_ptr == NULL);
+ ctl_datamove_done((union ctl_io *)ctnio, false);
+}
+
+static void
+nvmft_datamove_out(struct ctl_nvmeio *ctnio, struct nvmft_qpair *qp,
+ struct nvmf_capsule *nc)
+{
+ struct memdesc mem;
+ int error;
+
+ MPASS(ctnio->ext_data_ptr == NULL);
+ if (ctnio->kern_sg_entries > 0) {
+ struct ctl_sg_entry *sgl;
+ struct bus_dma_segment *vlist;
+
+ vlist = mallocarray(ctnio->kern_sg_entries, sizeof(*vlist),
+ M_NVMFT, M_WAITOK);
+ ctnio->ext_data_ptr = (void *)vlist;
+ sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
+ for (u_int i = 0; i < ctnio->kern_sg_entries; i++) {
+ vlist[i].ds_addr = (uintptr_t)sgl[i].addr;
+ vlist[i].ds_len = sgl[i].len;
+ }
+ mem = memdesc_vlist(vlist, ctnio->kern_sg_entries);
+ } else
+ mem = memdesc_vaddr(ctnio->kern_data_ptr, ctnio->kern_data_len);
+
+ error = nvmf_receive_controller_data(nc, ctnio->kern_rel_offset, &mem,
+ ctnio->kern_data_len, nvmft_datamove_out_cb, ctnio);
+ if (error == 0)
+ return;
+
+ nvmft_printf(nvmft_qpair_ctrlr(qp),
+ "Failed to request capsule data: %d\n", error);
+ ctl_nvme_set_data_transfer_error(ctnio);
+
+ if (ctnio->kern_sg_entries) {
+ free(ctnio->ext_data_ptr, M_NVMFT);
+ ctnio->ext_data_ptr = NULL;
+ } else
+ MPASS(ctnio->ext_data_ptr == NULL);
+ ctl_datamove_done((union ctl_io *)ctnio, true);
+}
+
+static struct mbuf *
+nvmft_copy_data(struct ctl_nvmeio *ctnio)
+{
+ struct ctl_sg_entry *sgl;
+ struct mbuf *m0, *m;
+ uint32_t resid, off, todo;
+ int mlen;
+
+ MPASS(ctnio->kern_data_len != 0);
+
+ m0 = m_getm2(NULL, ctnio->kern_data_len, M_WAITOK, MT_DATA, 0);
+
+ if (ctnio->kern_sg_entries == 0) {
+ m_copyback(m0, 0, ctnio->kern_data_len, ctnio->kern_data_ptr);
+ return (m0);
+ }
+
+ resid = ctnio->kern_data_len;
+ sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
+ off = 0;
+ m = m0;
+ mlen = M_TRAILINGSPACE(m);
+ for (;;) {
+ todo = MIN(mlen, sgl->len - off);
+ memcpy(mtod(m, char *) + m->m_len, (char *)sgl->addr + off,
+ todo);
+ m->m_len += todo;
+ resid -= todo;
+ if (resid == 0) {
+ MPASS(m->m_next == NULL);
+ break;
+ }
+
+ off += todo;
+ if (off == sgl->len) {
+ sgl++;
+ off = 0;
+ }
+ mlen -= todo;
+ if (mlen == 0) {
+ m = m->m_next;
+ mlen = M_TRAILINGSPACE(m);
+ }
+ }
+
+ return (m0);
+}
+
+static void
+m_free_ref_data(struct mbuf *m)
+{
+ ctl_ref kern_data_ref = m->m_ext.ext_arg1;
+
+ kern_data_ref(m->m_ext.ext_arg2, -1);
+}
+
+static struct mbuf *
+m_get_ref_data(struct ctl_nvmeio *ctnio, void *buf, u_int size)
+{
+ struct mbuf *m;
+
+ m = m_get(M_WAITOK, MT_DATA);
+ m_extadd(m, buf, size, m_free_ref_data, ctnio->kern_data_ref,
+ ctnio->kern_data_arg, M_RDONLY, EXT_CTL);
+ m->m_len = size;
+ ctnio->kern_data_ref(ctnio->kern_data_arg, 1);
+ return (m);
+}
+
+static struct mbuf *
+nvmft_ref_data(struct ctl_nvmeio *ctnio)
+{
+ struct ctl_sg_entry *sgl;
+ struct mbuf *m0, *m;
+
+ MPASS(ctnio->kern_data_len != 0);
+
+ if (ctnio->kern_sg_entries == 0)
+ return (m_get_ref_data(ctnio, ctnio->kern_data_ptr,
+ ctnio->kern_data_len));
+
+ sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
+ m0 = m_get_ref_data(ctnio, sgl[0].addr, sgl[0].len);
+ m = m0;
+ for (u_int i = 1; i < ctnio->kern_sg_entries; i++) {
+ m->m_next = m_get_ref_data(ctnio, sgl[i].addr, sgl[i].len);
+ m = m->m_next;
+ }
+ return (m0);
+}
+
+static void
+nvmft_datamove_in(struct ctl_nvmeio *ctnio, struct nvmft_qpair *qp,
+ struct nvmf_capsule *nc)
+{
+ struct mbuf *m;
+ u_int status;
+
+ if (ctnio->kern_data_ref != NULL)
+ m = nvmft_ref_data(ctnio);
+ else
+ m = nvmft_copy_data(ctnio);
+ status = nvmf_send_controller_data(nc, ctnio->kern_rel_offset, m,
+ ctnio->kern_data_len);
+ switch (status) {
+ case NVMF_SUCCESS_SENT:
+ ctnio->success_sent = true;
+ nvmft_command_completed(qp, nc);
+ /* FALLTHROUGH */
+ case NVMF_MORE:
+ case NVME_SC_SUCCESS:
+ break;
+ default:
+ ctl_nvme_set_generic_error(ctnio, status);
+ break;
+ }
+ ctl_datamove_done((union ctl_io *)ctnio, true);
+}
+
+static void
+nvmft_datamove(union ctl_io *io)
+{
+ struct nvmf_capsule *nc;
+ struct nvmft_qpair *qp;
+
+ /* Some CTL commands preemptively set a success status. */
+ MPASS(io->io_hdr.status == CTL_STATUS_NONE ||
+ io->io_hdr.status == CTL_SUCCESS);
+ MPASS(!io->nvmeio.success_sent);
+
+ nc = NVMFT_NC(io);
+ qp = NVMFT_QP(io);
+
+ if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN)
+ nvmft_datamove_in(&io->nvmeio, qp, nc);
+ else
+ nvmft_datamove_out(&io->nvmeio, qp, nc);
+}
+
+static void
+hip_add(uint64_t pair[2], uint64_t addend)
+{
+ uint64_t old, new;
+
+ old = le64toh(pair[0]);
+ new = old + addend;
+ pair[0] = htole64(new);
+ if (new < old)
+ pair[1] += htole64(1);
+}
+
+static void
+nvmft_done(union ctl_io *io)
+{
+ struct nvmft_controller *ctrlr;
+ const struct nvme_command *cmd;
+ struct nvmft_qpair *qp;
+ struct nvmf_capsule *nc;
+ size_t len;
+
+ KASSERT(io->io_hdr.status == CTL_SUCCESS ||
+ io->io_hdr.status == CTL_NVME_ERROR,
+ ("%s: bad status %u", __func__, io->io_hdr.status));
+
+ nc = NVMFT_NC(io);
+ qp = NVMFT_QP(io);
+ ctrlr = nvmft_qpair_ctrlr(qp);
+
+ if (nc == NULL) {
+ /* Completion of nvmft_terminate_commands. */
+ goto end;
+ }
+
+ cmd = nvmf_capsule_sqe(nc);
+
+ if (io->io_hdr.status == CTL_SUCCESS)
+ len = nvmf_capsule_data_len(nc) / 512;
+ else
+ len = 0;
+ switch (cmd->opc) {
+ case NVME_OPC_WRITE:
+ mtx_lock(&ctrlr->lock);
+ hip_add(ctrlr->hip.host_write_commands, 1);
+ len += ctrlr->partial_duw;
+ if (len > 1000)
+ hip_add(ctrlr->hip.data_units_written, len / 1000);
+ ctrlr->partial_duw = len % 1000;
+ mtx_unlock(&ctrlr->lock);
+ break;
+ case NVME_OPC_READ:
+ case NVME_OPC_COMPARE:
+ case NVME_OPC_VERIFY:
+ mtx_lock(&ctrlr->lock);
+ if (cmd->opc != NVME_OPC_VERIFY)
+ hip_add(ctrlr->hip.host_read_commands, 1);
+ len += ctrlr->partial_dur;
+ if (len > 1000)
+ hip_add(ctrlr->hip.data_units_read, len / 1000);
+ ctrlr->partial_dur = len % 1000;
+ mtx_unlock(&ctrlr->lock);
+ break;
+ }
+
+ if (io->nvmeio.success_sent) {
+ MPASS(io->io_hdr.status == CTL_SUCCESS);
+ } else {
+ io->nvmeio.cpl.cid = cmd->cid;
+ nvmft_send_response(qp, &io->nvmeio.cpl);
+ }
+ nvmf_free_capsule(nc);
+end:
+ ctl_free_io(io);
+ mtx_lock(&ctrlr->lock);
+ ctrlr->pending_commands--;
+ if (ctrlr->pending_commands == 0)
+ ctrlr->busy_total += sbinuptime() - ctrlr->start_busy;
+ mtx_unlock(&ctrlr->lock);
+}
+
+static int
+nvmft_init(void)
+{
+ TAILQ_INIT(&nvmft_ports);
+ sx_init(&nvmft_ports_lock, "nvmft ports");
+ return (0);
+}
+
+void
+nvmft_port_free(struct nvmft_port *np)
+{
+ KASSERT(TAILQ_EMPTY(&np->controllers),
+ ("%s(%p): active controllers", __func__, np));
+
+ if (np->port.targ_port != -1) {
+ if (ctl_port_deregister(&np->port) != 0)
+ printf("%s: ctl_port_deregister() failed\n", __func__);
+ }
+
+ free(np->active_ns, M_NVMFT);
+ clean_unrhdr(np->ids);
+ delete_unrhdr(np->ids);
+ sx_destroy(&np->lock);
+ free(np, M_NVMFT);
+}
+
+static struct nvmft_port *
+nvmft_port_find(const char *subnqn)
+{
+ struct nvmft_port *np;
+
+ KASSERT(nvmf_nqn_valid(subnqn), ("%s: invalid nqn", __func__));
+
+ sx_assert(&nvmft_ports_lock, SA_LOCKED);
+ TAILQ_FOREACH(np, &nvmft_ports, link) {
+ if (strcmp(np->cdata.subnqn, subnqn) == 0)
+ break;
+ }
+ return (np);
+}
+
+static struct nvmft_port *
+nvmft_port_find_by_id(int port_id)
+{
+ struct nvmft_port *np;
+
+ sx_assert(&nvmft_ports_lock, SA_LOCKED);
+ TAILQ_FOREACH(np, &nvmft_ports, link) {
+ if (np->port.targ_port == port_id)
+ break;
+ }
+ return (np);
+}
+
+/*
+ * Helper function to fetch a number stored as a string in an nv_list.
+ * Returns false if the string was not a valid number.
+ */
+static bool
+dnvlist_get_strnum(nvlist_t *nvl, const char *name, u_long default_value,
+ u_long *value)
+{
+ const char *str;
+ char *cp;
+
+ str = dnvlist_get_string(nvl, name, NULL);
+ if (str == NULL) {
+ *value = default_value;
+ return (true);
+ }
+ if (*str == '\0')
+ return (false);
+ *value = strtoul(str, &cp, 0);
+ if (*cp != '\0')
+ return (false);
+ return (true);
+}
+
+/*
+ * NVMeoF ports support the following parameters:
+ *
+ * Mandatory:
+ *
+ * subnqn: subsystem NVMe Qualified Name
+ * portid: integer port ID from Discovery Log Page entry
+ *
+ * Optional:
+ * serial: Serial Number string
+ * max_io_qsize: Maximum number of I/O queue entries
+ * enable_timeout: Timeout for controller enable in milliseconds
+ * ioccsz: Maximum command capsule size
+ * iorcsz: Maximum response capsule size
+ * nn: Number of namespaces
+ */
+static void
+nvmft_port_create(struct ctl_req *req)
+{
+ struct nvmft_port *np;
+ struct ctl_port *port;
+ const char *serial, *subnqn;
+ char serial_buf[NVME_SERIAL_NUMBER_LENGTH];
+ u_long enable_timeout, hostid, ioccsz, iorcsz, max_io_qsize, nn, portid;
+ int error;
+
+ /* Required parameters. */
+ subnqn = dnvlist_get_string(req->args_nvl, "subnqn", NULL);
+ if (subnqn == NULL || !nvlist_exists_string(req->args_nvl, "portid")) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Missing required argument");
+ return;
+ }
+ if (!nvmf_nqn_valid(subnqn)) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid SubNQN");
+ return;
+ }
+ if (!dnvlist_get_strnum(req->args_nvl, "portid", UINT16_MAX, &portid) ||
+ portid > UINT16_MAX) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid port ID");
+ return;
+ }
+
+ /* Optional parameters. */
+ if (!dnvlist_get_strnum(req->args_nvl, "max_io_qsize",
+ NVMF_MAX_IO_ENTRIES, &max_io_qsize) ||
+ max_io_qsize < NVME_MIN_IO_ENTRIES ||
+ max_io_qsize > NVME_MAX_IO_ENTRIES) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid maximum I/O queue size");
+ return;
+ }
+
+ if (!dnvlist_get_strnum(req->args_nvl, "enable_timeout",
+ NVMF_CC_EN_TIMEOUT * 500, &enable_timeout) ||
+ (enable_timeout % 500) != 0 || (enable_timeout / 500) > 255) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid enable timeout");
+ return;
+ }
+
+ if (!dnvlist_get_strnum(req->args_nvl, "ioccsz", NVMF_IOCCSZ,
+ &ioccsz) || ioccsz < sizeof(struct nvme_command) ||
+ (ioccsz % 16) != 0) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid Command Capsule size");
+ return;
+ }
+
+ if (!dnvlist_get_strnum(req->args_nvl, "iorcsz", NVMF_IORCSZ,
+ &iorcsz) || iorcsz < sizeof(struct nvme_completion) ||
+ (iorcsz % 16) != 0) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid Response Capsule size");
+ return;
+ }
+
+ if (!dnvlist_get_strnum(req->args_nvl, "nn", NVMF_NN, &nn) ||
+ nn < 1 || nn > UINT32_MAX) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid number of namespaces");
+ return;
+ }
+
+ serial = dnvlist_get_string(req->args_nvl, "serial", NULL);
+ if (serial == NULL) {
+ getcredhostid(curthread->td_ucred, &hostid);
+ nvmf_controller_serial(serial_buf, sizeof(serial_buf), hostid);
+ serial = serial_buf;
+ }
+
+ sx_xlock(&nvmft_ports_lock);
+
+ np = nvmft_port_find(subnqn);
+ if (np != NULL) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "SubNQN \"%s\" already exists", subnqn);
+ sx_xunlock(&nvmft_ports_lock);
+ return;
+ }
+
+ np = malloc(sizeof(*np), M_NVMFT, M_WAITOK | M_ZERO);
+ refcount_init(&np->refs, 1);
+ np->max_io_qsize = max_io_qsize;
+ np->cap = _nvmf_controller_cap(max_io_qsize, enable_timeout / 500);
+ sx_init(&np->lock, "nvmft port");
+ np->ids = new_unrhdr(0, MIN(CTL_MAX_INIT_PER_PORT - 1,
+ NVMF_CNTLID_STATIC_MAX), UNR_NO_MTX);
+ TAILQ_INIT(&np->controllers);
+
+ /* The controller ID is set later for individual controllers. */
+ _nvmf_init_io_controller_data(0, max_io_qsize, serial, ostype,
+ osrelease, subnqn, nn, ioccsz, iorcsz, &np->cdata);
+ np->cdata.aerl = NVMFT_NUM_AER - 1;
+ np->cdata.oaes = htole32(NVME_ASYNC_EVENT_NS_ATTRIBUTE);
+ np->cdata.oncs = htole16(NVMEF(NVME_CTRLR_DATA_ONCS_VERIFY, 1) |
+ NVMEF(NVME_CTRLR_DATA_ONCS_WRZERO, 1) |
+ NVMEF(NVME_CTRLR_DATA_ONCS_DSM, 1) |
+ NVMEF(NVME_CTRLR_DATA_ONCS_COMPARE, 1));
+ np->cdata.fuses = NVMEF(NVME_CTRLR_DATA_FUSES_CNW, 1);
+
+ np->fp.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1);
+ memcpy(np->fp.revision[0], np->cdata.fr, sizeof(np->cdata.fr));
+
+ port = &np->port;
+
+ port->frontend = &nvmft_frontend;
+ port->port_type = CTL_PORT_NVMF;
+ port->num_requested_ctl_io = max_io_qsize;
+ port->port_name = "nvmf";
+ port->physical_port = portid;
+ port->virtual_port = 0;
+ port->port_online = nvmft_online;
+ port->port_offline = nvmft_offline;
+ port->onoff_arg = np;
+ port->lun_enable = nvmft_lun_enable;
+ port->lun_disable = nvmft_lun_disable;
+ port->targ_lun_arg = np;
+ port->fe_datamove = nvmft_datamove;
+ port->fe_done = nvmft_done;
+ port->targ_port = -1;
+ port->options = nvlist_clone(req->args_nvl);
+
+ error = ctl_port_register(port);
+ if (error != 0) {
+ sx_xunlock(&nvmft_ports_lock);
+ nvlist_destroy(port->options);
+ nvmft_port_rele(np);
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Failed to register CTL port with error %d", error);
+ return;
+ }
+
+ TAILQ_INSERT_TAIL(&nvmft_ports, np, link);
+ sx_xunlock(&nvmft_ports_lock);
+
+ req->status = CTL_LUN_OK;
+ req->result_nvl = nvlist_create(0);
+ nvlist_add_number(req->result_nvl, "port_id", port->targ_port);
+}
+
+static void
+nvmft_port_remove(struct ctl_req *req)
+{
+ struct nvmft_port *np;
+ const char *subnqn;
+ u_long port_id;
+
+ /*
+ * ctladm port -r just provides the port_id, so permit looking
+ * up a port either by "subnqn" or "port_id".
+ */
+ port_id = ULONG_MAX;
+ subnqn = dnvlist_get_string(req->args_nvl, "subnqn", NULL);
+ if (subnqn == NULL) {
+ if (!nvlist_exists_string(req->args_nvl, "port_id")) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Missing required argument");
+ return;
+ }
+ if (!dnvlist_get_strnum(req->args_nvl, "port_id", ULONG_MAX,
+ &port_id)) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid CTL port ID");
+ return;
+ }
+ } else {
+ if (nvlist_exists_string(req->args_nvl, "port_id")) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Ambiguous port removal request");
+ return;
+ }
+ }
+
+ sx_xlock(&nvmft_ports_lock);
+
+ if (subnqn != NULL) {
+ np = nvmft_port_find(subnqn);
+ if (np == NULL) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "SubNQN \"%s\" does not exist", subnqn);
+ sx_xunlock(&nvmft_ports_lock);
+ return;
+ }
+ } else {
+ np = nvmft_port_find_by_id(port_id);
+ if (np == NULL) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "CTL port %lu is not a NVMF port", port_id);
+ sx_xunlock(&nvmft_ports_lock);
+ return;
+ }
+ }
+
+ TAILQ_REMOVE(&nvmft_ports, np, link);
+ sx_xunlock(&nvmft_ports_lock);
+
+ ctl_port_offline(&np->port);
+ nvmft_port_rele(np);
+ req->status = CTL_LUN_OK;
+}
+
+static void
+nvmft_handoff(struct ctl_nvmf *cn)
+{
+ struct nvmf_fabric_connect_cmd cmd;
+ struct nvmf_handoff_controller_qpair *handoff;
+ struct nvmf_fabric_connect_data *data;
+ struct nvmft_port *np;
+ int error;
+
+ np = NULL;
+ data = NULL;
+ handoff = &cn->data.handoff;
+ error = copyin(handoff->cmd, &cmd, sizeof(cmd));
+ if (error != 0) {
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Failed to copyin CONNECT SQE");
+ return;
+ }
+
+ data = malloc(sizeof(*data), M_NVMFT, M_WAITOK);
+ error = copyin(handoff->data, data, sizeof(*data));
+ if (error != 0) {
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Failed to copyin CONNECT data");
+ goto out;
+ }
+
+ if (!nvmf_nqn_valid(data->subnqn)) {
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Invalid SubNQN");
+ goto out;
+ }
+
+ sx_slock(&nvmft_ports_lock);
+ np = nvmft_port_find(data->subnqn);
+ if (np == NULL) {
+ sx_sunlock(&nvmft_ports_lock);
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Unknown SubNQN");
+ goto out;
+ }
+ if (!np->online) {
+ sx_sunlock(&nvmft_ports_lock);
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "CTL port offline");
+ np = NULL;
+ goto out;
+ }
+ nvmft_port_ref(np);
+ sx_sunlock(&nvmft_ports_lock);
+
+ if (handoff->params.admin) {
+ error = nvmft_handoff_admin_queue(np, handoff, &cmd, data);
+ if (error != 0) {
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Failed to handoff admin queue: %d", error);
+ goto out;
+ }
+ } else {
+ error = nvmft_handoff_io_queue(np, handoff, &cmd, data);
+ if (error != 0) {
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Failed to handoff admin queue: %d", error);
+ goto out;
+ }
+ }
+
+ cn->status = CTL_NVMF_OK;
+out:
+ if (np != NULL)
+ nvmft_port_rele(np);
+ free(data, M_NVMFT);
+}
+
+static void
+nvmft_list(struct ctl_nvmf *cn)
+{
+ struct ctl_nvmf_list_params *lp;
+ struct nvmft_controller *ctrlr;
+ struct nvmft_port *np;
+ struct sbuf *sb;
+ int error;
+
+ lp = &cn->data.list;
+
+ sb = sbuf_new(NULL, NULL, lp->alloc_len, SBUF_FIXEDLEN |
+ SBUF_INCLUDENUL);
+ if (sb == NULL) {
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Failed to allocate NVMeoF session list");
+ return;
+ }
+
+ sbuf_printf(sb, "<ctlnvmflist>\n");
+ sx_slock(&nvmft_ports_lock);
+ TAILQ_FOREACH(np, &nvmft_ports, link) {
+ sx_slock(&np->lock);
+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+ sbuf_printf(sb, "<connection id=\"%d\">"
+ "<hostnqn>%s</hostnqn>"
+ "<subnqn>%s</subnqn>"
+ "<trtype>%u</trtype>"
+ "</connection>\n",
+ ctrlr->cntlid,
+ ctrlr->hostnqn,
+ np->cdata.subnqn,
+ ctrlr->trtype);
+ }
+ sx_sunlock(&np->lock);
+ }
+ sx_sunlock(&nvmft_ports_lock);
+ sbuf_printf(sb, "</ctlnvmflist>\n");
+ if (sbuf_finish(sb) != 0) {
+ sbuf_delete(sb);
+ cn->status = CTL_NVMF_LIST_NEED_MORE_SPACE;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Out of space, %d bytes is too small", lp->alloc_len);
+ return;
+ }
+
+ error = copyout(sbuf_data(sb), lp->conn_xml, sbuf_len(sb));
+ if (error != 0) {
+ sbuf_delete(sb);
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Failed to copyout session list: %d", error);
+ return;
+ }
+ lp->fill_len = sbuf_len(sb);
+ cn->status = CTL_NVMF_OK;
+ sbuf_delete(sb);
+}
+
+static void
+nvmft_terminate(struct ctl_nvmf *cn)
+{
+ struct ctl_nvmf_terminate_params *tp;
+ struct nvmft_controller *ctrlr;
+ struct nvmft_port *np;
+ bool found, match;
+
+ tp = &cn->data.terminate;
+
+ found = false;
+ sx_slock(&nvmft_ports_lock);
+ TAILQ_FOREACH(np, &nvmft_ports, link) {
+ sx_slock(&np->lock);
+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+ if (tp->all != 0)
+ match = true;
+ else if (tp->cntlid != -1)
+ match = tp->cntlid == ctrlr->cntlid;
+ else if (tp->hostnqn[0] != '\0')
+ match = strncmp(tp->hostnqn, ctrlr->hostnqn,
+ sizeof(tp->hostnqn)) == 0;
+ else
+ match = false;
+ if (!match)
+ continue;
+ nvmft_printf(ctrlr,
+ "disconnecting due to administrative request\n");
+ nvmft_controller_error(ctrlr, NULL, ECONNABORTED);
+ found = true;
+ }
+ sx_sunlock(&np->lock);
+ }
+ sx_sunlock(&nvmft_ports_lock);
+
+ if (!found) {
+ cn->status = CTL_NVMF_ASSOCIATION_NOT_FOUND;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "No matching associations found");
+ return;
+ }
+ cn->status = CTL_NVMF_OK;
+}
+
+static int
+nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int flag,
+ struct thread *td)
+{
+ struct ctl_nvmf *cn;
+ struct ctl_req *req;
+
+ switch (cmd) {
+ case CTL_PORT_REQ:
+ req = (struct ctl_req *)data;
+ switch (req->reqtype) {
+ case CTL_REQ_CREATE:
+ nvmft_port_create(req);
+ break;
+ case CTL_REQ_REMOVE:
+ nvmft_port_remove(req);
+ break;
+ default:
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Unsupported request type %d", req->reqtype);
+ break;
+ }
+ return (0);
+ case CTL_NVMF:
+ cn = (struct ctl_nvmf *)data;
+ switch (cn->type) {
+ case CTL_NVMF_HANDOFF:
+ nvmft_handoff(cn);
+ break;
+ case CTL_NVMF_LIST:
+ nvmft_list(cn);
+ break;
+ case CTL_NVMF_TERMINATE:
+ nvmft_terminate(cn);
+ break;
+ default:
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Invalid NVMeoF request type %d", cn->type);
+ break;
+ }
+ return (0);
+ default:
+ return (ENOTTY);
+ }
+}
+
+static int
+nvmft_shutdown(void)
+{
+ /* TODO: Need to check for active controllers. */
+ if (!TAILQ_EMPTY(&nvmft_ports))
+ return (EBUSY);
+
+ sx_destroy(&nvmft_ports_lock);
+ return (0);
+}
+
+CTL_FRONTEND_DECLARE(nvmft, nvmft_frontend);
+MODULE_DEPEND(nvmft, nvmf_transport, 1, 1, 1);
diff --git a/sys/dev/nvmf/controller/nvmft_controller.c b/sys/dev/nvmf/controller/nvmft_controller.c
new file mode 100644
index 000000000000..f3783eac1275
--- /dev/null
+++ b/sys/dev/nvmf/controller/nvmft_controller.c
@@ -0,0 +1,1130 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/memdesc.h>
+#include <sys/mutex.h>
+#include <sys/sbuf.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/controller/nvmft_subr.h>
+#include <dev/nvmf/controller/nvmft_var.h>
+
+static void nvmft_controller_shutdown(void *arg, int pending);
+static void nvmft_controller_terminate(void *arg, int pending);
+
+int
+nvmft_printf(struct nvmft_controller *ctrlr, const char *fmt, ...)
+{
+ char buf[128];
+ struct sbuf sb;
+ va_list ap;
+ size_t retval;
+
+ sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
+ sbuf_set_drain(&sb, sbuf_printf_drain, &retval);
+
+ sbuf_printf(&sb, "nvmft%u: ", ctrlr->cntlid);
+
+ va_start(ap, fmt);
+ sbuf_vprintf(&sb, fmt, ap);
+ va_end(ap);
+
+ sbuf_finish(&sb);
+ sbuf_delete(&sb);
+
+ return (retval);
+}
+
+static struct nvmft_controller *
+nvmft_controller_alloc(struct nvmft_port *np, uint16_t cntlid,
+ const struct nvmf_fabric_connect_data *data)
+{
+ struct nvmft_controller *ctrlr;
+
+ ctrlr = malloc(sizeof(*ctrlr), M_NVMFT, M_WAITOK | M_ZERO);
+ ctrlr->cntlid = cntlid;
+ nvmft_port_ref(np);
+ TAILQ_INSERT_TAIL(&np->controllers, ctrlr, link);
+ ctrlr->np = np;
+ mtx_init(&ctrlr->lock, "nvmft controller", NULL, MTX_DEF);
+ callout_init(&ctrlr->ka_timer, 1);
+ TASK_INIT(&ctrlr->shutdown_task, 0, nvmft_controller_shutdown, ctrlr);
+ TIMEOUT_TASK_INIT(taskqueue_thread, &ctrlr->terminate_task, 0,
+ nvmft_controller_terminate, ctrlr);
+
+ ctrlr->cdata = np->cdata;
+ ctrlr->cdata.ctrlr_id = htole16(cntlid);
+ memcpy(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid));
+ memcpy(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn));
+ ctrlr->hip.power_cycles[0] = 1;
+ ctrlr->create_time = sbinuptime();
+
+ ctrlr->changed_ns = malloc(sizeof(*ctrlr->changed_ns), M_NVMFT,
+ M_WAITOK | M_ZERO);
+
+ return (ctrlr);
+}
+
+static void
+nvmft_controller_free(struct nvmft_controller *ctrlr)
+{
+ mtx_destroy(&ctrlr->lock);
+ MPASS(ctrlr->io_qpairs == NULL);
+ free(ctrlr->changed_ns, M_NVMFT);
+ free(ctrlr, M_NVMFT);
+}
+
+static void
+nvmft_keep_alive_timer(void *arg)
+{
+ struct nvmft_controller *ctrlr = arg;
+ int traffic;
+
+ if (ctrlr->shutdown)
+ return;
+
+ traffic = atomic_readandclear_int(&ctrlr->ka_active_traffic);
+ if (traffic == 0) {
+ nvmft_printf(ctrlr,
+ "disconnecting due to KeepAlive timeout\n");
+ nvmft_controller_error(ctrlr, NULL, ETIMEDOUT);
+ return;
+ }
+
+ callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0, C_HARDCLOCK);
+}
+
+int
+nvmft_handoff_admin_queue(struct nvmft_port *np,
+ const struct nvmf_handoff_controller_qpair *handoff,
+ const struct nvmf_fabric_connect_cmd *cmd,
+ const struct nvmf_fabric_connect_data *data)
+{
+ struct nvmft_controller *ctrlr;
+ struct nvmft_qpair *qp;
+ uint32_t kato;
+ int cntlid;
+
+ if (cmd->qid != htole16(0))
+ return (EINVAL);
+
+ qp = nvmft_qpair_init(handoff->trtype, &handoff->params, 0,
+ "admin queue");
+
+ sx_xlock(&np->lock);
+ cntlid = alloc_unr(np->ids);
+ if (cntlid == -1) {
+ sx_xunlock(&np->lock);
+ printf("NVMFT: Unable to allocate controller for %.*s\n",
+ (int)sizeof(data->hostnqn), data->hostnqn);
+ nvmft_connect_error(qp, cmd, NVME_SCT_COMMAND_SPECIFIC,
+ NVMF_FABRIC_SC_INVALID_HOST);
+ nvmft_qpair_destroy(qp);
+ return (ENOMEM);
+ }
+
+#ifdef INVARIANTS
+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+ KASSERT(ctrlr->cntlid != cntlid,
+ ("%s: duplicate controllers with id %d", __func__, cntlid));
+ }
+#endif
+
+ ctrlr = nvmft_controller_alloc(np, cntlid, data);
+ nvmft_printf(ctrlr, "associated with %.*s\n",
+ (int)sizeof(data->hostnqn), data->hostnqn);
+ ctrlr->admin = qp;
+ ctrlr->trtype = handoff->trtype;
+
+ /*
+ * The spec requires a non-zero KeepAlive timer, but allow a
+ * zero KATO value to match Linux.
+ */
+ kato = le32toh(cmd->kato);
+ if (kato != 0) {
+ /*
+ * Round up to 1 second matching granularity
+ * advertised in cdata.
+ */
+ ctrlr->ka_sbt = mstosbt(roundup(kato, 1000));
+ callout_reset_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
+ nvmft_keep_alive_timer, ctrlr, C_HARDCLOCK);
+ }
+
+ nvmft_finish_accept(qp, cmd, ctrlr);
+ sx_xunlock(&np->lock);
+
+ return (0);
+}
+
+int
+nvmft_handoff_io_queue(struct nvmft_port *np,
+ const struct nvmf_handoff_controller_qpair *handoff,
+ const struct nvmf_fabric_connect_cmd *cmd,
+ const struct nvmf_fabric_connect_data *data)
+{
+ struct nvmft_controller *ctrlr;
+ struct nvmft_qpair *qp;
+ char name[16];
+ uint16_t cntlid, qid;
+
+ qid = le16toh(cmd->qid);
+ if (qid == 0)
+ return (EINVAL);
+ cntlid = le16toh(data->cntlid);
+
+ snprintf(name, sizeof(name), "I/O queue %u", qid);
+ qp = nvmft_qpair_init(handoff->trtype, &handoff->params, qid, name);
+
+ sx_slock(&np->lock);
+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+ if (ctrlr->cntlid == cntlid)
+ break;
+ }
+ if (ctrlr == NULL) {
+ sx_sunlock(&np->lock);
+ printf("NVMFT: Nonexistent controller %u for I/O queue %u from %.*s\n",
+ ctrlr->cntlid, qid, (int)sizeof(data->hostnqn),
+ data->hostnqn);
+ nvmft_connect_invalid_parameters(qp, cmd, true,
+ offsetof(struct nvmf_fabric_connect_data, cntlid));
+ nvmft_qpair_destroy(qp);
+ return (ENOENT);
+ }
+
+ if (memcmp(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid)) != 0) {
+ sx_sunlock(&np->lock);
+ nvmft_printf(ctrlr,
+ "hostid mismatch for I/O queue %u from %.*s\n", qid,
+ (int)sizeof(data->hostnqn), data->hostnqn);
+ nvmft_connect_invalid_parameters(qp, cmd, true,
+ offsetof(struct nvmf_fabric_connect_data, hostid));
+ nvmft_qpair_destroy(qp);
+ return (EINVAL);
+ }
+ if (memcmp(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn)) != 0) {
+ sx_sunlock(&np->lock);
+ nvmft_printf(ctrlr,
+ "hostnqn mismatch for I/O queue %u from %.*s\n", qid,
+ (int)sizeof(data->hostnqn), data->hostnqn);
+ nvmft_connect_invalid_parameters(qp, cmd, true,
+ offsetof(struct nvmf_fabric_connect_data, hostnqn));
+ nvmft_qpair_destroy(qp);
+ return (EINVAL);
+ }
+
+ /* XXX: Require handoff->trtype == ctrlr->trtype? */
+
+ mtx_lock(&ctrlr->lock);
+ if (ctrlr->shutdown) {
+ mtx_unlock(&ctrlr->lock);
+ sx_sunlock(&np->lock);
+ nvmft_printf(ctrlr,
+ "attempt to create I/O queue %u on disabled controller from %.*s\n",
+ qid, (int)sizeof(data->hostnqn), data->hostnqn);
+ nvmft_connect_invalid_parameters(qp, cmd, true,
+ offsetof(struct nvmf_fabric_connect_data, cntlid));
+ nvmft_qpair_destroy(qp);
+ return (EINVAL);
+ }
+ if (ctrlr->num_io_queues == 0) {
+ mtx_unlock(&ctrlr->lock);
+ sx_sunlock(&np->lock);
+ nvmft_printf(ctrlr,
+ "attempt to create I/O queue %u without enabled queues from %.*s\n",
+ qid, (int)sizeof(data->hostnqn), data->hostnqn);
+ nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
+ NVME_SC_COMMAND_SEQUENCE_ERROR);
+ nvmft_qpair_destroy(qp);
+ return (EINVAL);
+ }
+ if (cmd->qid > ctrlr->num_io_queues) {
+ mtx_unlock(&ctrlr->lock);
+ sx_sunlock(&np->lock);
+ nvmft_printf(ctrlr,
+ "attempt to create invalid I/O queue %u from %.*s\n", qid,
+ (int)sizeof(data->hostnqn), data->hostnqn);
+ nvmft_connect_invalid_parameters(qp, cmd, false,
+ offsetof(struct nvmf_fabric_connect_cmd, qid));
+ nvmft_qpair_destroy(qp);
+ return (EINVAL);
+ }
+ if (ctrlr->io_qpairs[qid - 1].qp != NULL) {
+ mtx_unlock(&ctrlr->lock);
+ sx_sunlock(&np->lock);
+ nvmft_printf(ctrlr,
+ "attempt to re-create I/O queue %u from %.*s\n", qid,
+ (int)sizeof(data->hostnqn), data->hostnqn);
+ nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
+ NVME_SC_COMMAND_SEQUENCE_ERROR);
+ nvmft_qpair_destroy(qp);
+ return (EINVAL);
+ }
+
+ ctrlr->io_qpairs[qid - 1].qp = qp;
+ mtx_unlock(&ctrlr->lock);
+ nvmft_finish_accept(qp, cmd, ctrlr);
+ sx_sunlock(&np->lock);
+
+ return (0);
+}
+
+static void
+nvmft_controller_shutdown(void *arg, int pending)
+{
+ struct nvmft_controller *ctrlr = arg;
+
+ MPASS(pending == 1);
+
+ /*
+ * Shutdown all I/O queues to terminate pending datamoves and
+ * stop receiving new commands.
+ */
+ mtx_lock(&ctrlr->lock);
+ for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
+ if (ctrlr->io_qpairs[i].qp != NULL) {
+ ctrlr->io_qpairs[i].shutdown = true;
+ mtx_unlock(&ctrlr->lock);
+ nvmft_qpair_shutdown(ctrlr->io_qpairs[i].qp);
+ mtx_lock(&ctrlr->lock);
+ }
+ }
+ mtx_unlock(&ctrlr->lock);
+
+ /* Terminate active CTL commands. */
+ nvmft_terminate_commands(ctrlr);
+
+ /* Wait for all pending CTL commands to complete. */
+ mtx_lock(&ctrlr->lock);
+ while (ctrlr->pending_commands != 0)
+ mtx_sleep(&ctrlr->pending_commands, &ctrlr->lock, 0, "nvmftsh",
+ hz / 100);
+ mtx_unlock(&ctrlr->lock);
+
+ /* Delete all of the I/O queues. */
+ for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
+ if (ctrlr->io_qpairs[i].qp != NULL)
+ nvmft_qpair_destroy(ctrlr->io_qpairs[i].qp);
+ }
+ free(ctrlr->io_qpairs, M_NVMFT);
+ ctrlr->io_qpairs = NULL;
+
+ mtx_lock(&ctrlr->lock);
+ ctrlr->num_io_queues = 0;
+
+ /* Mark shutdown complete. */
+ if (NVMEV(NVME_CSTS_REG_SHST, ctrlr->csts) == NVME_SHST_OCCURRING) {
+ ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
+ ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_COMPLETE);
+ }
+
+ if (NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) == 0) {
+ ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_RDY);
+ ctrlr->shutdown = false;
+ }
+ mtx_unlock(&ctrlr->lock);
+
+ /*
+ * If the admin queue was closed while shutting down or a
+ * fatal controller error has occurred, terminate the
+ * association immediately, otherwise wait up to 2 minutes
+ * (NVMe-over-Fabrics 1.1 4.6).
+ */
+ if (ctrlr->admin_closed || NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) != 0)
+ nvmft_controller_terminate(ctrlr, 0);
+ else
+ taskqueue_enqueue_timeout(taskqueue_thread,
+ &ctrlr->terminate_task, hz * 60 * 2);
+}
+
+static void
+nvmft_controller_terminate(void *arg, int pending)
+{
+ struct nvmft_controller *ctrlr = arg;
+ struct nvmft_port *np;
+ bool wakeup_np;
+
+ /* If the controller has been re-enabled, nothing to do. */
+ mtx_lock(&ctrlr->lock);
+ if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) != 0) {
+ mtx_unlock(&ctrlr->lock);
+
+ if (ctrlr->ka_sbt != 0)
+ callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
+ C_HARDCLOCK);
+ return;
+ }
+
+ /* Disable updates to CC while destroying admin qpair. */
+ ctrlr->shutdown = true;
+ mtx_unlock(&ctrlr->lock);
+
+ nvmft_qpair_destroy(ctrlr->admin);
+
+ /* Remove association (CNTLID). */
+ np = ctrlr->np;
+ sx_xlock(&np->lock);
+ TAILQ_REMOVE(&np->controllers, ctrlr, link);
+ free_unr(np->ids, ctrlr->cntlid);
+ wakeup_np = (!np->online && TAILQ_EMPTY(&np->controllers));
+ sx_xunlock(&np->lock);
+ if (wakeup_np)
+ wakeup(np);
+
+ callout_drain(&ctrlr->ka_timer);
+
+ nvmft_printf(ctrlr, "association terminated\n");
+ nvmft_controller_free(ctrlr);
+ nvmft_port_rele(np);
+}
+
+void
+nvmft_controller_error(struct nvmft_controller *ctrlr, struct nvmft_qpair *qp,
+ int error)
+{
+ /*
+ * If a queue pair is closed, that isn't an error per se.
+ * That just means additional commands cannot be received on
+ * that queue pair.
+ *
+ * If the admin queue pair is closed while idle or while
+ * shutting down, terminate the association immediately.
+ *
+ * If an I/O queue pair is closed, just ignore it.
+ */
+ if (error == 0) {
+ if (qp != ctrlr->admin)
+ return;
+
+ mtx_lock(&ctrlr->lock);
+ if (ctrlr->shutdown) {
+ ctrlr->admin_closed = true;
+ mtx_unlock(&ctrlr->lock);
+ return;
+ }
+
+ if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0) {
+ MPASS(ctrlr->num_io_queues == 0);
+ mtx_unlock(&ctrlr->lock);
+
+ /*
+ * Ok to drop lock here since ctrlr->cc can't
+ * change if the admin queue pair has closed.
+ * This also means no new queues can be handed
+ * off, etc. Note that since there are no I/O
+ * queues, only the admin queue needs to be
+ * destroyed, so it is safe to skip
+ * nvmft_controller_shutdown and just schedule
+ * nvmft_controller_terminate. Note that we
+ * cannot call nvmft_controller_terminate from
+ * here directly as this is called from the
+ * transport layer and freeing the admin qpair
+ * might deadlock waiting for the current
+ * thread to exit.
+ */
+ if (taskqueue_cancel_timeout(taskqueue_thread,
+ &ctrlr->terminate_task, NULL) == 0)
+ taskqueue_enqueue_timeout(taskqueue_thread,
+ &ctrlr->terminate_task, 0);
+ return;
+ }
+
+ /*
+ * Treat closing of the admin queue pair while enabled
+ * as a transport error. Note that the admin queue
+ * pair has been closed.
+ */
+ ctrlr->admin_closed = true;
+ } else
+ mtx_lock(&ctrlr->lock);
+
+ /* Ignore transport errors if we are already shutting down. */
+ if (ctrlr->shutdown) {
+ mtx_unlock(&ctrlr->lock);
+ return;
+ }
+
+ ctrlr->csts |= NVMEF(NVME_CSTS_REG_CFS, 1);
+ ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
+ ctrlr->shutdown = true;
+ mtx_unlock(&ctrlr->lock);
+
+ callout_stop(&ctrlr->ka_timer);
+ taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
+}
+
+/* Wrapper around m_getm2 that also sets m_len in the mbufs in the chain. */
+static struct mbuf *
+m_getml(size_t len, int how)
+{
+ struct mbuf *m, *n;
+
+ m = m_getm2(NULL, len, how, MT_DATA, 0);
+ if (m == NULL)
+ return (NULL);
+ for (n = m; len > 0; n = n->m_next) {
+ n->m_len = M_SIZE(n);
+ if (n->m_len >= len) {
+ n->m_len = len;
+ MPASS(n->m_next == NULL);
+ }
+ len -= n->m_len;
+ }
+ return (m);
+}
+
+static void
+m_zero(struct mbuf *m, u_int offset, u_int len)
+{
+ u_int todo;
+
+ if (len == 0)
+ return;
+
+ while (m->m_len <= offset) {
+ offset -= m->m_len;
+ m = m->m_next;
+ }
+
+ todo = m->m_len - offset;
+ if (todo > len)
+ todo = len;
+ memset(mtodo(m, offset), 0, todo);
+ m = m->m_next;
+ len -= todo;
+
+ while (len > 0) {
+ todo = m->m_len;
+ if (todo > len)
+ todo = len;
+ memset(mtod(m, void *), 0, todo);
+ m = m->m_next;
+ len -= todo;
+ }
+}
+
+static void
+handle_get_log_page(struct nvmft_controller *ctrlr,
+ struct nvmf_capsule *nc, const struct nvme_command *cmd)
+{
+ struct mbuf *m;
+ uint64_t offset;
+ uint32_t numd;
+ size_t len, todo;
+ u_int status;
+ uint8_t lid;
+ bool rae;
+
+ lid = le32toh(cmd->cdw10) & 0xff;
+ rae = (le32toh(cmd->cdw10) & (1U << 15)) != 0;
+ numd = le32toh(cmd->cdw10) >> 16 | le32toh(cmd->cdw11) << 16;
+ offset = le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32;
+
+ if (offset % 3 != 0) {
+ status = NVME_SC_INVALID_FIELD;
+ goto done;
+ }
+
+ len = (numd + 1) * 4;
+
+ switch (lid) {
+ case NVME_LOG_ERROR:
+ todo = 0;
+
+ m = m_getml(len, M_WAITOK);
+ if (todo != len)
+ m_zero(m, todo, len - todo);
+ status = nvmf_send_controller_data(nc, 0, m, len);
+ MPASS(status != NVMF_MORE);
+ break;
+ case NVME_LOG_HEALTH_INFORMATION:
+ {
+ struct nvme_health_information_page hip;
+
+ if (offset >= sizeof(hip)) {
+ status = NVME_SC_INVALID_FIELD;
+ goto done;
+ }
+ todo = sizeof(hip) - offset;
+ if (todo > len)
+ todo = len;
+
+ mtx_lock(&ctrlr->lock);
+ hip = ctrlr->hip;
+ hip.controller_busy_time[0] =
+ sbintime_getsec(ctrlr->busy_total) / 60;
+ hip.power_on_hours[0] =
+ sbintime_getsec(sbinuptime() - ctrlr->create_time) / 3600;
+ mtx_unlock(&ctrlr->lock);
+
+ m = m_getml(len, M_WAITOK);
+ m_copyback(m, 0, todo, (char *)&hip + offset);
+ if (todo != len)
+ m_zero(m, todo, len - todo);
+ status = nvmf_send_controller_data(nc, 0, m, len);
+ MPASS(status != NVMF_MORE);
+ break;
+ }
+ case NVME_LOG_FIRMWARE_SLOT:
+ if (offset >= sizeof(ctrlr->np->fp)) {
+ status = NVME_SC_INVALID_FIELD;
+ goto done;
+ }
+ todo = sizeof(ctrlr->np->fp) - offset;
+ if (todo > len)
+ todo = len;
+
+ m = m_getml(len, M_WAITOK);
+ m_copyback(m, 0, todo, (char *)&ctrlr->np->fp + offset);
+ if (todo != len)
+ m_zero(m, todo, len - todo);
+ status = nvmf_send_controller_data(nc, 0, m, len);
+ MPASS(status != NVMF_MORE);
+ break;
+ case NVME_LOG_CHANGED_NAMESPACE:
+ if (offset >= sizeof(*ctrlr->changed_ns)) {
+ status = NVME_SC_INVALID_FIELD;
+ goto done;
+ }
+ todo = sizeof(*ctrlr->changed_ns) - offset;
+ if (todo > len)
+ todo = len;
+
+ m = m_getml(len, M_WAITOK);
+ mtx_lock(&ctrlr->lock);
+ m_copyback(m, 0, todo, (char *)ctrlr->changed_ns + offset);
+ if (offset == 0 && len == sizeof(*ctrlr->changed_ns))
+ memset(ctrlr->changed_ns, 0,
+ sizeof(*ctrlr->changed_ns));
+ if (!rae)
+ ctrlr->changed_ns_reported = false;
+ mtx_unlock(&ctrlr->lock);
+ if (todo != len)
+ m_zero(m, todo, len - todo);
+ status = nvmf_send_controller_data(nc, 0, m, len);
+ MPASS(status != NVMF_MORE);
+ break;
+ default:
+ nvmft_printf(ctrlr, "Unsupported page %#x for GET_LOG_PAGE\n",
+ lid);
+ status = NVME_SC_INVALID_FIELD;
+ break;
+ }
+
+done:
+ if (status == NVMF_SUCCESS_SENT)
+ nvmft_command_completed(ctrlr->admin, nc);
+ else
+ nvmft_send_generic_error(ctrlr->admin, nc, status);
+ nvmf_free_capsule(nc);
+}
+
+static void
+m_free_nslist(struct mbuf *m)
+{
+ free(m->m_ext.ext_arg1, M_NVMFT);
+}
+
+static void
+handle_identify_command(struct nvmft_controller *ctrlr,
+ struct nvmf_capsule *nc, const struct nvme_command *cmd)
+{
+ struct mbuf *m;
+ size_t data_len;
+ u_int status;
+ uint8_t cns;
+
+ cns = le32toh(cmd->cdw10) & 0xFF;
+ data_len = nvmf_capsule_data_len(nc);
+ if (data_len != sizeof(ctrlr->cdata)) {
+ nvmft_printf(ctrlr,
+ "Invalid length %zu for IDENTIFY with CNS %#x\n", data_len,
+ cns);
+ nvmft_send_generic_error(ctrlr->admin, nc,
+ NVME_SC_INVALID_OPCODE);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ switch (cns) {
+ case 0: /* Namespace data. */
+ case 3: /* Namespace Identification Descriptor list. */
+ nvmft_dispatch_command(ctrlr->admin, nc, true);
+ return;
+ case 1:
+ /* Controller data. */
+ m = m_getml(sizeof(ctrlr->cdata), M_WAITOK);
+ m_copyback(m, 0, sizeof(ctrlr->cdata), (void *)&ctrlr->cdata);
+ status = nvmf_send_controller_data(nc, 0, m,
+ sizeof(ctrlr->cdata));
+ MPASS(status != NVMF_MORE);
+ break;
+ case 2:
+ {
+ /* Active namespace list. */
+ struct nvme_ns_list *nslist;
+ uint32_t nsid;
+
+ nsid = le32toh(cmd->nsid);
+ if (nsid >= 0xfffffffe) {
+ status = NVME_SC_INVALID_FIELD;
+ break;
+ }
+
+ nslist = malloc(sizeof(*nslist), M_NVMFT, M_WAITOK | M_ZERO);
+ nvmft_populate_active_nslist(ctrlr->np, nsid, nslist);
+ m = m_get(M_WAITOK, MT_DATA);
+ m_extadd(m, (void *)nslist, sizeof(*nslist), m_free_nslist,
+ nslist, NULL, 0, EXT_CTL);
+ m->m_len = sizeof(*nslist);
+ status = nvmf_send_controller_data(nc, 0, m, m->m_len);
+ MPASS(status != NVMF_MORE);
+ break;
+ }
+ default:
+ nvmft_printf(ctrlr, "Unsupported CNS %#x for IDENTIFY\n", cns);
+ status = NVME_SC_INVALID_FIELD;
+ break;
+ }
+
+ if (status == NVMF_SUCCESS_SENT)
+ nvmft_command_completed(ctrlr->admin, nc);
+ else
+ nvmft_send_generic_error(ctrlr->admin, nc, status);
+ nvmf_free_capsule(nc);
+}
+
+static void
+handle_set_features(struct nvmft_controller *ctrlr,
+ struct nvmf_capsule *nc, const struct nvme_command *cmd)
+{
+ struct nvme_completion cqe;
+ uint8_t fid;
+
+ fid = NVMEV(NVME_FEAT_SET_FID, le32toh(cmd->cdw10));
+ switch (fid) {
+ case NVME_FEAT_NUMBER_OF_QUEUES:
+ {
+ uint32_t num_queues;
+ struct nvmft_io_qpair *io_qpairs;
+
+ num_queues = le32toh(cmd->cdw11) & 0xffff;
+
+ /* 5.12.1.7: 65535 is invalid. */
+ if (num_queues == 65535)
+ goto error;
+
+ /* Fabrics requires the same number of SQs and CQs. */
+ if (le32toh(cmd->cdw11) >> 16 != num_queues)
+ goto error;
+
+ /* Convert to 1's based */
+ num_queues++;
+
+ io_qpairs = mallocarray(num_queues, sizeof(*io_qpairs),
+ M_NVMFT, M_WAITOK | M_ZERO);
+
+ mtx_lock(&ctrlr->lock);
+ if (ctrlr->num_io_queues != 0) {
+ mtx_unlock(&ctrlr->lock);
+ free(io_qpairs, M_NVMFT);
+ nvmft_send_generic_error(ctrlr->admin, nc,
+ NVME_SC_COMMAND_SEQUENCE_ERROR);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ ctrlr->num_io_queues = num_queues;
+ ctrlr->io_qpairs = io_qpairs;
+ mtx_unlock(&ctrlr->lock);
+
+ nvmft_init_cqe(&cqe, nc, 0);
+ cqe.cdw0 = cmd->cdw11;
+ nvmft_send_response(ctrlr->admin, &cqe);
+ nvmf_free_capsule(nc);
+ return;
+ }
+ case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
+ {
+ uint32_t aer_mask;
+
+ aer_mask = le32toh(cmd->cdw11);
+
+ /* Check for any reserved or unimplemented feature bits. */
+ if ((aer_mask & 0xffffc000) != 0)
+ goto error;
+
+ mtx_lock(&ctrlr->lock);
+ ctrlr->aer_mask = aer_mask;
+ mtx_unlock(&ctrlr->lock);
+ nvmft_send_success(ctrlr->admin, nc);
+ return;
+ }
+ default:
+ nvmft_printf(ctrlr,
+ "Unsupported feature ID %u for SET_FEATURES\n", fid);
+ goto error;
+ }
+
+error:
+ nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
+ nvmf_free_capsule(nc);
+}
+
+static bool
+update_cc(struct nvmft_controller *ctrlr, uint32_t new_cc, bool *need_shutdown)
+{
+ struct nvmft_port *np = ctrlr->np;
+ uint32_t changes;
+
+ *need_shutdown = false;
+
+ mtx_lock(&ctrlr->lock);
+
+ /* Don't allow any changes while shutting down. */
+ if (ctrlr->shutdown) {
+ mtx_unlock(&ctrlr->lock);
+ return (false);
+ }
+
+ if (!_nvmf_validate_cc(np->max_io_qsize, np->cap, ctrlr->cc, new_cc)) {
+ mtx_unlock(&ctrlr->lock);
+ return (false);
+ }
+
+ changes = ctrlr->cc ^ new_cc;
+ ctrlr->cc = new_cc;
+
+ /* Handle shutdown requests. */
+ if (NVMEV(NVME_CC_REG_SHN, changes) != 0 &&
+ NVMEV(NVME_CC_REG_SHN, new_cc) != 0) {
+ ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
+ ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_OCCURRING);
+ ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
+ ctrlr->shutdown = true;
+ *need_shutdown = true;
+ nvmft_printf(ctrlr, "shutdown requested\n");
+ }
+
+ if (NVMEV(NVME_CC_REG_EN, changes) != 0) {
+ if (NVMEV(NVME_CC_REG_EN, new_cc) == 0) {
+ /* Controller reset. */
+ nvmft_printf(ctrlr, "reset requested\n");
+ ctrlr->shutdown = true;
+ *need_shutdown = true;
+ } else
+ ctrlr->csts |= NVMEF(NVME_CSTS_REG_RDY, 1);
+ }
+ mtx_unlock(&ctrlr->lock);
+
+ return (true);
+}
+
+static void
+handle_property_get(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc,
+ const struct nvmf_fabric_prop_get_cmd *pget)
+{
+ struct nvmf_fabric_prop_get_rsp rsp;
+
+ nvmft_init_cqe(&rsp, nc, 0);
+
+ switch (le32toh(pget->ofst)) {
+ case NVMF_PROP_CAP:
+ if (pget->attrib.size != NVMF_PROP_SIZE_8)
+ goto error;
+ rsp.value.u64 = htole64(ctrlr->np->cap);
+ break;
+ case NVMF_PROP_VS:
+ if (pget->attrib.size != NVMF_PROP_SIZE_4)
+ goto error;
+ rsp.value.u32.low = ctrlr->cdata.ver;
+ break;
+ case NVMF_PROP_CC:
+ if (pget->attrib.size != NVMF_PROP_SIZE_4)
+ goto error;
+ rsp.value.u32.low = htole32(ctrlr->cc);
+ break;
+ case NVMF_PROP_CSTS:
+ if (pget->attrib.size != NVMF_PROP_SIZE_4)
+ goto error;
+ rsp.value.u32.low = htole32(ctrlr->csts);
+ break;
+ default:
+ goto error;
+ }
+
+ nvmft_send_response(ctrlr->admin, &rsp);
+ return;
+error:
+ nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
+}
+
+static void
+handle_property_set(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc,
+ const struct nvmf_fabric_prop_set_cmd *pset)
+{
+ bool need_shutdown;
+
+ need_shutdown = false;
+ switch (le32toh(pset->ofst)) {
+ case NVMF_PROP_CC:
+ if (pset->attrib.size != NVMF_PROP_SIZE_4)
+ goto error;
+ if (!update_cc(ctrlr, le32toh(pset->value.u32.low),
+ &need_shutdown))
+ goto error;
+ break;
+ default:
+ goto error;
+ }
+
+ nvmft_send_success(ctrlr->admin, nc);
+ if (need_shutdown) {
+ callout_stop(&ctrlr->ka_timer);
+ taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
+ }
+ return;
+error:
+ nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
+}
+
+static void
+handle_admin_fabrics_command(struct nvmft_controller *ctrlr,
+ struct nvmf_capsule *nc, const struct nvmf_fabric_cmd *fc)
+{
+ switch (fc->fctype) {
+ case NVMF_FABRIC_COMMAND_PROPERTY_GET:
+ handle_property_get(ctrlr, nc,
+ (const struct nvmf_fabric_prop_get_cmd *)fc);
+ break;
+ case NVMF_FABRIC_COMMAND_PROPERTY_SET:
+ handle_property_set(ctrlr, nc,
+ (const struct nvmf_fabric_prop_set_cmd *)fc);
+ break;
+ case NVMF_FABRIC_COMMAND_CONNECT:
+ nvmft_printf(ctrlr,
+ "CONNECT command on connected admin queue\n");
+ nvmft_send_generic_error(ctrlr->admin, nc,
+ NVME_SC_COMMAND_SEQUENCE_ERROR);
+ break;
+ case NVMF_FABRIC_COMMAND_DISCONNECT:
+ nvmft_printf(ctrlr, "DISCONNECT command on admin queue\n");
+ nvmft_send_error(ctrlr->admin, nc, NVME_SCT_COMMAND_SPECIFIC,
+ NVMF_FABRIC_SC_INVALID_QUEUE_TYPE);
+ break;
+ default:
+ nvmft_printf(ctrlr, "Unsupported fabrics command %#x\n",
+ fc->fctype);
+ nvmft_send_generic_error(ctrlr->admin, nc,
+ NVME_SC_INVALID_OPCODE);
+ break;
+ }
+ nvmf_free_capsule(nc);
+}
+
+void
+nvmft_handle_admin_command(struct nvmft_controller *ctrlr,
+ struct nvmf_capsule *nc)
+{
+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
+
+ /* Only permit Fabrics commands while a controller is disabled. */
+ if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0 &&
+ cmd->opc != NVME_OPC_FABRICS_COMMANDS) {
+ nvmft_printf(ctrlr,
+ "Unsupported admin opcode %#x whiled disabled\n", cmd->opc);
+ nvmft_send_generic_error(ctrlr->admin, nc,
+ NVME_SC_COMMAND_SEQUENCE_ERROR);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ atomic_store_int(&ctrlr->ka_active_traffic, 1);
+
+ switch (cmd->opc) {
+ case NVME_OPC_GET_LOG_PAGE:
+ handle_get_log_page(ctrlr, nc, cmd);
+ break;
+ case NVME_OPC_IDENTIFY:
+ handle_identify_command(ctrlr, nc, cmd);
+ break;
+ case NVME_OPC_SET_FEATURES:
+ handle_set_features(ctrlr, nc, cmd);
+ break;
+ case NVME_OPC_ASYNC_EVENT_REQUEST:
+ mtx_lock(&ctrlr->lock);
+ if (ctrlr->aer_pending == NVMFT_NUM_AER) {
+ mtx_unlock(&ctrlr->lock);
+ nvmft_send_error(ctrlr->admin, nc,
+ NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
+ } else {
+ /* NB: Store the CID without byte-swapping. */
+ ctrlr->aer_cids[ctrlr->aer_pidx] = cmd->cid;
+ ctrlr->aer_pending++;
+ ctrlr->aer_pidx = (ctrlr->aer_pidx + 1) % NVMFT_NUM_AER;
+ mtx_unlock(&ctrlr->lock);
+ }
+ nvmf_free_capsule(nc);
+ break;
+ case NVME_OPC_KEEP_ALIVE:
+ nvmft_send_success(ctrlr->admin, nc);
+ nvmf_free_capsule(nc);
+ break;
+ case NVME_OPC_FABRICS_COMMANDS:
+ handle_admin_fabrics_command(ctrlr, nc,
+ (const struct nvmf_fabric_cmd *)cmd);
+ break;
+ default:
+ nvmft_printf(ctrlr, "Unsupported admin opcode %#x\n", cmd->opc);
+ nvmft_send_generic_error(ctrlr->admin, nc,
+ NVME_SC_INVALID_OPCODE);
+ nvmf_free_capsule(nc);
+ break;
+ }
+}
+
+void
+nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid,
+ struct nvmf_capsule *nc)
+{
+ struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp);
+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
+
+ atomic_store_int(&ctrlr->ka_active_traffic, 1);
+
+ switch (cmd->opc) {
+ case NVME_OPC_FLUSH:
+ if (cmd->nsid == htole32(0xffffffff)) {
+ nvmft_send_generic_error(qp, nc,
+ NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
+ nvmf_free_capsule(nc);
+ break;
+ }
+ /* FALLTHROUGH */
+ case NVME_OPC_WRITE:
+ case NVME_OPC_READ:
+ case NVME_OPC_WRITE_UNCORRECTABLE:
+ case NVME_OPC_COMPARE:
+ case NVME_OPC_WRITE_ZEROES:
+ case NVME_OPC_DATASET_MANAGEMENT:
+ case NVME_OPC_VERIFY:
+ nvmft_dispatch_command(qp, nc, false);
+ break;
+ default:
+ nvmft_printf(ctrlr, "Unsupported I/O opcode %#x\n", cmd->opc);
+ nvmft_send_generic_error(qp, nc,
+ NVME_SC_INVALID_OPCODE);
+ nvmf_free_capsule(nc);
+ break;
+ }
+}
+
+static void
+nvmft_report_aer(struct nvmft_controller *ctrlr, uint32_t aer_mask,
+ u_int type, uint8_t info, uint8_t log_page_id)
+{
+ struct nvme_completion cpl;
+
+ MPASS(type <= 7);
+
+ /* Drop events that are not enabled. */
+ mtx_lock(&ctrlr->lock);
+ if ((ctrlr->aer_mask & aer_mask) == 0) {
+ mtx_unlock(&ctrlr->lock);
+ return;
+ }
+
+ /*
+ * If there is no pending AER command, drop it.
+ * XXX: Should we queue these?
+ */
+ if (ctrlr->aer_pending == 0) {
+ mtx_unlock(&ctrlr->lock);
+ nvmft_printf(ctrlr,
+ "dropping AER type %u, info %#x, page %#x\n",
+ type, info, log_page_id);
+ return;
+ }
+
+ memset(&cpl, 0, sizeof(cpl));
+ cpl.cid = ctrlr->aer_cids[ctrlr->aer_cidx];
+ ctrlr->aer_pending--;
+ ctrlr->aer_cidx = (ctrlr->aer_cidx + 1) % NVMFT_NUM_AER;
+ mtx_unlock(&ctrlr->lock);
+
+ cpl.cdw0 = htole32(NVMEF(NVME_ASYNC_EVENT_TYPE, type) |
+ NVMEF(NVME_ASYNC_EVENT_INFO, info) |
+ NVMEF(NVME_ASYNC_EVENT_LOG_PAGE_ID, log_page_id));
+
+ nvmft_send_response(ctrlr->admin, &cpl);
+}
+
+void
+nvmft_controller_lun_changed(struct nvmft_controller *ctrlr, int lun_id)
+{
+ struct nvme_ns_list *nslist;
+ uint32_t new_nsid, nsid;
+ u_int i;
+
+ new_nsid = lun_id + 1;
+
+ mtx_lock(&ctrlr->lock);
+ nslist = ctrlr->changed_ns;
+
+ /* If the first entry is 0xffffffff, the list is already full. */
+ if (nslist->ns[0] != 0xffffffff) {
+ /* Find the insertion point for this namespace ID. */
+ for (i = 0; i < nitems(nslist->ns); i++) {
+ nsid = le32toh(nslist->ns[i]);
+ if (nsid == new_nsid) {
+ /* Already reported, nothing to do. */
+ mtx_unlock(&ctrlr->lock);
+ return;
+ }
+
+ if (nsid == 0 || nsid > new_nsid)
+ break;
+ }
+
+ if (nslist->ns[nitems(nslist->ns) - 1] != htole32(0)) {
+ /* List is full. */
+ memset(ctrlr->changed_ns, 0,
+ sizeof(*ctrlr->changed_ns));
+ ctrlr->changed_ns->ns[0] = 0xffffffff;
+ } else if (nslist->ns[i] == htole32(0)) {
+ /*
+ * Optimize case where this ID is appended to
+ * the end.
+ */
+ nslist->ns[i] = htole32(new_nsid);
+ } else {
+ memmove(&nslist->ns[i + 1], &nslist->ns[i],
+ (nitems(nslist->ns) - i - 1) *
+ sizeof(nslist->ns[0]));
+ nslist->ns[i] = htole32(new_nsid);
+ }
+ }
+
+ if (ctrlr->changed_ns_reported) {
+ mtx_unlock(&ctrlr->lock);
+ return;
+ }
+ ctrlr->changed_ns_reported = true;
+ mtx_unlock(&ctrlr->lock);
+
+ nvmft_report_aer(ctrlr, NVME_ASYNC_EVENT_NS_ATTRIBUTE, 0x2, 0x0,
+ NVME_LOG_CHANGED_NAMESPACE);
+}
diff --git a/sys/dev/nvmf/controller/nvmft_qpair.c b/sys/dev/nvmf/controller/nvmft_qpair.c
new file mode 100644
index 000000000000..6cb3ebd76884
--- /dev/null
+++ b/sys/dev/nvmf/controller/nvmft_qpair.c
@@ -0,0 +1,361 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/types.h>
+#include <sys/_bitset.h>
+#include <sys/bitset.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/controller/nvmft_var.h>
+
+/*
+ * A bitmask of command ID values. This is used to detect duplicate
+ * commands with the same ID.
+ */
+#define NUM_CIDS (UINT16_MAX + 1)
+BITSET_DEFINE(cidset, NUM_CIDS);
+
+struct nvmft_qpair {
+ struct nvmft_controller *ctrlr;
+ struct nvmf_qpair *qp;
+ struct cidset *cids;
+
+ bool admin;
+ bool sq_flow_control;
+ uint16_t qid;
+ u_int qsize;
+ uint16_t sqhd;
+ uint16_t sqtail;
+ volatile u_int qp_refs; /* Internal references on 'qp'. */
+
+ struct mtx lock;
+
+ char name[16];
+};
+
+static int _nvmft_send_generic_error(struct nvmft_qpair *qp,
+ struct nvmf_capsule *nc, uint8_t sc_status);
+
+static void
+nvmft_qpair_error(void *arg, int error)
+{
+ struct nvmft_qpair *qp = arg;
+ struct nvmft_controller *ctrlr = qp->ctrlr;
+
+ /*
+ * XXX: The Linux TCP initiator sends a RST immediately after
+ * the FIN, so treat ECONNRESET as plain EOF to avoid spurious
+ * errors on shutdown.
+ */
+ if (error == ECONNRESET)
+ error = 0;
+
+ if (error != 0)
+ nvmft_printf(ctrlr, "error %d on %s\n", error, qp->name);
+ nvmft_controller_error(ctrlr, qp, error);
+}
+
+static void
+nvmft_receive_capsule(void *arg, struct nvmf_capsule *nc)
+{
+ struct nvmft_qpair *qp = arg;
+ struct nvmft_controller *ctrlr = qp->ctrlr;
+ const struct nvme_command *cmd;
+ uint8_t sc_status;
+
+ cmd = nvmf_capsule_sqe(nc);
+ if (ctrlr == NULL) {
+ printf("NVMFT: %s received CID %u opcode %u on newborn queue\n",
+ qp->name, le16toh(cmd->cid), cmd->opc);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ sc_status = nvmf_validate_command_capsule(nc);
+ if (sc_status != NVME_SC_SUCCESS) {
+ _nvmft_send_generic_error(qp, nc, sc_status);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ /* Don't bother byte-swapping CID. */
+ if (BIT_TEST_SET_ATOMIC(NUM_CIDS, cmd->cid, qp->cids)) {
+ _nvmft_send_generic_error(qp, nc, NVME_SC_COMMAND_ID_CONFLICT);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ if (qp->admin)
+ nvmft_handle_admin_command(ctrlr, nc);
+ else
+ nvmft_handle_io_command(qp, qp->qid, nc);
+}
+
+struct nvmft_qpair *
+nvmft_qpair_init(enum nvmf_trtype trtype,
+ const struct nvmf_handoff_qpair_params *handoff, uint16_t qid,
+ const char *name)
+{
+ struct nvmft_qpair *qp;
+
+ qp = malloc(sizeof(*qp), M_NVMFT, M_WAITOK | M_ZERO);
+ qp->admin = handoff->admin;
+ qp->sq_flow_control = handoff->sq_flow_control;
+ qp->qsize = handoff->qsize;
+ qp->qid = qid;
+ qp->sqhd = handoff->sqhd;
+ qp->sqtail = handoff->sqtail;
+ strlcpy(qp->name, name, sizeof(qp->name));
+ mtx_init(&qp->lock, "nvmft qp", NULL, MTX_DEF);
+ qp->cids = BITSET_ALLOC(NUM_CIDS, M_NVMFT, M_WAITOK | M_ZERO);
+
+ qp->qp = nvmf_allocate_qpair(trtype, true, handoff, nvmft_qpair_error,
+ qp, nvmft_receive_capsule, qp);
+ if (qp->qp == NULL) {
+ mtx_destroy(&qp->lock);
+ free(qp->cids, M_NVMFT);
+ free(qp, M_NVMFT);
+ return (NULL);
+ }
+
+ refcount_init(&qp->qp_refs, 1);
+ return (qp);
+}
+
+void
+nvmft_qpair_shutdown(struct nvmft_qpair *qp)
+{
+ struct nvmf_qpair *nq;
+
+ mtx_lock(&qp->lock);
+ nq = qp->qp;
+ qp->qp = NULL;
+ mtx_unlock(&qp->lock);
+ if (nq != NULL && refcount_release(&qp->qp_refs))
+ nvmf_free_qpair(nq);
+}
+
+void
+nvmft_qpair_destroy(struct nvmft_qpair *qp)
+{
+ nvmft_qpair_shutdown(qp);
+ mtx_destroy(&qp->lock);
+ free(qp->cids, M_NVMFT);
+ free(qp, M_NVMFT);
+}
+
+struct nvmft_controller *
+nvmft_qpair_ctrlr(struct nvmft_qpair *qp)
+{
+ return (qp->ctrlr);
+}
+
+uint16_t
+nvmft_qpair_id(struct nvmft_qpair *qp)
+{
+ return (qp->qid);
+}
+
+const char *
+nvmft_qpair_name(struct nvmft_qpair *qp)
+{
+ return (qp->name);
+}
+
+static int
+_nvmft_send_response(struct nvmft_qpair *qp, const void *cqe)
+{
+ struct nvme_completion cpl;
+ struct nvmf_qpair *nq;
+ struct nvmf_capsule *rc;
+ int error;
+
+ memcpy(&cpl, cqe, sizeof(cpl));
+ mtx_lock(&qp->lock);
+ nq = qp->qp;
+ if (nq == NULL) {
+ mtx_unlock(&qp->lock);
+ return (ENOTCONN);
+ }
+ refcount_acquire(&qp->qp_refs);
+
+ /* Set SQHD. */
+ if (qp->sq_flow_control) {
+ qp->sqhd = (qp->sqhd + 1) % qp->qsize;
+ cpl.sqhd = htole16(qp->sqhd);
+ } else
+ cpl.sqhd = 0;
+ mtx_unlock(&qp->lock);
+
+ rc = nvmf_allocate_response(nq, &cpl, M_WAITOK);
+ error = nvmf_transmit_capsule(rc);
+ nvmf_free_capsule(rc);
+
+ if (refcount_release(&qp->qp_refs))
+ nvmf_free_qpair(nq);
+ return (error);
+}
+
+void
+nvmft_command_completed(struct nvmft_qpair *qp, struct nvmf_capsule *nc)
+{
+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
+
+ /* Don't bother byte-swapping CID. */
+ KASSERT(BIT_ISSET(NUM_CIDS, cmd->cid, qp->cids),
+ ("%s: CID %u not busy", __func__, cmd->cid));
+
+ BIT_CLR_ATOMIC(NUM_CIDS, cmd->cid, qp->cids);
+}
+
+int
+nvmft_send_response(struct nvmft_qpair *qp, const void *cqe)
+{
+ const struct nvme_completion *cpl = cqe;
+
+ /* Don't bother byte-swapping CID. */
+ KASSERT(BIT_ISSET(NUM_CIDS, cpl->cid, qp->cids),
+ ("%s: CID %u not busy", __func__, cpl->cid));
+
+ BIT_CLR_ATOMIC(NUM_CIDS, cpl->cid, qp->cids);
+ return (_nvmft_send_response(qp, cqe));
+}
+
+void
+nvmft_init_cqe(void *cqe, struct nvmf_capsule *nc, uint16_t status)
+{
+ struct nvme_completion *cpl = cqe;
+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
+
+ memset(cpl, 0, sizeof(*cpl));
+ cpl->cid = cmd->cid;
+ cpl->status = htole16(status);
+}
+
+int
+nvmft_send_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
+ uint8_t sc_type, uint8_t sc_status)
+{
+ struct nvme_completion cpl;
+ uint16_t status;
+
+ status = NVMEF(NVME_STATUS_SCT, sc_type) |
+ NVMEF(NVME_STATUS_SC, sc_status);
+ nvmft_init_cqe(&cpl, nc, status);
+ return (nvmft_send_response(qp, &cpl));
+}
+
+int
+nvmft_send_generic_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
+ uint8_t sc_status)
+{
+ return (nvmft_send_error(qp, nc, NVME_SCT_GENERIC, sc_status));
+}
+
+/*
+ * This version doesn't clear CID in qp->cids and is used for errors
+ * before the CID is validated.
+ */
+static int
+_nvmft_send_generic_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
+ uint8_t sc_status)
+{
+ struct nvme_completion cpl;
+ uint16_t status;
+
+ status = NVMEF(NVME_STATUS_SCT, NVME_SCT_GENERIC) |
+ NVMEF(NVME_STATUS_SC, sc_status);
+ nvmft_init_cqe(&cpl, nc, status);
+ return (_nvmft_send_response(qp, &cpl));
+}
+
+int
+nvmft_send_success(struct nvmft_qpair *qp, struct nvmf_capsule *nc)
+{
+ return (nvmft_send_generic_error(qp, nc, NVME_SC_SUCCESS));
+}
+
+static void
+nvmft_init_connect_rsp(struct nvmf_fabric_connect_rsp *rsp,
+ const struct nvmf_fabric_connect_cmd *cmd, uint16_t status)
+{
+ memset(rsp, 0, sizeof(*rsp));
+ rsp->cid = cmd->cid;
+ rsp->status = htole16(status);
+}
+
+static int
+nvmft_send_connect_response(struct nvmft_qpair *qp,
+ const struct nvmf_fabric_connect_rsp *rsp)
+{
+ struct nvmf_capsule *rc;
+ struct nvmf_qpair *nq;
+ int error;
+
+ mtx_lock(&qp->lock);
+ nq = qp->qp;
+ if (nq == NULL) {
+ mtx_unlock(&qp->lock);
+ return (ENOTCONN);
+ }
+ refcount_acquire(&qp->qp_refs);
+ mtx_unlock(&qp->lock);
+
+ rc = nvmf_allocate_response(qp->qp, rsp, M_WAITOK);
+ error = nvmf_transmit_capsule(rc);
+ nvmf_free_capsule(rc);
+
+ if (refcount_release(&qp->qp_refs))
+ nvmf_free_qpair(nq);
+ return (error);
+}
+
+void
+nvmft_connect_error(struct nvmft_qpair *qp,
+ const struct nvmf_fabric_connect_cmd *cmd, uint8_t sc_type,
+ uint8_t sc_status)
+{
+ struct nvmf_fabric_connect_rsp rsp;
+ uint16_t status;
+
+ status = NVMEF(NVME_STATUS_SCT, sc_type) |
+ NVMEF(NVME_STATUS_SC, sc_status);
+ nvmft_init_connect_rsp(&rsp, cmd, status);
+ nvmft_send_connect_response(qp, &rsp);
+}
+
+void
+nvmft_connect_invalid_parameters(struct nvmft_qpair *qp,
+ const struct nvmf_fabric_connect_cmd *cmd, bool data, uint16_t offset)
+{
+ struct nvmf_fabric_connect_rsp rsp;
+
+ nvmft_init_connect_rsp(&rsp, cmd,
+ NVMEF(NVME_STATUS_SCT, NVME_SCT_COMMAND_SPECIFIC) |
+ NVMEF(NVME_STATUS_SC, NVMF_FABRIC_SC_INVALID_PARAM));
+ rsp.status_code_specific.invalid.ipo = htole16(offset);
+ rsp.status_code_specific.invalid.iattr = data ? 1 : 0;
+ nvmft_send_connect_response(qp, &rsp);
+}
+
+int
+nvmft_finish_accept(struct nvmft_qpair *qp,
+ const struct nvmf_fabric_connect_cmd *cmd, struct nvmft_controller *ctrlr)
+{
+ struct nvmf_fabric_connect_rsp rsp;
+
+ qp->ctrlr = ctrlr;
+ nvmft_init_connect_rsp(&rsp, cmd, 0);
+ if (qp->sq_flow_control)
+ rsp.sqhd = htole16(qp->sqhd);
+ else
+ rsp.sqhd = htole16(0xffff);
+ rsp.status_code_specific.success.cntlid = htole16(ctrlr->cntlid);
+ return (nvmft_send_connect_response(qp, &rsp));
+}
diff --git a/sys/dev/nvmf/controller/nvmft_var.h b/sys/dev/nvmf/controller/nvmft_var.h
new file mode 100644
index 000000000000..fc1f86754382
--- /dev/null
+++ b/sys/dev/nvmf/controller/nvmft_var.h
@@ -0,0 +1,174 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#ifndef __NVMFT_VAR_H__
+#define __NVMFT_VAR_H__
+
+#include <sys/_callout.h>
+#include <sys/refcount.h>
+#include <sys/taskqueue.h>
+
+#include <dev/nvmf/nvmf_proto.h>
+
+#include <cam/ctl/ctl.h>
+#include <cam/ctl/ctl_io.h>
+#include <cam/ctl/ctl_frontend.h>
+
+struct nvmf_capsule;
+struct nvmft_controller;
+struct nvmft_qpair;
+
+#define NVMFT_NUM_AER 16
+
+struct nvmft_port {
+ TAILQ_ENTRY(nvmft_port) link;
+ u_int refs;
+ struct ctl_port port;
+ struct nvme_controller_data cdata;
+ struct nvme_firmware_page fp;
+ uint64_t cap;
+ uint32_t max_io_qsize;
+ bool online;
+
+ struct sx lock;
+
+ struct unrhdr *ids;
+ TAILQ_HEAD(, nvmft_controller) controllers;
+
+ uint32_t *active_ns;
+ u_int num_ns;
+};
+
+struct nvmft_io_qpair {
+ struct nvmft_qpair *qp;
+
+ bool shutdown;
+};
+
+struct nvmft_controller {
+ struct nvmft_qpair *admin;
+ struct nvmft_io_qpair *io_qpairs;
+ u_int num_io_queues;
+ bool shutdown;
+ bool admin_closed;
+ uint16_t cntlid;
+ uint32_t cc;
+ uint32_t csts;
+
+ struct nvmft_port *np;
+ struct mtx lock;
+
+ struct nvme_controller_data cdata;
+ struct nvme_health_information_page hip;
+ sbintime_t create_time;
+ sbintime_t start_busy;
+ sbintime_t busy_total;
+ uint16_t partial_dur;
+ uint16_t partial_duw;
+
+ uint8_t hostid[16];
+ uint8_t hostnqn[NVME_NQN_FIELD_SIZE];
+ u_int trtype;
+
+ TAILQ_ENTRY(nvmft_controller) link;
+
+ /*
+ * Each queue can have at most UINT16_MAX commands, so the total
+ * across all queues will fit in a uint32_t.
+ */
+ uint32_t pending_commands;
+
+ volatile int ka_active_traffic;
+ struct callout ka_timer;
+ sbintime_t ka_sbt;
+
+ /* AER fields. */
+ uint32_t aer_mask;
+ uint16_t aer_cids[NVMFT_NUM_AER];
+ uint8_t aer_pending;
+ uint8_t aer_cidx;
+ uint8_t aer_pidx;
+
+ /* Changed namespace IDs. */
+ struct nvme_ns_list *changed_ns;
+ bool changed_ns_reported;
+
+ struct task shutdown_task;
+ struct timeout_task terminate_task;
+};
+
+MALLOC_DECLARE(M_NVMFT);
+
+/* ctl_frontend_nvmf.c */
+void nvmft_port_free(struct nvmft_port *np);
+void nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid,
+ struct nvme_ns_list *nslist);
+void nvmft_dispatch_command(struct nvmft_qpair *qp,
+ struct nvmf_capsule *nc, bool admin);
+void nvmft_terminate_commands(struct nvmft_controller *ctrlr);
+
+/* nvmft_controller.c */
+void nvmft_controller_error(struct nvmft_controller *ctrlr,
+ struct nvmft_qpair *qp, int error);
+void nvmft_controller_lun_changed(struct nvmft_controller *ctrlr,
+ int lun_id);
+void nvmft_handle_admin_command(struct nvmft_controller *ctrlr,
+ struct nvmf_capsule *nc);
+void nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid,
+ struct nvmf_capsule *nc);
+int nvmft_handoff_admin_queue(struct nvmft_port *np,
+ const struct nvmf_handoff_controller_qpair *handoff,
+ const struct nvmf_fabric_connect_cmd *cmd,
+ const struct nvmf_fabric_connect_data *data);
+int nvmft_handoff_io_queue(struct nvmft_port *np,
+ const struct nvmf_handoff_controller_qpair *handoff,
+ const struct nvmf_fabric_connect_cmd *cmd,
+ const struct nvmf_fabric_connect_data *data);
+int nvmft_printf(struct nvmft_controller *ctrlr, const char *fmt, ...)
+ __printflike(2, 3);
+
+/* nvmft_qpair.c */
+struct nvmft_qpair *nvmft_qpair_init(enum nvmf_trtype trtype,
+ const struct nvmf_handoff_qpair_params *handoff, uint16_t qid,
+ const char *name);
+void nvmft_qpair_shutdown(struct nvmft_qpair *qp);
+void nvmft_qpair_destroy(struct nvmft_qpair *qp);
+struct nvmft_controller *nvmft_qpair_ctrlr(struct nvmft_qpair *qp);
+uint16_t nvmft_qpair_id(struct nvmft_qpair *qp);
+const char *nvmft_qpair_name(struct nvmft_qpair *qp);
+void nvmft_command_completed(struct nvmft_qpair *qp,
+ struct nvmf_capsule *nc);
+int nvmft_send_response(struct nvmft_qpair *qp, const void *cqe);
+void nvmft_init_cqe(void *cqe, struct nvmf_capsule *nc, uint16_t status);
+int nvmft_send_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
+ uint8_t sc_type, uint8_t sc_status);
+int nvmft_send_generic_error(struct nvmft_qpair *qp,
+ struct nvmf_capsule *nc, uint8_t sc_status);
+int nvmft_send_success(struct nvmft_qpair *qp,
+ struct nvmf_capsule *nc);
+void nvmft_connect_error(struct nvmft_qpair *qp,
+ const struct nvmf_fabric_connect_cmd *cmd, uint8_t sc_type,
+ uint8_t sc_status);
+void nvmft_connect_invalid_parameters(struct nvmft_qpair *qp,
+ const struct nvmf_fabric_connect_cmd *cmd, bool data, uint16_t offset);
+int nvmft_finish_accept(struct nvmft_qpair *qp,
+ const struct nvmf_fabric_connect_cmd *cmd, struct nvmft_controller *ctrlr);
+
+static __inline void
+nvmft_port_ref(struct nvmft_port *np)
+{
+ refcount_acquire(&np->refs);
+}
+
+static __inline void
+nvmft_port_rele(struct nvmft_port *np)
+{
+ if (refcount_release(&np->refs))
+ nvmft_port_free(np);
+}
+
+#endif /* !__NVMFT_VAR_H__ */
diff --git a/sys/modules/nvmf/Makefile b/sys/modules/nvmf/Makefile
index a380c8c63651..88ea40be1257 100644
--- a/sys/modules/nvmf/Makefile
+++ b/sys/modules/nvmf/Makefile
@@ -1,5 +1,6 @@
SUBDIR= nvmf \
nvmf_tcp \
- nvmf_transport
+ nvmf_transport \
+ nvmft
.include <bsd.subdir.mk>
diff --git a/sys/modules/nvmf/nvmft/Makefile b/sys/modules/nvmf/nvmft/Makefile
new file mode 100644
index 000000000000..b47ffa53c310
--- /dev/null
+++ b/sys/modules/nvmf/nvmft/Makefile
@@ -0,0 +1,10 @@
+.PATH: ${SRCTOP}/sys/dev/nvmf/controller
+
+KMOD= nvmft
+
+SRCS= ctl_frontend_nvmf.c \
+ nvmft_controller.c \
+ nvmft_subr.c \
+ nvmft_qpair.c
+
+.include <bsd.kmod.mk>