diff options
author | John Baldwin <jhb@FreeBSD.org> | 2024-05-02 23:35:40 +0000 |
---|---|---|
committer | John Baldwin <jhb@FreeBSD.org> | 2024-05-02 23:38:39 +0000 |
commit | a8089ea5aee578e08acab2438e82fc9a9ae50ed8 (patch) | |
tree | 16313ac62d995d187f5d46daaf3907b98cddb224 | |
parent | 09a931554adf9b726d77e42eedcb416cd0aa7798 (diff) | |
download | src-a8089ea5aee578e08acab2438e82fc9a9ae50ed8.tar.gz src-a8089ea5aee578e08acab2438e82fc9a9ae50ed8.zip |
nvmfd: A simple userspace daemon for the NVMe over Fabrics controller
This daemon can operate as a purely userspace controller exporting one
or more simulated RAM disks or local block devices as NVMe namespaces
to a remote host. In this case the daemon provides a discovery
controller with a single entry for an I/O controller.
nvmfd can also offload I/O controller queue pairs to the nvmft.ko
in-kernel Fabrics controller when -K is passed. In this mode, nvmfd
still accepts connections and performs initial transport-specific
negotitation in userland. The daemon still provides a userspace-only
discovery controller with a single entry for an I/O controller.
However, queue pairs for the I/O controller are handed off to the CTL
NVMF frontend.
Eventually ctld(8) should be refactored to to provide an abstraction
for the frontend protocol and the discovery and the kernel mode of
this daemon should be merged into ctld(8). At that point this daemon
can be moved to tools/tools/nvmf as a debugging tool (mostly as sample
code for a userspace controller using libnvmf).
Reviewed by: imp
Sponsored by: Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D44731
-rw-r--r-- | usr.sbin/Makefile | 1 | ||||
-rw-r--r-- | usr.sbin/nvmfd/Makefile | 14 | ||||
-rw-r--r-- | usr.sbin/nvmfd/controller.c | 244 | ||||
-rw-r--r-- | usr.sbin/nvmfd/ctl.c | 139 | ||||
-rw-r--r-- | usr.sbin/nvmfd/devices.c | 386 | ||||
-rw-r--r-- | usr.sbin/nvmfd/discovery.c | 343 | ||||
-rw-r--r-- | usr.sbin/nvmfd/internal.h | 65 | ||||
-rw-r--r-- | usr.sbin/nvmfd/io.c | 677 | ||||
-rw-r--r-- | usr.sbin/nvmfd/nvmfd.8 | 126 | ||||
-rw-r--r-- | usr.sbin/nvmfd/nvmfd.c | 260 |
10 files changed, 2255 insertions, 0 deletions
diff --git a/usr.sbin/Makefile b/usr.sbin/Makefile index c3a4cc42f721..0aac7062146d 100644 --- a/usr.sbin/Makefile +++ b/usr.sbin/Makefile @@ -56,6 +56,7 @@ SUBDIR= adduser \ nfsuserd \ nmtree \ nologin \ + nvmfd \ pciconf \ periodic \ pnfsdscopymr \ diff --git a/usr.sbin/nvmfd/Makefile b/usr.sbin/nvmfd/Makefile new file mode 100644 index 000000000000..dc3dcc5e3a5c --- /dev/null +++ b/usr.sbin/nvmfd/Makefile @@ -0,0 +1,14 @@ +.include <src.opts.mk> +.PATH: ${SRCTOP}/sys/libkern + +PACKAGE=nvme-tools +PROG= nvmfd +SRCS= nvmfd.c controller.c ctl.c devices.c discovery.c gsb_crc32.c io.c +CFLAGS+= -I${SRCTOP}/lib/libnvmf +MAN= nvmfd.8 +LIBADD+= nvmf pthread util nv + +.include <bsd.prog.mk> + +CFLAGS.ctl.c= -I${SRCTOP}/sys +CWARNFLAGS.gsb_crc32.c= -Wno-cast-align diff --git a/usr.sbin/nvmfd/controller.c b/usr.sbin/nvmfd/controller.c new file mode 100644 index 000000000000..09baaea74ab4 --- /dev/null +++ b/usr.sbin/nvmfd/controller.c @@ -0,0 +1,244 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin <jhb@FreeBSD.org> + */ + +#include <err.h> +#include <errno.h> +#include <libnvmf.h> +#include <stdlib.h> + +#include "internal.h" + +struct controller { + struct nvmf_qpair *qp; + + uint64_t cap; + uint32_t vs; + uint32_t cc; + uint32_t csts; + + bool shutdown; + + struct nvme_controller_data cdata; +}; + +static bool +update_cc(struct controller *c, uint32_t new_cc) +{ + uint32_t changes; + + if (c->shutdown) + return (false); + if (!nvmf_validate_cc(c->qp, c->cap, c->cc, new_cc)) + return (false); + + changes = c->cc ^ new_cc; + c->cc = new_cc; + + /* Handle shutdown requests. */ + if (NVMEV(NVME_CC_REG_SHN, changes) != 0 && + NVMEV(NVME_CC_REG_SHN, new_cc) != 0) { + c->csts &= ~NVMEM(NVME_CSTS_REG_SHST); + c->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_COMPLETE); + c->shutdown = true; + } + + if (NVMEV(NVME_CC_REG_EN, changes) != 0) { + if (NVMEV(NVME_CC_REG_EN, new_cc) == 0) { + /* Controller reset. */ + c->csts = 0; + c->shutdown = true; + } else + c->csts |= NVMEF(NVME_CSTS_REG_RDY, 1); + } + return (true); +} + +static void +handle_property_get(const struct controller *c, const struct nvmf_capsule *nc, + const struct nvmf_fabric_prop_get_cmd *pget) +{ + struct nvmf_fabric_prop_get_rsp rsp; + + nvmf_init_cqe(&rsp, nc, 0); + + switch (le32toh(pget->ofst)) { + case NVMF_PROP_CAP: + if (pget->attrib.size != NVMF_PROP_SIZE_8) + goto error; + rsp.value.u64 = htole64(c->cap); + break; + case NVMF_PROP_VS: + if (pget->attrib.size != NVMF_PROP_SIZE_4) + goto error; + rsp.value.u32.low = htole32(c->vs); + break; + case NVMF_PROP_CC: + if (pget->attrib.size != NVMF_PROP_SIZE_4) + goto error; + rsp.value.u32.low = htole32(c->cc); + break; + case NVMF_PROP_CSTS: + if (pget->attrib.size != NVMF_PROP_SIZE_4) + goto error; + rsp.value.u32.low = htole32(c->csts); + break; + default: + goto error; + } + + nvmf_send_response(nc, &rsp); + return; +error: + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); +} + +static void +handle_property_set(struct controller *c, const struct nvmf_capsule *nc, + const struct nvmf_fabric_prop_set_cmd *pset) +{ + switch (le32toh(pset->ofst)) { + case NVMF_PROP_CC: + if (pset->attrib.size != NVMF_PROP_SIZE_4) + goto error; + if (!update_cc(c, le32toh(pset->value.u32.low))) + goto error; + break; + default: + goto error; + } + + nvmf_send_success(nc); + return; +error: + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); +} + +static void +handle_fabrics_command(struct controller *c, + const struct nvmf_capsule *nc, const struct nvmf_fabric_cmd *fc) +{ + switch (fc->fctype) { + case NVMF_FABRIC_COMMAND_PROPERTY_GET: + handle_property_get(c, nc, + (const struct nvmf_fabric_prop_get_cmd *)fc); + break; + case NVMF_FABRIC_COMMAND_PROPERTY_SET: + handle_property_set(c, nc, + (const struct nvmf_fabric_prop_set_cmd *)fc); + break; + case NVMF_FABRIC_COMMAND_CONNECT: + warnx("CONNECT command on connected queue"); + nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR); + break; + case NVMF_FABRIC_COMMAND_DISCONNECT: + warnx("DISCONNECT command on admin queue"); + nvmf_send_error(nc, NVME_SCT_COMMAND_SPECIFIC, + NVMF_FABRIC_SC_INVALID_QUEUE_TYPE); + break; + default: + warnx("Unsupported fabrics command %#x", fc->fctype); + nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE); + break; + } +} + +static void +handle_identify_command(const struct controller *c, + const struct nvmf_capsule *nc, const struct nvme_command *cmd) +{ + uint8_t cns; + + cns = le32toh(cmd->cdw10) & 0xFF; + switch (cns) { + case 1: + break; + default: + warnx("Unsupported CNS %#x for IDENTIFY", cns); + goto error; + } + + nvmf_send_controller_data(nc, &c->cdata, sizeof(c->cdata)); + return; +error: + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); +} + +void +controller_handle_admin_commands(struct controller *c, handle_command *cb, + void *cb_arg) +{ + struct nvmf_qpair *qp = c->qp; + const struct nvme_command *cmd; + struct nvmf_capsule *nc; + int error; + + for (;;) { + error = nvmf_controller_receive_capsule(qp, &nc); + if (error != 0) { + if (error != ECONNRESET) + warnc(error, "Failed to read command capsule"); + break; + } + + cmd = nvmf_capsule_sqe(nc); + + /* + * Only permit Fabrics commands while a controller is + * disabled. + */ + if (NVMEV(NVME_CC_REG_EN, c->cc) == 0 && + cmd->opc != NVME_OPC_FABRICS_COMMANDS) { + warnx("Unsupported admin opcode %#x whiled disabled\n", + cmd->opc); + nvmf_send_generic_error(nc, + NVME_SC_COMMAND_SEQUENCE_ERROR); + nvmf_free_capsule(nc); + continue; + } + + if (cb(nc, cmd, cb_arg)) { + nvmf_free_capsule(nc); + continue; + } + + switch (cmd->opc) { + case NVME_OPC_FABRICS_COMMANDS: + handle_fabrics_command(c, nc, + (const struct nvmf_fabric_cmd *)cmd); + break; + case NVME_OPC_IDENTIFY: + handle_identify_command(c, nc, cmd); + break; + default: + warnx("Unsupported admin opcode %#x", cmd->opc); + nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE); + break; + } + nvmf_free_capsule(nc); + } +} + +struct controller * +init_controller(struct nvmf_qpair *qp, + const struct nvme_controller_data *cdata) +{ + struct controller *c; + + c = calloc(1, sizeof(*c)); + c->qp = qp; + c->cap = nvmf_controller_cap(c->qp); + c->vs = cdata->ver; + c->cdata = *cdata; + + return (c); +} + +void +free_controller(struct controller *c) +{ + free(c); +} diff --git a/usr.sbin/nvmfd/ctl.c b/usr.sbin/nvmfd/ctl.c new file mode 100644 index 000000000000..5f01ec8e5bc8 --- /dev/null +++ b/usr.sbin/nvmfd/ctl.c @@ -0,0 +1,139 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Chelsio Communications, Inc. + * Written by: John Baldwin <jhb@FreeBSD.org> + */ + +#include <sys/param.h> +#include <sys/linker.h> +#include <sys/nv.h> +#include <sys/time.h> +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <libnvmf.h> +#include <string.h> + +#include <cam/ctl/ctl.h> +#include <cam/ctl/ctl_io.h> +#include <cam/ctl/ctl_ioctl.h> + +#include "internal.h" + +static int ctl_fd = -1; +static int ctl_port; + +static void +open_ctl(void) +{ + if (ctl_fd > 0) + return; + + ctl_fd = open(CTL_DEFAULT_DEV, O_RDWR); + if (ctl_fd == -1 && errno == ENOENT) { + if (kldload("ctl") == -1) + err(1, "Failed to load ctl.ko"); + ctl_fd = open(CTL_DEFAULT_DEV, O_RDWR); + } + if (ctl_fd == -1) + err(1, "Failed to open %s", CTL_DEFAULT_DEV); +} + +void +init_ctl_port(const char *subnqn, const struct nvmf_association_params *params) +{ + char result_buf[256]; + struct ctl_port_entry entry; + struct ctl_req req; + nvlist_t *nvl; + + open_ctl(); + + nvl = nvlist_create(0); + + nvlist_add_string(nvl, "subnqn", subnqn); + + /* XXX: Hardcoded in discovery.c */ + nvlist_add_stringf(nvl, "portid", "%u", 1); + + nvlist_add_stringf(nvl, "max_io_qsize", "%u", params->max_io_qsize); + + memset(&req, 0, sizeof(req)); + strlcpy(req.driver, "nvmf", sizeof(req.driver)); + req.reqtype = CTL_REQ_CREATE; + req.args = nvlist_pack(nvl, &req.args_len); + if (req.args == NULL) + errx(1, "Failed to pack nvlist for CTL_PORT/CTL_REQ_CREATE"); + req.result = result_buf; + req.result_len = sizeof(result_buf); + if (ioctl(ctl_fd, CTL_PORT_REQ, &req) != 0) + err(1, "ioctl(CTL_PORT/CTL_REQ_CREATE)"); + if (req.status == CTL_LUN_ERROR) + errx(1, "Failed to create CTL port: %s", req.error_str); + if (req.status != CTL_LUN_OK) + errx(1, "Failed to create CTL port: %d", req.status); + + nvlist_destroy(nvl); + nvl = nvlist_unpack(result_buf, req.result_len, 0); + if (nvl == NULL) + errx(1, "Failed to unpack nvlist from CTL_PORT/CTL_REQ_CREATE"); + + ctl_port = nvlist_get_number(nvl, "port_id"); + nvlist_destroy(nvl); + + memset(&entry, 0, sizeof(entry)); + entry.targ_port = ctl_port; + if (ioctl(ctl_fd, CTL_ENABLE_PORT, &entry) != 0) + errx(1, "ioctl(CTL_ENABLE_PORT)"); +} + +void +shutdown_ctl_port(const char *subnqn) +{ + struct ctl_req req; + nvlist_t *nvl; + + open_ctl(); + + nvl = nvlist_create(0); + + nvlist_add_string(nvl, "subnqn", subnqn); + + memset(&req, 0, sizeof(req)); + strlcpy(req.driver, "nvmf", sizeof(req.driver)); + req.reqtype = CTL_REQ_REMOVE; + req.args = nvlist_pack(nvl, &req.args_len); + if (req.args == NULL) + errx(1, "Failed to pack nvlist for CTL_PORT/CTL_REQ_REMOVE"); + if (ioctl(ctl_fd, CTL_PORT_REQ, &req) != 0) + err(1, "ioctl(CTL_PORT/CTL_REQ_REMOVE)"); + if (req.status == CTL_LUN_ERROR) + errx(1, "Failed to remove CTL port: %s", req.error_str); + if (req.status != CTL_LUN_OK) + errx(1, "Failed to remove CTL port: %d", req.status); + + nvlist_destroy(nvl); +} + +void +ctl_handoff_qpair(struct nvmf_qpair *qp, + const struct nvmf_fabric_connect_cmd *cmd, + const struct nvmf_fabric_connect_data *data) +{ + struct ctl_nvmf req; + int error; + + memset(&req, 0, sizeof(req)); + req.type = CTL_NVMF_HANDOFF; + error = nvmf_handoff_controller_qpair(qp, &req.data.handoff); + if (error != 0) { + warnc(error, "Failed to prepare qpair for handoff"); + return; + } + + req.data.handoff.cmd = cmd; + req.data.handoff.data = data; + if (ioctl(ctl_fd, CTL_NVMF, &req) != 0) + warn("ioctl(CTL_NVMF/CTL_NVMF_HANDOFF)"); +} diff --git a/usr.sbin/nvmfd/devices.c b/usr.sbin/nvmfd/devices.c new file mode 100644 index 000000000000..fafc1077f207 --- /dev/null +++ b/usr.sbin/nvmfd/devices.c @@ -0,0 +1,386 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin <jhb@FreeBSD.org> + */ + +#include <sys/disk.h> +#include <sys/gsb_crc32.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <net/ieee_oui.h> +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <libnvmf.h> +#include <libutil.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "internal.h" + +#define RAMDISK_PREFIX "ramdisk:" + +struct backing_device { + enum { RAMDISK, FILE, CDEV } type; + union { + int fd; /* FILE, CDEV */ + void *mem; /* RAMDISK */ + }; + u_int sector_size; + uint64_t nlbas; + uint64_t eui64; +}; + +static struct backing_device *devices; +static u_int ndevices; + +static uint64_t +generate_eui64(uint32_t low) +{ + return (OUI_FREEBSD_NVME_LOW << 16 | low); +} + +static uint32_t +crc32(const void *buf, size_t len) +{ + return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff); +} + +static void +init_ramdisk(const char *config, struct backing_device *dev) +{ + static uint32_t ramdisk_idx = 1; + uint64_t num; + + dev->type = RAMDISK; + dev->sector_size = 512; + if (expand_number(config, &num)) + errx(1, "Invalid ramdisk specification: %s", config); + if ((num % dev->sector_size) != 0) + errx(1, "Invalid ramdisk size %ju", (uintmax_t)num); + dev->mem = calloc(num, 1); + dev->nlbas = num / dev->sector_size; + dev->eui64 = generate_eui64('M' << 24 | ramdisk_idx++); +} + +static void +init_filedevice(const char *config, int fd, struct stat *sb, + struct backing_device *dev) +{ + dev->type = FILE; + dev->fd = fd; + dev->sector_size = 512; + if ((sb->st_size % dev->sector_size) != 0) + errx(1, "File size is not a multiple of 512: %s", config); + dev->nlbas = sb->st_size / dev->sector_size; + dev->eui64 = generate_eui64('F' << 24 | + (crc32(config, strlen(config)) & 0xffffff)); +} + +static void +init_chardevice(const char *config, int fd, struct backing_device *dev) +{ + off_t len; + + dev->type = CDEV; + dev->fd = fd; + if (ioctl(fd, DIOCGSECTORSIZE, &dev->sector_size) != 0) + err(1, "Failed to fetch sector size for %s", config); + if (ioctl(fd, DIOCGMEDIASIZE, &len) != 0) + err(1, "Failed to fetch sector size for %s", config); + dev->nlbas = len / dev->sector_size; + dev->eui64 = generate_eui64('C' << 24 | + (crc32(config, strlen(config)) & 0xffffff)); +} + +static void +init_device(const char *config, struct backing_device *dev) +{ + struct stat sb; + int fd; + + /* Check for a RAM disk. */ + if (strncmp(RAMDISK_PREFIX, config, strlen(RAMDISK_PREFIX)) == 0) { + init_ramdisk(config + strlen(RAMDISK_PREFIX), dev); + return; + } + + fd = open(config, O_RDWR); + if (fd == -1) + err(1, "Failed to open %s", config); + if (fstat(fd, &sb) == -1) + err(1, "fstat"); + switch (sb.st_mode & S_IFMT) { + case S_IFCHR: + init_chardevice(config, fd, dev); + break; + case S_IFREG: + init_filedevice(config, fd, &sb, dev); + break; + default: + errx(1, "Invalid file type for %s", config); + } +} + +void +register_devices(int ac, char **av) +{ + ndevices = ac; + devices = calloc(ndevices, sizeof(*devices)); + + for (int i = 0; i < ac; i++) + init_device(av[i], &devices[i]); +} + +u_int +device_count(void) +{ + return (ndevices); +} + +static struct backing_device * +lookup_device(uint32_t nsid) +{ + if (nsid == 0 || nsid > ndevices) + return (NULL); + return (&devices[nsid - 1]); +} + +void +device_active_nslist(uint32_t nsid, struct nvme_ns_list *nslist) +{ + u_int count; + + memset(nslist, 0, sizeof(*nslist)); + count = 0; + nsid++; + while (nsid <= ndevices) { + nslist->ns[count] = htole32(nsid); + count++; + if (count == nitems(nslist->ns)) + break; + nsid++; + } +} + +bool +device_identification_descriptor(uint32_t nsid, void *buf) +{ + struct backing_device *dev; + char *p; + + dev = lookup_device(nsid); + if (dev == NULL) + return (false); + + memset(buf, 0, 4096); + + p = buf; + + /* EUI64 */ + *p++ = 1; + *p++ = 8; + p += 2; + be64enc(p, dev->eui64); + return (true); +} + +bool +device_namespace_data(uint32_t nsid, struct nvme_namespace_data *nsdata) +{ + struct backing_device *dev; + + dev = lookup_device(nsid); + if (dev == NULL) + return (false); + + memset(nsdata, 0, sizeof(*nsdata)); + nsdata->nsze = htole64(dev->nlbas); + nsdata->ncap = nsdata->nsze; + nsdata->nuse = nsdata->ncap; + nsdata->nlbaf = 1 - 1; + nsdata->flbas = NVMEF(NVME_NS_DATA_FLBAS_FORMAT, 0); + nsdata->lbaf[0] = NVMEF(NVME_NS_DATA_LBAF_LBADS, + ffs(dev->sector_size) - 1); + + be64enc(nsdata->eui64, dev->eui64); + return (true); +} + +static bool +read_buffer(int fd, void *buf, size_t len, off_t offset) +{ + ssize_t nread; + char *dst; + + dst = buf; + while (len > 0) { + nread = pread(fd, dst, len, offset); + if (nread == -1 && errno == EINTR) + continue; + if (nread <= 0) + return (false); + dst += nread; + len -= nread; + offset += nread; + } + return (true); +} + +void +device_read(uint32_t nsid, uint64_t lba, u_int nlb, + const struct nvmf_capsule *nc) +{ + struct backing_device *dev; + char *p, *src; + off_t off; + size_t len; + + dev = lookup_device(nsid); + if (dev == NULL) { + nvmf_send_generic_error(nc, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + return; + } + + if (lba + nlb < lba || lba + nlb > dev->nlbas) { + nvmf_send_generic_error(nc, NVME_SC_LBA_OUT_OF_RANGE); + return; + } + + off = lba * dev->sector_size; + len = nlb * dev->sector_size; + if (nvmf_capsule_data_len(nc) != len) { + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); + return; + } + + if (dev->type == RAMDISK) { + p = NULL; + src = (char *)dev->mem + off; + } else { + p = malloc(len); + if (!read_buffer(dev->fd, p, len, off)) { + free(p); + nvmf_send_generic_error(nc, + NVME_SC_INTERNAL_DEVICE_ERROR); + return; + } + src = p; + } + + nvmf_send_controller_data(nc, src, len); + free(p); +} + +static bool +write_buffer(int fd, const void *buf, size_t len, off_t offset) +{ + ssize_t nwritten; + const char *src; + + src = buf; + while (len > 0) { + nwritten = pwrite(fd, src, len, offset); + if (nwritten == -1 && errno == EINTR) + continue; + if (nwritten <= 0) + return (false); + src += nwritten; + len -= nwritten; + offset += nwritten; + } + return (true); +} + +void +device_write(uint32_t nsid, uint64_t lba, u_int nlb, + const struct nvmf_capsule *nc) +{ + struct backing_device *dev; + char *p, *dst; + off_t off; + size_t len; + int error; + + dev = lookup_device(nsid); + if (dev == NULL) { + nvmf_send_generic_error(nc, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + return; + } + + if (lba + nlb < lba || lba + nlb > dev->nlbas) { + nvmf_send_generic_error(nc, NVME_SC_LBA_OUT_OF_RANGE); + return; + } + + off = lba * dev->sector_size; + len = nlb * dev->sector_size; + if (nvmf_capsule_data_len(nc) != len) { + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); + return; + } + + if (dev->type == RAMDISK) { + p = NULL; + dst = (char *)dev->mem + off; + } else { + p = malloc(len); + dst = p; + } + + error = nvmf_receive_controller_data(nc, 0, dst, len); + if (error != 0) { + nvmf_send_generic_error(nc, NVME_SC_TRANSIENT_TRANSPORT_ERROR); + free(p); + return; + } + + if (dev->type != RAMDISK) { + if (!write_buffer(dev->fd, p, len, off)) { + free(p); + nvmf_send_generic_error(nc, + NVME_SC_INTERNAL_DEVICE_ERROR); + return; + } + } + free(p); + nvmf_send_success(nc); +} + +void +device_flush(uint32_t nsid, const struct nvmf_capsule *nc) +{ + struct backing_device *dev; + + dev = lookup_device(nsid); + if (dev == NULL) { + nvmf_send_generic_error(nc, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + return; + } + + switch (dev->type) { + case RAMDISK: + break; + case FILE: + if (fdatasync(dev->fd) == -1) { + nvmf_send_error(nc, NVME_SCT_MEDIA_ERROR, + NVME_SC_WRITE_FAULTS); + return; + } + break; + case CDEV: + if (ioctl(dev->fd, DIOCGFLUSH) == -1) { + nvmf_send_error(nc, NVME_SCT_MEDIA_ERROR, + NVME_SC_WRITE_FAULTS); + return; + } + } + + nvmf_send_success(nc); +} diff --git a/usr.sbin/nvmfd/discovery.c b/usr.sbin/nvmfd/discovery.c new file mode 100644 index 000000000000..985c77620a62 --- /dev/null +++ b/usr.sbin/nvmfd/discovery.c @@ -0,0 +1,343 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin <jhb@FreeBSD.org> + */ + +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <assert.h> +#include <err.h> +#include <libnvmf.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "internal.h" + +struct io_controller_data { + struct nvme_discovery_log_entry entry; + bool wildcard; +}; + +struct discovery_controller { + struct nvme_discovery_log *discovery_log; + size_t discovery_log_len; + int s; +}; + +struct discovery_thread_arg { + struct controller *c; + struct nvmf_qpair *qp; + int s; +}; + +static struct io_controller_data *io_controllers; +static struct nvmf_association *discovery_na; +static u_int num_io_controllers; + +static bool +init_discovery_log_entry(struct nvme_discovery_log_entry *entry, int s, + const char *subnqn) +{ + struct sockaddr_storage ss; + socklen_t len; + bool wildcard; + + len = sizeof(ss); + if (getsockname(s, (struct sockaddr *)&ss, &len) == -1) + err(1, "getsockname"); + + memset(entry, 0, sizeof(*entry)); + entry->trtype = NVMF_TRTYPE_TCP; + switch (ss.ss_family) { + case AF_INET: + { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&ss; + entry->adrfam = NVMF_ADRFAM_IPV4; + snprintf(entry->trsvcid, sizeof(entry->trsvcid), "%u", + htons(sin->sin_port)); + if (inet_ntop(AF_INET, &sin->sin_addr, entry->traddr, + sizeof(entry->traddr)) == NULL) + err(1, "inet_ntop"); + wildcard = (sin->sin_addr.s_addr == htonl(INADDR_ANY)); + break; + } + case AF_INET6: + { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&ss; + entry->adrfam = NVMF_ADRFAM_IPV6; + snprintf(entry->trsvcid, sizeof(entry->trsvcid), "%u", + htons(sin6->sin6_port)); + if (inet_ntop(AF_INET6, &sin6->sin6_addr, entry->traddr, + sizeof(entry->traddr)) == NULL) + err(1, "inet_ntop"); + wildcard = (memcmp(&sin6->sin6_addr, &in6addr_any, + sizeof(in6addr_any)) == 0); + break; + } + default: + errx(1, "Unsupported address family %u", ss.ss_family); + } + entry->subtype = NVMF_SUBTYPE_NVME; + if (flow_control_disable) + entry->treq |= (1 << 2); + entry->portid = htole16(1); + entry->cntlid = htole16(NVMF_CNTLID_DYNAMIC); + entry->aqsz = NVME_MAX_ADMIN_ENTRIES; + strlcpy(entry->subnqn, subnqn, sizeof(entry->subnqn)); + return (wildcard); +} + +void +init_discovery(void) +{ + struct nvmf_association_params aparams; + + memset(&aparams, 0, sizeof(aparams)); + aparams.sq_flow_control = false; + aparams.dynamic_controller_model = true; + aparams.max_admin_qsize = NVME_MAX_ADMIN_ENTRIES; + aparams.tcp.pda = 0; + aparams.tcp.header_digests = header_digests; + aparams.tcp.data_digests = data_digests; + aparams.tcp.maxr2t = 1; + aparams.tcp.maxh2cdata = 256 * 1024; + discovery_na = nvmf_allocate_association(NVMF_TRTYPE_TCP, true, + &aparams); + if (discovery_na == NULL) + err(1, "Failed to create discovery association"); +} + +void +discovery_add_io_controller(int s, const char *subnqn) +{ + struct io_controller_data *icd; + + io_controllers = reallocf(io_controllers, (num_io_controllers + 1) * + sizeof(*io_controllers)); + + icd = &io_controllers[num_io_controllers]; + num_io_controllers++; + + icd->wildcard = init_discovery_log_entry(&icd->entry, s, subnqn); +} + +static void +build_discovery_log_page(struct discovery_controller *dc) +{ + struct sockaddr_storage ss; + socklen_t len; + char traddr[256]; + u_int i, nentries; + uint8_t adrfam; + + if (dc->discovery_log != NULL) + return; + + len = sizeof(ss); + if (getsockname(dc->s, (struct sockaddr *)&ss, &len) == -1) { + warn("build_discovery_log_page: getsockname"); + return; + } + + memset(traddr, 0, sizeof(traddr)); + switch (ss.ss_family) { + case AF_INET: + { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&ss; + adrfam = NVMF_ADRFAM_IPV4; + if (inet_ntop(AF_INET, &sin->sin_addr, traddr, + sizeof(traddr)) == NULL) { + warn("build_discovery_log_page: inet_ntop"); + return; + } + break; + } + case AF_INET6: + { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&ss; + adrfam = NVMF_ADRFAM_IPV6; + if (inet_ntop(AF_INET6, &sin6->sin6_addr, traddr, + sizeof(traddr)) == NULL) { + warn("build_discovery_log_page: inet_ntop"); + return; + } + break; + } + default: + assert(false); + } + + nentries = 0; + for (i = 0; i < num_io_controllers; i++) { + if (io_controllers[i].wildcard && + io_controllers[i].entry.adrfam != adrfam) + continue; + nentries++; + } + + dc->discovery_log_len = sizeof(*dc->discovery_log) + + nentries * sizeof(struct nvme_discovery_log_entry); + dc->discovery_log = calloc(dc->discovery_log_len, 1); + dc->discovery_log->numrec = nentries; + dc->discovery_log->recfmt = 0; + nentries = 0; + for (i = 0; i < num_io_controllers; i++) { + if (io_controllers[i].wildcard && + io_controllers[i].entry.adrfam != adrfam) + continue; + + dc->discovery_log->entries[nentries] = io_controllers[i].entry; + if (io_controllers[i].wildcard) + memcpy(dc->discovery_log->entries[nentries].traddr, + traddr, sizeof(traddr)); + } +} + +static void +handle_get_log_page_command(const struct nvmf_capsule *nc, + const struct nvme_command *cmd, struct discovery_controller *dc) +{ + uint64_t offset; + uint32_t length; + + switch (nvmf_get_log_page_id(cmd)) { + case NVME_LOG_DISCOVERY: + break; + default: + warnx("Unsupported log page %u for discovery controller", + nvmf_get_log_page_id(cmd)); + goto error; + } + + build_discovery_log_page(dc); + + offset = nvmf_get_log_page_offset(cmd); + if (offset >= dc->discovery_log_len) + goto error; + + length = nvmf_get_log_page_length(cmd); + if (length > dc->discovery_log_len - offset) + length = dc->discovery_log_len - offset; + + nvmf_send_controller_data(nc, (char *)dc->discovery_log + offset, + length); + return; +error: + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); +} + +static bool +discovery_command(const struct nvmf_capsule *nc, const struct nvme_command *cmd, + void *arg) +{ + struct discovery_controller *dc = arg; + + switch (cmd->opc) { + case NVME_OPC_GET_LOG_PAGE: + handle_get_log_page_command(nc, cmd, dc); + return (true); + default: + return (false); + } +} + +static void * +discovery_thread(void *arg) +{ + struct discovery_thread_arg *dta = arg; + struct discovery_controller dc; + + pthread_detach(pthread_self()); + + memset(&dc, 0, sizeof(dc)); + dc.s = dta->s; + + controller_handle_admin_commands(dta->c, discovery_command, &dc); + + free(dc.discovery_log); + free_controller(dta->c); + + nvmf_free_qpair(dta->qp); + + close(dta->s); + free(dta); + return (NULL); +} + +void +handle_discovery_socket(int s) +{ + struct nvmf_fabric_connect_data data; + struct nvme_controller_data cdata; + struct nvmf_qpair_params qparams; + struct discovery_thread_arg *dta; + struct nvmf_capsule *nc; + struct nvmf_qpair *qp; + pthread_t thr; + int error; + + memset(&qparams, 0, sizeof(qparams)); + qparams.tcp.fd = s; + + nc = NULL; + qp = nvmf_accept(discovery_na, &qparams, &nc, &data); + if (qp == NULL) { + warnx("Failed to create discovery qpair: %s", + nvmf_association_error(discovery_na)); + goto error; + } + + if (strcmp(data.subnqn, NVMF_DISCOVERY_NQN) != 0) { + warn("Discovery qpair with invalid SubNQN: %.*s", + (int)sizeof(data.subnqn), data.subnqn); + nvmf_connect_invalid_parameters(nc, true, + offsetof(struct nvmf_fabric_connect_data, subnqn)); + goto error; + } + + /* Just use a controller ID of 1 for all discovery controllers. */ + error = nvmf_finish_accept(nc, 1); + if (error != 0) { + warnc(error, "Failed to send CONNECT reponse"); + goto error; + } + + nvmf_init_discovery_controller_data(qp, &cdata); + + dta = malloc(sizeof(*dta)); + dta->qp = qp; + dta->s = s; + dta->c = init_controller(qp, &cdata); + + error = pthread_create(&thr, NULL, discovery_thread, dta); + if (error != 0) { + warnc(error, "Failed to create discovery thread"); + free_controller(dta->c); + free(dta); + goto error; + } + + nvmf_free_capsule(nc); + return; + +error: + if (nc != NULL) + nvmf_free_capsule(nc); + if (qp != NULL) + nvmf_free_qpair(qp); + close(s); +} diff --git a/usr.sbin/nvmfd/internal.h b/usr.sbin/nvmfd/internal.h new file mode 100644 index 000000000000..5ddbc1cf89f0 --- /dev/null +++ b/usr.sbin/nvmfd/internal.h @@ -0,0 +1,65 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin <jhb@FreeBSD.org> + */ + +#ifndef __INTERNAL_H__ +#define __INTERNAL_H__ + +#include <stdbool.h> + +struct controller; +struct nvme_command; +struct nvme_controller_data; +struct nvme_ns_list; +struct nvmf_capsule; +struct nvmf_qpair; + +typedef bool handle_command(const struct nvmf_capsule *, + const struct nvme_command *, void *); + +extern bool data_digests; +extern bool header_digests; +extern bool flow_control_disable; +extern bool kernel_io; + +/* controller.c */ +void controller_handle_admin_commands(struct controller *c, + handle_command *cb, void *cb_arg); +struct controller *init_controller(struct nvmf_qpair *qp, + const struct nvme_controller_data *cdata); +void free_controller(struct controller *c); + +/* discovery.c */ +void init_discovery(void); +void handle_discovery_socket(int s); +void discovery_add_io_controller(int s, const char *subnqn); + +/* io.c */ +void init_io(const char *subnqn); +void handle_io_socket(int s); +void shutdown_io(void); + +/* devices.c */ +void register_devices(int ac, char **av); +u_int device_count(void); +void device_active_nslist(uint32_t nsid, struct nvme_ns_list *nslist); +bool device_identification_descriptor(uint32_t nsid, void *buf); +bool device_namespace_data(uint32_t nsid, struct nvme_namespace_data *nsdata); +void device_read(uint32_t nsid, uint64_t lba, u_int nlb, + const struct nvmf_capsule *nc); +void device_write(uint32_t nsid, uint64_t lba, u_int nlb, + const struct nvmf_capsule *nc); +void device_flush(uint32_t nsid, const struct nvmf_capsule *nc); + +/* ctl.c */ +void init_ctl_port(const char *subnqn, + const struct nvmf_association_params *params); +void ctl_handoff_qpair(struct nvmf_qpair *qp, + const struct nvmf_fabric_connect_cmd *cmd, + const struct nvmf_fabric_connect_data *data); +void shutdown_ctl_port(const char *subnqn); + +#endif /* !__INTERNAL_H__ */ diff --git a/usr.sbin/nvmfd/io.c b/usr.sbin/nvmfd/io.c new file mode 100644 index 000000000000..be845a8ed784 --- /dev/null +++ b/usr.sbin/nvmfd/io.c @@ -0,0 +1,677 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin <jhb@FreeBSD.org> + */ + +#include <sys/sysctl.h> +#include <err.h> +#include <errno.h> +#include <libnvmf.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "internal.h" + +struct io_controller { + struct controller *c; + + u_int num_io_queues; + u_int active_io_queues; + struct nvmf_qpair **io_qpairs; + int *io_sockets; + + struct nvme_firmware_page fp; + struct nvme_health_information_page hip; + uint16_t partial_dur; + uint16_t partial_duw; + + uint16_t cntlid; + char hostid[16]; + char hostnqn[NVME_NQN_FIELD_SIZE]; +}; + +static struct nvmf_association *io_na; +static pthread_cond_t io_cond; +static pthread_mutex_t io_na_mutex; +static struct io_controller *io_controller; +static const char *nqn; +static char serial[NVME_SERIAL_NUMBER_LENGTH]; + +void +init_io(const char *subnqn) +{ + struct nvmf_association_params aparams; + u_long hostid; + size_t len; + + memset(&aparams, 0, sizeof(aparams)); + aparams.sq_flow_control = !flow_control_disable; + aparams.dynamic_controller_model = true; + aparams.max_admin_qsize = NVME_MAX_ADMIN_ENTRIES; + aparams.max_io_qsize = NVMF_MAX_IO_ENTRIES; + aparams.tcp.pda = 0; + aparams.tcp.header_digests = header_digests; + aparams.tcp.data_digests = data_digests; + aparams.tcp.maxr2t = 1; + aparams.tcp.maxh2cdata = 256 * 1024; + io_na = nvmf_allocate_association(NVMF_TRTYPE_TCP, true, + &aparams); + if (io_na == NULL) + err(1, "Failed to create I/O controller association"); + + nqn = subnqn; + + /* Generate a serial number from the kern.hostid node. */ + len = sizeof(hostid); + if (sysctlbyname("kern.hostid", &hostid, &len, NULL, 0) == -1) + err(1, "sysctl: kern.hostid"); + + nvmf_controller_serial(serial, sizeof(serial), hostid); + + pthread_cond_init(&io_cond, NULL); + pthread_mutex_init(&io_na_mutex, NULL); + + if (kernel_io) + init_ctl_port(subnqn, &aparams); +} + +void +shutdown_io(void) +{ + if (kernel_io) + shutdown_ctl_port(nqn); +} + +static void +handle_get_log_page(struct io_controller *ioc, const struct nvmf_capsule *nc, + const struct nvme_command *cmd) +{ + uint64_t offset; + uint32_t numd; + size_t len; + uint8_t lid; + + lid = le32toh(cmd->cdw10) & 0xff; + numd = le32toh(cmd->cdw10) >> 16 | le32toh(cmd->cdw11) << 16; + offset = le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32; + + if (offset % 3 != 0) + goto error; + + len = (numd + 1) * 4; + + switch (lid) { + case NVME_LOG_ERROR: + { + void *buf; + + if (len % sizeof(struct nvme_error_information_entry) != 0) + goto error; + + buf = calloc(1, len); + nvmf_send_controller_data(nc, buf, len); + free(buf); + return; + } + case NVME_LOG_HEALTH_INFORMATION: + if (len != sizeof(ioc->hip)) + goto error; + + nvmf_send_controller_data(nc, &ioc->hip, sizeof(ioc->hip)); + return; + case NVME_LOG_FIRMWARE_SLOT: + if (len != sizeof(ioc->fp)) + goto error; + + nvmf_send_controller_data(nc, &ioc->fp, sizeof(ioc->fp)); + return; + default: + warnx("Unsupported page %#x for GET_LOG_PAGE\n", lid); + goto error; + } + +error: + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); +} + +static bool +handle_io_identify_command(const struct nvmf_capsule *nc, + const struct nvme_command *cmd) +{ + struct nvme_namespace_data nsdata; + struct nvme_ns_list nslist; + uint32_t nsid; + uint8_t cns; + + cns = le32toh(cmd->cdw10) & 0xFF; + switch (cns) { + case 0: /* Namespace data. */ + if (!device_namespace_data(le32toh(cmd->nsid), &nsdata)) { + nvmf_send_generic_error(nc, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + return (true); + } + + nvmf_send_controller_data(nc, &nsdata, sizeof(nsdata)); + return (true); + case 2: /* Active namespace list. */ + nsid = le32toh(cmd->nsid); + if (nsid >= 0xfffffffe) { + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); + return (true); + } + + device_active_nslist(nsid, &nslist); + nvmf_send_controller_data(nc, &nslist, sizeof(nslist)); + return (true); + case 3: /* Namespace Identification Descriptor list. */ + if (!device_identification_descriptor(le32toh(cmd->nsid), + &nsdata)) { + nvmf_send_generic_error(nc, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + return (true); + } + + nvmf_send_controller_data(nc, &nsdata, sizeof(nsdata)); + return (true); + default: + return (false); + } +} + +static void +handle_set_features(struct io_controller *ioc, const struct nvmf_capsule *nc, + const struct nvme_command *cmd) +{ + struct nvme_completion cqe; + uint8_t fid; + + fid = NVMEV(NVME_FEAT_SET_FID, le32toh(cmd->cdw10)); + switch (fid) { + case NVME_FEAT_NUMBER_OF_QUEUES: + { + uint32_t num_queues; + + if (ioc->num_io_queues != 0) { + nvmf_send_generic_error(nc, + NVME_SC_COMMAND_SEQUENCE_ERROR); + return; + } + + num_queues = le32toh(cmd->cdw11) & 0xffff; + + /* 5.12.1.7: 65535 is invalid. */ + if (num_queues == 65535) + goto error; + + /* Fabrics requires the same number of SQs and CQs. */ + if (le32toh(cmd->cdw11) >> 16 != num_queues) + goto error; + + /* Convert to 1's based */ + num_queues++; + + /* Lock to synchronize with handle_io_qpair. */ + pthread_mutex_lock(&io_na_mutex); + ioc->num_io_queues = num_queues; + ioc->io_qpairs = calloc(num_queues, sizeof(*ioc->io_qpairs)); + ioc->io_sockets = calloc(num_queues, sizeof(*ioc->io_sockets)); + pthread_mutex_unlock(&io_na_mutex); + + nvmf_init_cqe(&cqe, nc, 0); + cqe.cdw0 = cmd->cdw11; + nvmf_send_response(nc, &cqe); + return; + } + case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + { + uint32_t aer_mask; + + aer_mask = le32toh(cmd->cdw11); + + /* Check for any reserved or unimplemented feature bits. */ + if ((aer_mask & 0xffffc000) != 0) + goto error; + + /* No AERs are generated by this daemon. */ + nvmf_send_success(nc); + return; + } + default: + warnx("Unsupported feature ID %u for SET_FEATURES", fid); + goto error; + } + +error: + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); +} + +static bool +admin_command(const struct nvmf_capsule *nc, const struct nvme_command *cmd, + void *arg) +{ + struct io_controller *ioc = arg; + + switch (cmd->opc) { + case NVME_OPC_GET_LOG_PAGE: + handle_get_log_page(ioc, nc, cmd); + return (true); + case NVME_OPC_IDENTIFY: + return (handle_io_identify_command(nc, cmd)); + case NVME_OPC_SET_FEATURES: + handle_set_features(ioc, nc, cmd); + return (true); + case NVME_OPC_ASYNC_EVENT_REQUEST: + /* Ignore and never complete. */ + return (true); + case NVME_OPC_KEEP_ALIVE: + nvmf_send_success(nc); + return (true); + default: + return (false); + } +} + +static void +handle_admin_qpair(struct io_controller *ioc) +{ + pthread_setname_np(pthread_self(), "admin queue"); + + controller_handle_admin_commands(ioc->c, admin_command, ioc); + + pthread_mutex_lock(&io_na_mutex); + for (u_int i = 0; i < ioc->num_io_queues; i++) { + if (ioc->io_qpairs[i] == NULL || ioc->io_sockets[i] == -1) + continue; + close(ioc->io_sockets[i]); + ioc->io_sockets[i] = -1; + } + + /* Wait for I/O threads to notice. */ + while (ioc->active_io_queues > 0) + pthread_cond_wait(&io_cond, &io_na_mutex); + + io_controller = NULL; + pthread_mutex_unlock(&io_na_mutex); + + free_controller(ioc->c); + + free(ioc); +} + +static bool +handle_io_fabrics_command(const struct nvmf_capsule *nc, + const struct nvmf_fabric_cmd *fc) +{ + switch (fc->fctype) { + case NVMF_FABRIC_COMMAND_CONNECT: + warnx("CONNECT command on connected queue"); + nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR); + break; + case NVMF_FABRIC_COMMAND_DISCONNECT: + { + const struct nvmf_fabric_disconnect_cmd *dis = + (const struct nvmf_fabric_disconnect_cmd *)fc; + if (dis->recfmt != htole16(0)) { + nvmf_send_error(nc, NVME_SCT_COMMAND_SPECIFIC, + NVMF_FABRIC_SC_INCOMPATIBLE_FORMAT); + break; + } + nvmf_send_success(nc); + return (true); + } + default: + warnx("Unsupported fabrics command %#x", fc->fctype); + nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE); + break; + } + + return (false); +} + +static void +hip_add(uint64_t pair[2], uint64_t addend) +{ + uint64_t old, new; + + old = le64toh(pair[0]); + new = old + addend; + pair[0] = htole64(new); + if (new < old) + pair[1] += htole64(1); +} + +static uint64_t +cmd_lba(const struct nvme_command *cmd) +{ + return ((uint64_t)le32toh(cmd->cdw11) << 32 | le32toh(cmd->cdw10)); +} + +static u_int +cmd_nlb(const struct nvme_command *cmd) +{ + return ((le32toh(cmd->cdw12) & 0xffff) + 1); +} + +static void +handle_read(struct io_controller *ioc, const struct nvmf_capsule *nc, + const struct nvme_command *cmd) +{ + size_t len; + + len = nvmf_capsule_data_len(nc); + device_read(le32toh(cmd->nsid), cmd_lba(cmd), cmd_nlb(cmd), nc); + hip_add(ioc->hip.host_read_commands, 1); + + len /= 512; + len += ioc->partial_dur; + if (len > 1000) + hip_add(ioc->hip.data_units_read, len / 1000); + ioc->partial_dur = len % 1000; +} + +static void +handle_write(struct io_controller *ioc, const struct nvmf_capsule *nc, + const struct nvme_command *cmd) +{ + size_t len; + + len = nvmf_capsule_data_len(nc); + device_write(le32toh(cmd->nsid), cmd_lba(cmd), cmd_nlb(cmd), nc); + hip_add(ioc->hip.host_write_commands, 1); + + len /= 512; + len += ioc->partial_duw; + if (len > 1000) + hip_add(ioc->hip.data_units_written, len / 1000); + ioc->partial_duw = len % 1000; +} + +static void +handle_flush(const struct nvmf_capsule *nc, const struct nvme_command *cmd) +{ + device_flush(le32toh(cmd->nsid), nc); +} + +static bool +handle_io_commands(struct io_controller *ioc, struct nvmf_qpair *qp) +{ + const struct nvme_command *cmd; + struct nvmf_capsule *nc; + int error; + bool disconnect; + + disconnect = false; + + while (!disconnect) { + error = nvmf_controller_receive_capsule(qp, &nc); + if (error != 0) { + if (error != ECONNRESET) + warnc(error, "Failed to read command capsule"); + break; + } + + cmd = nvmf_capsule_sqe(nc); + + switch (cmd->opc) { + case NVME_OPC_FLUSH: + if (cmd->nsid == htole32(0xffffffff)) { + nvmf_send_generic_error(nc, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + break; + } + handle_flush(nc, cmd); + break; + case NVME_OPC_WRITE: + handle_write(ioc, nc, cmd); + break; + case NVME_OPC_READ: + handle_read(ioc, nc, cmd); + break; + case NVME_OPC_FABRICS_COMMANDS: + disconnect = handle_io_fabrics_command(nc, + (const struct nvmf_fabric_cmd *)cmd); + break; + default: + warnx("Unsupported NVM opcode %#x", cmd->opc); + nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE); + break; + } + nvmf_free_capsule(nc); + } + + return (disconnect); +} + +static void +handle_io_qpair(struct io_controller *ioc, struct nvmf_qpair *qp, int qid) +{ + char name[64]; + bool disconnect; + + snprintf(name, sizeof(name), "I/O queue %d", qid); + pthread_setname_np(pthread_self(), name); + + disconnect = handle_io_commands(ioc, qp); + + pthread_mutex_lock(&io_na_mutex); + if (disconnect) + ioc->io_qpairs[qid - 1] = NULL; + if (ioc->io_sockets[qid - 1] != -1) { + close(ioc->io_sockets[qid - 1]); + ioc->io_sockets[qid - 1] = -1; + } + ioc->active_io_queues--; + if (ioc->active_io_queues == 0) + pthread_cond_broadcast(&io_cond); + pthread_mutex_unlock(&io_na_mutex); +} + +static void +connect_admin_qpair(int s, struct nvmf_qpair *qp, struct nvmf_capsule *nc, + const struct nvmf_fabric_connect_data *data) +{ + struct nvme_controller_data cdata; + struct io_controller *ioc; + int error; + + /* Can only have one active I/O controller at a time. */ + pthread_mutex_lock(&io_na_mutex); + if (io_controller != NULL) { + pthread_mutex_unlock(&io_na_mutex); + nvmf_send_error(nc, NVME_SCT_COMMAND_SPECIFIC, + NVMF_FABRIC_SC_CONTROLLER_BUSY); + goto error; + } + + error = nvmf_finish_accept(nc, 2); + if (error != 0) { + pthread_mutex_unlock(&io_na_mutex); + warnc(error, "Failed to send CONNECT response"); + goto error; + } + + ioc = calloc(1, sizeof(*ioc)); + ioc->cntlid = 2; + memcpy(ioc->hostid, data->hostid, sizeof(ioc->hostid)); + memcpy(ioc->hostnqn, data->hostnqn, sizeof(ioc->hostnqn)); + + nvmf_init_io_controller_data(qp, serial, nqn, device_count(), + NVMF_IOCCSZ, &cdata); + + ioc->fp.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1); + memcpy(ioc->fp.revision[0], cdata.fr, sizeof(cdata.fr)); + + ioc->hip.power_cycles[0] = 1; + + ioc->c = init_controller(qp, &cdata); + + io_controller = ioc; + pthread_mutex_unlock(&io_na_mutex); + + nvmf_free_capsule(nc); + + handle_admin_qpair(ioc); + close(s); + return; + +error: + nvmf_free_capsule(nc); + close(s); +} + +static void +connect_io_qpair(int s, struct nvmf_qpair *qp, struct nvmf_capsule *nc, + const struct nvmf_fabric_connect_data *data, uint16_t qid) +{ + struct io_controller *ioc; + int error; + + pthread_mutex_lock(&io_na_mutex); + if (io_controller == NULL) { + pthread_mutex_unlock(&io_na_mutex); + warnx("Attempt to create I/O qpair without admin qpair"); + nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR); + goto error; + } + + if (memcmp(io_controller->hostid, data->hostid, + sizeof(data->hostid)) != 0) { + pthread_mutex_unlock(&io_na_mutex); + warnx("hostid mismatch for I/O qpair CONNECT"); + nvmf_connect_invalid_parameters(nc, true, + offsetof(struct nvmf_fabric_connect_data, hostid)); + goto error; + } + if (le16toh(data->cntlid) != io_controller->cntlid) { + pthread_mutex_unlock(&io_na_mutex); + warnx("cntlid mismatch for I/O qpair CONNECT"); + nvmf_connect_invalid_parameters(nc, true, + offsetof(struct nvmf_fabric_connect_data, cntlid)); + goto error; + } + if (memcmp(io_controller->hostnqn, data->hostnqn, + sizeof(data->hostid)) != 0) { + pthread_mutex_unlock(&io_na_mutex); + warnx("host NQN mismatch for I/O qpair CONNECT"); + nvmf_connect_invalid_parameters(nc, true, + offsetof(struct nvmf_fabric_connect_data, hostnqn)); + goto error; + } + + if (io_controller->num_io_queues == 0) { + pthread_mutex_unlock(&io_na_mutex); + warnx("Attempt to create I/O qpair without enabled queues"); + nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR); + goto error; + } + if (qid > io_controller->num_io_queues) { + pthread_mutex_unlock(&io_na_mutex); + warnx("Attempt to create invalid I/O qpair %u", qid); + nvmf_connect_invalid_parameters(nc, false, + offsetof(struct nvmf_fabric_connect_cmd, qid)); + goto error; + } + if (io_controller->io_qpairs[qid - 1] != NULL) { + pthread_mutex_unlock(&io_na_mutex); + warnx("Attempt to re-create I/O qpair %u", qid); + nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR); + goto error; + } + + error = nvmf_finish_accept(nc, io_controller->cntlid); + if (error != 0) { + pthread_mutex_unlock(&io_na_mutex); + warnc(error, "Failed to send CONNECT response"); + goto error; + } + + ioc = io_controller; + ioc->active_io_queues++; + ioc->io_qpairs[qid - 1] = qp; + ioc->io_sockets[qid - 1] = s; + pthread_mutex_unlock(&io_na_mutex); + + nvmf_free_capsule(nc); + + handle_io_qpair(ioc, qp, qid); + return; + +error: + nvmf_free_capsule(nc); + close(s); +} + +static void * +io_socket_thread(void *arg) +{ + struct nvmf_fabric_connect_data data; + struct nvmf_qpair_params qparams; + const struct nvmf_fabric_connect_cmd *cmd; + struct nvmf_capsule *nc; + struct nvmf_qpair *qp; + int s; + + pthread_detach(pthread_self()); + + s = (intptr_t)arg; + memset(&qparams, 0, sizeof(qparams)); + qparams.tcp.fd = s; + + nc = NULL; + qp = nvmf_accept(io_na, &qparams, &nc, &data); + if (qp == NULL) { + warnx("Failed to create I/O qpair: %s", + nvmf_association_error(io_na)); + goto error; + } + + if (kernel_io) { + ctl_handoff_qpair(qp, nvmf_capsule_sqe(nc), &data); + goto error; + } + + if (strcmp(data.subnqn, nqn) != 0) { + warn("I/O qpair with invalid SubNQN: %.*s", + (int)sizeof(data.subnqn), data.subnqn); + nvmf_connect_invalid_parameters(nc, true, + offsetof(struct nvmf_fabric_connect_data, subnqn)); + goto error; + } + + /* Is this an admin or I/O queue pair? */ + cmd = nvmf_capsule_sqe(nc); + if (cmd->qid == 0) + connect_admin_qpair(s, qp, nc, &data); + else + connect_io_qpair(s, qp, nc, &data, le16toh(cmd->qid)); + nvmf_free_qpair(qp); + return (NULL); + +error: + if (nc != NULL) + nvmf_free_capsule(nc); + if (qp != NULL) + nvmf_free_qpair(qp); + close(s); + return (NULL); +} + +void +handle_io_socket(int s) +{ + pthread_t thr; + int error; + + error = pthread_create(&thr, NULL, io_socket_thread, + (void *)(uintptr_t)s); + if (error != 0) { + warnc(error, "Failed to create I/O qpair thread"); + close(s); + } +} diff --git a/usr.sbin/nvmfd/nvmfd.8 b/usr.sbin/nvmfd/nvmfd.8 new file mode 100644 index 000000000000..689ac6d4dda1 --- /dev/null +++ b/usr.sbin/nvmfd/nvmfd.8 @@ -0,0 +1,126 @@ +.\" +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2024 Chelsio Communications, Inc. +.\" +.Dd May 2, 2024 +.Dt NVMFD 8 +.Os +.Sh NAME +.Nm nvmfd +.Nd "NVMeoF controller daemon" +.Sh SYNOPSIS +.Nm +.Fl K +.Op Fl dFGg +.Op Fl P Ar port +.Op Fl p Ar port +.Op Fl t Ar transport +.Op Fl n Ar subnqn +.Nm +.Op Fl dFGg +.Op Fl P Ar port +.Op Fl p Ar port +.Op Fl t Ar transport +.Op Fl n Ar subnqn +.Ar device +.Op Ar device ... +.Sh DESCRIPTION +.Nm +accepts incoming NVMeoF connections for both I/O and discovery controllers. +.Nm +can either implement a single dynamic I/O controller in user mode or hand +off incoming I/O controller connections to +.Xr nvmft 4 . +A dynamic discovery controller service is always provided in user mode. +.Pp +The following options are available: +.Bl -tag -width "-t transport" +.It Fl F +Permit remote hosts to disable SQ flow control. +.It Fl G +Permit remote hosts to enable PDU data digests for the TCP transport. +.It Fl g +Permit remote hosts to enable PDU header digests for the TCP transport. +.It Fl K +Enable kernel mode which hands off incoming I/O controller connections to +.Xr nvmft 4 . +.It Fl P Ar port +Use +.Ar port +as the listen TCP port for the discovery controller service. +The default value is 8009. +.It Fl d +Enable debug mode. +The daemon sends any errors to standard output and does not place +itself in the background. +.It Fl p Ar port +Use +.Ar port +as the listen TCP port for the I/O controller service. +By default an unused ephemeral port will be chosen. +.It Fl n Ar subnqn +The Subsystem NVMe Qualified Name for the I/O controller. +If an explicit NQN is not given, a default value is generated from the +current host's UUID obtained from the +.Vt kern.hostuuid +sysctl. +.It Fl t Ar transport +The transport type to use. +The default transport is +.Dq tcp . +.It Ar device +When implementing a user mode I/O controller, +one or more +.Ar device +arguments must be specified. +Each +.Ar device +describes the backing store for a namespace exported to remote hosts. +Devices can be specified using one of the following syntaxes: +.Bl -tag -width "ramdisk:size" +.It Pa pathname +File or disk device +.It ramdisk : Ns Ar size +Allocate a memory disk with the given +.Ar size . +.Ar size +may use any of the suffixes supported by +.Xr expand_number 3 . +.El +.El +.Sh FILES +.Bl -tag -width "/var/run/nvmfd.pid" -compact +.It Pa /var/run/nvmfd.pid +The default location of the +.Nm +PID file. +.El +.Sh EXIT STATUS +.Ex -std +.Sh SEE ALSO +.Xr ctl 4 , +.Xr nvmft 4 , +.Xr ctladm 8 , +.Xr ctld 8 +.Sh HISTORY +The +.Nm +module first appeared in +.Fx 15.0 . +.Sh AUTHORS +The +.Nm +subsystem was developed by +.An John Baldwin Aq Mt jhb@FreeBSD.org +under sponsorship from Chelsio Communications, Inc. +.Sh BUGS +The discovery controller and kernel mode functionality of +.Nm +should be merged into +.Xr ctld 8 . +.Pp +Additional paramters such as +.Va MAXR2T , +.Va MAXH2CDATA , +and queue sizes should be configurable. diff --git a/usr.sbin/nvmfd/nvmfd.c b/usr.sbin/nvmfd/nvmfd.c new file mode 100644 index 000000000000..6fce21b07b74 --- /dev/null +++ b/usr.sbin/nvmfd/nvmfd.c @@ -0,0 +1,260 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin <jhb@FreeBSD.org> + */ + +#include <sys/param.h> +#include <sys/event.h> +#include <sys/linker.h> +#include <sys/module.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <assert.h> +#include <err.h> +#include <errno.h> +#include <libnvmf.h> +#include <libutil.h> +#include <netdb.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "internal.h" + +bool data_digests = false; +bool header_digests = false; +bool flow_control_disable = false; +bool kernel_io = false; + +static const char *subnqn; +static volatile bool quit = false; + +static void +usage(void) +{ + fprintf(stderr, "nvmfd -K [-FGg] [-P port] [-p port] [-t transport] [-n subnqn]\n" + "nvmfd [-dDFH] [-P port] [-p port] [-t transport] [-n subnqn]\n" + "\tdevice [device [...]]\n" + "\n" + "Devices use one of the following syntaxes:\n" + "\tpathame - file or disk device\n" + "\tramdisk:size - memory disk of given size\n"); + exit(1); +} + +static void +handle_sig(int sig __unused) +{ + quit = true; +} + +static void +register_listen_socket(int kqfd, int s, void *udata) +{ + struct kevent kev; + + if (listen(s, -1) != 0) + err(1, "listen"); + + EV_SET(&kev, s, EVFILT_READ, EV_ADD, 0, 0, udata); + if (kevent(kqfd, &kev, 1, NULL, 0, NULL) == -1) + err(1, "kevent: failed to add listen socket"); +} + +static void +create_passive_sockets(int kqfd, const char *port, bool discovery) +{ + struct addrinfo hints, *ai, *list; + bool created; + int error, s; + + memset(&hints, 0, sizeof(hints)); + hints.ai_flags = AI_PASSIVE; + hints.ai_family = AF_UNSPEC; + hints.ai_protocol = IPPROTO_TCP; + error = getaddrinfo(NULL, port, &hints, &list); + if (error != 0) + errx(1, "%s", gai_strerror(error)); + created = false; + + for (ai = list; ai != NULL; ai = ai->ai_next) { + s = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); + if (s == -1) + continue; + + if (bind(s, ai->ai_addr, ai->ai_addrlen) != 0) { + close(s); + continue; + } + + if (discovery) { + register_listen_socket(kqfd, s, (void *)1); + } else { + register_listen_socket(kqfd, s, (void *)2); + discovery_add_io_controller(s, subnqn); + } + created = true; + } + + freeaddrinfo(list); + if (!created) + err(1, "Failed to create any listen sockets"); +} + +static void +handle_connections(int kqfd) +{ + struct kevent ev; + int s; + + signal(SIGHUP, handle_sig); + signal(SIGINT, handle_sig); + signal(SIGQUIT, handle_sig); + signal(SIGTERM, handle_sig); + + while (!quit) { + if (kevent(kqfd, NULL, 0, &ev, 1, NULL) == -1) { + if (errno == EINTR) + continue; + err(1, "kevent"); + } + + assert(ev.filter == EVFILT_READ); + + s = accept(ev.ident, NULL, NULL); + if (s == -1) { + warn("accept"); + continue; + } + + switch ((uintptr_t)ev.udata) { + case 1: + handle_discovery_socket(s); + break; + case 2: + handle_io_socket(s); + break; + default: + __builtin_unreachable(); + } + } +} + +int +main(int ac, char **av) +{ + struct pidfh *pfh; + const char *dport, *ioport, *transport; + pid_t pid; + int ch, error, kqfd; + bool daemonize; + static char nqn[NVMF_NQN_MAX_LEN]; + + /* 7.4.9.3 Default port for discovery */ + dport = "8009"; + + pfh = NULL; + daemonize = true; + ioport = "0"; + subnqn = NULL; + transport = "tcp"; + while ((ch = getopt(ac, av, "dFgGKn:P:p:t:")) != -1) { + switch (ch) { + case 'd': + daemonize = false; + break; + case 'F': + flow_control_disable = true; + break; + case 'G': + data_digests = true; + break; + case 'g': + header_digests = true; + break; + case 'K': + kernel_io = true; + break; + case 'n': + subnqn = optarg; + break; + case 'P': + dport = optarg; + break; + case 'p': + ioport = optarg; + break; + case 't': + transport = optarg; + break; + default: + usage(); + } + } + + av += optind; + ac -= optind; + + if (kernel_io) { + if (ac > 0) + usage(); + if (modfind("nvmft") == -1 && kldload("nvmft") == -1) + warn("couldn't load nvmft"); + } else { + if (ac < 1) + usage(); + } + + if (strcasecmp(transport, "tcp") == 0) { + } else + errx(1, "Invalid transport %s", transport); + + if (subnqn == NULL) { + error = nvmf_nqn_from_hostuuid(nqn); + if (error != 0) + errc(1, error, "Failed to generate NQN"); + subnqn = nqn; + } + + if (!kernel_io) + register_devices(ac, av); + + init_discovery(); + init_io(subnqn); + + if (daemonize) { + pfh = pidfile_open(NULL, 0600, &pid); + if (pfh == NULL) { + if (errno == EEXIST) + errx(1, "Daemon already running, pid: %jd", + (intmax_t)pid); + warn("Cannot open or create pidfile"); + } + + if (daemon(0, 0) != 0) { + pidfile_remove(pfh); + err(1, "Failed to fork into the background"); + } + + pidfile_write(pfh); + } + + kqfd = kqueue(); + if (kqfd == -1) { + pidfile_remove(pfh); + err(1, "kqueue"); + } + + create_passive_sockets(kqfd, dport, true); + create_passive_sockets(kqfd, ioport, false); + + handle_connections(kqfd); + shutdown_io(); + if (pfh != NULL) + pidfile_remove(pfh); + return (0); +} |