diff options
Diffstat (limited to 'sys/dev/cxgbe/nvmf/nvmf_che.c')
| -rw-r--r-- | sys/dev/cxgbe/nvmf/nvmf_che.c | 3331 |
1 files changed, 3331 insertions, 0 deletions
diff --git a/sys/dev/cxgbe/nvmf/nvmf_che.c b/sys/dev/cxgbe/nvmf/nvmf_che.c new file mode 100644 index 000000000000..5c2174b8a40b --- /dev/null +++ b/sys/dev/cxgbe/nvmf/nvmf_che.c @@ -0,0 +1,3331 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Chelsio Communications, Inc. + * Written by: John Baldwin <jhb@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_inet.h" + +#include <sys/param.h> +#include <sys/libkern.h> +#include <sys/kernel.h> +#include <sys/module.h> + +#ifdef TCP_OFFLOAD +#include <sys/bitset.h> +#include <sys/capsicum.h> +#include <sys/file.h> +#include <sys/kthread.h> +#include <sys/ktr.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/nv.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/tcp_var.h> +#include <netinet/toecore.h> + +#include <dev/nvmf/nvmf.h> +#include <dev/nvmf/nvmf_proto.h> +#include <dev/nvmf/nvmf_tcp.h> +#include <dev/nvmf/nvmf_transport.h> +#include <dev/nvmf/nvmf_transport_internal.h> + +#include <vm/pmap.h> +#include <vm/vm_page.h> + +#include "common/common.h" +#include "common/t4_regs.h" +#include "common/t4_tcb.h" +#include "tom/t4_tom.h" + +/* Status code values in CPL_NVMT_CMP. */ +#define CMP_STATUS_ERROR_MASK 0x7f +#define CMP_STATUS_NO_ERROR 0 +#define CMP_STATUS_HEADER_DIGEST 1 +#define CMP_STATUS_DIRECTION_MISMATCH 2 +#define CMP_STATUS_DIGEST_FLAG_MISMATCH 3 +#define CMP_STATUS_SUCCESS_NOT_LAST 4 +#define CMP_STATUS_BAD_DATA_LENGTH 5 +#define CMP_STATUS_USER_MODE_UNALLOCATED 6 +#define CMP_STATUS_RQT_LIMIT 7 +#define CMP_STATUS_RQT_WRAP 8 +#define CMP_STATUS_RQT_BOUND 9 +#define CMP_STATUS_TPT_LIMIT 16 +#define CMP_STATUS_TPT_INVALID 17 +#define CMP_STATUS_TPT_COLOUR_MISMATCH 18 +#define CMP_STATUS_TPT_MISC 19 +#define CMP_STATUS_TPT_WRAP 20 +#define CMP_STATUS_TPT_BOUND 21 +#define CMP_STATUS_TPT_LAST_PDU_UNALIGNED 22 +#define CMP_STATUS_PBL_LIMIT 24 +#define CMP_STATUS_DATA_DIGEST 25 +#define CMP_STATUS_DDP 0x80 + +/* + * Transfer tags and CIDs with the MSB set are "unallocated" tags that + * pass data through to the freelist without using DDP. + */ +#define CHE_FL_TAG_MASK 0x8000 +#define CHE_MAX_FL_TAG 0x7fff +#define CHE_NUM_FL_TAGS (CHE_MAX_FL_TAG + 1) + +#define CHE_TAG_IS_FL(ttag) (((ttag) & CHE_FL_TAG_MASK) == CHE_FL_TAG_MASK) +#define CHE_RAW_FL_TAG(ttag) ((ttag) & ~CHE_FL_TAG_MASK) +#define CHE_DDP_TAG(stag_idx, color) ((stag_idx) << 4 | (color)) +#define CHE_STAG_COLOR(stag) ((stag) & 0xf) +#define CHE_STAG_IDX(stag) ((stag) >> 4) +#define CHE_DDP_MAX_COLOR 0xf + +#define CHE_DDP_NO_TAG 0xffff + +/* + * A bitmap of non-DDP CIDs in use on the host. Since there is no + * _BIT_FFC (find first clear), the bitset is inverted so that a clear + * bit indicates an in-use CID. + */ +BITSET_DEFINE(fl_cid_set, CHE_NUM_FL_TAGS); +#define FL_CID_INIT(p) __BIT_FILL(CHE_NUM_FL_TAGS, p) +#define FL_CID_BUSY(n, p) __BIT_CLR(CHE_NUM_FL_TAGS, n, p) +#define FL_CID_ISACTIVE(n, p) !__BIT_ISSET(CHE_NUM_FL_TAGS, n, p) +#define FL_CID_FREE(n, p) __BIT_SET(CHE_NUM_FL_TAGS, n, p) +#define FL_CID_FINDFREE_AT(p, start) __BIT_FFS_AT(CHE_NUM_FL_TAGS, p, start) + +/* + * The TCP sequence number of both CPL_NVMT_DATA and CPL_NVMT_CMP + * mbufs are saved here while the mbuf is in qp->rx_data and qp->rx_pdus. + */ +#define nvmf_tcp_seq PH_loc.thirtytwo[0] + +/* + * The CPL status of CPL_NVMT_CMP mbufs are saved here while the mbuf + * is in qp->rx_pdus. + */ +#define nvmf_cpl_status PH_loc.eight[4] + +struct nvmf_che_capsule; +struct nvmf_che_qpair; + +struct nvmf_che_adapter { + struct adapter *sc; + + u_int ddp_threshold; + u_int max_transmit_pdu; + u_int max_receive_pdu; + bool nvmt_data_iqe; + + struct sysctl_ctx_list ctx; /* from uld_activate to deactivate */ +}; + +struct nvmf_che_command_buffer { + struct nvmf_che_qpair *qp; + + struct nvmf_io_request io; + size_t data_len; + size_t data_xfered; + uint32_t data_offset; + + u_int refs; + int error; + + bool ddp_ok; + uint16_t cid; + uint16_t ttag; + uint16_t original_cid; /* Host only */ + + TAILQ_ENTRY(nvmf_che_command_buffer) link; + + /* Fields used for DDP. */ + struct fw_ri_tpte tpte; + uint64_t *pbl; + uint32_t pbl_addr; + uint32_t pbl_len; + + /* Controller only */ + struct nvmf_che_capsule *cc; +}; + +struct nvmf_che_command_buffer_list { + TAILQ_HEAD(, nvmf_che_command_buffer) head; + struct mtx lock; +}; + +struct nvmf_che_qpair { + struct nvmf_qpair qp; + + struct socket *so; + struct toepcb *toep; + struct nvmf_che_adapter *nca; + + volatile u_int refs; /* Every allocated capsule holds a reference */ + uint8_t txpda; + uint8_t rxpda; + bool header_digests; + bool data_digests; + uint32_t maxr2t; + uint32_t maxh2cdata; /* Controller only */ + uint32_t max_rx_data; + uint32_t max_tx_data; + uint32_t max_icd; /* Host only */ + uint32_t max_ioccsz; /* Controller only */ + union { + uint16_t next_fl_ttag; /* Controller only */ + uint16_t next_cid; /* Host only */ + }; + uint16_t next_ddp_tag; + u_int num_fl_ttags; /* Controller only */ + u_int active_fl_ttags; /* Controller only */ + u_int num_ddp_tags; + u_int active_ddp_tags; + bool send_success; /* Controller only */ + uint8_t ddp_color; + uint32_t tpt_offset; + + /* Receive state. */ + struct thread *rx_thread; + struct cv rx_cv; + bool rx_shutdown; + int rx_error; + struct mbufq rx_data; /* Data received via CPL_NVMT_DATA. */ + struct mbufq rx_pdus; /* PDU headers received via CPL_NVMT_CMP. */ + + /* Transmit state. */ + struct thread *tx_thread; + struct cv tx_cv; + bool tx_shutdown; + STAILQ_HEAD(, nvmf_che_capsule) tx_capsules; + + struct nvmf_che_command_buffer_list tx_buffers; + struct nvmf_che_command_buffer_list rx_buffers; + + /* + * For the controller, an RX command buffer can be in one of + * three locations, all protected by the rx_buffers.lock. If + * a receive request is waiting for either an R2T slot for its + * command (due to exceeding MAXR2T), or a transfer tag it is + * placed on the rx_buffers list. When a request is allocated + * an active transfer tag, it moves to either the + * open_ddp_tags[] or open_fl_ttags[] array (indexed by the + * tag) until it completes. + * + * For the host, an RX command buffer using DDP is in + * open_ddp_tags[], otherwise it is in rx_buffers. + */ + struct nvmf_che_command_buffer **open_ddp_tags; + struct nvmf_che_command_buffer **open_fl_ttags; /* Controller only */ + + /* + * For the host, CIDs submitted by nvmf(4) must be rewritten + * to either use DDP or not use DDP. The CID in response + * capsules must be restored to their original value. For + * DDP, the original CID is stored in the command buffer. + * These variables manage non-DDP CIDs. + */ + uint16_t *fl_cids; /* Host only */ + struct fl_cid_set *fl_cid_set; /* Host only */ + struct mtx fl_cid_lock; /* Host only */ +}; + +struct nvmf_che_rxpdu { + struct mbuf *m; + const struct nvme_tcp_common_pdu_hdr *hdr; + uint32_t data_len; + bool data_digest_mismatch; + bool ddp; +}; + +struct nvmf_che_capsule { + struct nvmf_capsule nc; + + volatile u_int refs; + + struct nvmf_che_rxpdu rx_pdu; + + uint32_t active_r2ts; /* Controller only */ +#ifdef INVARIANTS + uint32_t tx_data_offset; /* Controller only */ + u_int pending_r2ts; /* Controller only */ +#endif + + STAILQ_ENTRY(nvmf_che_capsule) link; +}; + +#define CCAP(nc) ((struct nvmf_che_capsule *)(nc)) +#define CQP(qp) ((struct nvmf_che_qpair *)(qp)) + +static void che_release_capsule(struct nvmf_che_capsule *cc); +static void che_free_qpair(struct nvmf_qpair *nq); + +SYSCTL_NODE(_kern_nvmf, OID_AUTO, che, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "Chelsio TCP offload transport"); + +static u_int che_max_transmit_pdu = 32 * 1024; +SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_transmit_pdu, CTLFLAG_RWTUN, + &che_max_transmit_pdu, 0, + "Maximum size of a transmitted PDU"); + +static u_int che_max_receive_pdu = 32 * 1024; +SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_receive_pdu, CTLFLAG_RWTUN, + &che_max_receive_pdu, 0, + "Maximum size of a received PDU"); + +static int use_dsgl = 1; +SYSCTL_INT(_kern_nvmf_che, OID_AUTO, use_dsgl, CTLFLAG_RWTUN, &use_dsgl, 0, + "Use DSGL for PBL/FastReg (default=1)"); + +static int inline_threshold = 256; +SYSCTL_INT(_kern_nvmf_che, OID_AUTO, inline_threshold, CTLFLAG_RWTUN, + &inline_threshold, 0, + "inline vs dsgl threshold (default=256)"); + +static int ddp_tags_per_qp = 128; +SYSCTL_INT(_kern_nvmf_che, OID_AUTO, ddp_tags_per_qp, CTLFLAG_RWTUN, + &ddp_tags_per_qp, 0, + "Number of DDP tags to reserve for each queue pair"); + +static MALLOC_DEFINE(M_NVMF_CHE, "nvmf_che", "Chelsio NVMe-TCP offload"); + +/* + * PBL regions consist of N full-sized pages. TPT entries support an + * initial offset into the first page (FBO) and can handle a partial + * length on the last page. + */ +static bool +che_ddp_io_check(struct nvmf_che_qpair *qp, const struct nvmf_io_request *io) +{ + const struct memdesc *mem = &io->io_mem; + struct bus_dma_segment *ds; + int i; + + if (io->io_len < qp->nca->ddp_threshold) { + return (false); + } + + switch (mem->md_type) { + case MEMDESC_VADDR: + case MEMDESC_PADDR: + case MEMDESC_VMPAGES: + return (true); + case MEMDESC_VLIST: + case MEMDESC_PLIST: + /* + * Require all but the first segment to start on a + * page boundary. Require all but the last segment to + * end on a page boundary. + */ + ds = mem->u.md_list; + for (i = 0; i < mem->md_nseg; i++, ds++) { + if (i != 0 && ds->ds_addr % PAGE_SIZE != 0) + return (false); + if (i != mem->md_nseg - 1 && + (ds->ds_addr + ds->ds_len) % PAGE_SIZE != 0) + return (false); + } + return (true); + default: + /* + * Other types could be validated with more work, but + * they aren't used currently by nvmf(4) or nvmft(4). + */ + return (false); + } +} + +static u_int +che_fbo(struct nvmf_che_command_buffer *cb) +{ + struct memdesc *mem = &cb->io.io_mem; + + switch (mem->md_type) { + case MEMDESC_VADDR: + return ((uintptr_t)mem->u.md_vaddr & PAGE_MASK); + case MEMDESC_PADDR: + return (mem->u.md_paddr & PAGE_MASK); + case MEMDESC_VMPAGES: + return (mem->md_offset); + case MEMDESC_VLIST: + case MEMDESC_PLIST: + return (mem->u.md_list[0].ds_addr & PAGE_MASK); + default: + __assert_unreachable(); + } +} + +static u_int +che_npages(struct nvmf_che_command_buffer *cb) +{ + return (howmany(che_fbo(cb) + cb->io.io_len, PAGE_SIZE)); +} + +static struct nvmf_che_command_buffer * +che_alloc_command_buffer(struct nvmf_che_qpair *qp, + const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len, + uint16_t cid) +{ + struct nvmf_che_command_buffer *cb; + + cb = malloc(sizeof(*cb), M_NVMF_CHE, M_WAITOK); + cb->qp = qp; + cb->io = *io; + cb->data_offset = data_offset; + cb->data_len = data_len; + cb->data_xfered = 0; + refcount_init(&cb->refs, 1); + cb->error = 0; + cb->ddp_ok = che_ddp_io_check(qp, io); + cb->cid = cid; + cb->ttag = 0; + cb->original_cid = 0; + cb->cc = NULL; + cb->pbl = NULL; + + return (cb); +} + +static void +che_hold_command_buffer(struct nvmf_che_command_buffer *cb) +{ + refcount_acquire(&cb->refs); +} + +static void +che_free_command_buffer(struct nvmf_che_command_buffer *cb) +{ + nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error); + if (cb->cc != NULL) + che_release_capsule(cb->cc); + MPASS(cb->pbl == NULL); + free(cb, M_NVMF_CHE); +} + +static void +che_release_command_buffer(struct nvmf_che_command_buffer *cb) +{ + if (refcount_release(&cb->refs)) + che_free_command_buffer(cb); +} + +static void +che_add_command_buffer(struct nvmf_che_command_buffer_list *list, + struct nvmf_che_command_buffer *cb) +{ + mtx_assert(&list->lock, MA_OWNED); + TAILQ_INSERT_HEAD(&list->head, cb, link); +} + +static struct nvmf_che_command_buffer * +che_find_command_buffer(struct nvmf_che_command_buffer_list *list, + uint16_t cid) +{ + struct nvmf_che_command_buffer *cb; + + mtx_assert(&list->lock, MA_OWNED); + TAILQ_FOREACH(cb, &list->head, link) { + if (cb->cid == cid) + return (cb); + } + return (NULL); +} + +static void +che_remove_command_buffer(struct nvmf_che_command_buffer_list *list, + struct nvmf_che_command_buffer *cb) +{ + mtx_assert(&list->lock, MA_OWNED); + TAILQ_REMOVE(&list->head, cb, link); +} + +static void +che_purge_command_buffer(struct nvmf_che_command_buffer_list *list, + uint16_t cid) +{ + struct nvmf_che_command_buffer *cb; + + mtx_lock(&list->lock); + cb = che_find_command_buffer(list, cid); + if (cb != NULL) { + che_remove_command_buffer(list, cb); + mtx_unlock(&list->lock); + che_release_command_buffer(cb); + } else + mtx_unlock(&list->lock); +} + +static int +che_write_mem_inline(struct adapter *sc, struct toepcb *toep, uint32_t addr, + uint32_t len, void *data, struct mbufq *wrq) +{ + struct mbuf *m; + char *cp; + int copy_len, i, num_wqe, wr_len; + +#ifdef VERBOSE_TRACES + CTR(KTR_CXGBE, "%s: addr 0x%x len %u", __func__, addr << 5, len); +#endif + num_wqe = DIV_ROUND_UP(len, T4_MAX_INLINE_SIZE); + cp = data; + for (i = 0; i < num_wqe; i++) { + copy_len = min(len, T4_MAX_INLINE_SIZE); + wr_len = T4_WRITE_MEM_INLINE_LEN(copy_len); + + m = alloc_raw_wr_mbuf(wr_len); + if (m == NULL) + return (ENOMEM); + t4_write_mem_inline_wr(sc, mtod(m, void *), wr_len, toep->tid, + addr, copy_len, cp, 0); + if (cp != NULL) + cp += T4_MAX_INLINE_SIZE; + addr += T4_MAX_INLINE_SIZE >> 5; + len -= T4_MAX_INLINE_SIZE; + + mbufq_enqueue(wrq, m); + } + return (0); +} + +static int +che_write_mem_dma_aligned(struct adapter *sc, struct toepcb *toep, + uint32_t addr, uint32_t len, void *data, struct mbufq *wrq) +{ + struct mbuf *m; + vm_offset_t va; + u_int todo; + int wr_len; + + /* First page. */ + va = (vm_offset_t)data; + todo = min(PAGE_SIZE - (va % PAGE_SIZE), len); + wr_len = T4_WRITE_MEM_DMA_LEN; + m = alloc_raw_wr_mbuf(wr_len); + if (m == NULL) + return (ENOMEM); + t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid, addr, + todo, pmap_kextract(va), 0); + mbufq_enqueue(wrq, m); + len -= todo; + addr += todo >> 5; + va += todo; + + while (len > 0) { + MPASS(va == trunc_page(va)); + todo = min(PAGE_SIZE, len); + m = alloc_raw_wr_mbuf(wr_len); + if (m == NULL) + return (ENOMEM); + t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid, + addr, todo, pmap_kextract(va), 0); + mbufq_enqueue(wrq, m); + len -= todo; + addr += todo >> 5; + va += todo; + } + return (0); +} + +static int +che_write_adapter_mem(struct nvmf_che_qpair *qp, uint32_t addr, uint32_t len, + void *data) +{ + struct adapter *sc = qp->nca->sc; + struct toepcb *toep = qp->toep; + struct socket *so = qp->so; + struct inpcb *inp = sotoinpcb(so); + struct mbufq mq; + int error; + + mbufq_init(&mq, INT_MAX); + if (!use_dsgl || len < inline_threshold || data == NULL) + error = che_write_mem_inline(sc, toep, addr, len, data, &mq); + else + error = che_write_mem_dma_aligned(sc, toep, addr, len, data, + &mq); + if (__predict_false(error != 0)) + goto error; + + INP_WLOCK(inp); + if ((inp->inp_flags & INP_DROPPED) != 0) { + INP_WUNLOCK(inp); + error = ECONNRESET; + goto error; + } + mbufq_concat(&toep->ulp_pduq, &mq); + INP_WUNLOCK(inp); + return (0); + +error: + mbufq_drain(&mq); + return (error); +} + +static bool +che_alloc_pbl(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb) +{ + struct adapter *sc = qp->nca->sc; + struct memdesc *mem = &cb->io.io_mem; + uint64_t *pbl; + uint32_t addr, len; + u_int i, npages; + int error; + + MPASS(cb->pbl == NULL); + MPASS(cb->ddp_ok); + + /* Hardware limit? iWARP only enforces this for T5. */ + if (cb->io.io_len >= (8 * 1024 * 1024 * 1024ULL)) + return (false); + + npages = che_npages(cb); + len = roundup2(npages, 4) * sizeof(*cb->pbl); + addr = t4_pblpool_alloc(sc, len); + if (addr == 0) + return (false); + + pbl = malloc(len, M_NVMF_CHE, M_NOWAIT | M_ZERO); + if (pbl == NULL) { + t4_pblpool_free(sc, addr, len); + return (false); + } + + switch (mem->md_type) { + case MEMDESC_VADDR: + { + vm_offset_t va; + + va = trunc_page((uintptr_t)mem->u.md_vaddr); + for (i = 0; i < npages; i++) + pbl[i] = htobe64(pmap_kextract(va + i * PAGE_SIZE)); + break; + } + case MEMDESC_PADDR: + { + vm_paddr_t pa; + + pa = trunc_page(mem->u.md_paddr); + for (i = 0; i < npages; i++) + pbl[i] = htobe64(pa + i * PAGE_SIZE); + break; + } + case MEMDESC_VMPAGES: + for (i = 0; i < npages; i++) + pbl[i] = htobe64(VM_PAGE_TO_PHYS(mem->u.md_ma[i])); + break; + case MEMDESC_VLIST: + { + struct bus_dma_segment *ds; + vm_offset_t va; + vm_size_t len; + u_int j, k; + + i = 0; + ds = mem->u.md_list; + for (j = 0; j < mem->md_nseg; j++, ds++) { + va = trunc_page((uintptr_t)ds->ds_addr); + len = ds->ds_len; + if (ds->ds_addr % PAGE_SIZE != 0) + len += ds->ds_addr % PAGE_SIZE; + for (k = 0; k < howmany(len, PAGE_SIZE); k++) { + pbl[i] = htobe64(pmap_kextract(va + + k * PAGE_SIZE)); + i++; + } + } + MPASS(i == npages); + break; + } + case MEMDESC_PLIST: + { + struct bus_dma_segment *ds; + vm_paddr_t pa; + vm_size_t len; + u_int j, k; + + i = 0; + ds = mem->u.md_list; + for (j = 0; j < mem->md_nseg; j++, ds++) { + pa = trunc_page((vm_paddr_t)ds->ds_addr); + len = ds->ds_len; + if (ds->ds_addr % PAGE_SIZE != 0) + len += ds->ds_addr % PAGE_SIZE; + for (k = 0; k < howmany(len, PAGE_SIZE); k++) { + pbl[i] = htobe64(pa + k * PAGE_SIZE); + i++; + } + } + MPASS(i == npages); + break; + } + default: + __assert_unreachable(); + } + + error = che_write_adapter_mem(qp, addr >> 5, len, pbl); + if (error != 0) { + t4_pblpool_free(sc, addr, len); + free(pbl, M_NVMF_CHE); + return (false); + } + + cb->pbl = pbl; + cb->pbl_addr = addr; + cb->pbl_len = len; + + return (true); +} + +static void +che_free_pbl(struct nvmf_che_command_buffer *cb) +{ + free(cb->pbl, M_NVMF_CHE); + t4_pblpool_free(cb->qp->nca->sc, cb->pbl_addr, cb->pbl_len); + cb->pbl = NULL; + cb->pbl_addr = 0; + cb->pbl_len = 0; +} + +static bool +che_write_tpt_entry(struct nvmf_che_qpair *qp, + struct nvmf_che_command_buffer *cb, uint16_t stag) +{ + uint32_t tpt_addr; + int error; + + cb->tpte.valid_to_pdid = htobe32(F_FW_RI_TPTE_VALID | + V_FW_RI_TPTE_STAGKEY(CHE_STAG_COLOR(stag)) | + F_FW_RI_TPTE_STAGSTATE | + V_FW_RI_TPTE_STAGTYPE(FW_RI_STAG_NSMR) | + V_FW_RI_TPTE_PDID(0)); + cb->tpte.locread_to_qpid = htobe32( + V_FW_RI_TPTE_PERM(FW_RI_MEM_ACCESS_REM_WRITE) | + V_FW_RI_TPTE_ADDRTYPE(FW_RI_ZERO_BASED_TO) | + V_FW_RI_TPTE_PS(PAGE_SIZE) | + V_FW_RI_TPTE_QPID(qp->toep->tid)); +#define PBL_OFF(qp, a) ((a) - (qp)->nca->sc->vres.pbl.start) + cb->tpte.nosnoop_pbladdr = + htobe32(V_FW_RI_TPTE_PBLADDR(PBL_OFF(qp, cb->pbl_addr) >> 3)); + cb->tpte.len_lo = htobe32(cb->data_len); + cb->tpte.va_hi = 0; + cb->tpte.va_lo_fbo = htobe32(che_fbo(cb)); + cb->tpte.dca_mwbcnt_pstag = 0; + cb->tpte.len_hi = htobe32(cb->data_offset); + + tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) + + (qp->nca->sc->vres.stag.start >> 5); + + error = che_write_adapter_mem(qp, tpt_addr, sizeof(cb->tpte), + &cb->tpte); + return (error == 0); +} + +static void +che_clear_tpt_entry(struct nvmf_che_qpair *qp, uint16_t stag) +{ + uint32_t tpt_addr; + + tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) + + (qp->nca->sc->vres.stag.start >> 5); + + (void)che_write_adapter_mem(qp, tpt_addr, sizeof(struct fw_ri_tpte), + NULL); +} + +static uint16_t +che_alloc_ddp_stag(struct nvmf_che_qpair *qp, + struct nvmf_che_command_buffer *cb) +{ + uint16_t stag_idx; + + mtx_assert(&qp->rx_buffers.lock, MA_OWNED); + MPASS(cb->ddp_ok); + + if (qp->active_ddp_tags == qp->num_ddp_tags) + return (CHE_DDP_NO_TAG); + + MPASS(qp->num_ddp_tags != 0); + + stag_idx = qp->next_ddp_tag; + for (;;) { + if (qp->open_ddp_tags[stag_idx] == NULL) + break; + if (stag_idx == qp->num_ddp_tags - 1) { + stag_idx = 0; + if (qp->ddp_color == CHE_DDP_MAX_COLOR) + qp->ddp_color = 0; + else + qp->ddp_color++; + } else + stag_idx++; + MPASS(stag_idx != qp->next_ddp_tag); + } + if (stag_idx == qp->num_ddp_tags - 1) + qp->next_ddp_tag = 0; + else + qp->next_ddp_tag = stag_idx + 1; + + qp->active_ddp_tags++; + qp->open_ddp_tags[stag_idx] = cb; + + return (CHE_DDP_TAG(stag_idx, qp->ddp_color)); +} + +static void +che_free_ddp_stag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb, + uint16_t stag) +{ + MPASS(!CHE_TAG_IS_FL(stag)); + + mtx_assert(&qp->rx_buffers.lock, MA_OWNED); + + MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb); + + qp->open_ddp_tags[CHE_STAG_IDX(stag)] = NULL; + qp->active_ddp_tags--; +} + +static uint16_t +che_alloc_ddp_tag(struct nvmf_che_qpair *qp, + struct nvmf_che_command_buffer *cb) +{ + uint16_t stag; + + mtx_assert(&qp->rx_buffers.lock, MA_OWNED); + + if (!cb->ddp_ok) + return (CHE_DDP_NO_TAG); + + stag = che_alloc_ddp_stag(qp, cb); + if (stag == CHE_DDP_NO_TAG) { + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_no_stag, + 1); + return (CHE_DDP_NO_TAG); + } + + if (!che_alloc_pbl(qp, cb)) { + che_free_ddp_stag(qp, cb, stag); + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1); + return (CHE_DDP_NO_TAG); + } + + if (!che_write_tpt_entry(qp, cb, stag)) { + che_free_pbl(cb); + che_free_ddp_stag(qp, cb, stag); + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1); + return (CHE_DDP_NO_TAG); + } + + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_ok, 1); + return (stag); +} + +static void +che_free_ddp_tag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb, + uint16_t stag) +{ + MPASS(!CHE_TAG_IS_FL(stag)); + + mtx_assert(&qp->rx_buffers.lock, MA_OWNED); + + MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb); + + che_clear_tpt_entry(qp, stag); + che_free_pbl(cb); + che_free_ddp_stag(qp, cb, stag); +} + +static void +nvmf_che_write_pdu(struct nvmf_che_qpair *qp, struct mbuf *m) +{ + struct epoch_tracker et; + struct socket *so = qp->so; + struct inpcb *inp = sotoinpcb(so); + struct toepcb *toep = qp->toep; + + CURVNET_SET(so->so_vnet); + NET_EPOCH_ENTER(et); + INP_WLOCK(inp); + if (__predict_false(inp->inp_flags & INP_DROPPED) || + __predict_false((toep->flags & TPF_ATTACHED) == 0)) { + m_freem(m); + } else { + mbufq_enqueue(&toep->ulp_pduq, m); + t4_push_pdus(toep->vi->adapter, toep, 0); + } + INP_WUNLOCK(inp); + NET_EPOCH_EXIT(et); + CURVNET_RESTORE(); +} + +static void +nvmf_che_report_error(struct nvmf_che_qpair *qp, uint16_t fes, uint32_t fei, + struct mbuf *rx_pdu, u_int hlen) +{ + struct nvme_tcp_term_req_hdr *hdr; + struct mbuf *m; + + if (hlen != 0) { + hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE); + hlen = min(hlen, m_length(rx_pdu, NULL)); + } + + m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, M_PKTHDR); + m->m_len = sizeof(*hdr) + hlen; + m->m_pkthdr.len = m->m_len; + hdr = mtod(m, void *); + memset(hdr, 0, sizeof(*hdr)); + hdr->common.pdu_type = qp->qp.nq_controller ? + NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ; + hdr->common.hlen = sizeof(*hdr); + hdr->common.plen = sizeof(*hdr) + hlen; + hdr->fes = htole16(fes); + le32enc(hdr->fei, fei); + if (hlen != 0) + m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1)); + + nvmf_che_write_pdu(qp, m); +} + +static int +nvmf_che_validate_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) +{ + const struct nvme_tcp_common_pdu_hdr *ch; + struct mbuf *m = pdu->m; + uint32_t data_len, fei, plen, rx_digest; + u_int hlen, cpl_error; + int error; + uint16_t fes; + + /* Determine how large of a PDU header to return for errors. */ + ch = pdu->hdr; + hlen = ch->hlen; + plen = le32toh(ch->plen); + if (hlen < sizeof(*ch) || hlen > plen) + hlen = sizeof(*ch); + + cpl_error = m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_ERROR_MASK; + switch (cpl_error) { + case CMP_STATUS_NO_ERROR: + break; + case CMP_STATUS_HEADER_DIGEST: + counter_u64_add( + qp->toep->ofld_rxq->rx_nvme_header_digest_errors, 1); + printf("NVMe/TCP: Header digest mismatch\n"); + rx_digest = le32dec(mtodo(m, ch->hlen)); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m, + hlen); + return (EBADMSG); + case CMP_STATUS_DIRECTION_MISMATCH: + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); + printf("NVMe/TCP: Invalid PDU type %u\n", ch->pdu_type); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_common_pdu_hdr, pdu_type), m, + hlen); + return (EBADMSG); + case CMP_STATUS_SUCCESS_NOT_LAST: + case CMP_STATUS_DIGEST_FLAG_MISMATCH: + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); + printf("NVMe/TCP: Invalid PDU header flags %#x\n", ch->flags); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_common_pdu_hdr, flags), m, hlen); + return (EBADMSG); + case CMP_STATUS_BAD_DATA_LENGTH: + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); + printf("NVMe/TCP: Invalid PDU length %u\n", plen); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_common_pdu_hdr, plen), m, hlen); + return (EBADMSG); + case CMP_STATUS_USER_MODE_UNALLOCATED: + case CMP_STATUS_RQT_LIMIT: + case CMP_STATUS_RQT_WRAP: + case CMP_STATUS_RQT_BOUND: + device_printf(qp->nca->sc->dev, + "received invalid NVMET error %u\n", + cpl_error); + return (ECONNRESET); + case CMP_STATUS_TPT_LIMIT: + case CMP_STATUS_TPT_INVALID: + case CMP_STATUS_TPT_COLOUR_MISMATCH: + case CMP_STATUS_TPT_MISC: + case CMP_STATUS_TPT_WRAP: + case CMP_STATUS_TPT_BOUND: + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); + switch (ch->pdu_type) { + case NVME_TCP_PDU_TYPE_H2C_DATA: + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_h2c_data_hdr, ttag), + pdu->m, pdu->hdr->hlen); + return (EBADMSG); + case NVME_TCP_PDU_TYPE_C2H_DATA: + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_c2h_data_hdr, cccid), m, + hlen); + return (EBADMSG); + default: + device_printf(qp->nca->sc->dev, + "received DDP NVMET error %u for PDU %u\n", + cpl_error, ch->pdu_type); + return (ECONNRESET); + } + case CMP_STATUS_TPT_LAST_PDU_UNALIGNED: + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, m, hlen); + return (EBADMSG); + case CMP_STATUS_PBL_LIMIT: + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, m, + hlen); + return (EBADMSG); + case CMP_STATUS_DATA_DIGEST: + /* Handled below. */ + break; + default: + device_printf(qp->nca->sc->dev, + "received unknown NVMET error %u\n", + cpl_error); + return (ECONNRESET); + } + + error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller, + qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes, + &fei); + if (error != 0) { + if (error != ECONNRESET) + nvmf_che_report_error(qp, fes, fei, m, hlen); + return (error); + } + + /* Check data digest if present. */ + pdu->data_digest_mismatch = false; + if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) { + if (cpl_error == CMP_STATUS_DATA_DIGEST) { + printf("NVMe/TCP: Data digest mismatch\n"); + pdu->data_digest_mismatch = true; + counter_u64_add( + qp->toep->ofld_rxq->rx_nvme_data_digest_errors, 1); + } + } + + pdu->data_len = data_len; + + return (0); +} + +static void +nvmf_che_free_pdu(struct nvmf_che_rxpdu *pdu) +{ + m_freem(pdu->m); + pdu->m = NULL; + pdu->hdr = NULL; +} + +static int +nvmf_che_handle_term_req(struct nvmf_che_rxpdu *pdu) +{ + const struct nvme_tcp_term_req_hdr *hdr; + + hdr = (const void *)pdu->hdr; + + printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n", + le16toh(hdr->fes), le32dec(hdr->fei)); + nvmf_che_free_pdu(pdu); + return (ECONNRESET); +} + +static int +nvmf_che_save_command_capsule(struct nvmf_che_qpair *qp, + struct nvmf_che_rxpdu *pdu) +{ + const struct nvme_tcp_cmd *cmd; + struct nvmf_capsule *nc; + struct nvmf_che_capsule *cc; + + cmd = (const void *)pdu->hdr; + + nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK); + + cc = CCAP(nc); + cc->rx_pdu = *pdu; + + nvmf_capsule_received(&qp->qp, nc); + return (0); +} + +static int +nvmf_che_save_response_capsule(struct nvmf_che_qpair *qp, + struct nvmf_che_rxpdu *pdu) +{ + const struct nvme_tcp_rsp *rsp; + struct nvme_completion cpl; + struct nvmf_capsule *nc; + struct nvmf_che_capsule *cc; + uint16_t cid; + + rsp = (const void *)pdu->hdr; + + /* + * Restore the original CID and ensure any command buffers + * associated with this CID have been released. Once the CQE + * has been received, no further transfers to the command + * buffer for the associated CID can occur. + */ + cpl = rsp->rccqe; + cid = le16toh(cpl.cid); + if (CHE_TAG_IS_FL(cid)) { + cid = CHE_RAW_FL_TAG(cid); + mtx_lock(&qp->fl_cid_lock); + MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set)); + cpl.cid = qp->fl_cids[cid]; + FL_CID_FREE(cid, qp->fl_cid_set); + mtx_unlock(&qp->fl_cid_lock); + + che_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid); + che_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid); + } else { + struct nvmf_che_command_buffer *cb; + + mtx_lock(&qp->rx_buffers.lock); + cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)]; + MPASS(cb != NULL); + MPASS(cb->cid == rsp->rccqe.cid); + cpl.cid = cb->original_cid; + che_free_ddp_tag(qp, cb, cid); + mtx_unlock(&qp->rx_buffers.lock); + che_release_command_buffer(cb); + } +#ifdef VERBOSE_TRACES + CTR(KTR_CXGBE, "%s: tid %u freed cid 0x%04x for 0x%04x", __func__, + qp->toep->tid, le16toh(rsp->rccqe.cid), cpl.cid); +#endif + + nc = nvmf_allocate_response(&qp->qp, &cpl, M_WAITOK); + + nc->nc_sqhd_valid = true; + cc = CCAP(nc); + cc->rx_pdu = *pdu; + + nvmf_capsule_received(&qp->qp, nc); + return (0); +} + +/* + * Construct a PDU that contains an optional data payload. This + * includes dealing with the length fields in the common header. The + * adapter inserts digests and padding when the PDU is transmitted. + */ +static struct mbuf * +nvmf_che_construct_pdu(struct nvmf_che_qpair *qp, void *hdr, size_t hlen, + struct mbuf *data, uint32_t data_len) +{ + struct nvme_tcp_common_pdu_hdr *ch; + struct mbuf *top; + uint32_t pdo, plen; + uint8_t ulp_submode; + + plen = hlen; + if (qp->header_digests) + plen += sizeof(uint32_t); + if (data_len != 0) { + KASSERT(m_length(data, NULL) == data_len, ("length mismatch")); + pdo = roundup(plen, qp->txpda); + plen = pdo + data_len; + if (qp->data_digests) + plen += sizeof(uint32_t); + } else { + KASSERT(data == NULL, ("payload mbuf with zero length")); + pdo = 0; + } + + top = m_get2(hlen, M_WAITOK, MT_DATA, M_PKTHDR); + top->m_len = hlen; + top->m_pkthdr.len = hlen; + ch = mtod(top, void *); + memcpy(ch, hdr, hlen); + ch->hlen = hlen; + ulp_submode = 0; + if (qp->header_digests) { + ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF; + ulp_submode |= ULP_CRC_HEADER; + } + if (qp->data_digests && data_len != 0) { + ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF; + ulp_submode |= ULP_CRC_DATA; + } + ch->pdo = pdo; + ch->plen = htole32(plen); + set_mbuf_ulp_submode(top, ulp_submode); + + if (data_len != 0) { + top->m_pkthdr.len += data_len; + top->m_next = data; + } + + return (top); +} + +/* Allocate the next free freelist transfer tag. */ +static bool +nvmf_che_allocate_fl_ttag(struct nvmf_che_qpair *qp, + struct nvmf_che_command_buffer *cb) +{ + uint16_t ttag; + + mtx_assert(&qp->rx_buffers.lock, MA_OWNED); + + if (qp->active_fl_ttags == qp->num_fl_ttags) + return (false); + + ttag = qp->next_fl_ttag; + for (;;) { + if (qp->open_fl_ttags[ttag] == NULL) + break; + if (ttag == qp->num_fl_ttags - 1) + ttag = 0; + else + ttag++; + MPASS(ttag != qp->next_fl_ttag); + } + if (ttag == qp->num_fl_ttags - 1) + qp->next_fl_ttag = 0; + else + qp->next_fl_ttag = ttag + 1; + + qp->active_fl_ttags++; + qp->open_fl_ttags[ttag] = cb; + + cb->ttag = ttag | CHE_FL_TAG_MASK; + return (true); +} + +/* Attempt to allocate a free transfer tag and assign it to cb. */ +static bool +nvmf_che_allocate_ttag(struct nvmf_che_qpair *qp, + struct nvmf_che_command_buffer *cb) +{ + uint16_t stag; + + mtx_assert(&qp->rx_buffers.lock, MA_OWNED); + + stag = che_alloc_ddp_tag(qp, cb); + if (stag == CHE_DDP_NO_TAG) { + if (!nvmf_che_allocate_fl_ttag(qp, cb)) + return (false); + } else { + cb->ttag = stag; + } +#ifdef VERBOSE_TRACES + CTR(KTR_CXGBE, "%s: tid %u allocated ttag 0x%04x", __func__, + qp->toep->tid, cb->ttag); +#endif + cb->cc->active_r2ts++; + return (true); +} + +/* Find the next command buffer eligible to schedule for R2T. */ +static struct nvmf_che_command_buffer * +nvmf_che_next_r2t(struct nvmf_che_qpair *qp) +{ + struct nvmf_che_command_buffer *cb; + + mtx_assert(&qp->rx_buffers.lock, MA_OWNED); + + TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) { + /* NB: maxr2t is 0's based. */ + if (cb->cc->active_r2ts > qp->maxr2t) + continue; + + if (!nvmf_che_allocate_ttag(qp, cb)) + return (NULL); +#ifdef INVARIANTS + cb->cc->pending_r2ts--; +#endif + TAILQ_REMOVE(&qp->rx_buffers.head, cb, link); + return (cb); + } + return (NULL); +} + +/* NB: cid and is little-endian already. */ +static void +che_send_r2t(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag, + uint32_t data_offset, uint32_t data_len) +{ + struct nvme_tcp_r2t_hdr r2t; + struct mbuf *m; + + memset(&r2t, 0, sizeof(r2t)); + r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T; + r2t.cccid = cid; + r2t.ttag = htole16(ttag); + r2t.r2to = htole32(data_offset); + r2t.r2tl = htole32(data_len); + + m = nvmf_che_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0); + nvmf_che_write_pdu(qp, m); +} + +/* + * Release a transfer tag and schedule another R2T. + * + * NB: This drops the rx_buffers.lock mutex. + */ +static void +nvmf_che_send_next_r2t(struct nvmf_che_qpair *qp, + struct nvmf_che_command_buffer *cb) +{ + struct nvmf_che_command_buffer *ncb; + + mtx_assert(&qp->rx_buffers.lock, MA_OWNED); + +#ifdef VERBOSE_TRACES + CTR(KTR_CXGBE, "%s: tid %u freed ttag 0x%04x", __func__, qp->toep->tid, + cb->ttag); +#endif + if (CHE_TAG_IS_FL(cb->ttag)) { + uint16_t ttag; + + ttag = CHE_RAW_FL_TAG(cb->ttag); + MPASS(qp->open_fl_ttags[ttag] == cb); + + /* Release this transfer tag. */ + qp->open_fl_ttags[ttag] = NULL; + qp->active_fl_ttags--; + } else + che_free_ddp_tag(qp, cb, cb->ttag); + + cb->cc->active_r2ts--; + + /* Schedule another R2T. */ + ncb = nvmf_che_next_r2t(qp); + mtx_unlock(&qp->rx_buffers.lock); + if (ncb != NULL) + che_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset, + ncb->data_len); +} + +/* + * Copy len bytes starting at offset skip from an mbuf chain into an + * I/O buffer at destination offset io_offset. + */ +static void +mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len, + struct nvmf_io_request *io, u_int io_offset) +{ + u_int todo; + + while (m->m_len <= skip) { + skip -= m->m_len; + m = m->m_next; + } + while (len != 0) { + MPASS((m->m_flags & M_EXTPG) == 0); + + todo = min(m->m_len - skip, len); + memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip)); + skip = 0; + io_offset += todo; + len -= todo; + m = m->m_next; + } +} + +static int +nvmf_che_handle_h2c_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) +{ + const struct nvme_tcp_h2c_data_hdr *h2c; + struct nvmf_che_command_buffer *cb; + uint32_t data_len, data_offset; + uint16_t ttag, fl_ttag; + + h2c = (const void *)pdu->hdr; + if (le32toh(h2c->datal) > qp->maxh2cdata) { + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0, + pdu->m, pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } + + ttag = le16toh(h2c->ttag); + if (CHE_TAG_IS_FL(ttag)) { + fl_ttag = CHE_RAW_FL_TAG(ttag); + if (fl_ttag >= qp->num_fl_ttags) { + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_h2c_data_hdr, ttag), + pdu->m, pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } + + mtx_lock(&qp->rx_buffers.lock); + cb = qp->open_fl_ttags[fl_ttag]; + } else { + if (CHE_STAG_IDX(ttag) >= qp->num_ddp_tags) { + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_h2c_data_hdr, ttag), + pdu->m, pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } + + mtx_lock(&qp->rx_buffers.lock); + cb = qp->open_ddp_tags[CHE_STAG_IDX(ttag)]; + } + + if (cb == NULL) { + mtx_unlock(&qp->rx_buffers.lock); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m, + pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } + MPASS(cb->ttag == ttag); + + /* For a data digest mismatch, fail the I/O request. */ + if (pdu->data_digest_mismatch) { + nvmf_che_send_next_r2t(qp, cb); + cb->error = EINTEGRITY; + che_release_command_buffer(cb); + nvmf_che_free_pdu(pdu); + return (0); + } + + data_len = le32toh(h2c->datal); + if (data_len != pdu->data_len) { + mtx_unlock(&qp->rx_buffers.lock); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m, + pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } + + data_offset = le32toh(h2c->datao); + if (data_offset < cb->data_offset || + data_offset + data_len > cb->data_offset + cb->data_len) { + mtx_unlock(&qp->rx_buffers.lock); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m, + pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } + + if (data_offset != cb->data_offset + cb->data_xfered) { + if (CHE_TAG_IS_FL(ttag)) { + mtx_unlock(&qp->rx_buffers.lock); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, + pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } else { + uint32_t ddp_bytes; + + /* Account for PDUs silently received via DDP. */ + ddp_bytes = data_offset - + (cb->data_offset + cb->data_xfered); + cb->data_xfered += ddp_bytes; +#ifdef VERBOSE_TRACES + CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u", + __func__, qp->toep->tid, ddp_bytes); +#endif + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets, + ddp_bytes); + } + } + + if ((cb->data_xfered + data_len == cb->data_len) != + ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) { + mtx_unlock(&qp->rx_buffers.lock); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, + pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } + + cb->data_xfered += data_len; + data_offset -= cb->data_offset; + if (cb->data_xfered == cb->data_len) { + nvmf_che_send_next_r2t(qp, cb); + } else { + che_hold_command_buffer(cb); + mtx_unlock(&qp->rx_buffers.lock); + } + + if (CHE_TAG_IS_FL(ttag)) + mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io, + data_offset); + + che_release_command_buffer(cb); + nvmf_che_free_pdu(pdu); + return (0); +} + +static int +nvmf_che_handle_c2h_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) +{ + const struct nvme_tcp_c2h_data_hdr *c2h; + struct nvmf_che_command_buffer *cb; + uint32_t data_len, data_offset; + uint16_t cid, original_cid; + + /* + * Unlike freelist command buffers, DDP command buffers are + * not released until the response capsule is received to keep + * the STAG allocated until the command has completed. + */ + c2h = (const void *)pdu->hdr; + + cid = le16toh(c2h->cccid); + if (CHE_TAG_IS_FL(cid)) { + mtx_lock(&qp->rx_buffers.lock); + cb = che_find_command_buffer(&qp->rx_buffers, c2h->cccid); + } else { + if (CHE_STAG_IDX(cid) >= qp->num_ddp_tags) { + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_c2h_data_hdr, cccid), + pdu->m, pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } + + mtx_lock(&qp->rx_buffers.lock); + cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)]; + } + + if (cb == NULL) { + mtx_unlock(&qp->rx_buffers.lock); + /* + * XXX: Could be PDU sequence error if cccid is for a + * command that doesn't use a command buffer. + */ + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m, + pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } + + /* For a data digest mismatch, fail the I/O request. */ + if (pdu->data_digest_mismatch) { + cb->error = EINTEGRITY; + if (CHE_TAG_IS_FL(cid)) { + che_remove_command_buffer(&qp->rx_buffers, cb); + mtx_unlock(&qp->rx_buffers.lock); + che_release_command_buffer(cb); + } else + mtx_unlock(&qp->rx_buffers.lock); + nvmf_che_free_pdu(pdu); + return (0); + } + + data_len = le32toh(c2h->datal); + if (data_len != pdu->data_len) { + mtx_unlock(&qp->rx_buffers.lock); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m, + pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } + + data_offset = le32toh(c2h->datao); + if (data_offset < cb->data_offset || + data_offset + data_len > cb->data_offset + cb->data_len) { + mtx_unlock(&qp->rx_buffers.lock); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, + pdu->m, pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } + + if (data_offset != cb->data_offset + cb->data_xfered) { + if (CHE_TAG_IS_FL(cid)) { + mtx_unlock(&qp->rx_buffers.lock); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, + pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } else { + uint32_t ddp_bytes; + + /* Account for PDUs silently received via DDP. */ + ddp_bytes = data_offset - + (cb->data_offset + cb->data_xfered); + cb->data_xfered += ddp_bytes; +#ifdef VERBOSE_TRACES + CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u", + __func__, qp->toep->tid, ddp_bytes); +#endif + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets, + ddp_bytes); + } + } + + if ((cb->data_xfered + data_len == cb->data_len) != + ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) { + mtx_unlock(&qp->rx_buffers.lock); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, + pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } + + cb->data_xfered += data_len; + original_cid = cb->original_cid; + + if (CHE_TAG_IS_FL(cid)) { + data_offset -= cb->data_offset; + if (cb->data_xfered == cb->data_len) + che_remove_command_buffer(&qp->rx_buffers, cb); + else + che_hold_command_buffer(cb); + mtx_unlock(&qp->rx_buffers.lock); + + if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { + /* + * Free the CID as the command has now been + * completed. + */ + cid = CHE_RAW_FL_TAG(cid); + mtx_lock(&qp->fl_cid_lock); + MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set)); + MPASS(original_cid == qp->fl_cids[cid]); + FL_CID_FREE(cid, qp->fl_cid_set); + mtx_unlock(&qp->fl_cid_lock); + } + + mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io, + data_offset); + + che_release_command_buffer(cb); + } else { + if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { + /* + * Free the command buffer and STAG as the + * command has now been completed. + */ + che_free_ddp_tag(qp, cb, cid); + mtx_unlock(&qp->rx_buffers.lock); + che_release_command_buffer(cb); + } else + mtx_unlock(&qp->rx_buffers.lock); + } + + if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { + struct nvme_completion cqe; + struct nvmf_capsule *nc; + + memset(&cqe, 0, sizeof(cqe)); + cqe.cid = original_cid; + + nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK); + nc->nc_sqhd_valid = false; + + nvmf_capsule_received(&qp->qp, nc); + } + + nvmf_che_free_pdu(pdu); + return (0); +} + +/* Called when m_free drops refcount to 0. */ +static void +nvmf_che_mbuf_done(struct mbuf *m) +{ + struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1; + + che_free_command_buffer(cb); +} + +static struct mbuf * +nvmf_che_mbuf(void *arg, int how, void *data, size_t len) +{ + struct nvmf_che_command_buffer *cb = arg; + struct mbuf *m; + + m = m_get(how, MT_DATA); + m->m_flags |= M_RDONLY; + m_extaddref(m, data, len, &cb->refs, nvmf_che_mbuf_done, cb, NULL); + m->m_len = len; + return (m); +} + +static void +nvmf_che_free_mext_pg(struct mbuf *m) +{ + struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1; + + M_ASSERTEXTPG(m); + che_release_command_buffer(cb); +} + +static struct mbuf * +nvmf_che_mext_pg(void *arg, int how) +{ + struct nvmf_che_command_buffer *cb = arg; + struct mbuf *m; + + m = mb_alloc_ext_pgs(how, nvmf_che_free_mext_pg, M_RDONLY); + m->m_ext.ext_arg1 = cb; + che_hold_command_buffer(cb); + return (m); +} + +/* + * Return an mbuf chain for a range of data belonging to a command + * buffer. + * + * The mbuf chain uses M_EXT mbufs which hold references on the + * command buffer so that it remains "alive" until the data has been + * fully transmitted. If truncate_ok is true, then the mbuf chain + * might return a short chain to avoid gratuitously splitting up a + * page. + */ +static struct mbuf * +nvmf_che_command_buffer_mbuf(struct nvmf_che_command_buffer *cb, + uint32_t data_offset, uint32_t data_len, uint32_t *actual_len, + bool can_truncate) +{ + struct mbuf *m; + size_t len; + + m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_che_mbuf, + nvmf_che_mext_pg, cb, M_WAITOK, data_offset, data_len, &len, + can_truncate); + if (actual_len != NULL) + *actual_len = len; + return (m); +} + +/* NB: cid and ttag and little-endian already. */ +static void +che_send_h2c_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag, + uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu) +{ + struct nvme_tcp_h2c_data_hdr h2c; + struct mbuf *top; + + memset(&h2c, 0, sizeof(h2c)); + h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA; + if (last_pdu) + h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU; + h2c.cccid = cid; + h2c.ttag = ttag; + h2c.datao = htole32(data_offset); + h2c.datal = htole32(len); + + top = nvmf_che_construct_pdu(qp, &h2c, sizeof(h2c), m, len); + nvmf_che_write_pdu(qp, top); +} + +static int +nvmf_che_handle_r2t(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) +{ + const struct nvme_tcp_r2t_hdr *r2t; + struct nvmf_che_command_buffer *cb; + uint32_t data_len, data_offset; + + r2t = (const void *)pdu->hdr; + + mtx_lock(&qp->tx_buffers.lock); + cb = che_find_command_buffer(&qp->tx_buffers, r2t->cccid); + if (cb == NULL) { + mtx_unlock(&qp->tx_buffers.lock); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m, + pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } + + data_offset = le32toh(r2t->r2to); + if (data_offset != cb->data_xfered) { + mtx_unlock(&qp->tx_buffers.lock); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, + pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } + + /* + * XXX: The spec does not specify how to handle R2T tranfers + * out of range of the original command. + */ + data_len = le32toh(r2t->r2tl); + if (data_offset + data_len > cb->data_len) { + mtx_unlock(&qp->tx_buffers.lock); + nvmf_che_report_error(qp, + NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, + pdu->m, pdu->hdr->hlen); + nvmf_che_free_pdu(pdu); + return (EBADMSG); + } + + cb->data_xfered += data_len; + if (cb->data_xfered == cb->data_len) + che_remove_command_buffer(&qp->tx_buffers, cb); + else + che_hold_command_buffer(cb); + mtx_unlock(&qp->tx_buffers.lock); + + /* + * Queue one or more H2C_DATA PDUs containing the requested + * data. + */ + while (data_len > 0) { + struct mbuf *m; + uint32_t sent, todo; + + todo = min(data_len, qp->max_tx_data); + m = nvmf_che_command_buffer_mbuf(cb, data_offset, todo, &sent, + todo < data_len); + che_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m, + sent, sent == data_len); + + data_offset += sent; + data_len -= sent; + } + + che_release_command_buffer(cb); + nvmf_che_free_pdu(pdu); + return (0); +} + +static int +nvmf_che_dispatch_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) +{ + /* + * The PDU header should always be contiguous in the mbuf from + * CPL_NVMT_CMP. + */ + pdu->hdr = mtod(pdu->m, void *); + KASSERT(pdu->m->m_len == pdu->hdr->hlen + + ((pdu->hdr->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0 ? + sizeof(uint32_t) : 0), + ("%s: mismatched PDU header mbuf length", __func__)); + + switch (pdu->hdr->pdu_type) { + default: + __assert_unreachable(); + break; + case NVME_TCP_PDU_TYPE_H2C_TERM_REQ: + case NVME_TCP_PDU_TYPE_C2H_TERM_REQ: + return (nvmf_che_handle_term_req(pdu)); + case NVME_TCP_PDU_TYPE_CAPSULE_CMD: + return (nvmf_che_save_command_capsule(qp, pdu)); + case NVME_TCP_PDU_TYPE_CAPSULE_RESP: + return (nvmf_che_save_response_capsule(qp, pdu)); + case NVME_TCP_PDU_TYPE_H2C_DATA: + return (nvmf_che_handle_h2c_data(qp, pdu)); + case NVME_TCP_PDU_TYPE_C2H_DATA: + return (nvmf_che_handle_c2h_data(qp, pdu)); + case NVME_TCP_PDU_TYPE_R2T: + return (nvmf_che_handle_r2t(qp, pdu)); + } +} + +static int +nvmf_che_attach_pdu_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) +{ + struct socket *so = qp->so; + struct mbuf *m, *n; + uint32_t tcp_seq; + size_t len; + int error; + + /* Check for DDP data. */ + if (pdu->ddp) { + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_pdus, 1); + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets, + pdu->data_len); + return (0); + } + + error = 0; + len = pdu->data_len; + tcp_seq = pdu->m->m_pkthdr.nvmf_tcp_seq; + m = pdu->m; + SOCKBUF_LOCK(&so->so_rcv); + while (len > 0) { + n = mbufq_dequeue(&qp->rx_data); + KASSERT(n != NULL, ("%s: missing %zu data", __func__, len)); + if (n == NULL) { + error = ENOBUFS; + break; + } + + KASSERT(n->m_pkthdr.nvmf_tcp_seq == tcp_seq, + ("%s: TCP seq mismatch", __func__)); + KASSERT(n->m_pkthdr.len <= len, + ("%s: too much data", __func__)); + if (n->m_pkthdr.nvmf_tcp_seq != tcp_seq || + n->m_pkthdr.len > len) { + m_freem(n); + error = ENOBUFS; + break; + } + +#ifdef VERBOSE_TRACES + CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, + qp->toep->tid, n->m_pkthdr.len, n->m_pkthdr.nvmf_tcp_seq); +#endif + pdu->m->m_pkthdr.len += n->m_pkthdr.len; + len -= n->m_pkthdr.len; + tcp_seq += n->m_pkthdr.len; + m_demote_pkthdr(n); + m->m_next = n; + m = m_last(n); + } + SOCKBUF_UNLOCK(&so->so_rcv); + + if (error == 0) { + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_pdus, 1); + counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_octets, + pdu->data_len); + } + return (error); +} + +static void +nvmf_che_receive(void *arg) +{ + struct nvmf_che_qpair *qp = arg; + struct socket *so = qp->so; + struct nvmf_che_rxpdu pdu; + struct mbuf *m; + int error, terror; + + SOCKBUF_LOCK(&so->so_rcv); + while (!qp->rx_shutdown) { + /* Wait for a PDU. */ + if (so->so_error != 0 || so->so_rerror != 0) { + if (so->so_error != 0) + error = so->so_error; + else + error = so->so_rerror; + SOCKBUF_UNLOCK(&so->so_rcv); + error: + nvmf_qpair_error(&qp->qp, error); + SOCKBUF_LOCK(&so->so_rcv); + while (!qp->rx_shutdown) + cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv)); + break; + } + + m = mbufq_dequeue(&qp->rx_pdus); + if (m == NULL) { + if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) { + error = 0; + SOCKBUF_UNLOCK(&so->so_rcv); + goto error; + } + cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv)); + continue; + } + SOCKBUF_UNLOCK(&so->so_rcv); + + pdu.m = m; + pdu.hdr = mtod(m, const void *); + pdu.ddp = (m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_DDP) != 0; + + error = nvmf_che_validate_pdu(qp, &pdu); + if (error == 0 && pdu.data_len != 0) + error = nvmf_che_attach_pdu_data(qp, &pdu); + if (error != 0) + nvmf_che_free_pdu(&pdu); + else + error = nvmf_che_dispatch_pdu(qp, &pdu); + if (error != 0) { + /* + * If we received a termination request, close + * the connection immediately. + */ + if (error == ECONNRESET) + goto error; + + /* + * Wait for up to 30 seconds for the socket to + * be closed by the other end. + */ + SOCKBUF_LOCK(&so->so_rcv); + if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { + terror = cv_timedwait(&qp->rx_cv, + SOCKBUF_MTX(&so->so_rcv), 30 * hz); + if (terror == ETIMEDOUT) + printf("NVMe/TCP: Timed out after sending terminate request\n"); + } + SOCKBUF_UNLOCK(&so->so_rcv); + goto error; + } + + SOCKBUF_LOCK(&so->so_rcv); + } + SOCKBUF_UNLOCK(&so->so_rcv); + kthread_exit(); +} + +static int +nvmf_che_soupcall_receive(struct socket *so, void *arg, int waitflag) +{ + struct nvmf_che_qpair *qp = arg; + + cv_signal(&qp->rx_cv); + return (SU_OK); +} + +static int +do_nvmt_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + struct nvmf_che_adapter *nca = sc->nvme_ulp_softc; + const struct cpl_nvmt_data *cpl; + u_int tid; + struct toepcb *toep; + struct nvmf_che_qpair *qp; + struct socket *so; + struct inpcb *inp; + struct tcpcb *tp; + int len __diagused; + + if (nca->nvmt_data_iqe) { + cpl = (const void *)(rss + 1); + } else { + cpl = mtod(m, const void *); + + /* strip off CPL header */ + m_adj(m, sizeof(*cpl)); + } + tid = GET_TID(cpl); + toep = lookup_tid(sc, tid); + + KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); + + len = m->m_pkthdr.len; + + KASSERT(len == be16toh(cpl->length), + ("%s: payload length mismatch", __func__)); + + inp = toep->inp; + INP_WLOCK(inp); + if (inp->inp_flags & INP_DROPPED) { + CTR(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", + __func__, tid, len, inp->inp_flags); + INP_WUNLOCK(inp); + m_freem(m); + return (0); + } + + /* Save TCP sequence number. */ + m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq); + + qp = toep->ulpcb; + so = qp->so; + SOCKBUF_LOCK(&so->so_rcv); + mbufq_enqueue(&qp->rx_data, m); + SOCKBUF_UNLOCK(&so->so_rcv); + + tp = intotcpcb(inp); + tp->t_rcvtime = ticks; + +#ifdef VERBOSE_TRACES + CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len, + be32toh(cpl->seq)); +#endif + + INP_WUNLOCK(inp); + return (0); +} + +static int +do_nvmt_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_nvmt_cmp *cpl = mtod(m, const void *); + u_int tid = GET_TID(cpl); + struct toepcb *toep = lookup_tid(sc, tid); + struct nvmf_che_qpair *qp = toep->ulpcb; + struct socket *so = qp->so; + struct inpcb *inp = toep->inp; + u_int hlen __diagused; + bool empty; + + KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); + KASSERT(!(toep->flags & TPF_SYNQE), + ("%s: toep %p claims to be a synq entry", __func__, toep)); + + /* strip off CPL header */ + m_adj(m, sizeof(*cpl)); + hlen = m->m_pkthdr.len; + + KASSERT(hlen == be16toh(cpl->length), + ("%s: payload length mismatch", __func__)); + + INP_WLOCK(inp); + if (inp->inp_flags & INP_DROPPED) { + CTR(KTR_CXGBE, "%s: tid %u, rx (hlen %u), inp_flags 0x%x", + __func__, tid, hlen, inp->inp_flags); + INP_WUNLOCK(inp); + m_freem(m); + return (0); + } + +#ifdef VERBOSE_TRACES + CTR(KTR_CXGBE, "%s: tid %u hlen %u seq %u status %u", __func__, tid, + hlen, be32toh(cpl->seq), cpl->status); +#endif + + /* Save TCP sequence number and CPL status. */ + m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq); + m->m_pkthdr.nvmf_cpl_status = cpl->status; + + SOCKBUF_LOCK(&so->so_rcv); + empty = mbufq_len(&qp->rx_pdus) == 0; + mbufq_enqueue(&qp->rx_pdus, m); + SOCKBUF_UNLOCK(&so->so_rcv); + INP_WUNLOCK(inp); + if (empty) + cv_signal(&qp->rx_cv); + return (0); +} + +static uint16_t +che_alloc_fl_cid(struct nvmf_che_qpair *qp, uint16_t original_cid) +{ + uint16_t new_cid; + + mtx_lock(&qp->fl_cid_lock); + new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, qp->next_cid); + if (new_cid == 0) { + new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, 0); + MPASS(new_cid != 0); + } + new_cid--; + FL_CID_BUSY(new_cid, qp->fl_cid_set); + if (new_cid == CHE_MAX_FL_TAG) + qp->next_cid = 0; + else + qp->next_cid = new_cid + 1; + qp->fl_cids[new_cid] = original_cid; + mtx_unlock(&qp->fl_cid_lock); + + return (new_cid | CHE_FL_TAG_MASK); +} + +static uint16_t +che_alloc_ddp_cid(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb) +{ + mtx_assert(&qp->rx_buffers.lock, MA_OWNED); + + return (che_alloc_ddp_tag(qp, cb)); +} + +static struct mbuf * +che_command_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc) +{ + struct nvmf_capsule *nc = &cc->nc; + struct nvmf_che_command_buffer *cb; + struct nvme_sgl_descriptor *sgl; + struct nvme_tcp_cmd cmd; + struct mbuf *top, *m; + uint16_t cid; + bool use_icd; + + use_icd = false; + cb = NULL; + m = NULL; + + if (nc->nc_data.io_len != 0) { + cb = che_alloc_command_buffer(qp, &nc->nc_data, 0, + nc->nc_data.io_len, nc->nc_sqe.cid); + cb->original_cid = nc->nc_sqe.cid; + + if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) { + cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); + use_icd = true; + m = nvmf_che_command_buffer_mbuf(cb, 0, + nc->nc_data.io_len, NULL, false); + cb->data_xfered = nc->nc_data.io_len; + che_release_command_buffer(cb); + } else if (nc->nc_send_data) { + cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); + cb->cid = htole16(cid); + mtx_lock(&qp->tx_buffers.lock); + che_add_command_buffer(&qp->tx_buffers, cb); + mtx_unlock(&qp->tx_buffers.lock); + } else { + mtx_lock(&qp->rx_buffers.lock); + cid = che_alloc_ddp_cid(qp, cb); + if (cid == CHE_DDP_NO_TAG) { + cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); + che_add_command_buffer(&qp->rx_buffers, cb); + } + cb->cid = htole16(cid); + mtx_unlock(&qp->rx_buffers.lock); + } + } else + cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); + +#ifdef VERBOSE_TRACES + CTR(KTR_CXGBE, "%s: tid %u allocated cid 0x%04x for 0x%04x", __func__, + qp->toep->tid, cid, nc->nc_sqe.cid); +#endif + memset(&cmd, 0, sizeof(cmd)); + cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD; + cmd.ccsqe = nc->nc_sqe; + cmd.ccsqe.cid = htole16(cid); + + /* Populate SGL in SQE. */ + sgl = &cmd.ccsqe.sgl; + memset(sgl, 0, sizeof(*sgl)); + sgl->address = 0; + sgl->length = htole32(nc->nc_data.io_len); + if (use_icd) { + /* Use in-capsule data. */ + sgl->type = NVME_SGL_TYPE_ICD; + } else { + /* Use a command buffer. */ + sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER; + } + + top = nvmf_che_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ? + nc->nc_data.io_len : 0); + return (top); +} + +static struct mbuf * +che_response_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc) +{ + struct nvmf_capsule *nc = &cc->nc; + struct nvme_tcp_rsp rsp; + + memset(&rsp, 0, sizeof(rsp)); + rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP; + rsp.rccqe = nc->nc_cqe; + + return (nvmf_che_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0)); +} + +static struct mbuf * +capsule_to_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc) +{ + if (cc->nc.nc_qe_len == sizeof(struct nvme_command)) + return (che_command_pdu(qp, cc)); + else + return (che_response_pdu(qp, cc)); +} + +static void +nvmf_che_send(void *arg) +{ + struct nvmf_che_qpair *qp = arg; + struct nvmf_che_capsule *cc; + struct socket *so = qp->so; + struct mbuf *m; + int error; + + m = NULL; + SOCKBUF_LOCK(&so->so_snd); + while (!qp->tx_shutdown) { + if (so->so_error != 0) { + error = so->so_error; + SOCKBUF_UNLOCK(&so->so_snd); + m_freem(m); + nvmf_qpair_error(&qp->qp, error); + SOCKBUF_LOCK(&so->so_snd); + while (!qp->tx_shutdown) + cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); + break; + } + + if (STAILQ_EMPTY(&qp->tx_capsules)) { + cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); + continue; + } + + /* Convert a capsule into a PDU. */ + cc = STAILQ_FIRST(&qp->tx_capsules); + STAILQ_REMOVE_HEAD(&qp->tx_capsules, link); + SOCKBUF_UNLOCK(&so->so_snd); + + m = capsule_to_pdu(qp, cc); + che_release_capsule(cc); + + nvmf_che_write_pdu(qp, m); + + SOCKBUF_LOCK(&so->so_snd); + } + SOCKBUF_UNLOCK(&so->so_snd); + kthread_exit(); +} + +static int +nvmf_che_setsockopt(struct socket *so, u_int sspace, u_int rspace) +{ + struct sockopt opt; + int error, one = 1; + + /* Don't lower the buffer sizes, just enforce a minimum. */ + SOCKBUF_LOCK(&so->so_snd); + if (sspace < so->so_snd.sb_hiwat) + sspace = so->so_snd.sb_hiwat; + SOCKBUF_UNLOCK(&so->so_snd); + SOCKBUF_LOCK(&so->so_rcv); + if (rspace < so->so_rcv.sb_hiwat) + rspace = so->so_rcv.sb_hiwat; + SOCKBUF_UNLOCK(&so->so_rcv); + + error = soreserve(so, sspace, rspace); + if (error != 0) + return (error); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_flags |= SB_AUTOSIZE; + SOCKBUF_UNLOCK(&so->so_snd); + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_flags |= SB_AUTOSIZE; + SOCKBUF_UNLOCK(&so->so_rcv); + + /* + * Disable Nagle. + */ + bzero(&opt, sizeof(opt)); + opt.sopt_dir = SOPT_SET; + opt.sopt_level = IPPROTO_TCP; + opt.sopt_name = TCP_NODELAY; + opt.sopt_val = &one; + opt.sopt_valsize = sizeof(one); + error = sosetopt(so, &opt); + if (error != 0) + return (error); + + return (0); +} + +static void +t4_nvme_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, + uint64_t val) +{ + struct adapter *sc = td_adapter(toep->td); + + t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, word, mask, val, 0, 0); +} + +static void +set_ulp_mode_nvme(struct toepcb *toep, u_int ulp_submode, uint8_t rxpda) +{ + uint64_t val; + + CTR(KTR_CXGBE, "%s: tid %u, ULP_MODE_NVMET, submode=%#x, rxpda=%u", + __func__, toep->tid, ulp_submode, rxpda); + + val = V_TCB_ULP_TYPE(ULP_MODE_NVMET) | V_TCB_ULP_RAW(ulp_submode); + t4_nvme_set_tcb_field(toep, W_TCB_ULP_TYPE, + V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val); + + val = V_TF_RX_FLOW_CONTROL_DISABLE(1ULL); + t4_nvme_set_tcb_field(toep, W_TCB_T_FLAGS, val, val); + + val = V_TCB_RSVD((rxpda / 4) - 1); + t4_nvme_set_tcb_field(toep, W_TCB_RSVD, V_TCB_RSVD(M_TCB_RSVD), val); + + /* 0 disables CPL_NVMT_CMP_IMM which is not useful in this driver. */ + val = 0; + t4_nvme_set_tcb_field(toep, W_TCB_CMP_IMM_SZ, + V_TCB_CMP_IMM_SZ(M_TCB_CMP_IMM_SZ), val); +} + +static u_int +pdu_max_data_len(const nvlist_t *nvl, u_int max_pdu_len, u_int hlen, + uint8_t pda) +{ + u_int max_data_len; + + if (nvlist_get_bool(nvl, "header_digests")) + hlen += sizeof(uint32_t); + hlen = roundup(hlen, pda); + max_data_len = max_pdu_len - hlen; + if (nvlist_get_bool(nvl, "data_digests")) + max_data_len -= sizeof(uint32_t); + return (max_data_len); +} + +static struct nvmf_qpair * +che_allocate_qpair(bool controller, const nvlist_t *nvl) +{ + struct nvmf_che_adapter *nca; + struct nvmf_che_qpair *qp; + struct adapter *sc; + struct file *fp; + struct socket *so; + struct inpcb *inp; + struct tcpcb *tp; + struct toepcb *toep; + cap_rights_t rights; + u_int max_tx_pdu_len, num_ddp_tags; + int error, ulp_submode; + + if (!nvlist_exists_number(nvl, "fd") || + !nvlist_exists_number(nvl, "rxpda") || + !nvlist_exists_number(nvl, "txpda") || + !nvlist_exists_bool(nvl, "header_digests") || + !nvlist_exists_bool(nvl, "data_digests") || + !nvlist_exists_number(nvl, "maxr2t") || + !nvlist_exists_number(nvl, "maxh2cdata") || + !nvlist_exists_number(nvl, "max_icd")) + return (NULL); + + error = fget(curthread, nvlist_get_number(nvl, "fd"), + cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp); + if (error != 0) + return (NULL); + if (fp->f_type != DTYPE_SOCKET) { + fdrop(fp, curthread); + return (NULL); + } + so = fp->f_data; + if (so->so_type != SOCK_STREAM || + so->so_proto->pr_protocol != IPPROTO_TCP) { + fdrop(fp, curthread); + return (NULL); + } + + sc = find_offload_adapter(so); + if (sc == NULL) { + fdrop(fp, curthread); + return (NULL); + } + nca = sc->nvme_ulp_softc; + + /* + * Controller: Require advertised MAXH2CDATA to be small + * enough. + */ + if (controller) { + u_int max_rx_data; + + max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu, + sizeof(struct nvme_tcp_h2c_data_hdr), + nvlist_get_number(nvl, "rxpda")); + if (nvlist_get_number(nvl, "maxh2cdata") > max_rx_data) { + fdrop(fp, curthread); + return (NULL); + } + } + + /* + * Host: Require the queue size to be small enough that all of + * the command ids allocated by nvmf(4) will fit in the + * unallocated range. + * + * XXX: Alternatively this driver could just queue commands + * when an unallocated ID isn't available. + */ + if (!controller) { + u_int num_commands; + + num_commands = nvlist_get_number(nvl, "qsize") - 1; + if (nvlist_get_bool(nvl, "admin")) + num_commands += 8; /* Max AER */ + if (num_commands > CHE_NUM_FL_TAGS) { + fdrop(fp, curthread); + return (NULL); + } + } + + qp = malloc(sizeof(*qp), M_NVMF_CHE, M_WAITOK | M_ZERO); + qp->txpda = nvlist_get_number(nvl, "txpda"); + qp->rxpda = nvlist_get_number(nvl, "rxpda"); + qp->header_digests = nvlist_get_bool(nvl, "header_digests"); + qp->data_digests = nvlist_get_bool(nvl, "data_digests"); + qp->maxr2t = nvlist_get_number(nvl, "maxr2t"); + if (controller) + qp->maxh2cdata = nvlist_get_number(nvl, "maxh2cdata"); + + if (controller) { + /* NB: maxr2t is 0's based. */ + qp->num_fl_ttags = MIN(CHE_NUM_FL_TAGS, + nvlist_get_number(nvl, "qsize") * + ((uint64_t)qp->maxr2t + 1)); + qp->open_fl_ttags = mallocarray(qp->num_fl_ttags, + sizeof(*qp->open_fl_ttags), M_NVMF_CHE, M_WAITOK | M_ZERO); + } else { + qp->fl_cids = mallocarray(CHE_NUM_FL_TAGS, + sizeof(*qp->fl_cids), M_NVMF_CHE, M_WAITOK | M_ZERO); + qp->fl_cid_set = malloc(sizeof(*qp->fl_cid_set), M_NVMF_CHE, + M_WAITOK); + FL_CID_INIT(qp->fl_cid_set); + mtx_init(&qp->fl_cid_lock, "nvmf/che fl cids", NULL, MTX_DEF); + } + + inp = sotoinpcb(so); + INP_WLOCK(inp); + tp = intotcpcb(inp); + if (inp->inp_flags & INP_DROPPED) { + INP_WUNLOCK(inp); + free(qp->fl_cid_set, M_NVMF_CHE); + free(qp->fl_cids, M_NVMF_CHE); + free(qp->open_fl_ttags, M_NVMF_CHE); + free(qp, M_NVMF_CHE); + fdrop(fp, curthread); + return (NULL); + } + + MPASS(tp->t_flags & TF_TOE); + MPASS(tp->tod != NULL); + MPASS(tp->t_toe != NULL); + toep = tp->t_toe; + MPASS(toep->vi->adapter == sc); + + if (ulp_mode(toep) != ULP_MODE_NONE) { + INP_WUNLOCK(inp); + free(qp->fl_cid_set, M_NVMF_CHE); + free(qp->fl_cids, M_NVMF_CHE); + free(qp->open_fl_ttags, M_NVMF_CHE); + free(qp, M_NVMF_CHE); + fdrop(fp, curthread); + return (NULL); + } + + /* Claim socket from file descriptor. */ + fp->f_ops = &badfileops; + fp->f_data = NULL; + + qp->so = so; + qp->toep = toep; + qp->nca = nca; + refcount_init(&qp->refs, 1); + + /* NB: C2H and H2C headers are the same size. */ + qp->max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu, + sizeof(struct nvme_tcp_c2h_data_hdr), qp->rxpda); + qp->max_tx_data = pdu_max_data_len(nvl, nca->max_transmit_pdu, + sizeof(struct nvme_tcp_c2h_data_hdr), qp->txpda); + if (!controller) { + qp->max_tx_data = min(qp->max_tx_data, + nvlist_get_number(nvl, "maxh2cdata")); + qp->max_icd = min(nvlist_get_number(nvl, "max_icd"), + pdu_max_data_len(nvl, nca->max_transmit_pdu, + sizeof(struct nvme_tcp_cmd), qp->txpda)); + } else { + /* + * IOCCSZ represents the size of a logical command + * capsule including the 64 byte SQE and the + * in-capsule data. Use pdu_max_data_len to compute + * the maximum supported ICD length. + */ + qp->max_ioccsz = rounddown(pdu_max_data_len(nvl, + nca->max_receive_pdu, sizeof(struct nvme_tcp_cmd), + qp->rxpda), 16) + sizeof(struct nvme_command); + } + + ulp_submode = 0; + if (qp->header_digests) + ulp_submode |= FW_NVMET_ULPSUBMODE_HCRC; + if (qp->data_digests) + ulp_submode |= FW_NVMET_ULPSUBMODE_DCRC; + if (!controller) + ulp_submode |= FW_NVMET_ULPSUBMODE_ING_DIR; + + max_tx_pdu_len = sizeof(struct nvme_tcp_h2c_data_hdr); + if (qp->header_digests) + max_tx_pdu_len += sizeof(uint32_t); + max_tx_pdu_len = roundup(max_tx_pdu_len, qp->txpda); + max_tx_pdu_len += qp->max_tx_data; + if (qp->data_digests) + max_tx_pdu_len += sizeof(uint32_t); + + /* TODO: ISO limits */ + + if (controller) { + /* Use the SUCCESS flag if SQ flow control is disabled. */ + qp->send_success = !nvlist_get_bool(nvl, "sq_flow_control"); + } + + toep->params.ulp_mode = ULP_MODE_NVMET; + toep->ulpcb = qp; + + send_txdataplen_max_flowc_wr(sc, toep, + roundup(/* max_iso_pdus * */ max_tx_pdu_len, tp->t_maxseg)); + set_ulp_mode_nvme(toep, ulp_submode, qp->rxpda); + INP_WUNLOCK(inp); + + fdrop(fp, curthread); + + error = nvmf_che_setsockopt(so, max_tx_pdu_len, nca->max_receive_pdu); + if (error != 0) { + free(qp->fl_cid_set, M_NVMF_CHE); + free(qp->fl_cids, M_NVMF_CHE); + free(qp->open_fl_ttags, M_NVMF_CHE); + free(qp, M_NVMF_CHE); + return (NULL); + } + + num_ddp_tags = ddp_tags_per_qp; + if (num_ddp_tags > 0) { + qp->tpt_offset = t4_stag_alloc(sc, num_ddp_tags); + if (qp->tpt_offset != T4_STAG_UNSET) { +#ifdef VERBOSE_TRACES + CTR(KTR_CXGBE, + "%s: tid %u using %u tags at offset 0x%x", + __func__, toep->tid, num_ddp_tags, qp->tpt_offset); +#endif + qp->num_ddp_tags = num_ddp_tags; + qp->open_ddp_tags = mallocarray(qp->num_ddp_tags, + sizeof(*qp->open_ddp_tags), M_NVMF_CHE, M_WAITOK | + M_ZERO); + + t4_nvme_set_tcb_field(toep, W_TCB_TPT_OFFSET, + M_TCB_TPT_OFFSET, V_TCB_TPT_OFFSET(qp->tpt_offset)); + } + } + + TAILQ_INIT(&qp->rx_buffers.head); + TAILQ_INIT(&qp->tx_buffers.head); + mtx_init(&qp->rx_buffers.lock, "nvmf/che rx buffers", NULL, MTX_DEF); + mtx_init(&qp->tx_buffers.lock, "nvmf/che tx buffers", NULL, MTX_DEF); + + cv_init(&qp->rx_cv, "-"); + cv_init(&qp->tx_cv, "-"); + mbufq_init(&qp->rx_data, 0); + mbufq_init(&qp->rx_pdus, 0); + STAILQ_INIT(&qp->tx_capsules); + + /* Register socket upcall for receive to handle remote FIN. */ + SOCKBUF_LOCK(&so->so_rcv); + soupcall_set(so, SO_RCV, nvmf_che_soupcall_receive, qp); + SOCKBUF_UNLOCK(&so->so_rcv); + + /* Spin up kthreads. */ + error = kthread_add(nvmf_che_receive, qp, NULL, &qp->rx_thread, 0, 0, + "nvmef che rx"); + if (error != 0) { + che_free_qpair(&qp->qp); + return (NULL); + } + error = kthread_add(nvmf_che_send, qp, NULL, &qp->tx_thread, 0, 0, + "nvmef che tx"); + if (error != 0) { + che_free_qpair(&qp->qp); + return (NULL); + } + + return (&qp->qp); +} + +static void +che_release_qpair(struct nvmf_che_qpair *qp) +{ + if (refcount_release(&qp->refs)) + free(qp, M_NVMF_CHE); +} + +static void +che_free_qpair(struct nvmf_qpair *nq) +{ + struct nvmf_che_qpair *qp = CQP(nq); + struct nvmf_che_command_buffer *ncb, *cb; + struct nvmf_che_capsule *ncc, *cc; + struct socket *so = qp->so; + struct toepcb *toep = qp->toep; + struct inpcb *inp = sotoinpcb(so); + + /* Shut down kthreads. */ + SOCKBUF_LOCK(&so->so_snd); + qp->tx_shutdown = true; + if (qp->tx_thread != NULL) { + cv_signal(&qp->tx_cv); + mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0, + "nvchetx", 0); + } + SOCKBUF_UNLOCK(&so->so_snd); + + SOCKBUF_LOCK(&so->so_rcv); + qp->rx_shutdown = true; + if (qp->rx_thread != NULL) { + cv_signal(&qp->rx_cv); + mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0, + "nvcherx", 0); + } + soupcall_clear(so, SO_RCV); + SOCKBUF_UNLOCK(&so->so_rcv); + mbufq_drain(&qp->rx_data); + mbufq_drain(&qp->rx_pdus); + + STAILQ_FOREACH_SAFE(cc, &qp->tx_capsules, link, ncc) { + nvmf_abort_capsule_data(&cc->nc, ECONNABORTED); + che_release_capsule(cc); + } + + cv_destroy(&qp->tx_cv); + cv_destroy(&qp->rx_cv); + + if (qp->open_fl_ttags != NULL) { + for (u_int i = 0; i < qp->num_fl_ttags; i++) { + cb = qp->open_fl_ttags[i]; + if (cb != NULL) { + cb->cc->active_r2ts--; + cb->error = ECONNABORTED; + che_release_command_buffer(cb); + } + } + free(qp->open_fl_ttags, M_NVMF_CHE); + } + if (qp->num_ddp_tags != 0) { + for (u_int i = 0; i < qp->num_ddp_tags; i++) { + cb = qp->open_ddp_tags[i]; + if (cb != NULL) { + if (cb->cc != NULL) + cb->cc->active_r2ts--; + cb->error = ECONNABORTED; + mtx_lock(&qp->rx_buffers.lock); + che_free_ddp_tag(qp, cb, cb->ttag); + mtx_unlock(&qp->rx_buffers.lock); + che_release_command_buffer(cb); + } + } + free(qp->open_ddp_tags, M_NVMF_CHE); + } + + mtx_lock(&qp->rx_buffers.lock); + TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) { + che_remove_command_buffer(&qp->rx_buffers, cb); + mtx_unlock(&qp->rx_buffers.lock); +#ifdef INVARIANTS + if (cb->cc != NULL) + cb->cc->pending_r2ts--; +#endif + cb->error = ECONNABORTED; + che_release_command_buffer(cb); + mtx_lock(&qp->rx_buffers.lock); + } + mtx_destroy(&qp->rx_buffers.lock); + + mtx_lock(&qp->tx_buffers.lock); + TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) { + che_remove_command_buffer(&qp->tx_buffers, cb); + mtx_unlock(&qp->tx_buffers.lock); + cb->error = ECONNABORTED; + che_release_command_buffer(cb); + mtx_lock(&qp->tx_buffers.lock); + } + mtx_destroy(&qp->tx_buffers.lock); + + if (qp->num_ddp_tags != 0) + t4_stag_free(qp->nca->sc, qp->tpt_offset, qp->num_ddp_tags); + + if (!qp->qp.nq_controller) { + free(qp->fl_cids, M_NVMF_CHE); + free(qp->fl_cid_set, M_NVMF_CHE); + mtx_destroy(&qp->fl_cid_lock); + } + + INP_WLOCK(inp); + toep->ulpcb = NULL; + mbufq_drain(&toep->ulp_pduq); + + /* + * Grab a reference to use when waiting for the final CPL to + * be received. If toep->inp is NULL, then + * final_cpl_received() has already been called (e.g. due to + * the peer sending a RST). + */ + if (toep->inp != NULL) { + toep = hold_toepcb(toep); + toep->flags |= TPF_WAITING_FOR_FINAL; + } else + toep = NULL; + INP_WUNLOCK(inp); + + soclose(so); + + /* + * Wait for the socket to fully close. This ensures any + * pending received data has been received (and in particular, + * any data that would be received by DDP has been handled). + */ + if (toep != NULL) { + struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep); + + mtx_lock(lock); + while ((toep->flags & TPF_WAITING_FOR_FINAL) != 0) + mtx_sleep(toep, lock, PSOCK, "conclo2", 0); + mtx_unlock(lock); + free_toepcb(toep); + } + + che_release_qpair(qp); +} + +static uint32_t +che_max_ioccsz(struct nvmf_qpair *nq) +{ + struct nvmf_che_qpair *qp = CQP(nq); + + /* + * Limit the command capsule size so that with maximum ICD it + * fits within the limit of the largest PDU the adapter can + * receive. + */ + return (qp->max_ioccsz); +} + +static uint64_t +che_max_xfer_size(struct nvmf_qpair *nq) +{ + struct nvmf_che_qpair *qp = CQP(nq); + + /* + * Limit host transfers to the size of the data payload in the + * largest PDU the adapter can receive. + */ + return (qp->max_rx_data); +} + +static struct nvmf_capsule * +che_allocate_capsule(struct nvmf_qpair *nq, int how) +{ + struct nvmf_che_qpair *qp = CQP(nq); + struct nvmf_che_capsule *cc; + + cc = malloc(sizeof(*cc), M_NVMF_CHE, how | M_ZERO); + if (cc == NULL) + return (NULL); + refcount_init(&cc->refs, 1); + refcount_acquire(&qp->refs); + return (&cc->nc); +} + +static void +che_release_capsule(struct nvmf_che_capsule *cc) +{ + struct nvmf_che_qpair *qp = CQP(cc->nc.nc_qpair); + + if (!refcount_release(&cc->refs)) + return; + + MPASS(cc->active_r2ts == 0); + MPASS(cc->pending_r2ts == 0); + + nvmf_che_free_pdu(&cc->rx_pdu); + free(cc, M_NVMF_CHE); + che_release_qpair(qp); +} + +static void +che_free_capsule(struct nvmf_capsule *nc) +{ + che_release_capsule(CCAP(nc)); +} + +static int +che_transmit_capsule(struct nvmf_capsule *nc) +{ + struct nvmf_che_qpair *qp = CQP(nc->nc_qpair); + struct nvmf_che_capsule *cc = CCAP(nc); + struct socket *so = qp->so; + + refcount_acquire(&cc->refs); + SOCKBUF_LOCK(&so->so_snd); + STAILQ_INSERT_TAIL(&qp->tx_capsules, cc, link); + cv_signal(&qp->tx_cv); + SOCKBUF_UNLOCK(&so->so_snd); + return (0); +} + +static uint8_t +che_validate_command_capsule(struct nvmf_capsule *nc) +{ + struct nvmf_che_capsule *cc = CCAP(nc); + struct nvme_sgl_descriptor *sgl; + + KASSERT(cc->rx_pdu.hdr != NULL, ("capsule wasn't received")); + + sgl = &nc->nc_sqe.sgl; + switch (sgl->type) { + case NVME_SGL_TYPE_ICD: + if (cc->rx_pdu.data_len != le32toh(sgl->length)) { + printf("NVMe/TCP: Command Capsule with mismatched ICD length\n"); + return (NVME_SC_DATA_SGL_LENGTH_INVALID); + } + break; + case NVME_SGL_TYPE_COMMAND_BUFFER: + if (cc->rx_pdu.data_len != 0) { + printf("NVMe/TCP: Command Buffer SGL with ICD\n"); + return (NVME_SC_INVALID_FIELD); + } + break; + default: + printf("NVMe/TCP: Invalid SGL type in Command Capsule\n"); + return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID); + } + + if (sgl->address != 0) { + printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n"); + return (NVME_SC_SGL_OFFSET_INVALID); + } + + return (NVME_SC_SUCCESS); +} + +static size_t +che_capsule_data_len(const struct nvmf_capsule *nc) +{ + MPASS(nc->nc_qe_len == sizeof(struct nvme_command)); + return (le32toh(nc->nc_sqe.sgl.length)); +} + +static void +che_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset, + struct nvmf_io_request *io) +{ + struct nvmf_che_qpair *qp = CQP(nc->nc_qpair); + struct nvmf_che_capsule *cc = CCAP(nc); + struct nvmf_che_command_buffer *cb; + + cb = che_alloc_command_buffer(qp, io, data_offset, io->io_len, + nc->nc_sqe.cid); + + cb->cc = cc; + refcount_acquire(&cc->refs); + + /* + * If this command has too many active R2Ts or there are no + * available transfer tags, queue the request for later. + * + * NB: maxr2t is 0's based. + */ + mtx_lock(&qp->rx_buffers.lock); + if (cc->active_r2ts > qp->maxr2t || + !nvmf_che_allocate_ttag(qp, cb)) { +#ifdef INVARIANTS + cc->pending_r2ts++; +#endif + TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link); + mtx_unlock(&qp->rx_buffers.lock); + return; + } + mtx_unlock(&qp->rx_buffers.lock); + + che_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len); +} + +static void +che_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset, + struct nvmf_io_request *io) +{ + struct nvmf_che_capsule *cc = CCAP(nc); + + /* + * The header is in rx_pdu.m, the padding is discarded, and + * the data starts at rx_pdu.m->m_next. + */ + mbuf_copyto_io(cc->rx_pdu.m->m_next, data_offset, io->io_len, io, 0); + nvmf_complete_io_request(io, io->io_len, 0); +} + +static int +che_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, + struct nvmf_io_request *io) +{ + struct nvme_sgl_descriptor *sgl; + size_t data_len; + + if (nc->nc_qe_len != sizeof(struct nvme_command) || + !nc->nc_qpair->nq_controller) + return (EINVAL); + + sgl = &nc->nc_sqe.sgl; + data_len = le32toh(sgl->length); + if (data_offset + io->io_len > data_len) + return (EFBIG); + + if (sgl->type == NVME_SGL_TYPE_ICD) + che_receive_icd_data(nc, data_offset, io); + else + che_receive_r2t_data(nc, data_offset, io); + return (0); +} + +/* NB: cid is little-endian already. */ +static void +che_send_c2h_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint32_t data_offset, + struct mbuf *m, size_t len, bool last_pdu, bool success) +{ + struct nvme_tcp_c2h_data_hdr c2h; + struct mbuf *top; + + memset(&c2h, 0, sizeof(c2h)); + c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA; + if (last_pdu) + c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU; + if (success) + c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS; + c2h.cccid = cid; + c2h.datao = htole32(data_offset); + c2h.datal = htole32(len); + + top = nvmf_che_construct_pdu(qp, &c2h, sizeof(c2h), m, len); + nvmf_che_write_pdu(qp, top); +} + +static u_int +che_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, + struct mbuf *m, size_t len) +{ + struct nvmf_che_qpair *qp = CQP(nc->nc_qpair); + struct nvme_sgl_descriptor *sgl; + uint32_t data_len; + bool last_pdu, last_xfer; + + if (nc->nc_qe_len != sizeof(struct nvme_command) || + !qp->qp.nq_controller) { + m_freem(m); + return (NVME_SC_INVALID_FIELD); + } + + sgl = &nc->nc_sqe.sgl; + data_len = le32toh(sgl->length); + if (data_offset + len > data_len) { + m_freem(m); + return (NVME_SC_INVALID_FIELD); + } + last_xfer = (data_offset + len == data_len); + + if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) { + m_freem(m); + return (NVME_SC_INVALID_FIELD); + } + + KASSERT(data_offset == CCAP(nc)->tx_data_offset, + ("%s: starting data_offset %u doesn't match end of previous xfer %u", + __func__, data_offset, CCAP(nc)->tx_data_offset)); + + /* Queue one or more C2H_DATA PDUs containing the data from 'm'. */ + while (m != NULL) { + struct mbuf *n; + uint32_t todo; + + if (m->m_len > qp->max_tx_data) { + n = m_split(m, qp->max_tx_data, M_WAITOK); + todo = m->m_len; + } else { + struct mbuf *p; + + todo = m->m_len; + p = m; + n = p->m_next; + while (n != NULL) { + if (todo + n->m_len > qp->max_tx_data) { + p->m_next = NULL; + break; + } + todo += n->m_len; + p = n; + n = p->m_next; + } + MPASS(m_length(m, NULL) == todo); + } + + last_pdu = (n == NULL && last_xfer); + che_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo, + last_pdu, last_pdu && qp->send_success); + + data_offset += todo; + data_len -= todo; + m = n; + } + MPASS(data_len == 0); + +#ifdef INVARIANTS + CCAP(nc)->tx_data_offset = data_offset; +#endif + if (!last_xfer) + return (NVMF_MORE); + else if (qp->send_success) + return (NVMF_SUCCESS_SENT); + else + return (NVME_SC_SUCCESS); +} + +struct nvmf_transport_ops che_ops = { + .allocate_qpair = che_allocate_qpair, + .free_qpair = che_free_qpair, + .max_ioccsz = che_max_ioccsz, + .max_xfer_size = che_max_xfer_size, + .allocate_capsule = che_allocate_capsule, + .free_capsule = che_free_capsule, + .transmit_capsule = che_transmit_capsule, + .validate_command_capsule = che_validate_command_capsule, + .capsule_data_len = che_capsule_data_len, + .receive_controller_data = che_receive_controller_data, + .send_controller_data = che_send_controller_data, + .trtype = NVMF_TRTYPE_TCP, + .priority = 10, +}; + +NVMF_TRANSPORT(che, che_ops); + +static void +read_pdu_limits(struct adapter *sc, u_int *max_tx_pdu_len, + uint32_t *max_rx_pdu_len) +{ + uint32_t tx_len, rx_len, r, v; + + /* Copied from cxgbei, but not sure if this is correct. */ + rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE); + tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); + + r = t4_read_reg(sc, A_TP_PARA_REG2); + rx_len = min(rx_len, G_MAXRXDATA(r)); + tx_len = min(tx_len, G_MAXRXDATA(r)); + + r = t4_read_reg(sc, A_TP_PARA_REG7); + v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r)); + rx_len = min(rx_len, v); + tx_len = min(tx_len, v); + + /* Cannot be larger than 32KB - 256. */ + rx_len = min(rx_len, 32512); + tx_len = min(tx_len, 32512); + + *max_tx_pdu_len = tx_len; + *max_rx_pdu_len = rx_len; +} + +static int +nvmf_che_init(struct adapter *sc, struct nvmf_che_adapter *nca) +{ + struct sysctl_oid *oid; + struct sysctl_oid_list *children; + uint32_t val; + + read_pdu_limits(sc, &nca->max_transmit_pdu, &nca->max_receive_pdu); + if (nca->max_transmit_pdu > che_max_transmit_pdu) + nca->max_transmit_pdu = che_max_transmit_pdu; + if (nca->max_receive_pdu > che_max_receive_pdu) + nca->max_receive_pdu = che_max_receive_pdu; + val = t4_read_reg(sc, A_SGE_CONTROL2); + nca->nvmt_data_iqe = (val & F_RXCPLMODE_NVMT) != 0; + + sysctl_ctx_init(&nca->ctx); + oid = device_get_sysctl_tree(sc->dev); /* dev.che.X */ + children = SYSCTL_CHILDREN(oid); + + oid = SYSCTL_ADD_NODE(&nca->ctx, children, OID_AUTO, "nvme", + CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "NVMe ULP settings"); + children = SYSCTL_CHILDREN(oid); + + nca->ddp_threshold = 8192; + SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "ddp_threshold", + CTLFLAG_RW, &nca->ddp_threshold, 0, "Rx zero copy threshold"); + + SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_transmit_pdu", + CTLFLAG_RW, &nca->max_transmit_pdu, 0, + "Maximum size of a transmitted PDU"); + + SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_receive_pdu", + CTLFLAG_RW, &nca->max_receive_pdu, 0, + "Maximum size of a received PDU"); + + return (0); +} + +static void +nvmf_che_destroy(struct nvmf_che_adapter *nca) +{ + sysctl_ctx_free(&nca->ctx); + free(nca, M_CXGBE); +} + +static int +nvmf_che_activate(struct adapter *sc) +{ + struct nvmf_che_adapter *nca; + int rc; + + ASSERT_SYNCHRONIZED_OP(sc); + + if (uld_active(sc, ULD_NVME)) { + KASSERT(0, ("%s: NVMe offload already enabled on adapter %p", + __func__, sc)); + return (0); + } + + if ((sc->nvmecaps & FW_CAPS_CONFIG_NVME_TCP) == 0) { + device_printf(sc->dev, + "not NVMe offload capable, or capability disabled\n"); + return (ENOSYS); + } + + /* per-adapter softc for NVMe */ + nca = malloc(sizeof(*nca), M_CXGBE, M_ZERO | M_WAITOK); + nca->sc = sc; + + rc = nvmf_che_init(sc, nca); + if (rc != 0) { + free(nca, M_CXGBE); + return (rc); + } + + sc->nvme_ulp_softc = nca; + + return (0); +} + +static int +nvmf_che_deactivate(struct adapter *sc) +{ + struct nvmf_che_adapter *nca = sc->nvme_ulp_softc; + + ASSERT_SYNCHRONIZED_OP(sc); + + if (nca != NULL) { + nvmf_che_destroy(nca); + sc->nvme_ulp_softc = NULL; + } + + return (0); +} + +static void +nvmf_che_activate_all(struct adapter *sc, void *arg __unused) +{ + if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvact") != 0) + return; + + /* Activate NVMe if any port on this adapter has IFCAP_TOE enabled. */ + if (sc->offload_map && !uld_active(sc, ULD_NVME)) + (void) t4_activate_uld(sc, ULD_NVME); + + end_synchronized_op(sc, 0); +} + +static void +nvmf_che_deactivate_all(struct adapter *sc, void *arg __unused) +{ + if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvdea") != 0) + return; + + if (uld_active(sc, ULD_NVME)) + (void) t4_deactivate_uld(sc, ULD_NVME); + + end_synchronized_op(sc, 0); +} + +static struct uld_info nvmf_che_uld_info = { + .uld_activate = nvmf_che_activate, + .uld_deactivate = nvmf_che_deactivate, +}; + +static int +nvmf_che_mod_load(void) +{ + int rc; + + t4_register_cpl_handler(CPL_NVMT_CMP, do_nvmt_cmp); + t4_register_cpl_handler(CPL_NVMT_DATA, do_nvmt_data); + + rc = t4_register_uld(&nvmf_che_uld_info, ULD_NVME); + if (rc != 0) + return (rc); + + t4_iterate(nvmf_che_activate_all, NULL); + + return (rc); +} + +static int +nvmf_che_mod_unload(void) +{ + t4_iterate(nvmf_che_deactivate_all, NULL); + + if (t4_unregister_uld(&nvmf_che_uld_info, ULD_NVME) == EBUSY) + return (EBUSY); + + t4_register_cpl_handler(CPL_NVMT_CMP, NULL); + t4_register_cpl_handler(CPL_NVMT_DATA, NULL); + + return (0); +} +#endif + +static int +nvmf_che_modevent(module_t mod, int cmd, void *arg) +{ + int rc; + +#ifdef TCP_OFFLOAD + switch (cmd) { + case MOD_LOAD: + rc = nvmf_che_mod_load(); + break; + case MOD_UNLOAD: + rc = nvmf_che_mod_unload(); + break; + default: + rc = EOPNOTSUPP; + break; + } +#else + printf("nvmf_che: compiled without TCP_OFFLOAD support.\n"); + rc = EOPNOTSUPP; +#endif + + return (rc); +} + +static moduledata_t nvmf_che_mod = { + "nvmf_che", + nvmf_che_modevent, + NULL, +}; + +MODULE_VERSION(nvmf_che, 1); +DECLARE_MODULE(nvmf_che, nvmf_che_mod, SI_SUB_EXEC, SI_ORDER_ANY); +MODULE_DEPEND(nvmf_che, t4_tom, 1, 1, 1); +MODULE_DEPEND(nvmf_che, cxgbe, 1, 1, 1); |
