aboutsummaryrefslogtreecommitdiff
path: root/sys/dev
diff options
context:
space:
mode:
authorNavdeep Parhar <np@FreeBSD.org>2015-12-26 06:05:21 +0000
committerNavdeep Parhar <np@FreeBSD.org>2015-12-26 06:05:21 +0000
commite3148e46b2746df4963c224c30e5c01d8948f502 (patch)
tree3bd83734ffa9333162f6f5895531c4c1195f7de0 /sys/dev
parent66e979f15cf27715691a5e318181853e68e54d83 (diff)
downloadsrc-e3148e46b2746df4963c224c30e5c01d8948f502.tar.gz
src-e3148e46b2746df4963c224c30e5c01d8948f502.zip
cxgbei: Hardware accelerated iSCSI target and initiator for TOE capable
cards supported by cxgbe(4). On the host side this driver interfaces with the storage stack via the ICL (iSCSI Common Layer) in the kernel. On the wire the traffic is standard iSCSI (SCSI over TCP as per RFC 3720/7143 etc.) that interoperates with all other standards compliant implementations. The driver is layered on top of the TOE driver (t4_tom) and promotes connections being handled by t4_tom to iSCSI ULP (Upper Layer Protocol) mode. Hardware assistance in this mode includes: - Full TCP processing. - iSCSI PDU identification and recovery within the TCP stream. - Header and/or data digest insertion (tx) and verification (rx). - Zero copy (both tx and rx). Man page will follow in a separate commit in a couple of weeks. Relnotes: Yes Sponsored by: Chelsio Communications
Notes
Notes: svn path=/head/; revision=292740
Diffstat (limited to 'sys/dev')
-rw-r--r--sys/dev/cxgbe/cxgbei/cxgbei.c1131
-rw-r--r--sys/dev/cxgbe/cxgbei/cxgbei.h167
-rw-r--r--sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.c417
-rw-r--r--sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.h217
-rw-r--r--sys/dev/cxgbe/cxgbei/icl_cxgbei.c896
5 files changed, 2828 insertions, 0 deletions
diff --git a/sys/dev/cxgbe/cxgbei/cxgbei.c b/sys/dev/cxgbe/cxgbei/cxgbei.c
new file mode 100644
index 000000000000..66ceb895f29d
--- /dev/null
+++ b/sys/dev/cxgbe/cxgbei/cxgbei.c
@@ -0,0 +1,1131 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Chelsio T5xx iSCSI driver
+ *
+ * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/systm.h>
+
+#ifdef TCP_OFFLOAD
+#include <sys/errno.h>
+#include <sys/kthread.h>
+#include <sys/smp.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/mbuf.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/condvar.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/toecore.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+
+#include <cam/scsi/scsi_all.h>
+#include <cam/scsi/scsi_da.h>
+#include <cam/ctl/ctl_io.h>
+#include <cam/ctl/ctl.h>
+#include <cam/ctl/ctl_backend.h>
+#include <cam/ctl/ctl_error.h>
+#include <cam/ctl/ctl_frontend.h>
+#include <cam/ctl/ctl_debug.h>
+#include <cam/ctl/ctl_ha.h>
+#include <cam/ctl/ctl_ioctl.h>
+
+#include <dev/iscsi/icl.h>
+#include <dev/iscsi/iscsi_proto.h>
+#include <dev/iscsi/iscsi_ioctl.h>
+#include <dev/iscsi/iscsi.h>
+#include <cam/ctl/ctl_frontend_iscsi.h>
+
+#include <cam/cam.h>
+#include <cam/cam_ccb.h>
+#include <cam/cam_xpt.h>
+#include <cam/cam_debug.h>
+#include <cam/cam_sim.h>
+#include <cam/cam_xpt_sim.h>
+#include <cam/cam_xpt_periph.h>
+#include <cam/cam_periph.h>
+#include <cam/cam_compat.h>
+#include <cam/scsi/scsi_message.h>
+
+#include "common/common.h"
+#include "common/t4_msg.h"
+#include "common/t4_regs.h" /* for PCIE_MEM_ACCESS */
+#include "tom/t4_tom.h"
+#include "cxgbei.h"
+#include "cxgbei_ulp2_ddp.h"
+
+static int worker_thread_count;
+static struct cxgbei_worker_thread_softc *cwt_softc;
+static struct proc *cxgbei_proc;
+
+/* XXXNP some header instead. */
+struct icl_pdu *icl_cxgbei_new_pdu(int);
+void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *);
+void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *);
+
+/*
+ * Direct Data Placement -
+ * Directly place the iSCSI Data-In or Data-Out PDU's payload into pre-posted
+ * final destination host-memory buffers based on the Initiator Task Tag (ITT)
+ * in Data-In or Target Task Tag (TTT) in Data-Out PDUs.
+ * The host memory address is programmed into h/w in the format of pagepod
+ * entries.
+ * The location of the pagepod entry is encoded into ddp tag which is used as
+ * the base for ITT/TTT.
+ */
+
+/*
+ * functions to program the pagepod in h/w
+ */
+static void inline
+ppod_set(struct pagepod *ppod,
+ struct cxgbei_ulp2_pagepod_hdr *hdr,
+ struct cxgbei_ulp2_gather_list *gl,
+ unsigned int pidx)
+{
+ int i;
+
+ memcpy(ppod, hdr, sizeof(*hdr));
+
+ for (i = 0; i < (PPOD_PAGES + 1); i++, pidx++) {
+ ppod->addr[i] = pidx < gl->nelem ?
+ cpu_to_be64(gl->dma_sg[pidx].phys_addr) : 0ULL;
+ }
+}
+
+static void inline
+ppod_clear(struct pagepod *ppod)
+{
+ memset(ppod, 0, sizeof(*ppod));
+}
+
+static inline void
+ulp_mem_io_set_hdr(struct adapter *sc, int tid, struct ulp_mem_io *req,
+ unsigned int wr_len, unsigned int dlen,
+ unsigned int pm_addr)
+{
+ struct ulptx_idata *idata = (struct ulptx_idata *)(req + 1);
+
+ INIT_ULPTX_WR(req, wr_len, 0, 0);
+ req->cmd = cpu_to_be32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) |
+ V_ULP_MEMIO_ORDER(is_t4(sc)) |
+ V_T5_ULP_MEMIO_IMM(is_t5(sc)));
+ req->dlen = htonl(V_ULP_MEMIO_DATA_LEN(dlen >> 5));
+ req->len16 = htonl(DIV_ROUND_UP(wr_len - sizeof(req->wr), 16)
+ | V_FW_WR_FLOWID(tid));
+ req->lock_addr = htonl(V_ULP_MEMIO_ADDR(pm_addr >> 5));
+
+ idata->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_IMM));
+ idata->len = htonl(dlen);
+}
+
+#define PPOD_SIZE sizeof(struct pagepod)
+#define ULPMEM_IDATA_MAX_NPPODS 1 /* 256/PPOD_SIZE */
+#define PCIE_MEMWIN_MAX_NPPODS 16 /* 1024/PPOD_SIZE */
+
+static int
+ppod_write_idata(struct cxgbei_data *ci,
+ struct cxgbei_ulp2_pagepod_hdr *hdr,
+ unsigned int idx, unsigned int npods,
+ struct cxgbei_ulp2_gather_list *gl,
+ unsigned int gl_pidx, struct toepcb *toep)
+{
+ u_int dlen = PPOD_SIZE * npods;
+ u_int pm_addr = idx * PPOD_SIZE + ci->llimit;
+ u_int wr_len = roundup(sizeof(struct ulp_mem_io) +
+ sizeof(struct ulptx_idata) + dlen, 16);
+ struct ulp_mem_io *req;
+ struct ulptx_idata *idata;
+ struct pagepod *ppod;
+ u_int i;
+ struct wrqe *wr;
+ struct adapter *sc = toep->vi->pi->adapter;
+
+ wr = alloc_wrqe(wr_len, toep->ctrlq);
+ if (wr == NULL) {
+ CXGBE_UNIMPLEMENTED("ppod_write_idata: alloc_wrqe failure");
+ return (ENOMEM);
+ }
+
+ req = wrtod(wr);
+ memset(req, 0, wr_len);
+ ulp_mem_io_set_hdr(sc, toep->tid, req, wr_len, dlen, pm_addr);
+ idata = (struct ulptx_idata *)(req + 1);
+
+ ppod = (struct pagepod *)(idata + 1);
+ for (i = 0; i < npods; i++, ppod++, gl_pidx += PPOD_PAGES) {
+ if (!hdr) /* clear the pagepod */
+ ppod_clear(ppod);
+ else /* set the pagepod */
+ ppod_set(ppod, hdr, gl, gl_pidx);
+ }
+
+ t4_wrq_tx(sc, wr);
+ return 0;
+}
+
+int
+t4_ddp_set_map(struct cxgbei_data *ci, void *iccp,
+ struct cxgbei_ulp2_pagepod_hdr *hdr, u_int idx, u_int npods,
+ struct cxgbei_ulp2_gather_list *gl, int reply)
+{
+ struct icl_cxgbei_conn *icc = (struct icl_cxgbei_conn *)iccp;
+ struct toepcb *toep = icc->toep;
+ int err;
+ unsigned int pidx = 0, w_npods = 0, cnt;
+
+ /*
+ * on T4, if we use a mix of IMMD and DSGL with ULP_MEM_WRITE,
+ * the order would not be garanteed, so we will stick with IMMD
+ */
+ gl->tid = toep->tid;
+ gl->port_id = toep->vi->pi->port_id;
+ gl->egress_dev = (void *)toep->vi->ifp;
+
+ /* send via immediate data */
+ for (; w_npods < npods; idx += cnt, w_npods += cnt,
+ pidx += PPOD_PAGES) {
+ cnt = npods - w_npods;
+ if (cnt > ULPMEM_IDATA_MAX_NPPODS)
+ cnt = ULPMEM_IDATA_MAX_NPPODS;
+ err = ppod_write_idata(ci, hdr, idx, cnt, gl, pidx, toep);
+ if (err) {
+ printf("%s: ppod_write_idata failed\n", __func__);
+ break;
+ }
+ }
+ return err;
+}
+
+void
+t4_ddp_clear_map(struct cxgbei_data *ci, struct cxgbei_ulp2_gather_list *gl,
+ u_int tag, u_int idx, u_int npods, struct icl_cxgbei_conn *icc)
+{
+ struct toepcb *toep = icc->toep;
+ int err = -1;
+ u_int pidx = 0;
+ u_int w_npods = 0;
+ u_int cnt;
+
+ for (; w_npods < npods; idx += cnt, w_npods += cnt,
+ pidx += PPOD_PAGES) {
+ cnt = npods - w_npods;
+ if (cnt > ULPMEM_IDATA_MAX_NPPODS)
+ cnt = ULPMEM_IDATA_MAX_NPPODS;
+ err = ppod_write_idata(ci, NULL, idx, cnt, gl, 0, toep);
+ if (err)
+ break;
+ }
+}
+
+static int
+cxgbei_map_sg(struct cxgbei_sgl *sgl, struct ccb_scsiio *csio)
+{
+ unsigned int data_len = csio->dxfer_len;
+ unsigned int sgoffset = (uint64_t)csio->data_ptr & PAGE_MASK;
+ unsigned int nsge;
+ unsigned char *sgaddr = csio->data_ptr;
+ unsigned int len = 0;
+
+ nsge = (csio->dxfer_len + sgoffset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ sgl->sg_addr = sgaddr;
+ sgl->sg_offset = sgoffset;
+ if (data_len < (PAGE_SIZE - sgoffset))
+ len = data_len;
+ else
+ len = PAGE_SIZE - sgoffset;
+
+ sgl->sg_length = len;
+
+ data_len -= len;
+ sgaddr += len;
+ sgl = sgl+1;
+
+ while (data_len > 0) {
+ sgl->sg_addr = sgaddr;
+ len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE;
+ sgl->sg_length = len;
+ sgaddr += len;
+ data_len -= len;
+ sgl = sgl + 1;
+ }
+
+ return nsge;
+}
+
+static int
+cxgbei_map_sg_tgt(struct cxgbei_sgl *sgl, union ctl_io *io)
+{
+ unsigned int data_len, sgoffset, nsge;
+ unsigned char *sgaddr;
+ unsigned int len = 0, index = 0, ctl_sg_count, i;
+ struct ctl_sg_entry ctl_sg_entry, *ctl_sglist;
+
+ if (io->scsiio.kern_sg_entries > 0) {
+ ctl_sglist = (struct ctl_sg_entry *)io->scsiio.kern_data_ptr;
+ ctl_sg_count = io->scsiio.kern_sg_entries;
+ } else {
+ ctl_sglist = &ctl_sg_entry;
+ ctl_sglist->addr = io->scsiio.kern_data_ptr;
+ ctl_sglist->len = io->scsiio.kern_data_len;
+ ctl_sg_count = 1;
+ }
+
+ sgaddr = sgl->sg_addr = ctl_sglist[index].addr;
+ sgoffset = sgl->sg_offset = (uint64_t)sgl->sg_addr & PAGE_MASK;
+ data_len = ctl_sglist[index].len;
+
+ if (data_len < (PAGE_SIZE - sgoffset))
+ len = data_len;
+ else
+ len = PAGE_SIZE - sgoffset;
+
+ sgl->sg_length = len;
+
+ data_len -= len;
+ sgaddr += len;
+ sgl = sgl+1;
+
+ len = 0;
+ for (i = 0; i< ctl_sg_count; i++)
+ len += ctl_sglist[i].len;
+ nsge = (len + sgoffset + PAGE_SIZE -1) >> PAGE_SHIFT;
+ while (data_len > 0) {
+ sgl->sg_addr = sgaddr;
+ len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE;
+ sgl->sg_length = len;
+ sgaddr += len;
+ data_len -= len;
+ sgl = sgl + 1;
+ if (data_len == 0) {
+ if (index == ctl_sg_count - 1)
+ break;
+ index++;
+ sgaddr = ctl_sglist[index].addr;
+ data_len = ctl_sglist[index].len;
+ }
+ }
+
+ return nsge;
+}
+
+static int
+t4_sk_ddp_tag_reserve(struct cxgbei_data *ci, struct icl_cxgbei_conn *icc,
+ u_int xferlen, struct cxgbei_sgl *sgl, u_int sgcnt, u_int *ddp_tag)
+{
+ struct cxgbei_ulp2_gather_list *gl;
+ int err = -EINVAL;
+ struct toepcb *toep = icc->toep;
+
+ gl = cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec(xferlen, sgl, sgcnt, ci, 0);
+ if (gl) {
+ err = cxgbei_ulp2_ddp_tag_reserve(ci, icc, toep->tid,
+ &ci->tag_format, ddp_tag, gl, 0, 0);
+ if (err) {
+ cxgbei_ulp2_ddp_release_gl(ci, gl);
+ }
+ }
+
+ return err;
+}
+
+static unsigned int
+cxgbei_task_reserve_itt(struct icl_conn *ic, void **prv,
+ struct ccb_scsiio *scmd, unsigned int *itt)
+{
+ struct icl_cxgbei_conn *icc = ic_to_icc(ic);
+ int xferlen = scmd->dxfer_len;
+ struct cxgbei_task_data *tdata = NULL;
+ struct cxgbei_sgl *sge = NULL;
+ struct toepcb *toep = icc->toep;
+ struct adapter *sc = td_adapter(toep->td);
+ struct cxgbei_data *ci = sc->iscsi_ulp_softc;
+ int err = -1;
+
+ MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
+
+ tdata = (struct cxgbei_task_data *)*prv;
+ if (xferlen == 0 || tdata == NULL)
+ goto out;
+ if (xferlen < DDP_THRESHOLD)
+ goto out;
+
+ if ((scmd->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
+ tdata->nsge = cxgbei_map_sg(tdata->sgl, scmd);
+ if (tdata->nsge == 0) {
+ CTR1(KTR_CXGBE, "%s: map_sg failed", __func__);
+ return 0;
+ }
+ sge = tdata->sgl;
+
+ tdata->sc_ddp_tag = *itt;
+
+ CTR3(KTR_CXGBE, "%s: *itt:0x%x sc_ddp_tag:0x%x",
+ __func__, *itt, tdata->sc_ddp_tag);
+ if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format,
+ tdata->sc_ddp_tag)) {
+ err = t4_sk_ddp_tag_reserve(ci, icc, scmd->dxfer_len,
+ sge, tdata->nsge, &tdata->sc_ddp_tag);
+ } else {
+ CTR3(KTR_CXGBE,
+ "%s: itt:0x%x sc_ddp_tag:0x%x not usable",
+ __func__, *itt, tdata->sc_ddp_tag);
+ }
+ }
+out:
+ if (err < 0)
+ tdata->sc_ddp_tag =
+ cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *itt);
+
+ return tdata->sc_ddp_tag;
+}
+
+static unsigned int
+cxgbei_task_reserve_ttt(struct icl_conn *ic, void **prv, union ctl_io *io,
+ unsigned int *ttt)
+{
+ struct icl_cxgbei_conn *icc = ic_to_icc(ic);
+ struct toepcb *toep = icc->toep;
+ struct adapter *sc = td_adapter(toep->td);
+ struct cxgbei_data *ci = sc->iscsi_ulp_softc;
+ struct cxgbei_task_data *tdata = NULL;
+ int xferlen, err = -1;
+ struct cxgbei_sgl *sge = NULL;
+
+ MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
+
+ xferlen = (io->scsiio.kern_data_len - io->scsiio.ext_data_filled);
+ tdata = (struct cxgbei_task_data *)*prv;
+ if ((xferlen == 0) || (tdata == NULL))
+ goto out;
+ if (xferlen < DDP_THRESHOLD)
+ goto out;
+ tdata->nsge = cxgbei_map_sg_tgt(tdata->sgl, io);
+ if (tdata->nsge == 0) {
+ CTR1(KTR_CXGBE, "%s: map_sg failed", __func__);
+ return 0;
+ }
+ sge = tdata->sgl;
+
+ tdata->sc_ddp_tag = *ttt;
+ if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format, tdata->sc_ddp_tag)) {
+ err = t4_sk_ddp_tag_reserve(ci, icc, xferlen, sge,
+ tdata->nsge, &tdata->sc_ddp_tag);
+ } else {
+ CTR2(KTR_CXGBE, "%s: sc_ddp_tag:0x%x not usable",
+ __func__, tdata->sc_ddp_tag);
+ }
+out:
+ if (err < 0)
+ tdata->sc_ddp_tag =
+ cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *ttt);
+ return tdata->sc_ddp_tag;
+}
+
+static int
+t4_sk_ddp_tag_release(struct icl_cxgbei_conn *icc, unsigned int ddp_tag)
+{
+ struct toepcb *toep = icc->toep;
+ struct adapter *sc = td_adapter(toep->td);
+ struct cxgbei_data *ci = sc->iscsi_ulp_softc;
+
+ cxgbei_ulp2_ddp_tag_release(ci, ddp_tag, icc);
+
+ return (0);
+}
+
+static int
+cxgbei_ddp_init(struct adapter *sc, struct cxgbei_data *ci)
+{
+ int nppods, bits, max_sz, rc;
+ static const u_int pgsz_order[] = {0, 1, 2, 3};
+
+ MPASS(sc->vres.iscsi.size > 0);
+
+ ci->llimit = sc->vres.iscsi.start;
+ ci->ulimit = sc->vres.iscsi.start + sc->vres.iscsi.size - 1;
+ max_sz = G_MAXRXDATA(t4_read_reg(sc, A_TP_PARA_REG2));
+
+ nppods = sc->vres.iscsi.size >> IPPOD_SIZE_SHIFT;
+ if (nppods <= 1024)
+ return (ENXIO);
+
+ bits = fls(nppods);
+ if (bits > IPPOD_IDX_MAX_SIZE)
+ bits = IPPOD_IDX_MAX_SIZE;
+ nppods = (1 << (bits - 1)) - 1;
+
+ rc = bus_dma_tag_create(NULL, 1, 0, BUS_SPACE_MAXADDR,
+ BUS_SPACE_MAXADDR, NULL, NULL, UINT32_MAX , 8, BUS_SPACE_MAXSIZE,
+ BUS_DMA_ALLOCNOW, NULL, NULL, &ci->ulp_ddp_tag);
+ if (rc != 0) {
+ device_printf(sc->dev, "%s: failed to create DMA tag: %u.\n",
+ __func__, rc);
+ return (rc);
+ }
+
+ ci->colors = malloc(nppods * sizeof(char), M_CXGBE, M_NOWAIT | M_ZERO);
+ ci->gl_map = malloc(nppods * sizeof(struct cxgbei_ulp2_gather_list *),
+ M_CXGBE, M_NOWAIT | M_ZERO);
+ if (ci->colors == NULL || ci->gl_map == NULL) {
+ bus_dma_tag_destroy(ci->ulp_ddp_tag);
+ free(ci->colors, M_CXGBE);
+ free(ci->gl_map, M_CXGBE);
+ return (ENOMEM);
+ }
+
+ mtx_init(&ci->map_lock, "ddp lock", NULL, MTX_DEF | MTX_DUPOK);
+ ci->max_txsz = ci->max_rxsz = min(max_sz, ULP2_MAX_PKT_SIZE);
+ ci->nppods = nppods;
+ ci->idx_last = nppods;
+ ci->idx_bits = bits;
+ ci->idx_mask = (1 << bits) - 1;
+ ci->rsvd_tag_mask = (1 << (bits + IPPOD_IDX_SHIFT)) - 1;
+
+ ci->tag_format.sw_bits = bits;
+ ci->tag_format.rsvd_bits = bits;
+ ci->tag_format.rsvd_shift = IPPOD_IDX_SHIFT;
+ ci->tag_format.rsvd_mask = ci->idx_mask;
+
+ t4_iscsi_init(sc, ci->idx_mask << IPPOD_IDX_SHIFT, pgsz_order);
+
+ return (rc);
+}
+
+static int
+do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
+ u_int tid = GET_TID(cpl);
+ struct toepcb *toep = lookup_tid(sc, tid);
+ struct icl_pdu *ip;
+ struct icl_cxgbei_pdu *icp;
+
+ M_ASSERTPKTHDR(m);
+
+ ip = icl_cxgbei_new_pdu(M_NOWAIT);
+ if (ip == NULL)
+ CXGBE_UNIMPLEMENTED("PDU allocation failure");
+ icp = ip_to_icp(ip);
+ bcopy(mtod(m, caddr_t) + sizeof(*cpl), icp->ip.ip_bhs, sizeof(struct
+ iscsi_bhs));
+ icp->pdu_seq = ntohl(cpl->seq);
+ icp->pdu_flags = SBUF_ULP_FLAG_HDR_RCVD;
+
+ /* This is the start of a new PDU. There should be no old state. */
+ MPASS(toep->ulpcb2 == NULL);
+ toep->ulpcb2 = icp;
+
+#if 0
+ CTR4(KTR_CXGBE, "%s: tid %u, cpl->len hlen %u, m->m_len hlen %u",
+ __func__, tid, ntohs(cpl->len), m->m_len);
+#endif
+
+ m_freem(m);
+ return (0);
+}
+
+static int
+do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ struct cpl_iscsi_data *cpl = mtod(m, struct cpl_iscsi_data *);
+ u_int tid = GET_TID(cpl);
+ struct toepcb *toep = lookup_tid(sc, tid);
+ struct icl_cxgbei_pdu *icp = toep->ulpcb2;
+
+ M_ASSERTPKTHDR(m);
+
+ /* Must already have received the header (but not the data). */
+ MPASS(icp != NULL);
+ MPASS(icp->pdu_flags == SBUF_ULP_FLAG_HDR_RCVD);
+ MPASS(icp->ip.ip_data_mbuf == NULL);
+ MPASS(icp->ip.ip_data_len == 0);
+
+ m_adj(m, sizeof(*cpl));
+
+ icp->pdu_flags |= SBUF_ULP_FLAG_DATA_RCVD;
+ icp->ip.ip_data_mbuf = m;
+ icp->ip.ip_data_len = m->m_pkthdr.len;
+
+#if 0
+ CTR4(KTR_CXGBE, "%s: tid %u, cpl->len dlen %u, m->m_len dlen %u",
+ __func__, tid, ntohs(cpl->len), m->m_len);
+#endif
+
+ return (0);
+}
+
+static int
+do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
+ u_int tid = GET_TID(cpl);
+ struct toepcb *toep = lookup_tid(sc, tid);
+ struct inpcb *inp = toep->inp;
+ struct socket *so;
+ struct sockbuf *sb;
+ struct tcpcb *tp;
+ struct icl_cxgbei_conn *icc;
+ struct icl_conn *ic;
+ struct icl_cxgbei_pdu *icp = toep->ulpcb2;
+ struct icl_pdu *ip;
+ u_int pdu_len, val;
+
+ MPASS(m == NULL);
+
+ /* Must already be assembling a PDU. */
+ MPASS(icp != NULL);
+ MPASS(icp->pdu_flags & SBUF_ULP_FLAG_HDR_RCVD); /* Data is optional. */
+ ip = &icp->ip;
+ icp->pdu_flags |= SBUF_ULP_FLAG_STATUS_RCVD;
+ val = ntohl(cpl->ddpvld);
+ if (val & F_DDP_PADDING_ERR)
+ icp->pdu_flags |= SBUF_ULP_FLAG_PAD_ERROR;
+ if (val & F_DDP_HDRCRC_ERR)
+ icp->pdu_flags |= SBUF_ULP_FLAG_HCRC_ERROR;
+ if (val & F_DDP_DATACRC_ERR)
+ icp->pdu_flags |= SBUF_ULP_FLAG_DCRC_ERROR;
+ if (ip->ip_data_mbuf == NULL) {
+ /* XXXNP: what should ip->ip_data_len be, and why? */
+ icp->pdu_flags |= SBUF_ULP_FLAG_DATA_DDPED;
+ }
+ pdu_len = ntohs(cpl->len); /* includes everything. */
+
+ INP_WLOCK(inp);
+ if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
+ CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
+ __func__, tid, pdu_len, inp->inp_flags);
+ INP_WUNLOCK(inp);
+ icl_cxgbei_conn_pdu_free(NULL, ip);
+#ifdef INVARIANTS
+ toep->ulpcb2 = NULL;
+#endif
+ return (0);
+ }
+
+ tp = intotcpcb(inp);
+ MPASS(icp->pdu_seq == tp->rcv_nxt);
+ MPASS(tp->rcv_wnd >= pdu_len);
+ tp->rcv_nxt += pdu_len;
+ tp->rcv_wnd -= pdu_len;
+ tp->t_rcvtime = ticks;
+
+ /* update rx credits */
+ toep->rx_credits += pdu_len;
+ t4_rcvd(&toep->td->tod, tp); /* XXX: sc->tom_softc.tod */
+
+ so = inp->inp_socket;
+ sb = &so->so_rcv;
+ SOCKBUF_LOCK(sb);
+
+ icc = toep->ulpcb;
+ if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
+ CTR5(KTR_CXGBE,
+ "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
+ __func__, tid, pdu_len, icc, sb->sb_state);
+ SOCKBUF_UNLOCK(sb);
+ INP_WUNLOCK(inp);
+
+ INP_INFO_RLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ tp = tcp_drop(tp, ECONNRESET);
+ if (tp)
+ INP_WUNLOCK(inp);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+
+ icl_cxgbei_conn_pdu_free(NULL, ip);
+#ifdef INVARIANTS
+ toep->ulpcb2 = NULL;
+#endif
+ return (0);
+ }
+ MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
+ ic = &icc->ic;
+ icl_cxgbei_new_pdu_set_conn(ip, ic);
+
+ MPASS(m == NULL); /* was unused, we'll use it now. */
+ m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */
+ if (__predict_false(m != NULL)) {
+ int len = m_length(m, NULL);
+
+ /*
+ * PDUs were received before the tid transitioned to ULP mode.
+ * Convert them to icl_cxgbei_pdus and send them to ICL before
+ * the PDU in icp/ip.
+ */
+ CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid,
+ len);
+
+ /* XXXNP: needs to be rewritten. */
+ if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct
+ iscsi_bhs)) {
+ struct icl_cxgbei_pdu *icp0;
+ struct icl_pdu *ip0;
+
+ ip0 = icl_cxgbei_new_pdu(M_NOWAIT);
+ icl_cxgbei_new_pdu_set_conn(ip0, ic);
+ if (ip0 == NULL)
+ CXGBE_UNIMPLEMENTED("PDU allocation failure");
+ icp0 = ip_to_icp(ip0);
+ icp0->pdu_seq = 0; /* XXX */
+ icp0->pdu_flags = SBUF_ULP_FLAG_HDR_RCVD |
+ SBUF_ULP_FLAG_STATUS_RCVD;
+ m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs);
+ STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next);
+ }
+ m_freem(m);
+ }
+
+#if 0
+ CTR4(KTR_CXGBE, "%s: tid %u, pdu_len %u, pdu_flags 0x%x",
+ __func__, tid, pdu_len, icp->pdu_flags);
+#endif
+
+ STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
+ if ((icc->rx_flags & RXF_ACTIVE) == 0) {
+ struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
+
+ mtx_lock(&cwt->cwt_lock);
+ icc->rx_flags |= RXF_ACTIVE;
+ TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
+ if (cwt->cwt_state == CWT_SLEEPING) {
+ cwt->cwt_state = CWT_RUNNING;
+ cv_signal(&cwt->cwt_cv);
+ }
+ mtx_unlock(&cwt->cwt_lock);
+ }
+ SOCKBUF_UNLOCK(sb);
+ INP_WUNLOCK(inp);
+
+#ifdef INVARIANTS
+ toep->ulpcb2 = NULL;
+#endif
+
+ return (0);
+}
+
+static void
+t4_register_cpl_handler_with_tom(struct adapter *sc)
+{
+
+ t4_register_cpl_handler(sc, CPL_ISCSI_HDR, do_rx_iscsi_hdr);
+ t4_register_cpl_handler(sc, CPL_ISCSI_DATA, do_rx_iscsi_data);
+ t4_register_cpl_handler(sc, CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
+}
+
+static void
+t4_unregister_cpl_handler_with_tom(struct adapter *sc)
+{
+
+ t4_register_cpl_handler(sc, CPL_ISCSI_HDR, NULL);
+ t4_register_cpl_handler(sc, CPL_ISCSI_DATA, NULL);
+ t4_register_cpl_handler(sc, CPL_RX_ISCSI_DDP, NULL);
+}
+
+/* initiator */
+void
+cxgbei_conn_task_reserve_itt(void *conn, void **prv,
+ void *scmd, unsigned int *itt)
+{
+ unsigned int tag;
+ tag = cxgbei_task_reserve_itt(conn, prv, scmd, itt);
+ if (tag)
+ *itt = htonl(tag);
+ return;
+}
+
+/* target */
+void
+cxgbei_conn_transfer_reserve_ttt(void *conn, void **prv,
+ void *scmd, unsigned int *ttt)
+{
+ unsigned int tag;
+ tag = cxgbei_task_reserve_ttt(conn, prv, scmd, ttt);
+ if (tag)
+ *ttt = htonl(tag);
+ return;
+}
+
+void
+cxgbei_cleanup_task(void *conn, void *ofld_priv)
+{
+ struct icl_conn *ic = (struct icl_conn *)conn;
+ struct icl_cxgbei_conn *icc = ic_to_icc(ic);
+ struct cxgbei_task_data *tdata = ofld_priv;
+ struct adapter *sc = icc->sc;
+ struct cxgbei_data *ci = sc->iscsi_ulp_softc;
+
+ MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
+ MPASS(tdata != NULL);
+
+ if (cxgbei_ulp2_is_ddp_tag(&ci->tag_format, tdata->sc_ddp_tag))
+ t4_sk_ddp_tag_release(icc, tdata->sc_ddp_tag);
+ memset(tdata, 0, sizeof(*tdata));
+}
+
+static int
+cxgbei_activate(struct adapter *sc)
+{
+ struct cxgbei_data *ci;
+ int rc;
+
+ ASSERT_SYNCHRONIZED_OP(sc);
+
+ if (uld_active(sc, ULD_ISCSI)) {
+ KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
+ __func__, sc));
+ return (0);
+ }
+
+ if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
+ device_printf(sc->dev,
+ "not iSCSI offload capable, or capability disabled.\n");
+ return (ENOSYS);
+ }
+
+ /* per-adapter softc for iSCSI */
+ ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_NOWAIT);
+ if (ci == NULL)
+ return (ENOMEM);
+
+ rc = cxgbei_ddp_init(sc, ci);
+ if (rc != 0) {
+ free(ci, M_CXGBE);
+ return (rc);
+ }
+
+ t4_register_cpl_handler_with_tom(sc);
+ sc->iscsi_ulp_softc = ci;
+
+ return (0);
+}
+
+static int
+cxgbei_deactivate(struct adapter *sc)
+{
+
+ ASSERT_SYNCHRONIZED_OP(sc);
+
+ if (sc->iscsi_ulp_softc != NULL) {
+ cxgbei_ddp_cleanup(sc->iscsi_ulp_softc);
+ t4_unregister_cpl_handler_with_tom(sc);
+ free(sc->iscsi_ulp_softc, M_CXGBE);
+ sc->iscsi_ulp_softc = NULL;
+ }
+
+ return (0);
+}
+
+static void
+cxgbei_activate_all(struct adapter *sc, void *arg __unused)
+{
+
+ if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
+ return;
+
+ /* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
+ if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
+ (void) t4_activate_uld(sc, ULD_ISCSI);
+
+ end_synchronized_op(sc, 0);
+}
+
+static void
+cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
+{
+
+ if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
+ return;
+
+ if (uld_active(sc, ULD_ISCSI))
+ (void) t4_deactivate_uld(sc, ULD_ISCSI);
+
+ end_synchronized_op(sc, 0);
+}
+
+static struct uld_info cxgbei_uld_info = {
+ .uld_id = ULD_ISCSI,
+ .activate = cxgbei_activate,
+ .deactivate = cxgbei_deactivate,
+};
+
+static void
+cwt_main(void *arg)
+{
+ struct cxgbei_worker_thread_softc *cwt = arg;
+ struct icl_cxgbei_conn *icc = NULL;
+ struct icl_conn *ic;
+ struct icl_pdu *ip;
+ struct sockbuf *sb;
+ STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
+
+ MPASS(cwt != NULL);
+
+ mtx_lock(&cwt->cwt_lock);
+ MPASS(cwt->cwt_state == 0);
+ cwt->cwt_state = CWT_RUNNING;
+ cv_signal(&cwt->cwt_cv);
+
+ while (__predict_true(cwt->cwt_state != CWT_STOP)) {
+ cwt->cwt_state = CWT_RUNNING;
+ while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) {
+ TAILQ_REMOVE(&cwt->rx_head, icc, rx_link);
+ mtx_unlock(&cwt->cwt_lock);
+
+ ic = &icc->ic;
+ sb = &ic->ic_socket->so_rcv;
+
+ SOCKBUF_LOCK(sb);
+ MPASS(icc->rx_flags & RXF_ACTIVE);
+ if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) {
+ MPASS(STAILQ_EMPTY(&rx_pdus));
+ STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
+ SOCKBUF_UNLOCK(sb);
+
+ /* Hand over PDUs to ICL. */
+ while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
+ STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
+ ic->ic_receive(ip);
+ }
+
+ SOCKBUF_LOCK(sb);
+ MPASS(STAILQ_EMPTY(&rx_pdus));
+ }
+ MPASS(icc->rx_flags & RXF_ACTIVE);
+ if (STAILQ_EMPTY(&icc->rcvd_pdus) ||
+ __predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
+ icc->rx_flags &= ~RXF_ACTIVE;
+ } else {
+ /*
+ * More PDUs were received while we were busy
+ * handing over the previous batch to ICL.
+ * Re-add this connection to the end of the
+ * queue.
+ */
+ mtx_lock(&cwt->cwt_lock);
+ TAILQ_INSERT_TAIL(&cwt->rx_head, icc,
+ rx_link);
+ mtx_unlock(&cwt->cwt_lock);
+ }
+ SOCKBUF_UNLOCK(sb);
+
+ mtx_lock(&cwt->cwt_lock);
+ }
+
+ /* Inner loop doesn't check for CWT_STOP, do that first. */
+ if (__predict_false(cwt->cwt_state == CWT_STOP))
+ break;
+ cwt->cwt_state = CWT_SLEEPING;
+ cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
+ }
+
+ MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL);
+ mtx_assert(&cwt->cwt_lock, MA_OWNED);
+ cwt->cwt_state = CWT_STOPPED;
+ cv_signal(&cwt->cwt_cv);
+ mtx_unlock(&cwt->cwt_lock);
+ kthread_exit();
+}
+
+static int
+start_worker_threads(void)
+{
+ int i, rc;
+ struct cxgbei_worker_thread_softc *cwt;
+
+ worker_thread_count = min(mp_ncpus, 32);
+ cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE,
+ M_WAITOK | M_ZERO);
+
+ MPASS(cxgbei_proc == NULL);
+ for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
+ mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF);
+ cv_init(&cwt->cwt_cv, "cwt cv");
+ TAILQ_INIT(&cwt->rx_head);
+ rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0,
+ "cxgbei", "%d", i);
+ if (rc != 0) {
+ printf("cxgbei: failed to start thread #%d/%d (%d)\n",
+ i + 1, worker_thread_count, rc);
+ mtx_destroy(&cwt->cwt_lock);
+ cv_destroy(&cwt->cwt_cv);
+ bzero(&cwt, sizeof(*cwt));
+ if (i == 0) {
+ free(cwt_softc, M_CXGBE);
+ worker_thread_count = 0;
+
+ return (rc);
+ }
+
+ /* Not fatal, carry on with fewer threads. */
+ worker_thread_count = i;
+ rc = 0;
+ break;
+ }
+
+ /* Wait for thread to start before moving on to the next one. */
+ mtx_lock(&cwt->cwt_lock);
+ while (cwt->cwt_state == 0)
+ cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
+ mtx_unlock(&cwt->cwt_lock);
+ }
+
+ MPASS(cwt_softc != NULL);
+ MPASS(worker_thread_count > 0);
+ return (0);
+}
+
+static void
+stop_worker_threads(void)
+{
+ int i;
+ struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0];
+
+ MPASS(worker_thread_count >= 0);
+
+ for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
+ mtx_lock(&cwt->cwt_lock);
+ MPASS(cwt->cwt_state == CWT_RUNNING ||
+ cwt->cwt_state == CWT_SLEEPING);
+ cwt->cwt_state = CWT_STOP;
+ cv_signal(&cwt->cwt_cv);
+ do {
+ cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
+ } while (cwt->cwt_state != CWT_STOPPED);
+ mtx_unlock(&cwt->cwt_lock);
+ }
+ free(cwt_softc, M_CXGBE);
+}
+
+/* Select a worker thread for a connection. */
+u_int
+cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc)
+{
+ struct adapter *sc = icc->sc;
+ struct toepcb *toep = icc->toep;
+ u_int i, n;
+
+ n = worker_thread_count / sc->sge.nofldrxq;
+ if (n > 0)
+ i = toep->vi->pi->port_id * n + arc4random() % n;
+ else
+ i = arc4random() % worker_thread_count;
+
+ CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i);
+
+ return (i);
+}
+
+static int
+cxgbei_mod_load(void)
+{
+ int rc;
+
+ rc = start_worker_threads();
+ if (rc != 0)
+ return (rc);
+
+ rc = t4_register_uld(&cxgbei_uld_info);
+ if (rc != 0) {
+ stop_worker_threads();
+ return (rc);
+ }
+
+ t4_iterate(cxgbei_activate_all, NULL);
+
+ return (rc);
+}
+
+static int
+cxgbei_mod_unload(void)
+{
+
+ t4_iterate(cxgbei_deactivate_all, NULL);
+
+ if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
+ return (EBUSY);
+
+ stop_worker_threads();
+
+ return (0);
+}
+#endif
+
+static int
+cxgbei_modevent(module_t mod, int cmd, void *arg)
+{
+ int rc = 0;
+
+#ifdef TCP_OFFLOAD
+ switch (cmd) {
+ case MOD_LOAD:
+ rc = cxgbei_mod_load();
+ break;
+
+ case MOD_UNLOAD:
+ rc = cxgbei_mod_unload();
+ break;
+
+ default:
+ rc = EINVAL;
+ }
+#else
+ printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
+ rc = EOPNOTSUPP;
+#endif
+
+ return (rc);
+}
+
+static moduledata_t cxgbei_mod = {
+ "cxgbei",
+ cxgbei_modevent,
+ NULL,
+};
+
+MODULE_VERSION(cxgbei, 1);
+DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
+MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
+MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
+MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
diff --git a/sys/dev/cxgbe/cxgbei/cxgbei.h b/sys/dev/cxgbe/cxgbei/cxgbei.h
new file mode 100644
index 000000000000..10e12964d8fd
--- /dev/null
+++ b/sys/dev/cxgbe/cxgbei/cxgbei.h
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef __CXGBEI_OFLD_H__
+#define __CXGBEI_OFLD_H__
+
+#include <dev/iscsi/icl.h>
+
+enum {
+ CWT_SLEEPING = 1,
+ CWT_RUNNING = 2,
+ CWT_STOP = 3,
+ CWT_STOPPED = 4,
+};
+
+struct cxgbei_worker_thread_softc {
+ struct mtx cwt_lock;
+ struct cv cwt_cv;
+ volatile int cwt_state;
+
+ TAILQ_HEAD(, icl_cxgbei_conn) rx_head;
+} __aligned(CACHE_LINE_SIZE);
+
+#define CXGBEI_CONN_SIGNATURE 0x56788765
+
+enum {
+ RXF_ACTIVE = 1 << 0, /* In the worker thread's queue */
+};
+
+struct icl_cxgbei_conn {
+ struct icl_conn ic;
+
+ /* cxgbei specific stuff goes here. */
+ uint32_t icc_signature;
+ int ulp_submode;
+ struct adapter *sc;
+ struct toepcb *toep;
+
+ /* Receive related. */
+ u_int rx_flags; /* protected by so_rcv lock */
+ u_int cwt;
+ STAILQ_HEAD(, icl_pdu) rcvd_pdus; /* protected by so_rcv lock */
+ TAILQ_ENTRY(icl_cxgbei_conn) rx_link; /* protected by cwt lock */
+};
+
+static inline struct icl_cxgbei_conn *
+ic_to_icc(struct icl_conn *ic)
+{
+
+ return (__containerof(ic, struct icl_cxgbei_conn, ic));
+}
+
+#define CXGBEI_PDU_SIGNATURE 0x12344321
+
+struct icl_cxgbei_pdu {
+ struct icl_pdu ip;
+
+ /* cxgbei specific stuff goes here. */
+ uint32_t icp_signature;
+ uint32_t pdu_seq; /* For debug only */
+ u_int pdu_flags;
+};
+
+static inline struct icl_cxgbei_pdu *
+ip_to_icp(struct icl_pdu *ip)
+{
+
+ return (__containerof(ip, struct icl_cxgbei_pdu, ip));
+}
+
+struct cxgbei_sgl {
+ int sg_flag;
+ void *sg_addr;
+ void *sg_dma_addr;
+ size_t sg_offset;
+ size_t sg_length;
+};
+
+#define cxgbei_scsi_for_each_sg(_sgl, _sgel, _n, _i) \
+ for (_i = 0, _sgel = (cxgbei_sgl*) (_sgl); _i < _n; _i++, \
+ _sgel++)
+#define sg_dma_addr(_sgel) _sgel->sg_dma_addr
+#define sg_virt(_sgel) _sgel->sg_addr
+#define sg_len(_sgel) _sgel->sg_length
+#define sg_off(_sgel) _sgel->sg_offset
+#define sg_next(_sgel) _sgel + 1
+
+#define SBUF_ULP_FLAG_HDR_RCVD 0x1
+#define SBUF_ULP_FLAG_DATA_RCVD 0x2
+#define SBUF_ULP_FLAG_STATUS_RCVD 0x4
+#define SBUF_ULP_FLAG_HCRC_ERROR 0x10
+#define SBUF_ULP_FLAG_DCRC_ERROR 0x20
+#define SBUF_ULP_FLAG_PAD_ERROR 0x40
+#define SBUF_ULP_FLAG_DATA_DDPED 0x80
+
+/* private data for each scsi task */
+struct cxgbei_task_data {
+ struct cxgbei_sgl sgl[256];
+ u_int nsge;
+ u_int sc_ddp_tag;
+};
+
+struct cxgbei_ulp2_tag_format {
+ u_char sw_bits;
+ u_char rsvd_bits;
+ u_char rsvd_shift;
+ u_char filler[1];
+ uint32_t rsvd_mask;
+};
+
+struct cxgbei_data {
+ u_int max_txsz;
+ u_int max_rxsz;
+ u_int llimit;
+ u_int ulimit;
+ u_int nppods;
+ u_int idx_last;
+ u_char idx_bits;
+ uint32_t idx_mask;
+ uint32_t rsvd_tag_mask;
+
+ struct mtx map_lock;
+ bus_dma_tag_t ulp_ddp_tag;
+ unsigned char *colors;
+ struct cxgbei_ulp2_gather_list **gl_map;
+
+ struct cxgbei_ulp2_tag_format tag_format;
+};
+
+void cxgbei_conn_task_reserve_itt(void *, void **, void *, unsigned int *);
+void cxgbei_conn_transfer_reserve_ttt(void *, void **, void *, unsigned int *);
+void cxgbei_cleanup_task(void *, void *);
+u_int cxgbei_select_worker_thread(struct icl_cxgbei_conn *);
+
+struct cxgbei_ulp2_pagepod_hdr;
+int t4_ddp_set_map(struct cxgbei_data *, void *,
+ struct cxgbei_ulp2_pagepod_hdr *, u_int, u_int,
+ struct cxgbei_ulp2_gather_list *, int);
+void t4_ddp_clear_map(struct cxgbei_data *, struct cxgbei_ulp2_gather_list *,
+ u_int, u_int, u_int, struct icl_cxgbei_conn *);
+#endif
diff --git a/sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.c b/sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.c
new file mode 100644
index 000000000000..fd7cd4a74e11
--- /dev/null
+++ b/sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.c
@@ -0,0 +1,417 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Chelsio T5xx iSCSI driver
+ * cxgbei_ulp2_ddp.c: Chelsio iSCSI DDP Manager.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#ifdef TCP_OFFLOAD
+#include <sys/types.h>
+#include <sys/module.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/mbuf.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/condvar.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/toecore.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+
+#include <dev/iscsi/icl.h>
+#include <dev/iscsi/iscsi_proto.h>
+
+#include "common/common.h"
+#include "common/t4_msg.h"
+#include "common/t4_regs.h" /* for PCIE_MEM_ACCESS */
+#include "tom/t4_tom.h"
+#include "cxgbei.h"
+#include "cxgbei_ulp2_ddp.h"
+
+/*
+ * Map a single buffer address.
+ */
+static void
+ulp2_dma_map_addr(void *arg, bus_dma_segment_t *segs, int nseg, int error)
+{
+ bus_addr_t *ba = arg;
+ if (error)
+ return;
+
+ KASSERT(nseg == 1, ("%s: %d segments returned!", __func__, nseg));
+
+ *ba = segs->ds_addr;
+}
+
+/*
+ * iSCSI Direct Data Placement
+ *
+ * T4/5 ulp2 h/w can directly place the iSCSI Data-In or Data-Out PDU's
+ * payload into pre-posted final destination host-memory buffers based on the
+ * Initiator Task Tag (ITT) in Data-In or Target Task Tag (TTT) in Data-Out
+ * PDUs.
+ *
+ * The host memory address is programmed into h/w in the format of pagepod
+ * entries.
+ * The location of the pagepod entry is encoded into ddp tag which is used or
+ * is the base for ITT/TTT.
+ */
+
+
+static inline int
+ddp_find_unused_entries(struct cxgbei_data *ci, u_int start, u_int max,
+ u_int count, u_int *idx, struct cxgbei_ulp2_gather_list *gl)
+{
+ unsigned int i, j, k;
+
+ /* not enough entries */
+ if (max - start < count)
+ return (EBUSY);
+
+ max -= count;
+ mtx_lock(&ci->map_lock);
+ for (i = start; i < max;) {
+ for (j = 0, k = i; j < count; j++, k++) {
+ if (ci->gl_map[k])
+ break;
+ }
+ if (j == count) {
+ for (j = 0, k = i; j < count; j++, k++)
+ ci->gl_map[k] = gl;
+ mtx_unlock(&ci->map_lock);
+ *idx = i;
+ return (0);
+ }
+ i += j + 1;
+ }
+ mtx_unlock(&ci->map_lock);
+ return (EBUSY);
+}
+
+static inline void
+ddp_unmark_entries(struct cxgbei_data *ci, u_int start, u_int count)
+{
+
+ mtx_lock(&ci->map_lock);
+ memset(&ci->gl_map[start], 0,
+ count * sizeof(struct cxgbei_ulp2_gather_list *));
+ mtx_unlock(&ci->map_lock);
+}
+
+static inline void
+ddp_gl_unmap(struct cxgbei_data *ci, struct cxgbei_ulp2_gather_list *gl)
+{
+ int i;
+
+ if (!gl->pages[0])
+ return;
+
+ for (i = 0; i < gl->nelem; i++) {
+ bus_dmamap_unload(ci->ulp_ddp_tag, gl->dma_sg[i].bus_map);
+ bus_dmamap_destroy(ci->ulp_ddp_tag, gl->dma_sg[i].bus_map);
+ }
+}
+
+static inline int
+ddp_gl_map(struct cxgbei_data *ci, struct cxgbei_ulp2_gather_list *gl)
+{
+ int i, rc;
+ bus_addr_t pa;
+
+ MPASS(ci != NULL);
+
+ mtx_lock(&ci->map_lock);
+ for (i = 0; i < gl->nelem; i++) {
+ rc = bus_dmamap_create(ci->ulp_ddp_tag, 0,
+ &gl->dma_sg[i].bus_map);
+ if (rc != 0)
+ goto unmap;
+ rc = bus_dmamap_load(ci->ulp_ddp_tag, gl->dma_sg[i].bus_map,
+ gl->pages[i], PAGE_SIZE, ulp2_dma_map_addr,
+ &pa, BUS_DMA_NOWAIT);
+ if (rc != 0)
+ goto unmap;
+ gl->dma_sg[i].phys_addr = pa;
+ }
+ mtx_unlock(&ci->map_lock);
+
+ return (0);
+
+unmap:
+ if (i) {
+ u_int nelem = gl->nelem;
+
+ gl->nelem = i;
+ ddp_gl_unmap(ci, gl);
+ gl->nelem = nelem;
+ }
+ return (ENOMEM);
+}
+
+/**
+ * cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec - build ddp page buffer list
+ * @xferlen: total buffer length
+ * @sgl: page buffer scatter-gather list (struct cxgbei_sgl)
+ * @sgcnt: # of page buffers
+ * @gfp: allocation mode
+ *
+ * construct a ddp page buffer list from the scsi scattergather list.
+ * coalesce buffers as much as possible, and obtain dma addresses for
+ * each page.
+ *
+ * Return the cxgbei_ulp2_gather_list constructed from the page buffers if the
+ * memory can be used for ddp. Return NULL otherwise.
+ */
+struct cxgbei_ulp2_gather_list *
+cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec(u_int xferlen, struct cxgbei_sgl *sgl,
+ u_int sgcnt, struct cxgbei_data *ci, int gfp)
+{
+ struct cxgbei_ulp2_gather_list *gl;
+ struct cxgbei_sgl *sg = sgl;
+ void *sgpage = (void *)((u64)sg->sg_addr & (~PAGE_MASK));
+ unsigned int sglen = sg->sg_length;
+ unsigned int sgoffset = (u64)sg->sg_addr & PAGE_MASK;
+ unsigned int npages = (xferlen + sgoffset + PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
+ int i = 1, j = 0;
+
+ if (xferlen <= DDP_THRESHOLD) {
+ CTR2(KTR_CXGBE, "xfer %u < threshold %u, no ddp.",
+ xferlen, DDP_THRESHOLD);
+ return NULL;
+ }
+
+ gl = malloc(sizeof(struct cxgbei_ulp2_gather_list) +
+ npages * (sizeof(struct dma_segments) + sizeof(void *)),
+ M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (gl == NULL)
+ return (NULL);
+
+ gl->pages = (void **)&gl->dma_sg[npages];
+ gl->length = xferlen;
+ gl->offset = sgoffset;
+ gl->pages[0] = sgpage;
+ CTR6(KTR_CXGBE,
+ "%s: xferlen:0x%x len:0x%x off:0x%x sg_addr:%p npages:%d",
+ __func__, xferlen, gl->length, gl->offset, sg->sg_addr, npages);
+
+ for (i = 1, sg = sg_next(sg); i < sgcnt; i++, sg = sg_next(sg)) {
+ void *page = sg->sg_addr;
+
+ if (sgpage == page && sg->sg_offset == sgoffset + sglen)
+ sglen += sg->sg_length;
+ else {
+ /* make sure the sgl is fit for ddp:
+ * each has the same page size, and
+ * all of the middle pages are used completely
+ */
+ if ((j && sgoffset) ||
+ ((i != sgcnt - 1) &&
+ ((sglen + sgoffset) & ~CXGBEI_PAGE_MASK))){
+ goto error_out;
+ }
+
+ j++;
+ if (j == gl->nelem || sg->sg_offset) {
+ goto error_out;
+ }
+ gl->pages[j] = page;
+ sglen = sg->sg_length;
+ sgoffset = sg->sg_offset;
+ sgpage = page;
+ }
+ }
+ gl->nelem = ++j;
+
+ if (ddp_gl_map(ci, gl) < 0)
+ goto error_out;
+
+ return gl;
+
+error_out:
+ free(gl, M_DEVBUF);
+ return NULL;
+}
+
+/**
+ * cxgbei_ulp2_ddp_release_gl - release a page buffer list
+ * @gl: a ddp page buffer list
+ * @pdev: pci_dev used for pci_unmap
+ * free a ddp page buffer list resulted from cxgbei_ulp2_ddp_make_gl().
+ */
+void
+cxgbei_ulp2_ddp_release_gl(struct cxgbei_data *ci,
+ struct cxgbei_ulp2_gather_list *gl)
+{
+
+ ddp_gl_unmap(ci, gl);
+ free(gl, M_DEVBUF);
+}
+
+/**
+ * cxgbei_ulp2_ddp_tag_reserve - set up ddp for a data transfer
+ * @ci: adapter's ddp info
+ * @tid: connection id
+ * @tformat: tag format
+ * @tagp: contains s/w tag initially, will be updated with ddp/hw tag
+ * @gl: the page momory list
+ * @gfp: allocation mode
+ *
+ * ddp setup for a given page buffer list and construct the ddp tag.
+ * return 0 if success, < 0 otherwise.
+ */
+int
+cxgbei_ulp2_ddp_tag_reserve(struct cxgbei_data *ci, void *icc, u_int tid,
+ struct cxgbei_ulp2_tag_format *tformat, u32 *tagp,
+ struct cxgbei_ulp2_gather_list *gl, int gfp, int reply)
+{
+ struct cxgbei_ulp2_pagepod_hdr hdr;
+ u_int npods, idx;
+ int rc;
+ u32 sw_tag = *tagp;
+ u32 tag;
+
+ MPASS(ci != NULL);
+
+ if (!gl || !gl->nelem || gl->length < DDP_THRESHOLD)
+ return (EINVAL);
+
+ npods = (gl->nelem + IPPOD_PAGES_MAX - 1) >> IPPOD_PAGES_SHIFT;
+
+ if (ci->idx_last == ci->nppods)
+ rc = ddp_find_unused_entries(ci, 0, ci->nppods, npods, &idx,
+ gl);
+ else {
+ rc = ddp_find_unused_entries(ci, ci->idx_last + 1,
+ ci->nppods, npods, &idx, gl);
+ if (rc && ci->idx_last >= npods) {
+ rc = ddp_find_unused_entries(ci, 0,
+ min(ci->idx_last + npods, ci->nppods),
+ npods, &idx, gl);
+ }
+ }
+ if (rc) {
+ CTR3(KTR_CXGBE, "xferlen %u, gl %u, npods %u NO DDP.",
+ gl->length, gl->nelem, npods);
+ return (rc);
+ }
+
+ tag = cxgbei_ulp2_ddp_tag_base(idx, ci->colors, tformat, sw_tag);
+ CTR4(KTR_CXGBE, "%s: sw_tag:0x%x idx:0x%x tag:0x%x",
+ __func__, sw_tag, idx, tag);
+
+ hdr.rsvd = 0;
+ hdr.vld_tid = htonl(F_IPPOD_VALID | V_IPPOD_TID(tid));
+ hdr.pgsz_tag_clr = htonl(tag & ci->rsvd_tag_mask);
+ hdr.maxoffset = htonl(gl->length);
+ hdr.pgoffset = htonl(gl->offset);
+
+ rc = t4_ddp_set_map(ci, icc, &hdr, idx, npods, gl, reply);
+ if (rc < 0)
+ goto unmark_entries;
+
+ ci->idx_last = idx;
+ *tagp = tag;
+ return (0);
+
+unmark_entries:
+ ddp_unmark_entries(ci, idx, npods);
+ return (rc);
+}
+
+/**
+ * cxgbei_ulp2_ddp_tag_release - release a ddp tag
+ * @ci: adapter's ddp info
+ * @tag: ddp tag
+ * ddp cleanup for a given ddp tag and release all the resources held
+ */
+void
+cxgbei_ulp2_ddp_tag_release(struct cxgbei_data *ci, uint32_t tag,
+ struct icl_cxgbei_conn *icc)
+{
+ uint32_t idx;
+
+ MPASS(ci != NULL);
+ MPASS(icc != NULL);
+
+ idx = (tag >> IPPOD_IDX_SHIFT) & ci->idx_mask;
+ CTR3(KTR_CXGBE, "tag:0x%x idx:0x%x nppods:0x%x",
+ tag, idx, ci->nppods);
+ if (idx < ci->nppods) {
+ struct cxgbei_ulp2_gather_list *gl = ci->gl_map[idx];
+ unsigned int npods;
+
+ if (!gl || !gl->nelem) {
+ CTR4(KTR_CXGBE,
+ "release 0x%x, idx 0x%x, gl 0x%p, %u.",
+ tag, idx, gl, gl ? gl->nelem : 0);
+ return;
+ }
+ npods = (gl->nelem + IPPOD_PAGES_MAX - 1) >> IPPOD_PAGES_SHIFT;
+ CTR3(KTR_CXGBE, "ddp tag 0x%x, release idx 0x%x, npods %u.",
+ tag, idx, npods);
+ t4_ddp_clear_map(ci, gl, tag, idx, npods, icc);
+ ddp_unmark_entries(ci, idx, npods);
+ cxgbei_ulp2_ddp_release_gl(ci, gl);
+ } else
+ CTR3(KTR_CXGBE, "ddp tag 0x%x, idx 0x%x > max 0x%x.",
+ tag, idx, ci->nppods);
+}
+
+/**
+ * cxgbei_ddp_cleanup - release the adapter's ddp resources
+ */
+void
+cxgbei_ddp_cleanup(struct cxgbei_data *ci)
+{
+ int i = 0;
+
+ while (i < ci->nppods) {
+ struct cxgbei_ulp2_gather_list *gl = ci->gl_map[i];
+ if (gl) {
+ int npods = (gl->nelem + IPPOD_PAGES_MAX - 1)
+ >> IPPOD_PAGES_SHIFT;
+ free(gl, M_DEVBUF);
+ i += npods;
+ } else
+ i++;
+ }
+ free(ci->colors, M_CXGBE);
+ free(ci->gl_map, M_CXGBE);
+}
+#endif
diff --git a/sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.h b/sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.h
new file mode 100644
index 000000000000..f069f09aa47f
--- /dev/null
+++ b/sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.h
@@ -0,0 +1,217 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Chelsio T5xx iSCSI driver
+ * cxgbei_ulp2_ddp.c: Chelsio iSCSI DDP Manager.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef __CXGBEI_ULP2_DDP_H__
+#define __CXGBEI_ULP2_DDP_H__
+
+#define CXGBEI_PAGE_MASK (~(PAGE_SIZE-1))
+#define DDP_THRESHOLD 2048
+
+/*
+ * cxgbei ddp tag are 32 bits, it consists of reserved bits used by h/w and
+ * non-reserved bits that can be used by the iscsi s/w.
+ * The reserved bits are identified by the rsvd_bits and rsvd_shift fields
+ * in struct cxgbei_ulp2_tag_format.
+ *
+ * The upper most reserved bit can be used to check if a tag is ddp tag or not:
+ * if the bit is 0, the tag is a valid ddp tag
+ */
+
+/*
+ * cxgbei_ulp2_is_ddp_tag - check if a given tag is a hw/ddp tag
+ * @tformat: tag format information
+ * @tag: tag to be checked
+ *
+ * return true if the tag is a ddp tag, false otherwise.
+ */
+static inline int
+cxgbei_ulp2_is_ddp_tag(struct cxgbei_ulp2_tag_format *tformat, uint32_t tag)
+{
+
+ return (!(tag & (1 << (tformat->rsvd_bits + tformat->rsvd_shift - 1))));
+}
+
+/*
+ * cxgbei_ulp2_sw_tag_usable - check if s/w tag has enough bits left for hw bits
+ * @tformat: tag format information
+ * @sw_tag: s/w tag to be checked
+ *
+ * return true if the tag can be used for hw ddp tag, false otherwise.
+ */
+static inline int
+cxgbei_ulp2_sw_tag_usable(struct cxgbei_ulp2_tag_format *tformat,
+ uint32_t sw_tag)
+{
+
+ return (1); /* XXXNP: huh? */
+
+ sw_tag >>= (32 - tformat->rsvd_bits + tformat->rsvd_shift);
+ return !sw_tag;
+}
+
+/*
+ * cxgbei_ulp2_set_non_ddp_tag - mark a given s/w tag as an invalid ddp tag
+ * @tformat: tag format information
+ * @sw_tag: s/w tag to be checked
+ *
+ * insert 1 at the upper most reserved bit to mark it as an invalid ddp tag.
+ */
+static inline uint32_t
+cxgbei_ulp2_set_non_ddp_tag(struct cxgbei_ulp2_tag_format *tformat,
+ uint32_t sw_tag)
+{
+ uint32_t rsvd_bits = tformat->rsvd_bits + tformat->rsvd_shift;
+ if (sw_tag) {
+ u32 v1 = sw_tag & ((1 << (rsvd_bits - 1)) - 1);
+ u32 v2 = (sw_tag >> (rsvd_bits - 1)) << rsvd_bits;
+ return v2 | (1 << (rsvd_bits - 1)) | v1;
+ }
+
+ return sw_tag | (1 << (rsvd_bits - 1)) ;
+}
+
+struct dma_segments {
+ bus_dmamap_t bus_map;
+ bus_addr_t phys_addr;
+};
+/*
+ * struct cxgbei_ulp2_gather_list - cxgbei direct data placement memory
+ *
+ * @tag: ddp tag
+ * @length: total data buffer length
+ * @offset: initial offset to the 1st page
+ * @nelem: # of pages
+ * @pages: page pointers
+ * @phys_addr: physical address
+ */
+struct cxgbei_ulp2_gather_list {
+ uint32_t tag;
+ uint32_t tid;
+ uint32_t port_id;
+ void *egress_dev;
+ unsigned int length;
+ unsigned int offset;
+ unsigned int nelem;
+ bus_size_t mapsize;
+ bus_dmamap_t bus_map;
+ bus_dma_segment_t *segments;
+ void **pages;
+ struct dma_segments dma_sg[0];
+};
+
+#define IPPOD_SIZE sizeof(struct cxgbei_ulp2_pagepod) /* 64 */
+#define IPPOD_SIZE_SHIFT 6
+
+#define IPPOD_COLOR_SHIFT 0
+#define IPPOD_COLOR_SIZE 6
+#define IPPOD_COLOR_MASK ((1 << IPPOD_COLOR_SIZE) - 1)
+
+#define IPPOD_IDX_SHIFT IPPOD_COLOR_SIZE
+#define IPPOD_IDX_MAX_SIZE 24
+
+#define S_IPPOD_TID 0
+#define M_IPPOD_TID 0xFFFFFF
+#define V_IPPOD_TID(x) ((x) << S_IPPOD_TID)
+
+#define S_IPPOD_VALID 24
+#define V_IPPOD_VALID(x) ((x) << S_IPPOD_VALID)
+#define F_IPPOD_VALID V_IPPOD_VALID(1U)
+
+#define S_IPPOD_COLOR 0
+#define M_IPPOD_COLOR 0x3F
+#define V_IPPOD_COLOR(x) ((x) << S_IPPOD_COLOR)
+
+#define S_IPPOD_TAG 6
+#define M_IPPOD_TAG 0xFFFFFF
+#define V_IPPOD_TAG(x) ((x) << S_IPPOD_TAG)
+
+#define S_IPPOD_PGSZ 30
+#define M_IPPOD_PGSZ 0x3
+#define V_IPPOD_PGSZ(x) ((x) << S_IPPOD_PGSZ)
+
+static inline uint32_t
+cxgbei_ulp2_ddp_tag_base(u_int idx, u_char *colors,
+ struct cxgbei_ulp2_tag_format *tformat, uint32_t sw_tag)
+{
+ if (__predict_false(++colors[idx] == 1 << IPPOD_IDX_SHIFT))
+ colors[idx] = 0;
+
+ sw_tag <<= tformat->rsvd_bits + tformat->rsvd_shift;
+
+ return (sw_tag | idx << IPPOD_IDX_SHIFT | colors[idx]);
+}
+
+#define ISCSI_PDU_NONPAYLOAD_LEN 312 /* bhs(48) + ahs(256) + digest(8) */
+
+/*
+ * align pdu size to multiple of 512 for better performance
+ */
+#define cxgbei_align_pdu_size(n) do { n = (n) & (~511); } while (0)
+
+#define ULP2_MAX_PKT_SIZE 16224
+#define ULP2_MAX_PDU_PAYLOAD (ULP2_MAX_PKT_SIZE - ISCSI_PDU_NONPAYLOAD_LEN)
+#define IPPOD_PAGES_MAX 4
+#define IPPOD_PAGES_SHIFT 2 /* 4 pages per pod */
+
+/*
+ * struct pagepod_hdr, pagepod - pagepod format
+ */
+struct cxgbei_ulp2_pagepod_hdr {
+ uint32_t vld_tid;
+ uint32_t pgsz_tag_clr;
+ uint32_t maxoffset;
+ uint32_t pgoffset;
+ uint64_t rsvd;
+};
+
+struct cxgbei_ulp2_pagepod {
+ struct cxgbei_ulp2_pagepod_hdr hdr;
+ uint64_t addr[IPPOD_PAGES_MAX + 1];
+};
+
+int cxgbei_ulp2_ddp_tag_reserve(struct cxgbei_data *, void *, unsigned int,
+ struct cxgbei_ulp2_tag_format *, uint32_t *,
+ struct cxgbei_ulp2_gather_list *, int , int );
+void cxgbei_ulp2_ddp_tag_release(struct cxgbei_data *, uint32_t,
+ struct icl_cxgbei_conn *);
+
+struct cxgbei_ulp2_gather_list *cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec(u_int,
+ struct cxgbei_sgl *, u_int, struct cxgbei_data *, int);
+void cxgbei_ulp2_ddp_release_gl(struct cxgbei_data *,
+ struct cxgbei_ulp2_gather_list *);
+
+int cxgbei_ulp2_ddp_find_page_index(u_long);
+int cxgbei_ulp2_adapter_ddp_info(struct cxgbei_data *,
+ struct cxgbei_ulp2_tag_format *);
+
+void cxgbei_ddp_cleanup(struct cxgbei_data *);
+#endif
diff --git a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
new file mode 100644
index 000000000000..6f202813bc4a
--- /dev/null
+++ b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
@@ -0,0 +1,896 @@
+/*-
+ * Copyright (c) 2012 The FreeBSD Foundation
+ * Copyright (c) 2015 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * This software was developed by Edward Tomasz Napierala under sponsorship
+ * from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+ * cxgbei implementation of iSCSI Common Layer kobj(9) interface.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#ifdef TCP_OFFLOAD
+#include <sys/param.h>
+#include <sys/capsicum.h>
+#include <sys/condvar.h>
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/uio.h>
+#include <machine/bus.h>
+#include <vm/uma.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/toecore.h>
+
+#include <dev/iscsi/icl.h>
+#include <dev/iscsi/iscsi_proto.h>
+#include <icl_conn_if.h>
+
+#include "common/common.h"
+#include "tom/t4_tom.h"
+#include "cxgbei.h"
+
+SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD, 0, "Chelsio iSCSI offload");
+static int coalesce = 1;
+SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, coalesce, CTLFLAG_RWTUN,
+ &coalesce, 0, "Try to coalesce PDUs before sending");
+static int partial_receive_len = 128 * 1024;
+SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN,
+ &partial_receive_len, 0, "Minimum read size for partially received "
+ "data segment");
+static int sendspace = 1048576;
+SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, sendspace, CTLFLAG_RWTUN,
+ &sendspace, 0, "Default send socket buffer size");
+static int recvspace = 1048576;
+SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN,
+ &recvspace, 0, "Default receive socket buffer size");
+
+static uma_zone_t icl_transfer_zone;
+
+static volatile u_int icl_cxgbei_ncons;
+
+#define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock)
+#define ICL_CONN_UNLOCK(X) mtx_unlock(X->ic_lock)
+#define ICL_CONN_LOCK_ASSERT(X) mtx_assert(X->ic_lock, MA_OWNED)
+#define ICL_CONN_LOCK_ASSERT_NOT(X) mtx_assert(X->ic_lock, MA_NOTOWNED)
+
+struct icl_pdu *icl_cxgbei_new_pdu(int);
+void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *);
+
+static icl_conn_new_pdu_t icl_cxgbei_conn_new_pdu;
+icl_conn_pdu_free_t icl_cxgbei_conn_pdu_free;
+static icl_conn_pdu_data_segment_length_t
+ icl_cxgbei_conn_pdu_data_segment_length;
+static icl_conn_pdu_append_data_t icl_cxgbei_conn_pdu_append_data;
+static icl_conn_pdu_get_data_t icl_cxgbei_conn_pdu_get_data;
+static icl_conn_pdu_queue_t icl_cxgbei_conn_pdu_queue;
+static icl_conn_handoff_t icl_cxgbei_conn_handoff;
+static icl_conn_free_t icl_cxgbei_conn_free;
+static icl_conn_close_t icl_cxgbei_conn_close;
+static icl_conn_task_setup_t icl_cxgbei_conn_task_setup;
+static icl_conn_task_done_t icl_cxgbei_conn_task_done;
+static icl_conn_transfer_setup_t icl_cxgbei_conn_transfer_setup;
+static icl_conn_transfer_done_t icl_cxgbei_conn_transfer_done;
+
+static kobj_method_t icl_cxgbei_methods[] = {
+ KOBJMETHOD(icl_conn_new_pdu, icl_cxgbei_conn_new_pdu),
+ KOBJMETHOD(icl_conn_pdu_free, icl_cxgbei_conn_pdu_free),
+ KOBJMETHOD(icl_conn_pdu_data_segment_length,
+ icl_cxgbei_conn_pdu_data_segment_length),
+ KOBJMETHOD(icl_conn_pdu_append_data, icl_cxgbei_conn_pdu_append_data),
+ KOBJMETHOD(icl_conn_pdu_get_data, icl_cxgbei_conn_pdu_get_data),
+ KOBJMETHOD(icl_conn_pdu_queue, icl_cxgbei_conn_pdu_queue),
+ KOBJMETHOD(icl_conn_handoff, icl_cxgbei_conn_handoff),
+ KOBJMETHOD(icl_conn_free, icl_cxgbei_conn_free),
+ KOBJMETHOD(icl_conn_close, icl_cxgbei_conn_close),
+ KOBJMETHOD(icl_conn_task_setup, icl_cxgbei_conn_task_setup),
+ KOBJMETHOD(icl_conn_task_done, icl_cxgbei_conn_task_done),
+ KOBJMETHOD(icl_conn_transfer_setup, icl_cxgbei_conn_transfer_setup),
+ KOBJMETHOD(icl_conn_transfer_done, icl_cxgbei_conn_transfer_done),
+ { 0, 0 }
+};
+
+DEFINE_CLASS(icl_cxgbei, icl_cxgbei_methods, sizeof(struct icl_cxgbei_conn));
+
+#if 0
+/*
+ * Subtract another 256 for AHS from MAX_DSL if AHS could be used.
+ */
+#define CXGBEI_MAX_PDU 16224
+#define CXGBEI_MAX_DSL (CXGBEI_MAX_PDU - sizeof(struct iscsi_bhs) - 8)
+#endif
+#define CXGBEI_MAX_DSL 8192
+#define CXGBEI_MAX_PDU (CXGBEI_MAX_DSL + sizeof(struct iscsi_bhs) + 8)
+
+void
+icl_cxgbei_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
+{
+#ifdef INVARIANTS
+ struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
+#endif
+
+ MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
+ MPASS(ic == ip->ip_conn);
+ MPASS(ip->ip_bhs_mbuf != NULL);
+
+ m_freem(ip->ip_ahs_mbuf);
+ m_freem(ip->ip_data_mbuf);
+ m_freem(ip->ip_bhs_mbuf); /* storage for icl_cxgbei_pdu itself */
+
+#ifdef DIAGNOSTIC
+ if (__predict_true(ic != NULL))
+ refcount_release(&ic->ic_outstanding_pdus);
+#endif
+}
+
+struct icl_pdu *
+icl_cxgbei_new_pdu(int flags)
+{
+ struct icl_cxgbei_pdu *icp;
+ struct icl_pdu *ip;
+ struct mbuf *m;
+ uintptr_t a;
+
+ m = m_gethdr(flags, MT_DATA);
+ if (__predict_false(m == NULL))
+ return (NULL);
+
+ a = roundup2(mtod(m, uintptr_t), _Alignof(struct icl_cxgbei_pdu));
+ icp = (struct icl_cxgbei_pdu *)a;
+ bzero(icp, sizeof(*icp));
+
+ icp->icp_signature = CXGBEI_PDU_SIGNATURE;
+ ip = &icp->ip;
+ ip->ip_bhs_mbuf = m;
+
+ a = roundup2((uintptr_t)(icp + 1), _Alignof(struct iscsi_bhs *));
+ ip->ip_bhs = (struct iscsi_bhs *)a;
+#ifdef INVARIANTS
+ /* Everything must fit entirely in the mbuf. */
+ a = (uintptr_t)(ip->ip_bhs + 1);
+ MPASS(a <= (uintptr_t)m + MSIZE);
+#endif
+ bzero(ip->ip_bhs, sizeof(*ip->ip_bhs));
+
+ m->m_data = (void *)ip->ip_bhs;
+ m->m_len = sizeof(struct iscsi_bhs);
+ m->m_pkthdr.len = m->m_len;
+
+ return (ip);
+}
+
+void
+icl_cxgbei_new_pdu_set_conn(struct icl_pdu *ip, struct icl_conn *ic)
+{
+
+ ip->ip_conn = ic;
+#ifdef DIAGNOSTIC
+ refcount_acquire(&ic->ic_outstanding_pdus);
+#endif
+}
+
+/*
+ * Allocate icl_pdu with empty BHS to fill up by the caller.
+ */
+static struct icl_pdu *
+icl_cxgbei_conn_new_pdu(struct icl_conn *ic, int flags)
+{
+ struct icl_pdu *ip;
+
+ ip = icl_cxgbei_new_pdu(flags);
+ if (__predict_false(ip == NULL))
+ return (NULL);
+ icl_cxgbei_new_pdu_set_conn(ip, ic);
+
+ return (ip);
+}
+
+static size_t
+icl_pdu_data_segment_length(const struct icl_pdu *request)
+{
+ uint32_t len = 0;
+
+ len += request->ip_bhs->bhs_data_segment_len[0];
+ len <<= 8;
+ len += request->ip_bhs->bhs_data_segment_len[1];
+ len <<= 8;
+ len += request->ip_bhs->bhs_data_segment_len[2];
+
+ return (len);
+}
+
+size_t
+icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic,
+ const struct icl_pdu *request)
+{
+
+ return (icl_pdu_data_segment_length(request));
+}
+
+static uint32_t
+icl_conn_build_tasktag(struct icl_conn *ic, uint32_t tag)
+{
+ return tag;
+}
+
+static struct mbuf *
+finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp)
+{
+ struct icl_pdu *ip = &icp->ip;
+ uint8_t ulp_submode, padding;
+ struct mbuf *m, *last;
+ struct iscsi_bhs *bhs;
+
+ /*
+ * Fix up the data segment mbuf first.
+ */
+ m = ip->ip_data_mbuf;
+ ulp_submode = icc->ulp_submode;
+ if (m) {
+ last = m_last(m);
+
+ /*
+ * Round up the data segment to a 4B boundary. Pad with 0 if
+ * necessary. There will definitely be room in the mbuf.
+ */
+ padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len;
+ if (padding) {
+ bzero(mtod(last, uint8_t *) + last->m_len, padding);
+ last->m_len += padding;
+ }
+ } else {
+ MPASS(ip->ip_data_len == 0);
+ ulp_submode &= ~ULP_CRC_DATA;
+ padding = 0;
+ }
+
+ /*
+ * Now the header mbuf that has the BHS.
+ */
+ m = ip->ip_bhs_mbuf;
+ MPASS(m->m_pkthdr.len == sizeof(struct iscsi_bhs));
+ MPASS(m->m_len == sizeof(struct iscsi_bhs));
+
+ bhs = ip->ip_bhs;
+ bhs->bhs_data_segment_len[2] = ip->ip_data_len;
+ bhs->bhs_data_segment_len[1] = ip->ip_data_len >> 8;
+ bhs->bhs_data_segment_len[0] = ip->ip_data_len >> 16;
+
+ /* "Convert" PDU to mbuf chain. Do not use icp/ip after this. */
+ m->m_pkthdr.len = sizeof(struct iscsi_bhs) + ip->ip_data_len + padding;
+ m->m_next = ip->ip_data_mbuf;
+ set_mbuf_ulp_submode(m, ulp_submode);
+#ifdef INVARIANTS
+ bzero(icp, sizeof(*icp));
+#endif
+#ifdef DIAGNOSTIC
+ refcount_release(&icc->ic.ic_outstanding_pdus);
+#endif
+
+ return (m);
+}
+
+int
+icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip,
+ const void *addr, size_t len, int flags)
+{
+ struct mbuf *m;
+#ifdef INVARIANTS
+ struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
+#endif
+
+ MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
+ MPASS(ic == ip->ip_conn);
+ KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len));
+
+ m = ip->ip_data_mbuf;
+ if (m == NULL) {
+ m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUM16BYTES);
+ if (__predict_false(m == NULL))
+ return (ENOMEM);
+
+ ip->ip_data_mbuf = m;
+ }
+
+ if (__predict_true(m_append(m, len, addr) != 0)) {
+ ip->ip_data_len += len;
+ MPASS(ip->ip_data_len <= CXGBEI_MAX_DSL);
+ return (0);
+ } else {
+ if (flags & M_WAITOK) {
+ CXGBE_UNIMPLEMENTED("fail safe append");
+ }
+ ip->ip_data_len = m_length(m, NULL);
+ return (1);
+ }
+}
+
+void
+icl_cxgbei_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip,
+ size_t off, void *addr, size_t len)
+{
+ struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
+
+ if (icp->pdu_flags & SBUF_ULP_FLAG_DATA_DDPED)
+ return; /* data is DDP'ed, no need to copy */
+ m_copydata(ip->ip_data_mbuf, off, len, addr);
+}
+
+void
+icl_cxgbei_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
+{
+ struct icl_cxgbei_conn *icc = ic_to_icc(ic);
+ struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
+ struct socket *so = ic->ic_socket;
+ struct toepcb *toep = icc->toep;
+ struct inpcb *inp;
+ struct mbuf *m;
+
+ MPASS(ic == ip->ip_conn);
+ MPASS(ip->ip_bhs_mbuf != NULL);
+ /* The kernel doesn't generate PDUs with AHS. */
+ MPASS(ip->ip_ahs_mbuf == NULL && ip->ip_ahs_len == 0);
+
+ ICL_CONN_LOCK_ASSERT(ic);
+ /* NOTE: sowriteable without so_snd lock is a mostly harmless race. */
+ if (ic->ic_disconnecting || so == NULL || !sowriteable(so)) {
+ icl_cxgbei_conn_pdu_free(ic, ip);
+ return;
+ }
+
+ m = finalize_pdu(icc, icp);
+ M_ASSERTPKTHDR(m);
+ MPASS((m->m_pkthdr.len & 3) == 0);
+ MPASS(m->m_pkthdr.len + 8 <= CXGBEI_MAX_PDU);
+
+ /*
+ * Do not get inp from toep->inp as the toepcb might have detached
+ * already.
+ */
+ inp = sotoinpcb(so);
+ INP_WLOCK(inp);
+ if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) ||
+ __predict_false((toep->flags & TPF_ATTACHED) == 0))
+ m_freem(m);
+ else {
+ mbufq_enqueue(&toep->ulp_pduq, m);
+ t4_push_pdus(icc->sc, toep, 0);
+ }
+ INP_WUNLOCK(inp);
+}
+
+static struct icl_conn *
+icl_cxgbei_new_conn(const char *name, struct mtx *lock)
+{
+ struct icl_cxgbei_conn *icc;
+ struct icl_conn *ic;
+
+ refcount_acquire(&icl_cxgbei_ncons);
+
+ icc = (struct icl_cxgbei_conn *)kobj_create(&icl_cxgbei_class, M_CXGBE,
+ M_WAITOK | M_ZERO);
+ icc->icc_signature = CXGBEI_CONN_SIGNATURE;
+ STAILQ_INIT(&icc->rcvd_pdus);
+
+ ic = &icc->ic;
+ ic->ic_lock = lock;
+
+ /* XXXNP: review. Most of these icl_conn fields aren't really used */
+ STAILQ_INIT(&ic->ic_to_send);
+ cv_init(&ic->ic_send_cv, "icl_cxgbei_tx");
+ cv_init(&ic->ic_receive_cv, "icl_cxgbei_rx");
+#ifdef DIAGNOSTIC
+ refcount_init(&ic->ic_outstanding_pdus, 0);
+#endif
+ ic->ic_max_data_segment_length = CXGBEI_MAX_DSL;
+ ic->ic_name = name;
+ ic->ic_offload = "cxgbei";
+
+ CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc);
+
+ return (ic);
+}
+
+void
+icl_cxgbei_conn_free(struct icl_conn *ic)
+{
+ struct icl_cxgbei_conn *icc = ic_to_icc(ic);
+
+ MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
+
+ CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc);
+
+ cv_destroy(&ic->ic_send_cv);
+ cv_destroy(&ic->ic_receive_cv);
+
+ kobj_delete((struct kobj *)icc, M_CXGBE);
+ refcount_release(&icl_cxgbei_ncons);
+}
+
+static int
+icl_cxgbei_setsockopt(struct icl_conn *ic, struct socket *so)
+{
+ size_t minspace;
+ struct sockopt opt;
+ int error, one = 1;
+
+ /*
+ * For sendspace, this is required because the current code cannot
+ * send a PDU in pieces; thus, the minimum buffer size is equal
+ * to the maximum PDU size. "+4" is to account for possible padding.
+ *
+ * What we should actually do here is to use autoscaling, but set
+ * some minimal buffer size to "minspace". I don't know a way to do
+ * that, though.
+ */
+ minspace = sizeof(struct iscsi_bhs) + ic->ic_max_data_segment_length +
+ ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4;
+ if (sendspace < minspace)
+ sendspace = minspace;
+ if (recvspace < minspace)
+ recvspace = minspace;
+
+ error = soreserve(so, sendspace, recvspace);
+ if (error != 0) {
+ icl_cxgbei_conn_close(ic);
+ return (error);
+ }
+ SOCKBUF_LOCK(&so->so_snd);
+ so->so_snd.sb_flags |= SB_AUTOSIZE;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ SOCKBUF_LOCK(&so->so_rcv);
+ so->so_rcv.sb_flags |= SB_AUTOSIZE;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+
+ /*
+ * Disable Nagle.
+ */
+ bzero(&opt, sizeof(opt));
+ opt.sopt_dir = SOPT_SET;
+ opt.sopt_level = IPPROTO_TCP;
+ opt.sopt_name = TCP_NODELAY;
+ opt.sopt_val = &one;
+ opt.sopt_valsize = sizeof(one);
+ error = sosetopt(so, &opt);
+ if (error != 0) {
+ icl_cxgbei_conn_close(ic);
+ return (error);
+ }
+
+ return (0);
+}
+
+/*
+ * Request/response structure used to find out the adapter offloading a socket.
+ */
+struct find_ofld_adapter_rr {
+ struct socket *so;
+ struct adapter *sc; /* result */
+};
+
+static void
+find_offload_adapter(struct adapter *sc, void *arg)
+{
+ struct find_ofld_adapter_rr *fa = arg;
+ struct socket *so = fa->so;
+ struct tom_data *td = sc->tom_softc;
+ struct tcpcb *tp;
+ struct inpcb *inp;
+
+ /* Non-TCP were filtered out earlier. */
+ MPASS(so->so_proto->pr_protocol == IPPROTO_TCP);
+
+ if (fa->sc != NULL)
+ return; /* Found already. */
+
+ if (td == NULL)
+ return; /* TOE not enabled on this adapter. */
+
+ inp = sotoinpcb(so);
+ INP_WLOCK(inp);
+ if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
+ tp = intotcpcb(inp);
+ if (tp->t_flags & TF_TOE && tp->tod == &td->tod)
+ fa->sc = sc; /* Found. */
+ }
+ INP_WUNLOCK(inp);
+}
+
+/* XXXNP: move this to t4_tom. */
+static void
+send_iscsi_flowc_wr(struct adapter *sc, struct toepcb *toep, int maxlen)
+{
+ struct wrqe *wr;
+ struct fw_flowc_wr *flowc;
+ const u_int nparams = 1;
+ u_int flowclen;
+ struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
+
+ flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
+
+ wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq);
+ if (wr == NULL) {
+ /* XXX */
+ panic("%s: allocation failure.", __func__);
+ }
+ flowc = wrtod(wr);
+ memset(flowc, 0, wr->wr_len);
+
+ flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
+ V_FW_FLOWC_WR_NPARAMS(nparams));
+ flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
+ V_FW_WR_FLOWID(toep->tid));
+
+ flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX;
+ flowc->mnemval[0].val = htobe32(maxlen);
+
+ txsd->tx_credits = howmany(flowclen, 16);
+ txsd->plen = 0;
+ KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
+ ("%s: not enough credits (%d)", __func__, toep->tx_credits));
+ toep->tx_credits -= txsd->tx_credits;
+ if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
+ toep->txsd_pidx = 0;
+ toep->txsd_avail--;
+
+ t4_wrq_tx(sc, wr);
+}
+
+static void
+set_ulp_mode_iscsi(struct adapter *sc, struct toepcb *toep, int hcrc, int dcrc)
+{
+ uint64_t val = 0;
+
+ if (hcrc)
+ val |= ULP_CRC_HEADER;
+ if (dcrc)
+ val |= ULP_CRC_DATA;
+ val <<= 4;
+ val |= ULP_MODE_ISCSI;
+
+ CTR4(KTR_CXGBE, "%s: tid %u, ULP_MODE_ISCSI, CRC hdr=%d data=%d",
+ __func__, toep->tid, hcrc, dcrc);
+
+ t4_set_tcb_field(sc, toep, 1, 0, 0xfff, val);
+}
+
+/*
+ * XXXNP: Who is responsible for cleaning up the socket if this returns with an
+ * error? Review all error paths.
+ *
+ * XXXNP: What happens to the socket's fd reference if the operation is
+ * successful, and how does that affect the socket's life cycle?
+ */
+int
+icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd)
+{
+ struct icl_cxgbei_conn *icc = ic_to_icc(ic);
+ struct find_ofld_adapter_rr fa;
+ struct file *fp;
+ struct socket *so;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct toepcb *toep;
+ cap_rights_t rights;
+ int error;
+
+ MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
+ ICL_CONN_LOCK_ASSERT_NOT(ic);
+
+ /*
+ * Steal the socket from userland.
+ */
+ error = fget(curthread, fd,
+ cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
+ if (error != 0)
+ return (error);
+ if (fp->f_type != DTYPE_SOCKET) {
+ fdrop(fp, curthread);
+ return (EINVAL);
+ }
+ so = fp->f_data;
+ if (so->so_type != SOCK_STREAM ||
+ so->so_proto->pr_protocol != IPPROTO_TCP) {
+ fdrop(fp, curthread);
+ return (EINVAL);
+ }
+
+ ICL_CONN_LOCK(ic);
+ if (ic->ic_socket != NULL) {
+ ICL_CONN_UNLOCK(ic);
+ fdrop(fp, curthread);
+ return (EBUSY);
+ }
+ ic->ic_disconnecting = false;
+ ic->ic_socket = so;
+ fp->f_ops = &badfileops;
+ fp->f_data = NULL;
+ fdrop(fp, curthread);
+ ICL_CONN_UNLOCK(ic);
+
+ /* Find the adapter offloading this socket. */
+ fa.sc = NULL;
+ fa.so = so;
+ t4_iterate(find_offload_adapter, &fa);
+ if (fa.sc == NULL)
+ return (EINVAL);
+ icc->sc = fa.sc;
+
+ error = icl_cxgbei_setsockopt(ic, so);
+ if (error)
+ return (error);
+
+ inp = sotoinpcb(so);
+ INP_WLOCK(inp);
+ tp = intotcpcb(inp);
+ if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))
+ error = EBUSY;
+ else {
+ /*
+ * socket could not have been "unoffloaded" if here.
+ */
+ MPASS(tp->t_flags & TF_TOE);
+ MPASS(tp->tod != NULL);
+ MPASS(tp->t_toe != NULL);
+ toep = tp->t_toe;
+ MPASS(toep->vi->pi->adapter == icc->sc);
+ icc->toep = toep;
+ icc->cwt = cxgbei_select_worker_thread(icc);
+ icc->ulp_submode = 0;
+ if (ic->ic_header_crc32c)
+ icc->ulp_submode |= ULP_CRC_HEADER;
+ if (ic->ic_data_crc32c)
+ icc->ulp_submode |= ULP_CRC_DATA;
+ so->so_options |= SO_NO_DDP;
+ toep->ulp_mode = ULP_MODE_ISCSI;
+ toep->ulpcb = icc;
+
+ send_iscsi_flowc_wr(icc->sc, toep, CXGBEI_MAX_PDU);
+ set_ulp_mode_iscsi(icc->sc, toep, ic->ic_header_crc32c,
+ ic->ic_data_crc32c);
+ error = 0;
+ }
+ INP_WUNLOCK(inp);
+
+ return (error);
+}
+
+void
+icl_cxgbei_conn_close(struct icl_conn *ic)
+{
+ struct icl_cxgbei_conn *icc = ic_to_icc(ic);
+ struct icl_pdu *ip;
+ struct socket *so;
+ struct sockbuf *sb;
+ struct inpcb *inp;
+ struct toepcb *toep = icc->toep;
+
+ MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
+ ICL_CONN_LOCK_ASSERT_NOT(ic);
+
+ ICL_CONN_LOCK(ic);
+ so = ic->ic_socket;
+ if (ic->ic_disconnecting || so == NULL) {
+ CTR4(KTR_CXGBE, "%s: icc %p (disconnecting = %d), so %p",
+ __func__, icc, ic->ic_disconnecting, so);
+ ICL_CONN_UNLOCK(ic);
+ return;
+ }
+ ic->ic_disconnecting = true;
+
+ /* These are unused in this driver right now. */
+ MPASS(STAILQ_EMPTY(&ic->ic_to_send));
+ MPASS(ic->ic_receive_pdu == NULL);
+
+#ifdef DIAGNOSTIC
+ KASSERT(ic->ic_outstanding_pdus == 0,
+ ("destroying session with %d outstanding PDUs",
+ ic->ic_outstanding_pdus));
+#endif
+ ICL_CONN_UNLOCK(ic);
+
+ CTR3(KTR_CXGBE, "%s: tid %d, icc %p", __func__, toep ? toep->tid : -1,
+ icc);
+ inp = sotoinpcb(so);
+ sb = &so->so_rcv;
+ INP_WLOCK(inp);
+ if (toep != NULL) { /* NULL if connection was never offloaded. */
+ toep->ulpcb = NULL;
+ mbufq_drain(&toep->ulp_pduq);
+ SOCKBUF_LOCK(sb);
+ if (icc->rx_flags & RXF_ACTIVE) {
+ volatile u_int *p = &icc->rx_flags;
+
+ SOCKBUF_UNLOCK(sb);
+ INP_WUNLOCK(inp);
+
+ while (*p & RXF_ACTIVE)
+ pause("conclo", 1);
+
+ INP_WLOCK(inp);
+ SOCKBUF_LOCK(sb);
+ }
+
+ while (!STAILQ_EMPTY(&icc->rcvd_pdus)) {
+ ip = STAILQ_FIRST(&icc->rcvd_pdus);
+ STAILQ_REMOVE_HEAD(&icc->rcvd_pdus, ip_next);
+ icl_cxgbei_conn_pdu_free(ic, ip);
+ }
+ SOCKBUF_UNLOCK(sb);
+ }
+ INP_WUNLOCK(inp);
+
+ ICL_CONN_LOCK(ic);
+ ic->ic_socket = NULL;
+ ICL_CONN_UNLOCK(ic);
+
+ /*
+ * XXXNP: we should send RST instead of FIN when PDUs held in various
+ * queues were purged instead of delivered reliably but soabort isn't
+ * really general purpose and wouldn't do the right thing here.
+ */
+ soclose(so);
+}
+
+int
+icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct ccb_scsiio *csio,
+ uint32_t *task_tagp, void **prvp)
+{
+ void *prv;
+
+ *task_tagp = icl_conn_build_tasktag(ic, *task_tagp);
+
+ prv = uma_zalloc(icl_transfer_zone, M_NOWAIT | M_ZERO);
+ if (prv == NULL)
+ return (ENOMEM);
+
+ *prvp = prv;
+
+ cxgbei_conn_task_reserve_itt(ic, prvp, csio, task_tagp);
+
+ return (0);
+}
+
+void
+icl_cxgbei_conn_task_done(struct icl_conn *ic, void *prv)
+{
+
+ cxgbei_cleanup_task(ic, prv);
+ uma_zfree(icl_transfer_zone, prv);
+}
+
+int
+icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io,
+ uint32_t *transfer_tag, void **prvp)
+{
+ void *prv;
+
+ *transfer_tag = icl_conn_build_tasktag(ic, *transfer_tag);
+
+ prv = uma_zalloc(icl_transfer_zone, M_NOWAIT | M_ZERO);
+ if (prv == NULL)
+ return (ENOMEM);
+
+ *prvp = prv;
+
+ cxgbei_conn_transfer_reserve_ttt(ic, prvp, io, transfer_tag);
+
+ return (0);
+}
+
+void
+icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *prv)
+{
+ cxgbei_cleanup_task(ic, prv);
+ uma_zfree(icl_transfer_zone, prv);
+}
+
+static int
+icl_cxgbei_limits(size_t *limitp)
+{
+
+ *limitp = CXGBEI_MAX_DSL;
+
+ return (0);
+}
+
+static int
+icl_cxgbei_load(void)
+{
+ int error;
+
+ icl_transfer_zone = uma_zcreate("icl_transfer",
+ 16 * 1024, NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+
+ refcount_init(&icl_cxgbei_ncons, 0);
+
+ error = icl_register("cxgbei", 100, icl_cxgbei_limits,
+ icl_cxgbei_new_conn);
+ KASSERT(error == 0, ("failed to register"));
+
+ return (error);
+}
+
+static int
+icl_cxgbei_unload(void)
+{
+
+ if (icl_cxgbei_ncons != 0)
+ return (EBUSY);
+
+ icl_unregister("cxgbei");
+
+ uma_zdestroy(icl_transfer_zone);
+
+ return (0);
+}
+
+static int
+icl_cxgbei_modevent(module_t mod, int what, void *arg)
+{
+
+ switch (what) {
+ case MOD_LOAD:
+ return (icl_cxgbei_load());
+ case MOD_UNLOAD:
+ return (icl_cxgbei_unload());
+ default:
+ return (EINVAL);
+ }
+}
+
+moduledata_t icl_cxgbei_data = {
+ "icl_cxgbei",
+ icl_cxgbei_modevent,
+ 0
+};
+
+DECLARE_MODULE(icl_cxgbei, icl_cxgbei_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
+MODULE_DEPEND(icl_cxgbei, icl, 1, 1, 1);
+MODULE_VERSION(icl_cxgbei, 1);
+#endif