9 files changed, 934 insertions, 167 deletions
diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c
index 99e4c222996d..c236ee060bc2 100644
--- a/sys/dev/cxgbe/tom/t4_connect.c
+++ b/sys/dev/cxgbe/tom/t4_connect.c
@@ -89,6 +89,12 @@ do_act_establish(struct sge_iq *iq, const struct rss_header *rss,
 	INP_WLOCK(inp);
 	toep->tid = tid;
 	insert_tid(sc, tid, toep, inp->inp_vflag & INP_IPV6 ? 2 : 1);
+	if (sc->params.tid_qid_sel_mask != 0) {
+		update_tid_qid_sel(toep->vi, &toep->params, tid);
+		toep->ofld_txq = &sc->sge.ofld_txq[toep->params.txq_idx];
+		toep->ctrlq = &sc->sge.ctrlq[toep->params.ctrlq_idx];
+	}
+
 	if (inp->inp_flags & INP_DROPPED) {
 
 		/* socket closed by the kernel before hw told us it connected */
@@ -205,7 +211,7 @@ static inline int
 act_open_cpl_size(struct adapter *sc, int isipv6)
 {
 	int idx;
-	static const int sz_table[3][2] = {
+	static const int sz_table[4][2] = {
 		{
 			sizeof (struct cpl_act_open_req),
 			sizeof (struct cpl_act_open_req6)
@@ -218,10 +224,14 @@ act_open_cpl_size(struct adapter *sc, int isipv6)
 			sizeof (struct cpl_t6_act_open_req),
 			sizeof (struct cpl_t6_act_open_req6)
 		},
+		{
+			sizeof (struct cpl_t7_act_open_req),
+			sizeof (struct cpl_t7_act_open_req6)
+		},
 	};
 
 	MPASS(chip_id(sc) >= CHELSIO_T4);
-	idx = min(chip_id(sc) - CHELSIO_T4, 2);
+	idx = min(chip_id(sc) - CHELSIO_T4, 3);
 
 	return (sz_table[idx][!!isipv6]);
 }
@@ -255,6 +265,7 @@ t4_connect(struct toedev *tod, struct socket *so, struct nhop_object *nh,
 	struct offload_settings settings;
 	struct epoch_tracker et;
 	uint16_t vid = 0xfff, pcp = 0;
+	uint64_t ntuple;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6,
@@ -308,10 +319,12 @@ t4_connect(struct toedev *tod, struct socket *so, struct nhop_object *nh,
 	qid_atid = V_TID_QID(toep->ofld_rxq->iq.abs_id) | V_TID_TID(toep->tid) |
 	    V_TID_COOKIE(CPL_COOKIE_TOM);
 
+	ntuple = select_ntuple(vi, toep->l2te);
 	if (isipv6) {
 		struct cpl_act_open_req6 *cpl = wrtod(wr);
 		struct cpl_t5_act_open_req6 *cpl5 = (void *)cpl;
 		struct cpl_t6_act_open_req6 *cpl6 = (void *)cpl;
+		struct cpl_t7_act_open_req6 *cpl7 = (void *)cpl;
 
 		if ((inp->inp_vflag & INP_IPV6) == 0)
 			DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP);
@@ -323,18 +336,23 @@ t4_connect(struct toedev *tod, struct socket *so, struct nhop_object *nh,
 		switch (chip_id(sc)) {
 		case CHELSIO_T4:
 			INIT_TP_WR(cpl, 0);
-			cpl->params = select_ntuple(vi, toep->l2te);
+			cpl->params = htobe32((uint32_t)ntuple);
 			break;
 		case CHELSIO_T5:
 			INIT_TP_WR(cpl5, 0);
 			cpl5->iss = htobe32(tp->iss);
-			cpl5->params = select_ntuple(vi, toep->l2te);
+			cpl5->params = htobe64(V_FILTER_TUPLE(ntuple));
 			break;
 		case CHELSIO_T6:
-		default:
 			INIT_TP_WR(cpl6, 0);
 			cpl6->iss = htobe32(tp->iss);
-			cpl6->params = select_ntuple(vi, toep->l2te);
+			cpl6->params = htobe64(V_FILTER_TUPLE(ntuple));
+			break;
+		case CHELSIO_T7:
+		default:
+			INIT_TP_WR(cpl7, 0);
+			cpl7->iss = htobe32(tp->iss);
+			cpl7->params = htobe64(V_T7_FILTER_TUPLE(ntuple));
 			break;
 		}
 		OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
@@ -356,23 +374,28 @@ t4_connect(struct toedev *tod, struct socket *so, struct nhop_object *nh,
 		struct cpl_act_open_req *cpl = wrtod(wr);
 		struct cpl_t5_act_open_req *cpl5 = (void *)cpl;
 		struct cpl_t6_act_open_req *cpl6 = (void *)cpl;
+		struct cpl_t7_act_open_req *cpl7 = (void *)cpl;
 
 		switch (chip_id(sc)) {
 		case CHELSIO_T4:
 			INIT_TP_WR(cpl, 0);
-			cpl->params = select_ntuple(vi, toep->l2te);
+			cpl->params = htobe32((uint32_t)ntuple);
 			break;
 		case CHELSIO_T5:
 			INIT_TP_WR(cpl5, 0);
 			cpl5->iss = htobe32(tp->iss);
-			cpl5->params = select_ntuple(vi, toep->l2te);
+			cpl5->params = htobe64(V_FILTER_TUPLE(ntuple));
 			break;
 		case CHELSIO_T6:
-		default:
 			INIT_TP_WR(cpl6, 0);
 			cpl6->iss = htobe32(tp->iss);
-			cpl6->params = select_ntuple(vi, toep->l2te);
+			cpl6->params = htobe64(V_FILTER_TUPLE(ntuple));
 			break;
+		case CHELSIO_T7:
+		default:
+			INIT_TP_WR(cpl7, 0);
+			cpl7->iss = htobe32(tp->iss);
+			cpl7->params = htobe64(V_T7_FILTER_TUPLE(ntuple));
 		}
 		OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
 		    qid_atid));
diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c
index 7a6b1cbdd736..5c39ae5fa8f3 100644
--- a/sys/dev/cxgbe/tom/t4_cpl_io.c
+++ b/sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -66,6 +66,7 @@
 #include <vm/vm_page.h>
 
 #include <dev/iscsi/iscsi_proto.h>
+#include <dev/nvmf/nvmf_proto.h>
 
 #include "common/common.h"
 #include "common/t4_msg.h"
@@ -127,8 +128,9 @@ send_flowc_wr(struct toepcb *toep, struct tcpcb *tp)
 	paramidx = 0;
 
 	FLOWC_PARAM(PFNVFN, pfvf);
-	FLOWC_PARAM(CH, pi->tx_chan);
-	FLOWC_PARAM(PORT, pi->tx_chan);
+	/* Firmware expects hw port and will translate to channel itself. */
+	FLOWC_PARAM(CH, pi->hw_port);
+	FLOWC_PARAM(PORT, pi->hw_port);
 	FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id);
 	FLOWC_PARAM(SNDBUF, toep->params.sndbuf);
 	if (tp) {
@@ -148,6 +150,8 @@ send_flowc_wr(struct toepcb *toep, struct tcpcb *tp)
 
 	KASSERT(paramidx == nparams, ("nparams mismatch"));
 
+	KASSERT(howmany(flowclen, 16) <= MAX_OFLD_TX_SDESC_CREDITS,
+	    ("%s: tx_credits %u too large", __func__, howmany(flowclen, 16)));
 	txsd->tx_credits = howmany(flowclen, 16);
 	txsd->plen = 0;
 	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
@@ -215,6 +219,8 @@ update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps)
 		else
 			flowc->mnemval[0].val = htobe32(tc_idx);
 
+		KASSERT(flowclen16 <= MAX_OFLD_TX_SDESC_CREDITS,
+		    ("%s: tx_credits %u too large", __func__, flowclen16));
 		txsd->tx_credits = flowclen16;
 		txsd->plen = 0;
 		toep->tx_credits -= txsd->tx_credits;
@@ -490,6 +496,12 @@ t4_close_conn(struct adapter *sc, struct toepcb *toep)
 #define MIN_ISO_TX_CREDITS  (howmany(sizeof(struct cpl_tx_data_iso), 16))
 #define MIN_TX_CREDITS(iso)						\
 	(MIN_OFLD_TX_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0))
+#define MIN_OFLD_TX_V2_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_v2_wr) + 1, 16))
+#define MIN_TX_V2_CREDITS(iso)						\
+	(MIN_OFLD_TX_V2_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0))
+
+_Static_assert(MAX_OFLD_TX_CREDITS <= MAX_OFLD_TX_SDESC_CREDITS,
+    "MAX_OFLD_TX_SDESC_CREDITS too small");
 
 /* Maximum amount of immediate data we could stuff in a WR */
 static inline int
@@ -534,6 +546,46 @@ max_dsgl_nsegs(int tx_credits, int iso)
 	return (nseg);
 }
 
+/* Maximum amount of immediate data we could stuff in a WR */
+static inline int
+max_imm_payload_v2(int tx_credits, int iso)
+{
+	const int iso_cpl_size = iso ? sizeof(struct cpl_tx_data_iso) : 0;
+
+	KASSERT(tx_credits >= 0 &&
+		tx_credits <= MAX_OFLD_TX_CREDITS,
+		("%s: %d credits", __func__, tx_credits));
+
+	if (tx_credits < MIN_TX_V2_CREDITS(iso))
+		return (0);
+
+	return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_v2_wr) -
+	    iso_cpl_size);
+}
+
+/* Maximum number of SGL entries we could stuff in a WR */
+static inline int
+max_dsgl_nsegs_v2(int tx_credits, int iso, int imm_payload)
+{
+	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
+	int sge_pair_credits = tx_credits - MIN_TX_V2_CREDITS(iso);
+
+	KASSERT(tx_credits >= 0 &&
+		tx_credits <= MAX_OFLD_TX_CREDITS,
+		("%s: %d credits", __func__, tx_credits));
+
+	if (tx_credits < MIN_TX_V2_CREDITS(iso) ||
+	    sge_pair_credits <= howmany(imm_payload, 16))
+		return (0);
+	sge_pair_credits -= howmany(imm_payload, 16);
+
+	nseg += 2 * (sge_pair_credits * 16 / 24);
+	if ((sge_pair_credits * 16) % 24 == 16)
+		nseg++;
+
+	return (nseg);
+}
+
 static inline void
 write_tx_wr(void *dst, struct toepcb *toep, int fw_wr_opcode,
     unsigned int immdlen, unsigned int plen, uint8_t credits, int shove,
@@ -561,6 +613,35 @@ write_tx_wr(void *dst, struct toepcb *toep, int fw_wr_opcode,
 	}
 }
 
+static inline void
+write_tx_v2_wr(void *dst, struct toepcb *toep,  int fw_wr_opcode,
+    unsigned int immdlen, unsigned int plen, uint8_t credits, int shove,
+    int ulp_submode)
+{
+	struct fw_ofld_tx_data_v2_wr *txwr = dst;
+	uint32_t flags;
+
+	memset(txwr, 0, sizeof(*txwr));
+	txwr->op_to_immdlen = htobe32(V_WR_OP(fw_wr_opcode) |
+	    V_FW_WR_IMMDLEN(immdlen));
+	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
+	    V_FW_WR_LEN16(credits));
+	txwr->plen = htobe32(plen);
+	flags = V_TX_ULP_MODE(ULP_MODE_NVMET) | V_TX_ULP_SUBMODE(ulp_submode) |
+	    V_TX_URG(0) | V_TX_SHOVE(shove);
+
+	if (toep->params.tx_align > 0) {
+		if (plen < 2 * toep->params.emss)
+			flags |= F_FW_OFLD_TX_DATA_WR_LSODISABLE;
+		else
+			flags |= F_FW_OFLD_TX_DATA_WR_ALIGNPLD |
+			    (toep->params.nagle == 0 ? 0 :
+				F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE);
+	}
+
+	txwr->lsodisable_to_flags = htobe32(flags);
+}
+
 /*
  * Generate a DSGL from a starting mbuf.  The total number of segments and the
  * maximum segments in any one mbuf are provided.
@@ -612,6 +693,48 @@ write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
 	    __func__, nsegs, start, stop));
 }
 
+bool
+t4_push_raw_wr(struct adapter *sc, struct toepcb *toep, struct mbuf *m)
+{
+#ifdef INVARIANTS
+	struct inpcb *inp = toep->inp;
+#endif
+	struct wrqe *wr;
+	struct ofld_tx_sdesc *txsd;
+	u_int credits, plen;
+
+	INP_WLOCK_ASSERT(inp);
+	MPASS(mbuf_raw_wr(m));
+	plen = m->m_pkthdr.len;
+	credits = howmany(plen, 16);
+	if (credits > toep->tx_credits)
+		return (false);
+
+	wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq);
+	if (wr == NULL)
+		return (false);
+
+	m_copydata(m, 0, plen, wrtod(wr));
+	m_freem(m);
+
+	toep->tx_credits -= credits;
+	if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
+		toep->flags |= TPF_TX_SUSPENDED;
+
+	KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
+	KASSERT(credits <= MAX_OFLD_TX_SDESC_CREDITS,
+	    ("%s: tx_credits %u too large", __func__, credits));
+	txsd = &toep->txsd[toep->txsd_pidx];
+	txsd->plen = 0;
+	txsd->tx_credits = credits;
+	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
+		toep->txsd_pidx = 0;
+	toep->txsd_avail--;
+
+	t4_wrq_tx(sc, wr);
+	return (true);
+}
+
 /*
  * Max number of SGL entries an offload tx work request can have.  This is 41
  * (1 + 40) for a full 512B work request.
@@ -644,6 +767,7 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
 	struct tcpcb *tp = intotcpcb(inp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_snd;
+	struct mbufq *pduq = &toep->ulp_pduq;
 	int tx_credits, shove, compl, sowwakeup;
 	struct ofld_tx_sdesc *txsd;
 	bool nomap_mbuf_seen;
@@ -688,6 +812,19 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
 		max_imm = max_imm_payload(tx_credits, 0);
 		max_nsegs = max_dsgl_nsegs(tx_credits, 0);
 
+		if (__predict_false((sndptr = mbufq_first(pduq)) != NULL)) {
+			if (!t4_push_raw_wr(sc, toep, sndptr)) {
+				toep->flags |= TPF_TX_SUSPENDED;
+				return;
+			}
+
+			m = mbufq_dequeue(pduq);
+			MPASS(m == sndptr);
+
+			txsd = &toep->txsd[toep->txsd_pidx];
+			continue;
+		}
+
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
 		if (drop) {
@@ -705,6 +842,8 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
 
 			if ((m->m_flags & M_NOTREADY) != 0)
 				break;
+			if (plen + m->m_len > MAX_OFLD_TX_SDESC_PLEN)
+				break;
 			if (m->m_flags & M_EXTPG) {
 #ifdef KERN_TLS
 				if (m->m_epg_tls != NULL) {
@@ -870,6 +1009,8 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
+		KASSERT(plen <= MAX_OFLD_TX_SDESC_PLEN,
+		    ("%s: plen %u too large", __func__, plen));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
@@ -914,8 +1055,8 @@ rqdrop_locked(struct mbufq *q, int plen)
 #define	ULP_ISO		G_TX_ULP_SUBMODE(F_FW_ISCSI_TX_DATA_WR_ULPSUBMODE_ISO)
 
 static void
-write_tx_data_iso(void *dst, u_int ulp_submode, uint8_t flags, uint16_t mss,
-    int len, int npdu)
+write_iscsi_tx_data_iso(void *dst, u_int ulp_submode, uint8_t flags,
+    uint16_t mss, int len, int npdu)
 {
 	struct cpl_tx_data_iso *cpl;
 	unsigned int burst_size;
@@ -1079,7 +1220,7 @@ write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr)
 		    adjusted_plen, credits, shove, ulp_submode | ULP_ISO);
 		cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1);
 		MPASS(plen == sndptr->m_pkthdr.len);
-		write_tx_data_iso(cpl_iso, ulp_submode,
+		write_iscsi_tx_data_iso(cpl_iso, ulp_submode,
 		    mbuf_iscsi_iso_flags(sndptr), iso_mss, plen, npdu);
 		p = cpl_iso + 1;
 	} else {
@@ -1115,21 +1256,269 @@ write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr)
 	return (wr);
 }
 
+static void
+write_nvme_tx_data_iso(void *dst, u_int ulp_submode, u_int iso_type,
+    uint16_t mss, int len, int npdu, int pdo)
+{
+	struct cpl_t7_tx_data_iso *cpl;
+	unsigned int burst_size;
+
+	/*
+	 * TODO: Need to figure out how the LAST_PDU and SUCCESS flags
+	 * are handled.
+	 *
+	 * - Does len need padding bytes?  (If so, does padding need
+	 *   to be in DSGL input?)
+	 *
+	 * - burst always 0?
+	 */
+	burst_size = 0;
+
+	cpl = (struct cpl_t7_tx_data_iso *)dst;
+	cpl->op_to_scsi = htonl(V_CPL_T7_TX_DATA_ISO_OPCODE(CPL_TX_DATA_ISO) |
+	    V_CPL_T7_TX_DATA_ISO_FIRST(1) |
+	    V_CPL_T7_TX_DATA_ISO_LAST(1) |
+	    V_CPL_T7_TX_DATA_ISO_CPLHDRLEN(0) |
+	    V_CPL_T7_TX_DATA_ISO_HDRCRC(!!(ulp_submode & ULP_CRC_HEADER)) |
+	    V_CPL_T7_TX_DATA_ISO_PLDCRC(!!(ulp_submode & ULP_CRC_DATA)) |
+	    V_CPL_T7_TX_DATA_ISO_IMMEDIATE(0) |
+	    V_CPL_T7_TX_DATA_ISO_SCSI(iso_type));
+
+	cpl->nvme_tcp_pkd = F_CPL_T7_TX_DATA_ISO_NVME_TCP;
+	cpl->ahs = 0;
+	cpl->mpdu = htons(DIV_ROUND_UP(mss, 4));
+	cpl->burst = htonl(DIV_ROUND_UP(burst_size, 4));
+	cpl->size = htonl(len);
+	cpl->num_pi_bytes_seglen_offset = htonl(0);
+	cpl->datasn_offset = htonl(0);
+	cpl->buffer_offset = htonl(0);
+	cpl->pdo_pkd = pdo;
+}
+
+static struct wrqe *
+write_nvme_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr)
+{
+	struct mbuf *m;
+	const struct nvme_tcp_common_pdu_hdr *hdr;
+	struct fw_v2_nvmet_tx_data_wr *txwr;
+	struct cpl_tx_data_iso *cpl_iso;
+	void *p;
+	struct wrqe *wr;
+	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
+	u_int adjusted_plen, imm_data, ulp_submode;
+	struct inpcb *inp = toep->inp;
+	struct tcpcb *tp = intotcpcb(inp);
+	int tx_credits, shove, npdu, wr_len;
+	uint16_t iso_mss;
+	bool iso, nomap_mbuf_seen;
+
+	M_ASSERTPKTHDR(sndptr);
+
+	tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
+	if (mbuf_raw_wr(sndptr)) {
+		plen = sndptr->m_pkthdr.len;
+		KASSERT(plen <= SGE_MAX_WR_LEN,
+		    ("raw WR len %u is greater than max WR len", plen));
+		if (plen > tx_credits * 16)
+			return (NULL);
+
+		wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq);
+		if (__predict_false(wr == NULL))
+			return (NULL);
+
+		m_copydata(sndptr, 0, plen, wrtod(wr));
+		return (wr);
+	}
+
+	/*
+	 * The first mbuf is the PDU header that is always sent as
+	 * immediate data.
+	 */
+	imm_data = sndptr->m_len;
+
+	iso = mbuf_iscsi_iso(sndptr);
+	max_imm = max_imm_payload_v2(tx_credits, iso);
+
+	/*
+	 * Not enough credits for the PDU header.
+	 */
+	if (imm_data > max_imm)
+		return (NULL);
+
+	max_nsegs = max_dsgl_nsegs_v2(tx_credits, iso, imm_data);
+	iso_mss = mbuf_iscsi_iso_mss(sndptr);
+
+	plen = imm_data;
+	nsegs = 0;
+	max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
+	nomap_mbuf_seen = false;
+	for (m = sndptr->m_next; m != NULL; m = m->m_next) {
+		int n;
+
+		if (m->m_flags & M_EXTPG)
+			n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t),
+			    m->m_len);
+		else
+			n = sglist_count(mtod(m, void *), m->m_len);
+
+		nsegs += n;
+		plen += m->m_len;
+
+		/*
+		 * This mbuf would send us _over_ the nsegs limit.
+		 * Suspend tx because the PDU can't be sent out.
+		 */
+		if ((nomap_mbuf_seen || plen > max_imm) && nsegs > max_nsegs)
+			return (NULL);
+
+		if (m->m_flags & M_EXTPG)
+			nomap_mbuf_seen = true;
+		if (max_nsegs_1mbuf < n)
+			max_nsegs_1mbuf = n;
+	}
+
+	if (__predict_false(toep->flags & TPF_FIN_SENT))
+		panic("%s: excess tx.", __func__);
+
+	/*
+	 * We have a PDU to send.  All of it goes out in one WR so 'm'
+	 * is NULL.  A PDU's length is always a multiple of 4.
+	 */
+	MPASS(m == NULL);
+	MPASS((plen & 3) == 0);
+	MPASS(sndptr->m_pkthdr.len == plen);
+
+	shove = !(tp->t_flags & TF_MORETOCOME);
+
+	/*
+	 * plen doesn't include header digests, padding, and data
+	 * digests which are generated and inserted in the right
+	 * places by the TOE, but they do occupy TCP sequence space
+	 * and need to be accounted for.
+	 *
+	 * To determine the overhead, check the PDU header in sndptr.
+	 * Note that only certain PDU types can use digests and
+	 * padding, and PDO accounts for all but the data digests for
+	 * those PDUs.
+	 */
+	MPASS((sndptr->m_flags & M_EXTPG) == 0);
+	ulp_submode = mbuf_ulp_submode(sndptr);
+	hdr = mtod(sndptr, const void *);
+	switch (hdr->pdu_type) {
+	case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
+	case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
+		MPASS(ulp_submode == 0);
+		MPASS(!iso);
+		break;
+	case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
+	case NVME_TCP_PDU_TYPE_R2T:
+		MPASS((ulp_submode & ULP_CRC_DATA) == 0);
+		/* FALLTHROUGH */
+	case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
+		MPASS(!iso);
+		break;
+	case NVME_TCP_PDU_TYPE_H2C_DATA:
+	case NVME_TCP_PDU_TYPE_C2H_DATA:
+		if (le32toh(hdr->plen) + ((ulp_submode & ULP_CRC_DATA) != 0 ?
+		   sizeof(uint32_t) : 0) == plen)
+			MPASS(!iso);
+		break;
+	default:
+		__assert_unreachable();
+	}
+
+	if (iso) {
+		npdu = howmany(plen - hdr->hlen, iso_mss);
+		adjusted_plen = hdr->pdo * npdu + (plen - hdr->hlen);
+		if ((ulp_submode & ULP_CRC_DATA) != 0)
+			adjusted_plen += npdu * sizeof(uint32_t);
+	} else {
+		npdu = 1;
+		adjusted_plen = le32toh(hdr->plen);
+	}
+	wr_len = sizeof(*txwr);
+	if (iso)
+		wr_len += sizeof(struct cpl_tx_data_iso);
+	if (plen <= max_imm && !nomap_mbuf_seen) {
+		/* Immediate data tx for full PDU */
+		imm_data = plen;
+		wr_len += plen;
+		nsegs = 0;
+	} else {
+		/* DSGL tx for PDU data */
+		wr_len += roundup2(imm_data, 16);
+		wr_len += sizeof(struct ulptx_sgl) +
+		    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
+	}
+
+	wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq);
+	if (wr == NULL) {
+		/* XXX: how will we recover from this? */
+		return (NULL);
+	}
+	txwr = wrtod(wr);
+	credits = howmany(wr->wr_len, 16);
+
+	if (iso) {
+		write_tx_v2_wr(txwr, toep, FW_V2_NVMET_TX_DATA_WR,
+		    imm_data + sizeof(struct cpl_tx_data_iso),
+		    adjusted_plen, credits, shove, ulp_submode | ULP_ISO);
+		cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1);
+		MPASS(plen == sndptr->m_pkthdr.len);
+		write_nvme_tx_data_iso(cpl_iso, ulp_submode,
+		    (hdr->pdu_type & 0x1) == 0 ? 1 : 2, iso_mss, plen, npdu,
+		    hdr->pdo);
+		p = cpl_iso + 1;
+	} else {
+		write_tx_v2_wr(txwr, toep, FW_V2_NVMET_TX_DATA_WR, imm_data,
+		    adjusted_plen, credits, shove, ulp_submode);
+		p = txwr + 1;
+	}
+
+	/* PDU header (and immediate data payload). */
+	m_copydata(sndptr, 0, imm_data, p);
+	if (nsegs != 0) {
+		p = roundup2((char *)p + imm_data, 16);
+		write_tx_sgl(p, sndptr->m_next, NULL, nsegs, max_nsegs_1mbuf);
+		if (wr_len & 0xf) {
+			uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len);
+			*pad = 0;
+		}
+	}
+
+	KASSERT(toep->tx_credits >= credits,
+	    ("%s: not enough credits: credits %u "
+		"toep->tx_credits %u tx_credits %u nsegs %u "
+		"max_nsegs %u iso %d", __func__, credits,
+		toep->tx_credits, tx_credits, nsegs, max_nsegs, iso));
+
+	tp->snd_nxt += adjusted_plen;
+	tp->snd_max += adjusted_plen;
+
+	counter_u64_add(toep->ofld_txq->tx_nvme_pdus, npdu);
+	counter_u64_add(toep->ofld_txq->tx_nvme_octets, plen);
+	if (iso)
+		counter_u64_add(toep->ofld_txq->tx_nvme_iso_wrs, 1);
+
+	return (wr);
+}
+
 void
 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct mbuf *sndptr, *m;
 	struct fw_wr_hdr *wrhdr;
 	struct wrqe *wr;
-	u_int plen, credits;
+	u_int plen, credits, mode;
 	struct inpcb *inp = toep->inp;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 	struct mbufq *pduq = &toep->ulp_pduq;
 
 	INP_WLOCK_ASSERT(inp);
+	mode = ulp_mode(toep);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
-	KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI,
+	KASSERT(mode == ULP_MODE_ISCSI || mode == ULP_MODE_NVMET,
 	    ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
 
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
@@ -1162,7 +1551,7 @@ t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 			if (sbu > 0) {
 				/*
 				 * The data transmitted before the
-				 * tid's ULP mode changed to ISCSI is
+				 * tid's ULP mode changed to ISCSI/NVMET is
 				 * still in so_snd.  Incoming credits
 				 * should account for so_snd first.
 				 */
@@ -1175,7 +1564,10 @@ t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 	}
 
 	while ((sndptr = mbufq_first(pduq)) != NULL) {
-		wr = write_iscsi_mbuf_wr(toep, sndptr);
+		if (mode == ULP_MODE_ISCSI)
+			wr = write_iscsi_mbuf_wr(toep, sndptr);
+		else
+			wr = write_nvme_mbuf_wr(toep, sndptr);
 		if (wr == NULL) {
 			toep->flags |= TPF_TX_SUSPENDED;
 			return;
@@ -1211,6 +1603,8 @@ t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
+		KASSERT(plen <= MAX_OFLD_TX_SDESC_PLEN,
+		    ("%s: plen %u too large", __func__, plen));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
@@ -1232,7 +1626,8 @@ static inline void
 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop)
 {
 
-	if (ulp_mode(toep) == ULP_MODE_ISCSI)
+	if (ulp_mode(toep) == ULP_MODE_ISCSI ||
+	    ulp_mode(toep) == ULP_MODE_NVMET)
 		t4_push_pdus(sc, toep, drop);
 	else if (toep->flags & TPF_KTLS)
 		t4_push_ktls(sc, toep, drop);
@@ -1240,6 +1635,35 @@ t4_push_data(struct adapter *sc, struct toepcb *toep, int drop)
 		t4_push_frames(sc, toep, drop);
 }
 
+void
+t4_raw_wr_tx(struct adapter *sc, struct toepcb *toep, struct mbuf *m)
+{
+#ifdef INVARIANTS
+	struct inpcb *inp = toep->inp;
+#endif
+
+	INP_WLOCK_ASSERT(inp);
+
+	/*
+	 * If there are other raw WRs enqueued, enqueue to preserve
+	 * FIFO ordering.
+	 */
+	if (!mbufq_empty(&toep->ulp_pduq)) {
+		mbufq_enqueue(&toep->ulp_pduq, m);
+		return;
+	}
+
+	/*
+	 * Cannot call t4_push_data here as that will lock so_snd and
+	 * some callers of this run in rx handlers with so_rcv locked.
+	 * Instead, just try to transmit this WR.
+	 */
+	if (!t4_push_raw_wr(sc, toep, m)) {
+		mbufq_enqueue(&toep->ulp_pduq, m);
+		toep->flags |= TPF_TX_SUSPENDED;
+	}
+}
+
 int
 t4_tod_output(struct toedev *tod, struct tcpcb *tp)
 {
@@ -1363,7 +1787,8 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 	socantrcvmore(so);
 
 	if (ulp_mode(toep) == ULP_MODE_RDMA ||
-	    (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) {
+	    (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6) ||
+	    ulp_mode(toep) == ULP_MODE_NVMET) {
 		/*
 		 * There might be data received via DDP before the FIN
 		 * not reported to the driver.  Just assume the
@@ -1909,7 +2334,8 @@ do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 
 		SOCKBUF_LOCK(sb);
 		sbu = sbused(sb);
-		if (ulp_mode(toep) == ULP_MODE_ISCSI) {
+		if (ulp_mode(toep) == ULP_MODE_ISCSI ||
+		    ulp_mode(toep) == ULP_MODE_NVMET) {
 			if (__predict_false(sbu > 0)) {
 				/*
 				 * The data transmitted before the
@@ -1941,35 +2367,55 @@ do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 }
 
 void
-t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep,
+write_set_tcb_field(struct adapter *sc, void *dst, struct toepcb *toep,
     uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie)
 {
-	struct wrqe *wr;
-	struct cpl_set_tcb_field *req;
-	struct ofld_tx_sdesc *txsd;
+	struct cpl_set_tcb_field *req = dst;
 
 	MPASS((cookie & ~M_COOKIE) == 0);
 	if (reply) {
 		MPASS(cookie != CPL_COOKIE_RESERVED);
 	}
 
-	wr = alloc_wrqe(sizeof(*req), wrq);
+	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid);
+	if (reply == 0) {
+		req->reply_ctrl = htobe16(F_NO_REPLY);
+	} else {
+		const int qid = toep->ofld_rxq->iq.abs_id;
+		if (chip_id(sc) >= CHELSIO_T7) {
+			req->reply_ctrl = htobe16(V_T7_QUEUENO(qid) |
+			    V_T7_REPLY_CHAN(0) | V_NO_REPLY(0));
+		} else {
+			req->reply_ctrl = htobe16(V_QUEUENO(qid) |
+			    V_REPLY_CHAN(0) | V_NO_REPLY(0));
+		}
+	}
+	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
+	req->mask = htobe64(mask);
+	req->val = htobe64(val);
+}
+
+void
+t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep,
+    uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie)
+{
+	struct wrqe *wr;
+	struct ofld_tx_sdesc *txsd;
+	const u_int len = sizeof(struct cpl_set_tcb_field);
+
+	wr = alloc_wrqe(len, wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
-	req = wrtod(wr);
+	write_set_tcb_field(sc, wrtod(wr), toep, word, mask, val, reply,
+	    cookie);
 
-	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid);
-	req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id));
-	if (reply == 0)
-		req->reply_ctrl |= htobe16(F_NO_REPLY);
-	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
-	req->mask = htobe64(mask);
-	req->val = htobe64(val);
 	if (wrq->eq.type == EQ_OFLD) {
 		txsd = &toep->txsd[toep->txsd_pidx];
-		txsd->tx_credits = howmany(sizeof(*req), 16);
+		_Static_assert(howmany(len, 16) <= MAX_OFLD_TX_SDESC_CREDITS,
+		    "MAX_OFLD_TX_SDESC_CREDITS too small");
+		txsd->tx_credits = howmany(len, 16);
 		txsd->plen = 0;
 		KASSERT(toep->tx_credits >= txsd->tx_credits &&
 		    toep->txsd_avail > 0,
diff --git a/sys/dev/cxgbe/tom/t4_ddp.c b/sys/dev/cxgbe/tom/t4_ddp.c
index 2fee8fa91dac..35fb1061d867 100644
--- a/sys/dev/cxgbe/tom/t4_ddp.c
+++ b/sys/dev/cxgbe/tom/t4_ddp.c
@@ -1655,7 +1655,10 @@ t4_write_page_pods_for_ps(struct adapter *sc, struct sge_wrq *wrq, int tid,
 
 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
 		ulpmc->cmd = cmd;
-		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
+		if (chip_id(sc) >= CHELSIO_T7)
+			ulpmc->dlen = htobe32(V_T7_ULP_MEMIO_DATA_LEN(chunk >> 5));
+		else
+			ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk >> 5));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
@@ -1785,7 +1788,7 @@ t4_write_page_pods_for_rcvbuf(struct adapter *sc, struct sge_wrq *wrq, int tid,
 	return (0);
 }
 
-static struct mbuf *
+struct mbuf *
 alloc_raw_wr_mbuf(int len)
 {
 	struct mbuf *m;
@@ -1842,7 +1845,10 @@ t4_write_page_pods_for_bio(struct adapter *sc, struct toepcb *toep,
 		ulpmc = mtod(m, struct ulp_mem_io *);
 		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
 		ulpmc->cmd = cmd;
-		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
+		if (chip_id(sc) >= CHELSIO_T7)
+			ulpmc->dlen = htobe32(V_T7_ULP_MEMIO_DATA_LEN(chunk >> 5));
+		else
+			ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk >> 5));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
@@ -1922,7 +1928,10 @@ t4_write_page_pods_for_buf(struct adapter *sc, struct toepcb *toep,
 
 		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
 		ulpmc->cmd = cmd;
-		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
+		if (chip_id(sc) >= CHELSIO_T7)
+			ulpmc->dlen = htobe32(V_T7_ULP_MEMIO_DATA_LEN(chunk >> 5));
+		else
+			ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk >> 5));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
@@ -2013,7 +2022,10 @@ t4_write_page_pods_for_sgl(struct adapter *sc, struct toepcb *toep,
 
 		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
 		ulpmc->cmd = cmd;
-		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
+		if (chip_id(sc) >= CHELSIO_T7)
+			ulpmc->dlen = htobe32(V_T7_ULP_MEMIO_DATA_LEN(chunk >> 5));
+		else
+			ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk >> 5));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c
index 06c495dcafc3..b879f6883f25 100644
--- a/sys/dev/cxgbe/tom/t4_listen.c
+++ b/sys/dev/cxgbe/tom/t4_listen.c
@@ -508,10 +508,11 @@ send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe)
 	    V_FW_WR_FLOWID(synqe->tid));
 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
 	flowc->mnemval[0].val = htobe32(pfvf);
+	/* Firmware expects hw port and will translate to channel itself. */
 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
-	flowc->mnemval[1].val = htobe32(pi->tx_chan);
+	flowc->mnemval[1].val = htobe32(pi->hw_port);
 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
-	flowc->mnemval[2].val = htobe32(pi->tx_chan);
+	flowc->mnemval[2].val = htobe32(pi->hw_port);
 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
 	flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
 	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
@@ -1507,6 +1508,8 @@ found:
 
 	init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx,
 	    &synqe->params);
+	if (sc->params.tid_qid_sel_mask != 0)
+		update_tid_qid_sel(vi, &synqe->params, tid);
 
 	/*
 	 * If all goes well t4_syncache_respond will get called during
diff --git a/sys/dev/cxgbe/tom/t4_tls.c b/sys/dev/cxgbe/tom/t4_tls.c
index 27c16b9988ae..bbcc1c88c3db 100644
--- a/sys/dev/cxgbe/tom/t4_tls.c
+++ b/sys/dev/cxgbe/tom/t4_tls.c
@@ -61,11 +61,21 @@
 
 static void
 t4_set_tls_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask,
-    uint64_t val)
+    uint64_t val, int reply, int cookie)
 {
 	struct adapter *sc = td_adapter(toep->td);
+	struct mbuf *m;
+
+	m = alloc_raw_wr_mbuf(sizeof(struct cpl_set_tcb_field));
+	if (m == NULL) {
+		/* XXX */
+		panic("%s: out of memory", __func__);
+	}
 
-	t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, word, mask, val, 0, 0);
+	write_set_tcb_field(sc, mtod(m, void *), toep, word, mask, val, reply,
+	    cookie);
+
+	t4_raw_wr_tx(sc, toep, m);
 }
 
 /* TLS and DTLS common routines */
@@ -88,10 +98,9 @@ tls_tx_key(struct toepcb *toep)
 static void
 t4_set_rx_quiesce(struct toepcb *toep)
 {
-	struct adapter *sc = td_adapter(toep->td);
 
-	t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, W_TCB_T_FLAGS,
-	    V_TF_RX_QUIESCE(1), V_TF_RX_QUIESCE(1), 1, CPL_COOKIE_TOM);
+	t4_set_tls_tcb_field(toep, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1),
+	    V_TF_RX_QUIESCE(1), 1, CPL_COOKIE_TOM);
 }
 
 /* Clear TF_RX_QUIESCE to re-enable receive. */
@@ -99,7 +108,7 @@ static void
 t4_clear_rx_quiesce(struct toepcb *toep)
 {
 
-	t4_set_tls_tcb_field(toep, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), 0);
+	t4_set_tls_tcb_field(toep, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), 0, 0, 0);
 }
 
 /* TLS/DTLS content type  for CPL SFO */
@@ -145,16 +154,15 @@ get_tp_plen_max(struct ktls_session *tls)
 	return (tls->params.max_frame_len <= 8192 ? plen : FC_TP_PLEN_MAX);
 }
 
-/* Send request to get the key-id */
+/* Send request to save the key in on-card memory. */
 static int
 tls_program_key_id(struct toepcb *toep, struct ktls_session *tls,
     int direction)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct adapter *sc = td_adapter(toep->td);
-	struct ofld_tx_sdesc *txsd;
 	int keyid;
-	struct wrqe *wr;
+	struct mbuf *m;
 	struct tls_key_req *kwr;
 	struct tls_keyctx *kctx;
 
@@ -173,12 +181,12 @@ tls_program_key_id(struct toepcb *toep, struct ktls_session *tls,
 		return (ENOSPC);
 	}
 
-	wr = alloc_wrqe(TLS_KEY_WR_SZ, &toep->ofld_txq->wrq);
-	if (wr == NULL) {
+	m = alloc_raw_wr_mbuf(TLS_KEY_WR_SZ);
+	if (m == NULL) {
 		t4_free_tls_keyid(sc, keyid);
 		return (ENOMEM);
 	}
-	kwr = wrtod(wr);
+	kwr = mtod(m, struct tls_key_req *);
 	memset(kwr, 0, TLS_KEY_WR_SZ);
 
 	t4_write_tlskey_wr(tls, direction, toep->tid, F_FW_WR_COMPL, keyid,
@@ -190,15 +198,7 @@ tls_program_key_id(struct toepcb *toep, struct ktls_session *tls,
 		tls_ofld->rx_key_addr = keyid;
 	t4_tls_key_ctx(tls, direction, kctx);
 
-	txsd = &toep->txsd[toep->txsd_pidx];
-	txsd->tx_credits = DIV_ROUND_UP(TLS_KEY_WR_SZ, 16);
-	txsd->plen = 0;
-	toep->tx_credits -= txsd->tx_credits;
-	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
-		toep->txsd_pidx = 0;
-	toep->txsd_avail--;
-
-	t4_wrq_tx(sc, wr);
+	t4_raw_wr_tx(sc, toep, m);
 
 	return (0);
 }
@@ -207,7 +207,7 @@ int
 tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction)
 {
 	struct adapter *sc = td_adapter(toep->td);
-	int error, explicit_iv_size, mac_first;
+	int error, iv_size, mac_first;
 
 	if (!can_tls_offload(sc))
 		return (EINVAL);
@@ -228,6 +228,21 @@ tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction)
 		}
 	}
 
+	/* TLS 1.1 through TLS 1.3 are currently supported. */
+	if (tls->params.tls_vmajor != TLS_MAJOR_VER_ONE ||
+	    tls->params.tls_vminor < TLS_MINOR_VER_ONE ||
+	    tls->params.tls_vminor > TLS_MINOR_VER_THREE) {
+		return (EPROTONOSUPPORT);
+	}
+
+	/* TLS 1.3 is only supported on T7+. */
+	if (tls->params.tls_vminor == TLS_MINOR_VER_THREE) {
+		if (is_t6(sc)) {
+			return (EPROTONOSUPPORT);
+		}
+	}
+
+	/* Sanity check values in *tls. */
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		/* XXX: Explicitly ignore any provided IV. */
@@ -247,13 +262,10 @@ tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction)
 		default:
 			return (EPROTONOSUPPORT);
 		}
-		explicit_iv_size = AES_BLOCK_LEN;
+		iv_size = AES_BLOCK_LEN;
 		mac_first = 1;
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
-		if (tls->params.iv_len != SALT_SIZE) {
-			return (EINVAL);
-		}
 		switch (tls->params.cipher_key_len) {
 		case 128 / 8:
 		case 192 / 8:
@@ -262,20 +274,19 @@ tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction)
 		default:
 			return (EINVAL);
 		}
-		explicit_iv_size = 8;
+
+		/*
+		 * The IV size for TLS 1.2 is the explicit IV in the
+		 * record header.  For TLS 1.3 it is the size of the
+		 * sequence number.
+		 */
+		iv_size = 8;
 		mac_first = 0;
 		break;
 	default:
 		return (EPROTONOSUPPORT);
 	}
 
-	/* Only TLS 1.1 and TLS 1.2 are currently supported. */
-	if (tls->params.tls_vmajor != TLS_MAJOR_VER_ONE ||
-	    tls->params.tls_vminor < TLS_MINOR_VER_ONE ||
-	    tls->params.tls_vminor > TLS_MINOR_VER_TWO) {
-		return (EPROTONOSUPPORT);
-	}
-
 	/* Bail if we already have a key. */
 	if (direction == KTLS_TX) {
 		if (toep->tls.tx_key_addr != -1)
@@ -289,6 +300,7 @@ tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction)
 	if (error)
 		return (error);
 
+	toep->tls.tls13 = tls->params.tls_vminor == TLS_MINOR_VER_THREE;
 	if (direction == KTLS_TX) {
 		toep->tls.scmd0.seqno_numivs =
 			(V_SCMD_SEQ_NO_CTRL(3) |
@@ -298,14 +310,14 @@ tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction)
 			 V_SCMD_CIPH_MODE(t4_tls_cipher_mode(tls)) |
 			 V_SCMD_AUTH_MODE(t4_tls_auth_mode(tls)) |
 			 V_SCMD_HMAC_CTRL(t4_tls_hmac_ctrl(tls)) |
-			 V_SCMD_IV_SIZE(explicit_iv_size / 2));
+			 V_SCMD_IV_SIZE(iv_size / 2));
 
 		toep->tls.scmd0.ivgen_hdrlen =
 			(V_SCMD_IV_GEN_CTRL(1) |
 			 V_SCMD_KEY_CTX_INLINE(0) |
 			 V_SCMD_TLS_FRAG_ENABLE(1));
 
-		toep->tls.iv_len = explicit_iv_size;
+		toep->tls.iv_len = iv_size;
 		toep->tls.frag_size = tls->params.max_frame_len;
 		toep->tls.fcplenmax = get_tp_plen_max(tls);
 		toep->tls.expn_per_ulp = tls->params.tls_hlen +
@@ -352,7 +364,8 @@ tls_uninit_toep(struct toepcb *toep)
 
 static void
 write_tlstx_wr(struct fw_tlstx_data_wr *txwr, struct toepcb *toep,
-    unsigned int plen, unsigned int expn, uint8_t credits, int shove)
+    unsigned int plen, unsigned int expn, uint8_t credits, int shove,
+    int num_ivs)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	unsigned int len = plen + expn;
@@ -365,7 +378,7 @@ write_tlstx_wr(struct fw_tlstx_data_wr *txwr, struct toepcb *toep,
 	txwr->plen = htobe32(len);
 	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ULP_MODE_TLS) |
 	    V_TX_URG(0) | /* F_T6_TX_FORCE | */ V_TX_SHOVE(shove));
-	txwr->ctxloc_to_exp = htobe32(V_FW_TLSTX_DATA_WR_NUMIVS(1) |
+	txwr->ctxloc_to_exp = htobe32(V_FW_TLSTX_DATA_WR_NUMIVS(num_ivs) |
 	    V_FW_TLSTX_DATA_WR_EXP(expn) |
 	    V_FW_TLSTX_DATA_WR_CTXLOC(TLS_SFO_WR_CONTEXTLOC_DDR) |
 	    V_FW_TLSTX_DATA_WR_IVDSGL(0) |
@@ -381,20 +394,20 @@ write_tlstx_wr(struct fw_tlstx_data_wr *txwr, struct toepcb *toep,
 
 static void
 write_tlstx_cpl(struct cpl_tx_tls_sfo *cpl, struct toepcb *toep,
-    struct tls_hdr *tls_hdr, unsigned int plen, uint64_t seqno)
+    struct tls_hdr *tls_hdr, unsigned int plen, uint8_t rec_type,
+    uint64_t seqno)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	int data_type, seglen;
 
 	seglen = plen;
-	data_type = tls_content_type(tls_hdr->type);
+	data_type = tls_content_type(rec_type);
 	cpl->op_to_seg_len = htobe32(V_CPL_TX_TLS_SFO_OPCODE(CPL_TX_TLS_SFO) |
 	    V_CPL_TX_TLS_SFO_DATA_TYPE(data_type) |
 	    V_CPL_TX_TLS_SFO_CPL_LEN(2) | V_CPL_TX_TLS_SFO_SEG_LEN(seglen));
 	cpl->pld_len = htobe32(plen);
 	if (data_type == CPL_TX_TLS_SFO_TYPE_CUSTOM)
-		cpl->type_protover = htobe32(
-		    V_CPL_TX_TLS_SFO_TYPE(tls_hdr->type));
+		cpl->type_protover = htobe32(V_CPL_TX_TLS_SFO_TYPE(rec_type));
 	cpl->seqno_numivs = htobe32(tls_ofld->scmd0.seqno_numivs |
 	    V_SCMD_NUM_IVS(1));
 	cpl->ivgen_hdrlen = htobe32(tls_ofld->scmd0.ivgen_hdrlen);
@@ -494,9 +507,11 @@ t4_push_ktls(struct adapter *sc, struct toepcb *toep, int drop)
 	struct tcpcb *tp = intotcpcb(inp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_snd;
+	struct mbufq *pduq = &toep->ulp_pduq;
 	int tls_size, tx_credits, shove, sowwakeup;
 	struct ofld_tx_sdesc *txsd;
 	char *buf;
+	bool tls13;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
@@ -532,10 +547,23 @@ t4_push_ktls(struct adapter *sc, struct toepcb *toep, int drop)
 		return;
 	}
 
+	tls13 = toep->tls.tls13;
 	txsd = &toep->txsd[toep->txsd_pidx];
 	for (;;) {
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 
+		if (__predict_false((m = mbufq_first(pduq)) != NULL)) {
+			if (!t4_push_raw_wr(sc, toep, m)) {
+				toep->flags |= TPF_TX_SUSPENDED;
+				return;
+			}
+
+			(void)mbufq_dequeue(pduq);
+
+			txsd = &toep->txsd[toep->txsd_pidx];
+			continue;
+		}
+
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
 		if (drop) {
@@ -586,9 +614,11 @@ t4_push_ktls(struct adapter *sc, struct toepcb *toep, int drop)
 		    sizeof(struct cpl_tx_tls_sfo) +
 		    sizeof(struct ulptx_idata) + sizeof(struct ulptx_sc_memrd);
 
-		/* Explicit IVs for AES-CBC and AES-GCM are <= 16. */
-		MPASS(toep->tls.iv_len <= AES_BLOCK_LEN);
-		wr_len += AES_BLOCK_LEN;
+		if (!tls13) {
+			/* Explicit IVs for AES-CBC and AES-GCM are <= 16. */
+			MPASS(toep->tls.iv_len <= AES_BLOCK_LEN);
+			wr_len += AES_BLOCK_LEN;
+		}
 
 		/* Account for SGL in work request length. */
 		nsegs = count_ext_pgs_segs(m);
@@ -658,8 +688,10 @@ t4_push_ktls(struct adapter *sc, struct toepcb *toep, int drop)
 		expn_size = m->m_epg_hdrlen +
 		    m->m_epg_trllen;
 		tls_size = m->m_len - expn_size;
-		write_tlstx_wr(txwr, toep, tls_size, expn_size, credits, shove);
-		write_tlstx_cpl(cpl, toep, thdr, tls_size, m->m_epg_seqno);
+		write_tlstx_wr(txwr, toep, tls_size, expn_size, credits, shove,
+		    tls13 ? 0 : 1);
+		write_tlstx_cpl(cpl, toep, thdr, tls_size,
+		    tls13 ? m->m_epg_record_type : thdr->type, m->m_epg_seqno);
 
 		idata = (struct ulptx_idata *)(cpl + 1);
 		idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
@@ -670,10 +702,12 @@ t4_push_ktls(struct adapter *sc, struct toepcb *toep, int drop)
 		    V_ULPTX_LEN16(toep->tls.tx_key_info_size >> 4));
 		memrd->addr = htobe32(toep->tls.tx_key_addr >> 5);
 
-		/* Copy IV. */
 		buf = (char *)(memrd + 1);
-		memcpy(buf, thdr + 1, toep->tls.iv_len);
-		buf += AES_BLOCK_LEN;
+		if (!tls13) {
+			/* Copy IV. */
+			memcpy(buf, thdr + 1, toep->tls.iv_len);
+			buf += AES_BLOCK_LEN;
+		}
 
 		write_ktlstx_sgl(buf, m, nsegs);
 
@@ -694,6 +728,8 @@ t4_push_ktls(struct adapter *sc, struct toepcb *toep, int drop)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
+		KASSERT(m->m_len <= MAX_OFLD_TX_SDESC_PLEN,
+		    ("%s: plen %u too large", __func__, m->m_len));
 		txsd->plen = m->m_len;
 		txsd->tx_credits = credits;
 		txsd++;
@@ -793,8 +829,8 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 	struct sockbuf *sb;
 	struct mbuf *tls_data;
 	struct tls_get_record *tgr;
-	struct mbuf *control;
-	int pdu_length, trailer_len;
+	struct mbuf *control, *n;
+	int pdu_length, resid, trailer_len;
 #if defined(KTR) || defined(INVARIANTS)
 	int len;
 #endif
@@ -842,7 +878,9 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 
 	/*
 	 * The payload of this CPL is the TLS header followed by
-	 * additional fields.
+	 * additional fields.  For TLS 1.3 the type field holds the
+	 * inner record type and the length field has been updated to
+	 * strip the inner record type, padding, and MAC.
 	 */
 	KASSERT(m->m_len >= sizeof(*tls_hdr_pkt),
 	    ("%s: payload too small", __func__));
@@ -854,7 +892,14 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 		    ("%s: sequence mismatch", __func__));
 	}
 
-	/* Report decryption errors as EBADMSG. */
+	/*
+	 * Report decryption errors as EBADMSG.
+	 *
+	 * XXX: To support rekeying for TLS 1.3 this will eventually
+	 * have to be updated to recrypt the data with the old key and
+	 * then decrypt with the new key.  Punt for now as KTLS
+	 * doesn't yet support rekeying.
+	 */
 	if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) != 0) {
 		CTR4(KTR_CXGBE, "%s: tid %u TLS error %#x ddp_vld %#x",
 		    __func__, toep->tid, tls_hdr_pkt->res_to_mac_error,
@@ -872,6 +917,33 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 		return (0);
 	}
 
+	/* For TLS 1.3 trim the header and trailer. */
+	if (toep->tls.tls13) {
+		KASSERT(tls_data != NULL, ("%s: TLS 1.3 record without data",
+		    __func__));
+		MPASS(tls_data->m_pkthdr.len == pdu_length);
+		m_adj(tls_data, sizeof(struct tls_record_layer));
+		if (tls_data->m_pkthdr.len > be16toh(tls_hdr_pkt->length))
+			tls_data->m_pkthdr.len = be16toh(tls_hdr_pkt->length);
+		resid = tls_data->m_pkthdr.len;
+		if (resid == 0) {
+			m_freem(tls_data);
+			tls_data = NULL;
+		} else {
+			for (n = tls_data;; n = n->m_next) {
+				if (n->m_len < resid) {
+					resid -= n->m_len;
+					continue;
+				}
+
+				n->m_len = resid;
+				m_freem(n->m_next);
+				n->m_next = NULL;
+				break;
+			}
+		}
+	}
+
 	/* Handle data received after the socket is closed. */
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
@@ -1076,33 +1148,60 @@ out:
 }
 
 /*
- * Send a work request setting multiple TCB fields to enable
- * ULP_MODE_TLS.
+ * Send a work request setting one or more TCB fields to partially or
+ * fully enable ULP_MODE_TLS.
+ *
+ * - If resid == 0, the socket buffer ends at a record boundary
+ *   (either empty or contains one or more complete records).  Switch
+ *   to ULP_MODE_TLS (if not already) and enable TLS decryption.
+ *
+ * - If resid != 0, the socket buffer contains a partial record.  In
+ *   this case, switch to ULP_MODE_TLS partially and configure the TCB
+ *   to pass along the remaining resid bytes undecrypted.  Once they
+ *   arrive, this is called again with resid == 0 and enables TLS
+ *   decryption.
  */
 static void
-tls_update_tcb(struct adapter *sc, struct toepcb *toep, uint64_t seqno)
+tls_update_tcb(struct adapter *sc, struct toepcb *toep, uint64_t seqno,
+    size_t resid)
 {
-	struct wrqe *wr;
+	struct mbuf *m;
 	struct work_request_hdr *wrh;
 	struct ulp_txpkt *ulpmc;
 	int fields, key_offset, len;
 
-	KASSERT(ulp_mode(toep) == ULP_MODE_NONE,
-	    ("%s: tid %d already ULP_MODE_TLS", __func__, toep->tid));
+	/*
+	 * If we are already in ULP_MODE_TLS, then we should now be at
+	 * a record boundary and ready to finish enabling TLS RX.
+	 */
+	KASSERT(resid == 0 || ulp_mode(toep) == ULP_MODE_NONE,
+	    ("%s: tid %d needs %zu more data but already ULP_MODE_TLS",
+	    __func__, toep->tid, resid));
 
 	fields = 0;
+	if (ulp_mode(toep) == ULP_MODE_NONE) {
+		/* 2 writes for the overlay region */
+		fields += 2;
+	}
 
-	/* 2 writes for the overlay region */
-	fields += 2;
+	if (resid == 0) {
+		/* W_TCB_TLS_SEQ */
+		fields++;
 
-	/* W_TCB_TLS_SEQ */
-	fields++;
+		/* W_TCB_ULP_RAW */
+		fields++;
+	} else {
+		/* W_TCB_PDU_LEN */
+		fields++;
 
-	/* W_TCB_ULP_RAW */
-	fields++;
+		/* W_TCB_ULP_RAW */
+		fields++;
+	}
 
-	/* W_TCB_ULP_TYPE */
-	fields ++;
+	if (ulp_mode(toep) == ULP_MODE_NONE) {
+		/* W_TCB_ULP_TYPE */
+		fields ++;
+	}
 
 	/* W_TCB_T_FLAGS */
 	fields++;
@@ -1111,59 +1210,94 @@ tls_update_tcb(struct adapter *sc, struct toepcb *toep, uint64_t seqno)
 	KASSERT(len <= SGE_MAX_WR_LEN,
 	    ("%s: WR with %d TCB field updates too large", __func__, fields));
 
-	wr = alloc_wrqe(len, toep->ctrlq);
-	if (wr == NULL) {
+	m = alloc_raw_wr_mbuf(len);
+	if (m == NULL) {
 		/* XXX */
 		panic("%s: out of memory", __func__);
 	}
 
-	wrh = wrtod(wr);
-	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
+	wrh = mtod(m, struct work_request_hdr *);
+	INIT_ULPTX_WRH(wrh, len, 1, toep->tid);	/* atomic */
 	ulpmc = (struct ulp_txpkt *)(wrh + 1);
 
-	/*
-	 * Clear the TLS overlay region: 1023:832.
-	 *
-	 * Words 26/27 are always set to zero.  Words 28/29
-	 * contain seqno and are set when enabling TLS
-	 * decryption.  Word 30 is zero and Word 31 contains
-	 * the keyid.
-	 */
-	ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 26,
-	    0xffffffffffffffff, 0);
+	if (ulp_mode(toep) == ULP_MODE_NONE) {
+		/*
+		 * Clear the TLS overlay region: 1023:832.
+		 *
+		 * Words 26/27 are always set to zero.  Words 28/29
+		 * contain seqno and are set when enabling TLS
+		 * decryption.  Word 30 is zero and Word 31 contains
+		 * the keyid.
+		 */
+		ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 26,
+		    0xffffffffffffffff, 0);
 
-	/*
-	 * RX key tags are an index into the key portion of MA
-	 * memory stored as an offset from the base address in
-	 * units of 64 bytes.
-	 */
-	key_offset = toep->tls.rx_key_addr - sc->vres.key.start;
-	ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 30,
-	    0xffffffffffffffff,
-	    (uint64_t)V_TCB_RX_TLS_KEY_TAG(key_offset / 64) << 32);
-
-	CTR3(KTR_CXGBE, "%s: tid %d enable TLS seqno %lu", __func__,
-	    toep->tid, seqno);
-	ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_TLS_SEQ,
-	    V_TCB_TLS_SEQ(M_TCB_TLS_SEQ), V_TCB_TLS_SEQ(seqno));
-	ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_ULP_RAW,
-	    V_TCB_ULP_RAW(M_TCB_ULP_RAW),
-	    V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) | V_TF_TLS_CONTROL(1) |
-	    V_TF_TLS_ACTIVE(1) | V_TF_TLS_ENABLE(1))));
-
-	toep->flags &= ~TPF_TLS_STARTING;
-	toep->flags |= TPF_TLS_RECEIVE;
-
-	/* Set the ULP mode to ULP_MODE_TLS. */
-	toep->params.ulp_mode = ULP_MODE_TLS;
-	ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_ULP_TYPE,
-	    V_TCB_ULP_TYPE(M_TCB_ULP_TYPE), V_TCB_ULP_TYPE(ULP_MODE_TLS));
+		/*
+		 * RX key tags are an index into the key portion of MA
+		 * memory stored as an offset from the base address in
+		 * units of 64 bytes.
+		 */
+		key_offset = toep->tls.rx_key_addr - sc->vres.key.start;
+		ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 30,
+		    0xffffffffffffffff,
+		    (uint64_t)V_TCB_RX_TLS_KEY_TAG(key_offset / 64) << 32);
+	}
+
+	if (resid == 0) {
+		/*
+		 * The socket buffer is empty or only contains
+		 * complete TLS records: Set the sequence number and
+		 * enable TLS decryption.
+		 */
+		CTR3(KTR_CXGBE, "%s: tid %d enable TLS seqno %lu", __func__,
+		    toep->tid, seqno);
+		ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid,
+		    W_TCB_RX_TLS_SEQ, V_TCB_RX_TLS_SEQ(M_TCB_RX_TLS_SEQ),
+		    V_TCB_RX_TLS_SEQ(seqno));
+		ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid,
+		    W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW),
+		    V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) | V_TF_TLS_CONTROL(1) |
+		    V_TF_TLS_ACTIVE(1) | V_TF_TLS_ENABLE(1))));
+
+		toep->flags &= ~TPF_TLS_STARTING;
+		toep->flags |= TPF_TLS_RECEIVE;
+	} else {
+		/*
+		 * The socket buffer ends with a partial record with a
+		 * full header and needs at least 6 bytes.
+		 *
+		 * Set PDU length.  This is treating the 'resid' bytes
+		 * as a TLS PDU, so the first 5 bytes are a fake
+		 * header and the rest are the PDU length.
+		 */
+		ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid,
+		    W_TCB_PDU_LEN, V_TCB_PDU_LEN(M_TCB_PDU_LEN),
+		    V_TCB_PDU_LEN(resid - sizeof(struct tls_hdr)));
+		CTR3(KTR_CXGBE, "%s: tid %d setting PDU_LEN to %zu",
+		    __func__, toep->tid, resid - sizeof(struct tls_hdr));
+
+		/* Clear all bits in ULP_RAW except for ENABLE. */
+		ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid,
+		    W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW),
+		    V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1)));
+
+		/* Wait for 'resid' bytes to be delivered as CPL_RX_DATA. */
+		toep->tls.rx_resid = resid;
+	}
+
+	if (ulp_mode(toep) == ULP_MODE_NONE) {
+		/* Set the ULP mode to ULP_MODE_TLS. */
+		toep->params.ulp_mode = ULP_MODE_TLS;
+		ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid,
+		    W_TCB_ULP_TYPE, V_TCB_ULP_TYPE(M_TCB_ULP_TYPE),
+		    V_TCB_ULP_TYPE(ULP_MODE_TLS));
+	}
 
 	/* Clear TF_RX_QUIESCE. */
 	ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_T_FLAGS,
 	    V_TF_RX_QUIESCE(1), 0);
 
-	t4_wrq_tx(sc, wr);
+	t4_raw_wr_tx(sc, toep, m);
 }
 
 /*
@@ -1190,7 +1324,8 @@ tls_check_rx_sockbuf(struct adapter *sc, struct toepcb *toep,
 	 * size of a TLS record, re-enable receive and pause again once
 	 * we get more data to try again.
 	 */
-	if (!have_header || resid != 0) {
+	if (!have_header || (resid != 0 && (resid < sizeof(struct tls_hdr) ||
+	    is_t6(sc)))) {
 		CTR(KTR_CXGBE, "%s: tid %d waiting for more data", __func__,
 		    toep->tid);
 		toep->flags &= ~TPF_TLS_RX_QUIESCED;
@@ -1198,7 +1333,7 @@ tls_check_rx_sockbuf(struct adapter *sc, struct toepcb *toep,
 		return;
 	}
 
-	tls_update_tcb(sc, toep, seqno);
+	tls_update_tcb(sc, toep, seqno, resid);
 }
 
 void
diff --git a/sys/dev/cxgbe/tom/t4_tls.h b/sys/dev/cxgbe/tom/t4_tls.h
index 753a30890fdc..6faf946e9e3c 100644
--- a/sys/dev/cxgbe/tom/t4_tls.h
+++ b/sys/dev/cxgbe/tom/t4_tls.h
@@ -74,6 +74,7 @@ struct tls_ofld_info {
 	unsigned short adjusted_plen;
 	unsigned short expn_per_ulp;
 	unsigned short pdus_per_ulp;
+	bool tls13;
 	struct tls_scmd scmd0;
 	u_int iv_len;
 	unsigned int tx_key_info_size;
diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c
index 9b09facd05a7..8dfffd465345 100644
--- a/sys/dev/cxgbe/tom/t4_tom.c
+++ b/sys/dev/cxgbe/tom/t4_tom.c
@@ -182,7 +182,7 @@ init_toepcb(struct vi_info *vi, struct toepcb *toep)
 	}
 	toep->ofld_txq = &sc->sge.ofld_txq[cp->txq_idx];
 	toep->ofld_rxq = &sc->sge.ofld_rxq[cp->rxq_idx];
-	toep->ctrlq = &sc->sge.ctrlq[pi->port_id];
+	toep->ctrlq = &sc->sge.ctrlq[cp->ctrlq_idx];
 
 	tls_init_toep(toep);
 	MPASS(ulp_mode(toep) != ULP_MODE_TCPDDP);
@@ -494,8 +494,15 @@ send_get_tcb(struct adapter *sc, u_int tid)
 	bzero(cpl, sizeof(*cpl));
 	INIT_TP_WR(cpl, tid);
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_GET_TCB, tid));
-	cpl->reply_ctrl = htobe16(V_REPLY_CHAN(0) |
-	    V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id));
+	if (chip_id(sc) >= CHELSIO_T7) {
+		cpl->reply_ctrl =
+		    htobe16(V_T7_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id) |
+			V_T7_REPLY_CHAN(0) | V_NO_REPLY(0));
+	} else {
+		cpl->reply_ctrl =
+		    htobe16(V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id) |
+			V_REPLY_CHAN(0) | V_NO_REPLY(0));
+	}
 	cpl->cookie = 0xff;
 	commit_wrq_wr(&sc->sge.ctrlq[0], cpl, &cookie);
 
@@ -882,6 +889,8 @@ send_mss_flowc_wr(struct adapter *sc, struct toepcb *toep)
 	flowc->mnemval[0].val = htobe32(toep->params.emss);
 
 	txsd = &toep->txsd[toep->txsd_pidx];
+	_Static_assert(flowclen16 <= MAX_OFLD_TX_SDESC_CREDITS,
+	    "MAX_OFLD_TX_SDESC_CREDITS too small");
 	txsd->tx_credits = flowclen16;
 	txsd->plen = 0;
 	toep->tx_credits -= txsd->tx_credits;
@@ -1219,7 +1228,7 @@ select_ntuple(struct vi_info *vi, struct l2t_entry *e)
 		ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift;
 
 	if (tp->port_shift >= 0)
-		ntuple |= (uint64_t)e->lport << tp->port_shift;
+		ntuple |= (uint64_t)e->hw_port << tp->port_shift;
 
 	if (tp->protocol_shift >= 0)
 		ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift;
@@ -1230,10 +1239,7 @@ select_ntuple(struct vi_info *vi, struct l2t_entry *e)
 		    tp->vnic_shift;
 	}
 
-	if (is_t4(sc))
-		return (htobe32((uint32_t)ntuple));
-	else
-		return (htobe64(V_FILTER_TUPLE(ntuple)));
+	return (ntuple);
 }
 
 /*
@@ -1324,6 +1330,9 @@ init_conn_params(struct vi_info *vi , struct offload_settings *s,
 	 */
 	cp->mtu_idx = find_best_mtu_idx(sc, inc, s);
 
+	/* Control queue. */
+	cp->ctrlq_idx = vi->pi->port_id;
+
 	/* Tx queue for this connection. */
 	if (s->txq == QUEUE_RANDOM)
 		q_idx = arc4random();
@@ -1436,6 +1445,32 @@ init_conn_params(struct vi_info *vi , struct offload_settings *s,
 	cp->emss = 0;
 }
 
+void
+update_tid_qid_sel(struct vi_info *vi, struct conn_params *cp, int tid)
+{
+	struct adapter *sc = vi->adapter;
+	const int mask = sc->params.tid_qid_sel_mask;
+	struct sge_ofld_txq *ofld_txq = &sc->sge.ofld_txq[cp->txq_idx];
+	uint32_t ngroup;
+	int g, nqpg;
+
+	cp->ctrlq_idx = ofld_txq_group(tid, mask);
+	CTR(KTR_CXGBE, "tid %u is on core %u", tid, cp->ctrlq_idx);
+	if ((ofld_txq->wrq.eq.cntxt_id & mask) == (tid & mask))
+		return;
+
+	ngroup = 1 << bitcount32(mask);
+	MPASS(vi->nofldtxq % ngroup == 0);
+	g = ofld_txq_group(tid, mask);
+	nqpg = vi->nofldtxq / ngroup;
+	cp->txq_idx = vi->first_ofld_txq + g * nqpg + arc4random() % nqpg;
+#ifdef INVARIANTS
+	MPASS(cp->txq_idx < vi->first_ofld_txq + vi->nofldtxq);
+	ofld_txq = &sc->sge.ofld_txq[cp->txq_idx];
+	MPASS((ofld_txq->wrq.eq.cntxt_id & mask) == (tid & mask));
+#endif
+}
+
 int
 negative_advice(int status)
 {
@@ -1955,8 +1990,10 @@ t4_tom_deactivate(struct adapter *sc)
 	if (td == NULL)
 		return (0);	/* XXX. KASSERT? */
 
-	if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI))
-		return (EBUSY);	/* both iWARP and iSCSI rely on the TOE. */
+	/* These ULDs rely on the TOE. */
+	if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI) ||
+	    uld_active(sc, ULD_NVME))
+		return (EBUSY);
 
 	if (sc->offload_map != 0) {
 		for_each_port(sc, i) {
@@ -2231,6 +2268,98 @@ t4_aio_queue_tom(struct socket *so, struct kaiocb *job)
 		return (0);
 }
 
+/*
+ * Request/response structure used to find out the adapter offloading
+ * a socket.
+ */
+struct find_offload_adapter_data {
+	struct socket *so;
+	struct adapter *sc;	/* result */
+};
+
+static void
+find_offload_adapter_cb(struct adapter *sc, void *arg)
+{
+	struct find_offload_adapter_data *fa = arg;
+	struct socket *so = fa->so;
+	struct tom_data *td = sc->tom_softc;
+	struct tcpcb *tp;
+	struct inpcb *inp;
+
+	/* Non-TCP were filtered out earlier. */
+	MPASS(so->so_proto->pr_protocol == IPPROTO_TCP);
+
+	if (fa->sc != NULL)
+		return;	/* Found already. */
+
+	if (td == NULL)
+		return;	/* TOE not enabled on this adapter. */
+
+	inp = sotoinpcb(so);
+	INP_WLOCK(inp);
+	if ((inp->inp_flags & INP_DROPPED) == 0) {
+		tp = intotcpcb(inp);
+		if (tp->t_flags & TF_TOE && tp->tod == &td->tod)
+			fa->sc = sc;	/* Found. */
+	}
+	INP_WUNLOCK(inp);
+}
+
+struct adapter *
+find_offload_adapter(struct socket *so)
+{
+	struct find_offload_adapter_data fa;
+
+	fa.sc = NULL;
+	fa.so = so;
+	t4_iterate(find_offload_adapter_cb, &fa);
+	return (fa.sc);
+}
+
+void
+send_txdataplen_max_flowc_wr(struct adapter *sc, struct toepcb *toep,
+    int maxlen)
+{
+	struct wrqe *wr;
+	struct fw_flowc_wr *flowc;
+	const u_int nparams = 1;
+	u_int flowclen;
+	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
+
+	CTR(KTR_CXGBE, "%s: tid %u maxlen=%d", __func__, toep->tid, maxlen);
+
+	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
+
+	wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq);
+	if (wr == NULL) {
+		/* XXX */
+		panic("%s: allocation failure.", __func__);
+	}
+	flowc = wrtod(wr);
+	memset(flowc, 0, wr->wr_len);
+
+	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
+	    V_FW_FLOWC_WR_NPARAMS(nparams));
+	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
+	    V_FW_WR_FLOWID(toep->tid));
+
+	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX;
+	flowc->mnemval[0].val = htobe32(maxlen);
+
+	KASSERT(howmany(flowclen, 16) <= MAX_OFLD_TX_SDESC_CREDITS,
+	    ("%s: tx_credits %u too large", __func__, howmany(flowclen, 16)));
+	txsd->tx_credits = howmany(flowclen, 16);
+	txsd->plen = 0;
+	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
+	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
+	toep->tx_credits -= txsd->tx_credits;
+	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
+		toep->txsd_pidx = 0;
+	toep->txsd_avail--;
+
+	t4_wrq_tx(sc, wr);
+}
+
 static int
 t4_tom_mod_load(void)
 {
diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h
index 6295a3484b9f..c8c2d432b8f1 100644
--- a/sys/dev/cxgbe/tom/t4_tom.h
+++ b/sys/dev/cxgbe/tom/t4_tom.h
@@ -113,6 +113,7 @@ struct conn_params {
 	int8_t mtu_idx;
 	int8_t ulp_mode;
 	int8_t tx_align;
+	int8_t ctrlq_idx;	/* ctrlq = &sc->sge.ctrlq[ctrlq_idx] */
 	int16_t txq_idx;	/* ofld_txq = &sc->sge.ofld_txq[txq_idx] */
 	int16_t rxq_idx;	/* ofld_rxq = &sc->sge.ofld_rxq[rxq_idx] */
 	int16_t l2t_idx;
@@ -122,10 +123,13 @@ struct conn_params {
 };
 
 struct ofld_tx_sdesc {
-	uint32_t plen;		/* payload length */
-	uint8_t tx_credits;	/* firmware tx credits (unit is 16B) */
+	uint32_t plen : 26;		/* payload length */
+	uint32_t tx_credits : 6;	/* firmware tx credits (unit is 16B) */
 };
 
+#define	MAX_OFLD_TX_SDESC_PLEN		((1u << 26) - 1)
+#define	MAX_OFLD_TX_SDESC_CREDITS	((1u << 6) - 1)
+
 struct ppod_region {
 	u_int pr_start;
 	u_int pr_len;
@@ -474,11 +478,14 @@ int select_rcv_wscale(void);
 void init_conn_params(struct vi_info *, struct offload_settings *,
     struct in_conninfo *, struct socket *, const struct tcp_options *, int16_t,
     struct conn_params *cp);
+void update_tid_qid_sel(struct vi_info *, struct conn_params *, int);
 __be64 calc_options0(struct vi_info *, struct conn_params *);
 __be32 calc_options2(struct vi_info *, struct conn_params *);
 uint64_t select_ntuple(struct vi_info *, struct l2t_entry *);
 int negative_advice(int);
 int add_tid_to_history(struct adapter *, u_int);
+struct adapter *find_offload_adapter(struct socket *);
+void send_txdataplen_max_flowc_wr(struct adapter *, struct toepcb *, int);
 void t4_pcb_detach(struct toedev *, struct tcpcb *);
 
 /* t4_connect.c */
@@ -526,6 +533,10 @@ int t4_send_rst(struct toedev *, struct tcpcb *);
 void t4_set_tcb_field(struct adapter *, struct sge_wrq *, struct toepcb *,
     uint16_t, uint64_t, uint64_t, int, int);
 void t4_push_pdus(struct adapter *, struct toepcb *, int);
+bool t4_push_raw_wr(struct adapter *, struct toepcb *, struct mbuf *);
+void t4_raw_wr_tx(struct adapter *, struct toepcb *, struct mbuf *);
+void write_set_tcb_field(struct adapter *, void *, struct toepcb *, uint16_t,
+    uint64_t, uint64_t, int, int);
 
 /* t4_ddp.c */
 int t4_init_ppod_region(struct ppod_region *, struct t4_range *, u_int,
@@ -551,6 +562,7 @@ int t4_aio_queue_ddp(struct socket *, struct kaiocb *);
 int t4_enable_ddp_rcv(struct socket *, struct toepcb *);
 void t4_ddp_mod_load(void);
 void t4_ddp_mod_unload(void);
+struct mbuf *alloc_raw_wr_mbuf(int);
 void ddp_assert_empty(struct toepcb *);
 void ddp_uninit_toep(struct toepcb *);
 void ddp_queue_toep(struct toepcb *);
@@ -574,4 +586,10 @@ int tls_tx_key(struct toepcb *);
 void tls_uninit_toep(struct toepcb *);
 int tls_alloc_ktls(struct toepcb *, struct ktls_session *, int);
 
+/* t4_tpt.c */
+uint32_t t4_pblpool_alloc(struct adapter *, int);
+void t4_pblpool_free(struct adapter *, uint32_t, int);
+int t4_pblpool_create(struct adapter *);
+void t4_pblpool_destroy(struct adapter *);
+
 #endif
diff --git a/sys/dev/cxgbe/tom/t4_tom_l2t.c b/sys/dev/cxgbe/tom/t4_tom_l2t.c
index 3fd0d5ca41d4..e245c2b6fd5b 100644
--- a/sys/dev/cxgbe/tom/t4_tom_l2t.c
+++ b/sys/dev/cxgbe/tom/t4_tom_l2t.c
@@ -403,7 +403,7 @@ t4_l2t_get(struct port_info *pi, if_t ifp, struct sockaddr *sa)
 		l2_store(sa, e);
 		e->ifp = ifp;
 		e->hash = hash;
-		e->lport = pi->lport;
+		e->hw_port = pi->hw_port;
 		e->wrq = &sc->sge.ctrlq[pi->port_id];
 		e->iqid = sc->sge.ofld_rxq[pi->vi[0].first_ofld_rxq].iq.abs_id;
 		atomic_store_rel_int(&e->refcnt, 1);