aboutsummaryrefslogtreecommitdiff
path: root/sys/cam/ctl
diff options
context:
space:
mode:
authorAlexander Motin <mav@FreeBSD.org>2020-06-08 20:53:57 +0000
committerAlexander Motin <mav@FreeBSD.org>2020-06-08 20:53:57 +0000
commit9a4510ac3220b97781768bf250897c152d94c436 (patch)
tree745260c79d32f36611ac94bef82297b5cd51cf9d /sys/cam/ctl
parentc78cd98b8ac557177b565c8f4354ddd8862167a7 (diff)
downloadsrc-9a4510ac3220b97781768bf250897c152d94c436.tar.gz
src-9a4510ac3220b97781768bf250897c152d94c436.zip
Implement zero-copy iSCSI target transmission/read.
Add ICL_NOCOPY flag to icl_pdu_append_data(), specifying that the method can just reference the data buffer instead of immediately copying it. Extend the offload KPI with optional PDU queue method, allowing to specify completion callback, called when all the data referenced by above has been transferred and won't be accessed any more (the buffers can be freed). Implement the above functionality in software iSCSI driver using mbufs with external storage and reference counter. Note that some NICs (ixl(4)) may keep the mbuf in TX queue for a long time, so CTL has to be ready. Add optional method to struct ctl_scsiio for buffer reference counting. Implement it for CTL block backend, allowing to delay free of the struct ctl_be_block_io and memory it references as needed. In first reincarnation of the patch I tried to delay whole I/O as it is done for FibreChannel, that was cleaner, but due to the above callback delays I had to rewrite it this way to not leave LUN referenced potentially for hours or more. All together on sequential read from ZFS ARC this saves about 30% of CPU time and memory bandwidth by avoiding one of 3 memory copies (the other two are from ZFS ARC to DMU cache and then from DMU cache to CTL buffers). On tests with 2x Xeon Silver 4114 this allows to reach full line rate of 100GigE NIC. Tests with Gold CPUs and two 100GigE NICs are stil TBD, but expectations to saturate them are pretty high. ;) Discussed with: Chelsio Sponsored by: iXsystems, Inc.
Notes
Notes: svn path=/head/; revision=361939
Diffstat (limited to 'sys/cam/ctl')
-rw-r--r--sys/cam/ctl/ctl_backend_block.c22
-rw-r--r--sys/cam/ctl/ctl_frontend_iscsi.c43
-rw-r--r--sys/cam/ctl/ctl_io.h4
3 files changed, 65 insertions, 4 deletions
diff --git a/sys/cam/ctl/ctl_backend_block.c b/sys/cam/ctl/ctl_backend_block.c
index bb06ead005a7..2953a8fb6e32 100644
--- a/sys/cam/ctl/ctl_backend_block.c
+++ b/sys/cam/ctl/ctl_backend_block.c
@@ -201,6 +201,7 @@ struct ctl_be_block_io {
union ctl_io *io;
struct ctl_sg_entry sg_segs[CTLBLK_MAX_SEGS];
struct iovec xiovecs[CTLBLK_MAX_SEGS];
+ int refcnt;
int bio_cmd;
int two_sglists;
int num_segs;
@@ -305,11 +306,12 @@ ctl_alloc_beio(struct ctl_be_block_softc *softc)
beio = uma_zalloc(softc->beio_zone, M_WAITOK | M_ZERO);
beio->softc = softc;
+ beio->refcnt = 1;
return (beio);
}
static void
-ctl_free_beio(struct ctl_be_block_io *beio)
+ctl_real_free_beio(struct ctl_be_block_io *beio)
{
struct ctl_be_block_softc *softc = beio->softc;
int i;
@@ -328,6 +330,22 @@ ctl_free_beio(struct ctl_be_block_io *beio)
}
static void
+ctl_refcnt_beio(void *arg, int diff)
+{
+ struct ctl_be_block_io *beio = arg;
+
+ if (atomic_fetchadd_int(&beio->refcnt, diff) + diff == 0)
+ ctl_real_free_beio(beio);
+}
+
+static void
+ctl_free_beio(struct ctl_be_block_io *beio)
+{
+
+ ctl_refcnt_beio(beio, -1);
+}
+
+static void
ctl_complete_beio(struct ctl_be_block_io *beio)
{
union ctl_io *io = beio->io;
@@ -1613,6 +1631,8 @@ ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
io->scsiio.kern_data_len = beio->io_len;
io->scsiio.kern_sg_entries = beio->num_segs;
+ io->scsiio.kern_data_ref = ctl_refcnt_beio;
+ io->scsiio.kern_data_arg = beio;
io->io_hdr.flags |= CTL_FLAG_ALLOCATED;
/*
diff --git a/sys/cam/ctl/ctl_frontend_iscsi.c b/sys/cam/ctl/ctl_frontend_iscsi.c
index a9e4ad2c2137..82bb4e162a94 100644
--- a/sys/cam/ctl/ctl_frontend_iscsi.c
+++ b/sys/cam/ctl/ctl_frontend_iscsi.c
@@ -424,6 +424,17 @@ cfiscsi_pdu_queue(struct icl_pdu *response)
CFISCSI_SESSION_UNLOCK(cs);
}
+ static void
+cfiscsi_pdu_queue_cb(struct icl_pdu *response, icl_pdu_cb cb)
+{
+ struct cfiscsi_session *cs = PDU_SESSION(response);
+
+ CFISCSI_SESSION_LOCK(cs);
+ cfiscsi_pdu_prepare(response);
+ icl_pdu_queue_cb(response, cb);
+ CFISCSI_SESSION_UNLOCK(cs);
+}
+
static void
cfiscsi_pdu_handle_nop_out(struct icl_pdu *request)
{
@@ -2417,6 +2428,15 @@ cfiscsi_target_find_or_create(struct cfiscsi_softc *softc, const char *name,
}
static void
+cfiscsi_pdu_done(struct icl_pdu *ip, int error)
+{
+
+ if (error != 0)
+ ; // XXX: Do something on error?
+ ((ctl_ref)ip->ip_prv0)(ip->ip_prv1, -1);
+}
+
+static void
cfiscsi_datamove_in(union ctl_io *io)
{
struct cfiscsi_session *cs;
@@ -2426,6 +2446,7 @@ cfiscsi_datamove_in(union ctl_io *io)
struct ctl_sg_entry ctl_sg_entry, *ctl_sglist;
size_t len, expected_len, sg_len, buffer_offset;
const char *sg_addr;
+ icl_pdu_cb cb;
int ctl_sg_count, error, i;
request = PRIV_REQUEST(io);
@@ -2471,6 +2492,11 @@ cfiscsi_datamove_in(union ctl_io *io)
return;
}
+ if (io->scsiio.kern_data_ref != NULL)
+ cb = cfiscsi_pdu_done;
+ else
+ cb = NULL;
+
i = 0;
sg_addr = NULL;
sg_len = 0;
@@ -2534,7 +2560,8 @@ cfiscsi_datamove_in(union ctl_io *io)
len, sg_len));
}
- error = icl_pdu_append_data(response, sg_addr, len, M_NOWAIT);
+ error = icl_pdu_append_data(response, sg_addr, len,
+ M_NOWAIT | (cb ? ICL_NOCOPY : 0));
if (error != 0) {
CFISCSI_SESSION_WARN(cs, "failed to "
"allocate memory; dropping connection");
@@ -2587,7 +2614,12 @@ cfiscsi_datamove_in(union ctl_io *io)
buffer_offset -= response->ip_data_len;
break;
}
- cfiscsi_pdu_queue(response);
+ if (cb != NULL) {
+ response->ip_prv0 = io->scsiio.kern_data_ref;
+ response->ip_prv1 = io->scsiio.kern_data_arg;
+ io->scsiio.kern_data_ref(io->scsiio.kern_data_arg, 1);
+ }
+ cfiscsi_pdu_queue_cb(response, cb);
response = NULL;
bhsdi = NULL;
}
@@ -2617,7 +2649,12 @@ cfiscsi_datamove_in(union ctl_io *io)
}
}
KASSERT(response->ip_data_len > 0, ("sending empty Data-In"));
- cfiscsi_pdu_queue(response);
+ if (cb != NULL) {
+ response->ip_prv0 = io->scsiio.kern_data_ref;
+ response->ip_prv1 = io->scsiio.kern_data_arg;
+ io->scsiio.kern_data_ref(io->scsiio.kern_data_arg, 1);
+ }
+ cfiscsi_pdu_queue_cb(response, cb);
}
io->scsiio.be_move_done(io);
diff --git a/sys/cam/ctl/ctl_io.h b/sys/cam/ctl/ctl_io.h
index e892662d79f2..4925c25109bf 100644
--- a/sys/cam/ctl/ctl_io.h
+++ b/sys/cam/ctl/ctl_io.h
@@ -257,6 +257,8 @@ typedef enum {
union ctl_io;
+typedef void (*ctl_ref)(void *arg, int diff);
+
/*
* SCSI passthrough I/O structure for the CAM Target Layer. Note
* that some of these fields are here for completeness, but they aren't
@@ -329,6 +331,8 @@ struct ctl_scsiio {
uint8_t cdb[CTL_MAX_CDBLEN]; /* CDB */
int (*be_move_done)(union ctl_io *io); /* called by fe */
int (*io_cont)(union ctl_io *io); /* to continue processing */
+ ctl_ref kern_data_ref; /* Method to reference/release data */
+ void *kern_data_arg; /* Opaque argument for kern_data_ref() */
};
typedef enum {