src - FreeBSD source tree

diff options


context:
space:
mode:

author	Warner Losh <imp@FreeBSD.org>	2021-09-23 22:31:32 +0000
committer	Warner Losh <imp@FreeBSD.org>	2021-09-23 22:42:08 +0000
commit	502dc84a8b6703e7c0626739179a3cdffdd22d81 (patch)
tree	c2f654c80bdf3732b0c9bbda69a92a4529f372cd
parent	cb13059663e455b3fc69c293dadec53c164490dc (diff)
download	src-502dc84a8b6703e7c0626739179a3cdffdd22d81.tar.gz src-502dc84a8b6703e7c0626739179a3cdffdd22d81.zip

nvme: Use shared timeout rather than timeout per transaction

Keep track of the approximate time commands are 'due' and the next deadline for a command. twice a second, wake up to see if any commands have entered timeout. If so, quiessce and then enter a recovery mode half the timeout further in the future to allow the ISR to complete. Once we exit recovery mode, we go back to operations as normal. Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D28583

Diffstat

-rw-r--r--

sys/dev/nvme/nvme_ctrlr.c

-rw-r--r--

sys/dev/nvme/nvme_private.h

-rw-r--r--

sys/dev/nvme/nvme_qpair.c

173

3 files changed, 131 insertions, 66 deletions

diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
index 833bf328584a..39cde3f065dd 100644
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c

@@ -232,7 +232,8 @@ nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr,

mtx_lock(&ctrlr->lock);

STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq);

mtx_unlock(&ctrlr->lock);

- taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);

+ if (!ctrlr->is_dying)

+ taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);

}

static void

@@ -435,7 +436,8 @@ nvme_ctrlr_reset(struct nvme_controller *ctrlr)

return;

- taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);

+ if (!ctrlr->is_dying)

+ taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);

}

static int

@@ -1481,6 +1483,8 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)

{

int gone, i;

+ ctrlr->is_dying = true;

if (ctrlr->resource == NULL)

goto nores;

if (!mtx_initialized(&ctrlr->adminq.lock))

diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
index fba1b406e9ce..02cecaf03a97 100644
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h

@@ -151,7 +151,7 @@ struct nvme_tracker {

TAILQ_ENTRY(nvme_tracker) tailq;

struct nvme_request *req;

struct nvme_qpair *qpair;

- struct callout timer;

+ sbintime_t deadline;

bus_dmamap_t payload_dma_map;

uint16_t cid;

@@ -159,6 +159,12 @@ struct nvme_tracker {

bus_addr_t prp_bus_addr;

};

+enum nvme_recovery {

+ RECOVERY_NONE = 0, /* Normal operations */

+ RECOVERY_START, /* Deadline has passed, start recovering */

+ RECOVERY_RESET, /* This pass, initiate reset of controller */

+ RECOVERY_WAITING, /* waiting for the reset to complete */

+};

struct nvme_qpair {

struct nvme_controller *ctrlr;

uint32_t id;

@@ -170,6 +176,11 @@ struct nvme_qpair {

struct resource *res;

void *tag;

+ struct callout timer;

+ sbintime_t deadline;

+ bool timer_armed;

+ enum nvme_recovery recovery_state;

uint32_t num_entries;

uint32_t num_trackers;

uint32_t sq_tdbl_off;

@@ -201,8 +212,6 @@ struct nvme_qpair {

struct nvme_tracker **act_tr;

- bool is_enabled;

struct mtx lock __aligned(CACHE_LINE_SIZE);

} __aligned(CACHE_LINE_SIZE);

@@ -305,6 +314,7 @@ struct nvme_controller {

uint32_t notification_sent;

bool is_failed;

+ bool is_dying;

STAILQ_HEAD(, nvme_request) fail_req;

/* Host Memory Buffer */

diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
index 71718acff66f..7a17a057f319 100644
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c

@@ -452,7 +452,6 @@ nvme_qpair_complete_tracker(struct nvme_tracker *tr,

}

mtx_lock(&qpair->lock);

- callout_stop(&tr->timer);

if (retry) {

req->retries++;

@@ -544,7 +543,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)

* progress. Ignore the interrupt - any I/O that was associated with

* this interrupt will get retried when the reset is complete.

- if (!qpair->is_enabled)

+ if (qpair->recovery_state != RECOVERY_NONE)

return (false);

bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,

@@ -739,6 +738,10 @@ nvme_qpair_construct(struct nvme_qpair *qpair,

qpair->cpl_bus_addr = queuemem_phys + cmdsz;

prpmem_phys = queuemem_phys + cmdsz + cplsz;

+ callout_init(&qpair->timer, 1);

+ qpair->timer_armed = false;

+ qpair->recovery_state = RECOVERY_NONE;

* Calcuate the stride of the doorbell register. Many emulators set this

* value to correspond to a cache line. However, some hardware has set

@@ -776,7 +779,6 @@ nvme_qpair_construct(struct nvme_qpair *qpair,

DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK);

bus_dmamap_create(qpair->dma_tag_payload, 0,

&tr->payload_dma_map);

- callout_init(&tr->timer, 1);

tr->cid = i;

tr->qpair = qpair;

tr->prp = (uint64_t *)prp_list;

@@ -835,6 +837,8 @@ nvme_qpair_destroy(struct nvme_qpair *qpair)

{

struct nvme_tracker *tr;

+ callout_drain(&qpair->timer);

if (qpair->tag) {

bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag);

qpair->tag = NULL;

@@ -914,65 +918,101 @@ nvme_io_qpair_destroy(struct nvme_qpair *qpair)

}

static void

-nvme_abort_complete(void *arg, const struct nvme_completion *status)

+nvme_qpair_timeout(void *arg)

{

- struct nvme_tracker *tr = arg;

+ struct nvme_qpair *qpair = arg;

+ struct nvme_controller *ctrlr = qpair->ctrlr;

+ struct nvme_tracker *tr;

+ struct nvme_tracker *tr_temp;

+ sbintime_t now;

+ bool idle;

+ uint32_t csts;

+ uint8_t cfs;

- /*

- * If cdw0 == 1, the controller was not able to abort the command

- * we requested. We still need to check the active tracker array,

- * to cover race where I/O timed out at same time controller was

- * completing the I/O.

- */

- if (status->cdw0 == 1 && tr->qpair->act_tr[tr->cid] != NULL) {

+ mtx_lock(&qpair->lock);

+ idle = TAILQ_EMPTY(&qpair->outstanding_tr);

+again:

+ switch (qpair->recovery_state) {

+ case RECOVERY_NONE:

+ if (idle)

+ break;

+ now = getsbinuptime();

+ TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {

+ if (now > tr->deadline && tr->deadline != 0) {

+ /*

+ * We're now passed our earliest deadline. We

+ * need to do expensive things to cope, but next

+ * time. Flag that and close the door to any

+ * further processing.

+ */

+ qpair->recovery_state = RECOVERY_START;

+ nvme_printf(ctrlr, "RECOVERY_START %jd vs %jd\n",

+ (uintmax_t)now, (uintmax_t)tr->deadline);

+ break;

+ }

+ break;

+ case RECOVERY_START:

- * An I/O has timed out, and the controller was unable to

- * abort it for some reason. Construct a fake completion

- * status, and then complete the I/O's tracker manually.

+ * Read csts to get value of cfs - controller fatal status.

+ * If no fatal status, try to call the completion routine, and

+ * if completes transactions, report a missed interrupt and

+ * return (this may need to be rate limited). Otherwise, if

+ * aborts are enabled and the controller is not reporting

+ * fatal status, abort the command. Otherwise, just reset the

+ * controller and hope for the best.

- nvme_printf(tr->qpair->ctrlr,

- "abort command failed, aborting command manually\n");

- nvme_qpair_manual_complete_tracker(tr,

- NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL);

+ csts = nvme_mmio_read_4(ctrlr, csts);

+ cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK;

+ if (cfs) {

+ nvme_printf(ctrlr, "Controller in fatal status, resetting\n");

+ qpair->recovery_state = RECOVERY_RESET;

+ goto again;

+ }

+ mtx_unlock(&qpair->lock);

+ if (nvme_qpair_process_completions(qpair)) {

+ nvme_printf(ctrlr, "Completions present in output without an interrupt\n");

+ qpair->recovery_state = RECOVERY_NONE;

+ } else {

+ nvme_printf(ctrlr, "timeout with nothing complete, resetting\n");

+ qpair->recovery_state = RECOVERY_RESET;

+ mtx_lock(&qpair->lock);

+ goto again;

+ }

+ mtx_lock(&qpair->lock);

+ break;

+ case RECOVERY_RESET:

+ /*

+ * If we get here due to a possible surprise hot-unplug event,

+ * then we let nvme_ctrlr_reset confirm and fail the

+ * controller.

+ */

+ nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",

+ cfs ? " and fatal error status" : "");

+ nvme_printf(ctrlr, "RECOVERY_WAITING\n");

+ qpair->recovery_state = RECOVERY_WAITING;

+ nvme_ctrlr_reset(ctrlr);

+ break;

+ case RECOVERY_WAITING:

+ nvme_printf(ctrlr, "waiting\n");

+ break;

}

-static void

-nvme_timeout(void *arg)

- struct nvme_tracker *tr = arg;

- struct nvme_qpair *qpair = tr->qpair;

- struct nvme_controller *ctrlr = qpair->ctrlr;

- uint32_t csts;

- uint8_t cfs;

- * Read csts to get value of cfs - controller fatal status.

- * If no fatal status, try to call the completion routine, and

- * if completes transactions, report a missed interrupt and

- * return (this may need to be rate limited). Otherwise, if

- * aborts are enabled and the controller is not reporting

- * fatal status, abort the command. Otherwise, just reset the

- * controller and hope for the best.

+ * Rearm the timeout.

- csts = nvme_mmio_read_4(ctrlr, csts);

- cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK;

- if (cfs == 0 && nvme_qpair_process_completions(qpair)) {

- nvme_printf(ctrlr, "Missing interrupt\n");

- return;

- }

- if (ctrlr->enable_aborts && cfs == 0) {

- nvme_printf(ctrlr, "Aborting command due to a timeout.\n");

- nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,

- nvme_abort_complete, tr);

+ if (!idle) {

+ callout_schedule(&qpair->timer, hz / 2);

} else {

- nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",

- (csts == NVME_GONE) ? " and possible hot unplug" :

- (cfs ? " and fatal error status" : ""));

- nvme_ctrlr_reset(ctrlr);

+ qpair->timer_armed = false;

}

+ mtx_unlock(&qpair->lock);

}

+/*

+ * Submit the tracker to the hardware. Must already be in the

+ * outstanding queue when called.

+ */

void

nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)

{

@@ -989,12 +1029,17 @@ nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)

if (req->timeout) {

if (req->cb_fn == nvme_completion_poll_cb)

- timeout = hz;

+ timeout = 1;

else

- timeout = ctrlr->timeout_period * hz;

- callout_reset_on(&tr->timer, timeout, nvme_timeout, tr,

- qpair->cpu);

- }

+ timeout = ctrlr->timeout_period;

+ tr->deadline = getsbinuptime() + timeout * SBT_1S;

+ if (!qpair->timer_armed) {

+ qpair->timer_armed = true;

+ callout_reset_on(&qpair->timer, hz / 2,

+ nvme_qpair_timeout, qpair, qpair->cpu);

+ }

+ } else

+ tr->deadline = SBT_MAX;

/* Copy the command from the tracker to the submission queue. */

memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd));

@@ -1069,7 +1114,7 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)

tr = TAILQ_FIRST(&qpair->free_tr);

req->qpair = qpair;

- if (tr == NULL || !qpair->is_enabled) {

+ if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {

* No tracker is available, or the qpair is disabled due to

* an in-progress controller-level reset or controller

@@ -1096,6 +1141,8 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)

TAILQ_REMOVE(&qpair->free_tr, tr, tailq);

TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq);

+ if (!qpair->timer_armed)

+ tr->deadline = SBT_MAX;

tr->req = req;

switch (req->type) {

@@ -1164,8 +1211,9 @@ nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)

static void

nvme_qpair_enable(struct nvme_qpair *qpair)

{

+ mtx_assert(&qpair->lock, MA_OWNED);

- qpair->is_enabled = true;

+ qpair->recovery_state = RECOVERY_NONE;

}

void

@@ -1208,7 +1256,9 @@ nvme_admin_qpair_enable(struct nvme_qpair *qpair)

NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);

}

+ mtx_lock(&qpair->lock);

nvme_qpair_enable(qpair);

+ mtx_unlock(&qpair->lock);

}

void

@@ -1251,12 +1301,13 @@ nvme_io_qpair_enable(struct nvme_qpair *qpair)

static void

nvme_qpair_disable(struct nvme_qpair *qpair)

{

- struct nvme_tracker *tr;

+ struct nvme_tracker *tr, *tr_temp;

- qpair->is_enabled = false;

mtx_lock(&qpair->lock);

- TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq)

- callout_stop(&tr->timer);

+ qpair->recovery_state = RECOVERY_WAITING;

+ TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {

+ tr->deadline = SBT_MAX;

+ }

mtx_unlock(&qpair->lock);

}