diff options
Diffstat (limited to 'sys/dev/nvme/nvme_ctrlr.c')
-rw-r--r-- | sys/dev/nvme/nvme_ctrlr.c | 412 |
1 files changed, 229 insertions, 183 deletions
diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index 73a7cee4aad0..3a1894bf754d 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -41,6 +41,9 @@ #include <sys/endian.h> #include <sys/stdarg.h> #include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_extern.h> +#include <vm/vm_map.h> #include "nvme_private.h" #include "nvme_linux.h" @@ -48,7 +51,7 @@ #define B4_CHK_RDY_DELAY_MS 2300 /* work around controller bug */ static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, - struct nvme_async_event_request *aer); + struct nvme_async_event_request *aer); static void nvme_ctrlr_barrier(struct nvme_controller *ctrlr, int flags) @@ -597,7 +600,6 @@ nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr) static bool is_log_page_id_valid(uint8_t page_id) { - switch (page_id) { case NVME_LOG_ERROR: case NVME_LOG_HEALTH_INFORMATION: @@ -653,7 +655,6 @@ static void nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr, uint8_t state) { - if (state & NVME_CRIT_WARN_ST_AVAILABLE_SPARE) nvme_printf(ctrlr, "SMART WARNING: available spare space below threshold\n"); @@ -680,96 +681,6 @@ nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr, } static void -nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl) -{ - struct nvme_async_event_request *aer = arg; - struct nvme_health_information_page *health_info; - struct nvme_ns_list *nsl; - struct nvme_error_information_entry *err; - int i; - - /* - * If the log page fetch for some reason completed with an error, - * don't pass log page data to the consumers. In practice, this case - * should never happen. - */ - if (nvme_completion_is_error(cpl)) - nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, - aer->log_page_id, NULL, 0); - else { - /* Convert data to host endian */ - switch (aer->log_page_id) { - case NVME_LOG_ERROR: - err = (struct nvme_error_information_entry *)aer->log_page_buffer; - for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++) - nvme_error_information_entry_swapbytes(err++); - break; - case NVME_LOG_HEALTH_INFORMATION: - nvme_health_information_page_swapbytes( - (struct nvme_health_information_page *)aer->log_page_buffer); - break; - case NVME_LOG_CHANGED_NAMESPACE: - nvme_ns_list_swapbytes( - (struct nvme_ns_list *)aer->log_page_buffer); - break; - case NVME_LOG_COMMAND_EFFECT: - nvme_command_effects_page_swapbytes( - (struct nvme_command_effects_page *)aer->log_page_buffer); - break; - case NVME_LOG_RES_NOTIFICATION: - nvme_res_notification_page_swapbytes( - (struct nvme_res_notification_page *)aer->log_page_buffer); - break; - case NVME_LOG_SANITIZE_STATUS: - nvme_sanitize_status_page_swapbytes( - (struct nvme_sanitize_status_page *)aer->log_page_buffer); - break; - default: - break; - } - - if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) { - health_info = (struct nvme_health_information_page *) - aer->log_page_buffer; - nvme_ctrlr_log_critical_warnings(aer->ctrlr, - health_info->critical_warning); - /* - * Critical warnings reported through the - * SMART/health log page are persistent, so - * clear the associated bits in the async event - * config so that we do not receive repeated - * notifications for the same event. - */ - aer->ctrlr->async_event_config &= - ~health_info->critical_warning; - nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr, - aer->ctrlr->async_event_config, NULL, NULL); - } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE && - !nvme_use_nvd) { - nsl = (struct nvme_ns_list *)aer->log_page_buffer; - for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) { - if (nsl->ns[i] > NVME_MAX_NAMESPACES) - break; - nvme_notify_ns(aer->ctrlr, nsl->ns[i]); - } - } - - /* - * Pass the cpl data from the original async event completion, - * not the log page fetch. - */ - nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, - aer->log_page_id, aer->log_page_buffer, aer->log_page_size); - } - - /* - * Repost another asynchronous event request to replace the one - * that just completed. - */ - nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); -} - -static void nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl) { struct nvme_async_event_request *aer = arg; @@ -784,33 +695,18 @@ nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl) return; } - /* Associated log page is in bits 23:16 of completion entry dw0. */ + /* + * Save the completion status and associated log page is in bits 23:16 + * of completion entry dw0. Print a message and queue it for further + * processing. + */ + memcpy(&aer->cpl, cpl, sizeof(*cpl)); aer->log_page_id = NVMEV(NVME_ASYNC_EVENT_LOG_PAGE_ID, cpl->cdw0); - nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x," " page 0x%02x)\n", NVMEV(NVME_ASYNC_EVENT_TYPE, cpl->cdw0), NVMEV(NVME_ASYNC_EVENT_INFO, cpl->cdw0), aer->log_page_id); - - if (is_log_page_id_valid(aer->log_page_id)) { - aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr, - aer->log_page_id); - memcpy(&aer->cpl, cpl, sizeof(*cpl)); - nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id, - NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, - aer->log_page_size, nvme_ctrlr_async_event_log_page_cb, - aer); - /* Wait to notify consumers until after log page is fetched. */ - } else { - nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id, - NULL, 0); - - /* - * Repost another asynchronous event request to replace the one - * that just completed. - */ - nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); - } + taskqueue_enqueue(aer->ctrlr->taskqueue, &aer->task); } static void @@ -819,15 +715,21 @@ nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, { struct nvme_request *req; - aer->ctrlr = ctrlr; /* - * XXX-MJ this should be M_WAITOK but we might be in a non-sleepable - * callback context. AER completions should be handled on a dedicated - * thread. + * We're racing the reset thread, so let that process submit this again. + * XXX does this really solve that race? And is that race even possible + * since we only reset when we've no theard from the card in a long + * time. Why would we get an AER in the middle of that just before we + * kick off the reset? */ - req = nvme_allocate_request_null(M_NOWAIT, nvme_ctrlr_async_event_cb, + if (ctrlr->is_resetting) + return; + + aer->ctrlr = ctrlr; + req = nvme_allocate_request_null(M_WAITOK, nvme_ctrlr_async_event_cb, aer); aer->req = req; + aer->log_page_id = 0; /* Not a valid page */ /* * Disable timeout here, since asynchronous event requests should by @@ -880,7 +782,6 @@ nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr) static void nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr) { - ctrlr->int_coal_time = 0; TUNABLE_INT_FETCH("hw.nvme.int_coal_time", &ctrlr->int_coal_time); @@ -1203,6 +1104,140 @@ nvme_ctrlr_reset_task(void *arg, int pending) atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); } +static void +nvme_ctrlr_aer_done(void *arg, const struct nvme_completion *cpl) +{ + struct nvme_async_event_request *aer = arg; + + mtx_lock(&aer->mtx); + if (nvme_completion_is_error(cpl)) + aer->log_page_size = (uint32_t)-1; + else + aer->log_page_size = nvme_ctrlr_get_log_page_size( + aer->ctrlr, aer->log_page_id); + wakeup(aer); + mtx_unlock(&aer->mtx); +} + +static void +nvme_ctrlr_aer_task(void *arg, int pending) +{ + struct nvme_async_event_request *aer = arg; + struct nvme_controller *ctrlr = aer->ctrlr; + uint32_t len; + + /* + * We're resetting, so just punt. + */ + if (ctrlr->is_resetting) + return; + + if (!is_log_page_id_valid(aer->log_page_id)) { + /* + * Repost another asynchronous event request to replace the one + * that just completed. + */ + nvme_notify_async_consumers(ctrlr, &aer->cpl, aer->log_page_id, + NULL, 0); + nvme_ctrlr_construct_and_submit_aer(ctrlr, aer); + goto out; + } + + aer->log_page_size = 0; + len = nvme_ctrlr_get_log_page_size(aer->ctrlr, aer->log_page_id); + nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id, + NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, len, + nvme_ctrlr_aer_done, aer); + mtx_lock(&aer->mtx); + while (aer->log_page_size == 0) + mtx_sleep(aer, &aer->mtx, PRIBIO, "nvme_pt", 0); + mtx_unlock(&aer->mtx); + + if (aer->log_page_size != (uint32_t)-1) { + /* + * If the log page fetch for some reason completed with an + * error, don't pass log page data to the consumers. In + * practice, this case should never happen. + */ + nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, + aer->log_page_id, NULL, 0); + goto out; + } + + /* Convert data to host endian */ + switch (aer->log_page_id) { + case NVME_LOG_ERROR: { + struct nvme_error_information_entry *err = + (struct nvme_error_information_entry *)aer->log_page_buffer; + for (int i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++) + nvme_error_information_entry_swapbytes(err++); + break; + } + case NVME_LOG_HEALTH_INFORMATION: + nvme_health_information_page_swapbytes( + (struct nvme_health_information_page *)aer->log_page_buffer); + break; + case NVME_LOG_CHANGED_NAMESPACE: + nvme_ns_list_swapbytes( + (struct nvme_ns_list *)aer->log_page_buffer); + break; + case NVME_LOG_COMMAND_EFFECT: + nvme_command_effects_page_swapbytes( + (struct nvme_command_effects_page *)aer->log_page_buffer); + break; + case NVME_LOG_RES_NOTIFICATION: + nvme_res_notification_page_swapbytes( + (struct nvme_res_notification_page *)aer->log_page_buffer); + break; + case NVME_LOG_SANITIZE_STATUS: + nvme_sanitize_status_page_swapbytes( + (struct nvme_sanitize_status_page *)aer->log_page_buffer); + break; + default: + break; + } + + if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) { + struct nvme_health_information_page *health_info = + (struct nvme_health_information_page *)aer->log_page_buffer; + + /* + * Critical warnings reported through the SMART/health log page + * are persistent, so clear the associated bits in the async + * event config so that we do not receive repeated notifications + * for the same event. + */ + nvme_ctrlr_log_critical_warnings(aer->ctrlr, + health_info->critical_warning); + aer->ctrlr->async_event_config &= + ~health_info->critical_warning; + nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr, + aer->ctrlr->async_event_config, NULL, NULL); + } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE) { + struct nvme_ns_list *nsl = + (struct nvme_ns_list *)aer->log_page_buffer; + for (int i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) { + if (nsl->ns[i] > NVME_MAX_NAMESPACES) + break; + nvme_notify_ns(aer->ctrlr, nsl->ns[i]); + } + } + + /* + * Pass the cpl data from the original async event completion, not the + * log page fetch. + */ + nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, + aer->log_page_id, aer->log_page_buffer, aer->log_page_size); + + /* + * Repost another asynchronous event request to replace the one + * that just completed. + */ +out: + nvme_ctrlr_construct_and_submit_aer(ctrlr, aer); +} + /* * Poll all the queues enabled on the device for completion. */ @@ -1233,6 +1268,34 @@ nvme_ctrlr_shared_handler(void *arg) nvme_mmio_write_4(ctrlr, intmc, 1); } +#define NVME_MAX_PAGES (int)(1024 / sizeof(vm_page_t)) + +static int +nvme_user_ioctl_req(vm_offset_t addr, size_t len, bool is_read, + vm_page_t *upages, int max_pages, int *npagesp, struct nvme_request **req, + nvme_cb_fn_t cb_fn, void *cb_arg) +{ + vm_prot_t prot = VM_PROT_READ; + int err; + + if (is_read) + prot |= VM_PROT_WRITE; /* Device will write to host memory */ + err = vm_fault_hold_pages(&curproc->p_vmspace->vm_map, + addr, len, prot, upages, max_pages, npagesp); + if (err != 0) + return (err); + *req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg); + (*req)->payload = memdesc_vmpages(upages, len, addr & PAGE_MASK); + (*req)->payload_valid = true; + return (0); +} + +static void +nvme_user_ioctl_free(vm_page_t *pages, int npage) +{ + vm_page_unhold_pages(pages, npage); +} + static void nvme_pt_done(void *arg, const struct nvme_completion *cpl) { @@ -1255,30 +1318,28 @@ nvme_pt_done(void *arg, const struct nvme_completion *cpl) int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, - struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer, + struct nvme_pt_command *pt, uint32_t nsid, int is_user, int is_admin_cmd) { - struct nvme_request *req; - struct mtx *mtx; - struct buf *buf = NULL; - int ret = 0; + struct nvme_request *req; + struct mtx *mtx; + int ret = 0; + int npages = 0; + vm_page_t upages[NVME_MAX_PAGES]; if (pt->len > 0) { if (pt->len > ctrlr->max_xfer_size) { - nvme_printf(ctrlr, "pt->len (%d) " - "exceeds max_xfer_size (%d)\n", pt->len, - ctrlr->max_xfer_size); - return EIO; + nvme_printf(ctrlr, + "len (%d) exceeds max_xfer_size (%d)\n", + pt->len, ctrlr->max_xfer_size); + return (EIO); } - if (is_user_buffer) { - buf = uma_zalloc(pbuf_zone, M_WAITOK); - buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE; - if (vmapbuf(buf, pt->buf, pt->len, 1) < 0) { - ret = EFAULT; - goto err; - } - req = nvme_allocate_request_vaddr(buf->b_data, pt->len, - M_WAITOK, nvme_pt_done, pt); + if (is_user) { + ret = nvme_user_ioctl_req((vm_offset_t)pt->buf, pt->len, + pt->is_read, upages, nitems(upages), &npages, &req, + nvme_pt_done, pt); + if (ret != 0) + return (ret); } else req = nvme_allocate_request_vaddr(pt->buf, pt->len, M_WAITOK, nvme_pt_done, pt); @@ -1312,11 +1373,8 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, mtx_sleep(pt, mtx, PRIBIO, "nvme_pt", 0); mtx_unlock(mtx); - if (buf != NULL) { - vunmapbuf(buf); -err: - uma_zfree(pbuf_zone, buf); - } + if (npages > 0) + nvme_user_ioctl_free(upages, npages); return (ret); } @@ -1342,8 +1400,9 @@ nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr, { struct nvme_request *req; struct mtx *mtx; - struct buf *buf = NULL; int ret = 0; + int npages = 0; + vm_page_t upages[NVME_MAX_PAGES]; /* * We don't support metadata. @@ -1354,28 +1413,16 @@ nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr, if (npc->data_len > 0 && npc->addr != 0) { if (npc->data_len > ctrlr->max_xfer_size) { nvme_printf(ctrlr, - "npc->data_len (%d) exceeds max_xfer_size (%d)\n", + "data_len (%d) exceeds max_xfer_size (%d)\n", npc->data_len, ctrlr->max_xfer_size); return (EIO); } - /* - * We only support data out or data in commands, but not both at - * once. However, there's some comands with lower bit cleared - * that are really read commands, so we should filter & 3 == 0, - * but don't. - */ - if ((npc->opcode & 0x3) == 3) - return (EINVAL); if (is_user) { - buf = uma_zalloc(pbuf_zone, M_WAITOK); - buf->b_iocmd = npc->opcode & 1 ? BIO_WRITE : BIO_READ; - if (vmapbuf(buf, (void *)(uintptr_t)npc->addr, - npc->data_len, 1) < 0) { - ret = EFAULT; - goto err; - } - req = nvme_allocate_request_vaddr(buf->b_data, - npc->data_len, M_WAITOK, nvme_npc_done, npc); + ret = nvme_user_ioctl_req(npc->addr, npc->data_len, + npc->opcode & 0x1, upages, nitems(upages), &npages, + &req, nvme_npc_done, npc); + if (ret != 0) + return (ret); } else req = nvme_allocate_request_vaddr( (void *)(uintptr_t)npc->addr, npc->data_len, @@ -1385,8 +1432,8 @@ nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr, req->cmd.opc = npc->opcode; req->cmd.fuse = npc->flags; - req->cmd.rsvd2 = htole16(npc->cdw2); - req->cmd.rsvd3 = htole16(npc->cdw3); + req->cmd.rsvd2 = htole32(npc->cdw2); + req->cmd.rsvd3 = htole32(npc->cdw3); req->cmd.cdw10 = htole32(npc->cdw10); req->cmd.cdw11 = htole32(npc->cdw11); req->cmd.cdw12 = htole32(npc->cdw12); @@ -1410,11 +1457,8 @@ nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr, mtx_sleep(npc, mtx, PRIBIO, "nvme_npc", 0); mtx_unlock(mtx); - if (buf != NULL) { - vunmapbuf(buf); -err: - uma_zfree(pbuf_zone, buf); - } + if (npages > 0) + nvme_user_ioctl_free(upages, npages); return (ret); } @@ -1574,13 +1618,8 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) /* * Create 2 threads for the taskqueue. The reset thread will block when * it detects that the controller has failed until all I/O has been - * failed up the stack. The fail_req task needs to be able to run in - * this case to finish the request failure for some cases. - * - * We could partially solve this race by draining the failed requeust - * queue before proceding to free the sim, though nothing would stop - * new I/O from coming in after we do that drain, but before we reach - * cam_sim_free, so this big hammer is used instead. + * failed up the stack. The second thread is used for AER events, which + * can block, but only briefly for memory and log page fetching. */ ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK, taskqueue_thread_enqueue, &ctrlr->taskqueue); @@ -1590,7 +1629,12 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) ctrlr->is_initialized = false; ctrlr->notification_sent = 0; TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr); - STAILQ_INIT(&ctrlr->fail_req); + for (int i = 0; i < NVME_MAX_ASYNC_EVENTS; i++) { + struct nvme_async_event_request *aer = &ctrlr->aer[i]; + + TASK_INIT(&aer->task, 0, nvme_ctrlr_aer_task, aer); + mtx_init(&aer->mtx, "AER mutex", NULL, MTX_DEF); + } ctrlr->is_failed = false; make_dev_args_init(&md_args); @@ -1678,8 +1722,14 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev) } noadminq: - if (ctrlr->taskqueue) + if (ctrlr->taskqueue) { taskqueue_free(ctrlr->taskqueue); + for (int i = 0; i < NVME_MAX_ASYNC_EVENTS; i++) { + struct nvme_async_event_request *aer = &ctrlr->aer[i]; + + mtx_destroy(&aer->mtx); + } + } if (ctrlr->tag) bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag); @@ -1735,7 +1785,6 @@ void nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr, struct nvme_request *req) { - nvme_qpair_submit_request(&ctrlr->adminq, req); } @@ -1752,14 +1801,12 @@ nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr, device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr) { - return (ctrlr->dev); } const struct nvme_controller_data * nvme_ctrlr_get_data(struct nvme_controller *ctrlr) { - return (&ctrlr->cdata); } @@ -1812,7 +1859,6 @@ nvme_ctrlr_suspend(struct nvme_controller *ctrlr) int nvme_ctrlr_resume(struct nvme_controller *ctrlr) { - /* * Can't touch failed controllers, so nothing to do to resume. */ |