diff options
Diffstat (limited to 'module/zfs/vdev_queue.c')
-rw-r--r-- | module/zfs/vdev_queue.c | 444 |
1 files changed, 245 insertions, 199 deletions
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 0cad5839bb34..092b3f375be0 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -121,7 +121,7 @@ * The maximum number of i/os active to each device. Ideally, this will be >= * the sum of each queue's max_active. */ -uint32_t zfs_vdev_max_active = 1000; +uint_t zfs_vdev_max_active = 1000; /* * Per-queue limits on the number of i/os active to each device. If the @@ -141,24 +141,24 @@ uint32_t zfs_vdev_max_active = 1000; * more quickly, but reads and writes to have higher latency and lower * throughput. */ -static uint32_t zfs_vdev_sync_read_min_active = 10; -static uint32_t zfs_vdev_sync_read_max_active = 10; -static uint32_t zfs_vdev_sync_write_min_active = 10; -static uint32_t zfs_vdev_sync_write_max_active = 10; -static uint32_t zfs_vdev_async_read_min_active = 1; -/* */ uint32_t zfs_vdev_async_read_max_active = 3; -static uint32_t zfs_vdev_async_write_min_active = 2; -/* */ uint32_t zfs_vdev_async_write_max_active = 10; -static uint32_t zfs_vdev_scrub_min_active = 1; -static uint32_t zfs_vdev_scrub_max_active = 3; -static uint32_t zfs_vdev_removal_min_active = 1; -static uint32_t zfs_vdev_removal_max_active = 2; -static uint32_t zfs_vdev_initializing_min_active = 1; -static uint32_t zfs_vdev_initializing_max_active = 1; -static uint32_t zfs_vdev_trim_min_active = 1; -static uint32_t zfs_vdev_trim_max_active = 2; -static uint32_t zfs_vdev_rebuild_min_active = 1; -static uint32_t zfs_vdev_rebuild_max_active = 3; +static uint_t zfs_vdev_sync_read_min_active = 10; +static uint_t zfs_vdev_sync_read_max_active = 10; +static uint_t zfs_vdev_sync_write_min_active = 10; +static uint_t zfs_vdev_sync_write_max_active = 10; +static uint_t zfs_vdev_async_read_min_active = 1; +/* */ uint_t zfs_vdev_async_read_max_active = 3; +static uint_t zfs_vdev_async_write_min_active = 2; +/* */ uint_t zfs_vdev_async_write_max_active = 10; +static uint_t zfs_vdev_scrub_min_active = 1; +static uint_t zfs_vdev_scrub_max_active = 3; +static uint_t zfs_vdev_removal_min_active = 1; +static uint_t zfs_vdev_removal_max_active = 2; +static uint_t zfs_vdev_initializing_min_active = 1; +static uint_t zfs_vdev_initializing_max_active = 1; +static uint_t zfs_vdev_trim_min_active = 1; +static uint_t zfs_vdev_trim_max_active = 2; +static uint_t zfs_vdev_rebuild_min_active = 1; +static uint_t zfs_vdev_rebuild_max_active = 3; /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent @@ -167,8 +167,8 @@ static uint32_t zfs_vdev_rebuild_max_active = 3; * zfs_vdev_async_write_max_active. The value is linearly interpolated * between min and max. */ -int zfs_vdev_async_write_active_min_dirty_percent = 30; -int zfs_vdev_async_write_active_max_dirty_percent = 60; +uint_t zfs_vdev_async_write_active_min_dirty_percent = 30; +uint_t zfs_vdev_async_write_active_max_dirty_percent = 60; /* * For non-interactive I/O (scrub, resilver, removal, initialize and rebuild), @@ -198,10 +198,10 @@ static uint_t zfs_vdev_nia_credit = 5; * we include spans of optional I/Os to aid aggregation at the disk even when * they aren't able to help us aggregate at this level. */ -static int zfs_vdev_aggregation_limit = 1 << 20; -static int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE; -static int zfs_vdev_read_gap_limit = 32 << 10; -static int zfs_vdev_write_gap_limit = 4 << 10; +static uint_t zfs_vdev_aggregation_limit = 1 << 20; +static uint_t zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE; +static uint_t zfs_vdev_read_gap_limit = 32 << 10; +static uint_t zfs_vdev_write_gap_limit = 4 << 10; /* * Define the queue depth percentage for each top-level. This percentage is @@ -214,9 +214,9 @@ static int zfs_vdev_write_gap_limit = 4 << 10; * to 30 allocations per device. */ #ifdef _KERNEL -int zfs_vdev_queue_depth_pct = 1000; +uint_t zfs_vdev_queue_depth_pct = 1000; #else -int zfs_vdev_queue_depth_pct = 300; +uint_t zfs_vdev_queue_depth_pct = 300; #endif /* @@ -226,14 +226,7 @@ int zfs_vdev_queue_depth_pct = 300; * we assume that the average allocation size is 4k, so we need the queue depth * to be 32 per allocator to get good aggregation of sequential writes. */ -int zfs_vdev_def_queue_depth = 32; - -/* - * Allow TRIM I/Os to be aggregated. This should normally not be needed since - * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted - * by the TRIM code in zfs_trim.c. - */ -static int zfs_vdev_aggregate_trim = 0; +uint_t zfs_vdev_def_queue_depth = 32; static int vdev_queue_offset_compare(const void *x1, const void *x2) @@ -249,39 +242,64 @@ vdev_queue_offset_compare(const void *x1, const void *x2) return (TREE_PCMP(z1, z2)); } -static inline avl_tree_t * -vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) -{ - return (&vq->vq_class[p].vqc_queued_tree); -} - -static inline avl_tree_t * -vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) -{ - ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM); - if (t == ZIO_TYPE_READ) - return (&vq->vq_read_offset_tree); - else if (t == ZIO_TYPE_WRITE) - return (&vq->vq_write_offset_tree); - else - return (&vq->vq_trim_offset_tree); -} +#define VDQ_T_SHIFT 29 static int -vdev_queue_timestamp_compare(const void *x1, const void *x2) +vdev_queue_to_compare(const void *x1, const void *x2) { const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; - int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp); + int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT, + z2->io_timestamp >> VDQ_T_SHIFT); + int ocmp = TREE_CMP(z1->io_offset, z2->io_offset); + int cmp = tcmp ? tcmp : ocmp; - if (likely(cmp)) + if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE))) return (cmp); return (TREE_PCMP(z1, z2)); } -static int +static inline boolean_t +vdev_queue_class_fifo(zio_priority_t p) +{ + return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE || + p == ZIO_PRIORITY_TRIM); +} + +static void +vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio) +{ + zio_priority_t p = zio->io_priority; + vq->vq_cqueued |= 1U << p; + if (vdev_queue_class_fifo(p)) { + list_insert_tail(&vq->vq_class[p].vqc_list, zio); + vq->vq_class[p].vqc_list_numnodes++; + } + else + avl_add(&vq->vq_class[p].vqc_tree, zio); +} + +static void +vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio) +{ + zio_priority_t p = zio->io_priority; + uint32_t empty; + if (vdev_queue_class_fifo(p)) { + list_t *list = &vq->vq_class[p].vqc_list; + list_remove(list, zio); + empty = list_is_empty(list); + vq->vq_class[p].vqc_list_numnodes--; + } else { + avl_tree_t *tree = &vq->vq_class[p].vqc_tree; + avl_remove(tree, zio); + empty = avl_is_empty(tree); + } + vq->vq_cqueued &= ~(empty << p); +} + +static uint_t vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) { switch (p) { @@ -313,10 +331,10 @@ vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) } } -static int +static uint_t vdev_queue_max_async_writes(spa_t *spa) { - int writes; + uint_t writes; uint64_t dirty = 0; dsl_pool_t *dp = spa_get_dsl(spa); uint64_t min_bytes = zfs_dirty_data_max * @@ -359,8 +377,8 @@ vdev_queue_max_async_writes(spa_t *spa) return (writes); } -static int -vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) +static uint_t +vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: @@ -370,7 +388,7 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_max_active); case ZIO_PRIORITY_ASYNC_WRITE: - return (vdev_queue_max_async_writes(spa)); + return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa)); case ZIO_PRIORITY_SCRUB: if (vq->vq_ia_active > 0) { return (MIN(vq->vq_nia_credit, @@ -414,10 +432,10 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) static zio_priority_t vdev_queue_class_to_issue(vdev_queue_t *vq) { - spa_t *spa = vq->vq_vdev->vdev_spa; - zio_priority_t p, n; + uint32_t cq = vq->vq_cqueued; + zio_priority_t p, p1; - if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) + if (cq == 0 || vq->vq_active >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); /* @@ -425,14 +443,18 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) * Do round-robin to reduce starvation due to zfs_vdev_max_active * and vq_nia_credit limits. */ - for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) { - p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE; - if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && - vq->vq_class[p].vqc_active < - vdev_queue_class_min_active(vq, p)) { - vq->vq_last_prio = p; - return (p); - } + p1 = vq->vq_last_prio + 1; + if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE) + p1 = 0; + for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < + vdev_queue_class_min_active(vq, p)) + goto found; + } + for (p = 0; p < p1; p++) { + if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < + vdev_queue_class_min_active(vq, p)) + goto found; } /* @@ -440,16 +462,14 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) * maximum # outstanding i/os. */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && - vq->vq_class[p].vqc_active < - vdev_queue_class_max_active(spa, vq, p)) { - vq->vq_last_prio = p; - return (p); - } + if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < + vdev_queue_class_max_active(vq, p)) + break; } - /* No eligible queued i/os */ - return (ZIO_PRIORITY_NUM_QUEUEABLE); +found: + vq->vq_last_prio = p; + return (p); } void @@ -458,42 +478,30 @@ vdev_queue_init(vdev_t *vd) vdev_queue_t *vq = &vd->vdev_queue; zio_priority_t p; - mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); vq->vq_vdev = vd; - taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent); - - avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, - sizeof (zio_t), offsetof(struct zio, io_queue_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - int (*compfn) (const void *, const void *); - - /* - * The synchronous/trim i/o queues are dispatched in FIFO rather - * than LBA order. This provides more consistent latency for - * these i/os. - */ - if (p == ZIO_PRIORITY_SYNC_READ || - p == ZIO_PRIORITY_SYNC_WRITE || - p == ZIO_PRIORITY_TRIM) { - compfn = vdev_queue_timestamp_compare; + if (vdev_queue_class_fifo(p)) { + list_create(&vq->vq_class[p].vqc_list, + sizeof (zio_t), + offsetof(struct zio, io_queue_node.l)); } else { - compfn = vdev_queue_offset_compare; + avl_create(&vq->vq_class[p].vqc_tree, + vdev_queue_to_compare, sizeof (zio_t), + offsetof(struct zio, io_queue_node.a)); } - avl_create(vdev_queue_class_tree(vq, p), compfn, - sizeof (zio_t), offsetof(struct zio, io_queue_node)); } + avl_create(&vq->vq_read_offset_tree, + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); + avl_create(&vq->vq_write_offset_tree, + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); vq->vq_last_offset = 0; + list_create(&vq->vq_active_list, sizeof (struct zio), + offsetof(struct zio, io_queue_node.l)); + mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); } void @@ -501,30 +509,39 @@ vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; - for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) - avl_destroy(vdev_queue_class_tree(vq, p)); - avl_destroy(&vq->vq_active_tree); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM)); + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if (vdev_queue_class_fifo(p)) + list_destroy(&vq->vq_class[p].vqc_list); + else + avl_destroy(&vq->vq_class[p].vqc_tree); + } + avl_destroy(&vq->vq_read_offset_tree); + avl_destroy(&vq->vq_write_offset_tree); + list_destroy(&vq->vq_active_list); mutex_destroy(&vq->vq_lock); } static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); - avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); + zio->io_queue_state = ZIO_QS_QUEUED; + vdev_queue_class_add(vq, zio); + if (zio->io_type == ZIO_TYPE_READ) + avl_add(&vq->vq_read_offset_tree, zio); + else if (zio->io_type == ZIO_TYPE_WRITE) + avl_add(&vq->vq_write_offset_tree, zio); } static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); - avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); + vdev_queue_class_remove(vq, zio); + if (zio->io_type == ZIO_TYPE_READ) + avl_remove(&vq->vq_read_offset_tree, zio); + else if (zio->io_type == ZIO_TYPE_WRITE) + avl_remove(&vq->vq_write_offset_tree, zio); + zio->io_queue_state = ZIO_QS_NONE; } static boolean_t @@ -546,14 +563,16 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_class[zio->io_priority].vqc_active++; + vq->vq_cactive[zio->io_priority]++; + vq->vq_active++; if (vdev_queue_is_interactive(zio->io_priority)) { if (++vq->vq_ia_active == 1) vq->vq_nia_credit = 1; } else if (vq->vq_ia_active > 0) { vq->vq_nia_credit--; } - avl_add(&vq->vq_active_tree, zio); + zio->io_queue_state = ZIO_QS_ACTIVE; + list_insert_tail(&vq->vq_active_list, zio); } static void @@ -561,7 +580,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_class[zio->io_priority].vqc_active--; + vq->vq_cactive[zio->io_priority]--; + vq->vq_active--; if (vdev_queue_is_interactive(zio->io_priority)) { if (--vq->vq_ia_active == 0) vq->vq_nia_credit = 0; @@ -569,7 +589,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) vq->vq_nia_credit = zfs_vdev_nia_credit; } else if (vq->vq_ia_active == 0) vq->vq_nia_credit++; - avl_remove(&vq->vq_active_tree, zio); + list_remove(&vq->vq_active_list, zio); + zio->io_queue_state = ZIO_QS_NONE; } static void @@ -602,29 +623,28 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) uint64_t maxgap = 0; uint64_t size; uint64_t limit; - int maxblocksize; boolean_t stretch = B_FALSE; - avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); - enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; uint64_t next_offset; abd_t *abd; + avl_tree_t *t; + + /* + * TRIM aggregation should not be needed since code in zfs_trim.c can + * submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M). + */ + if (zio->io_type == ZIO_TYPE_TRIM) + return (NULL); + + if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) + return (NULL); - maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa); if (vq->vq_vdev->vdev_nonrot) limit = zfs_vdev_aggregation_limit_non_rotating; else limit = zfs_vdev_aggregation_limit; - limit = MAX(MIN(limit, maxblocksize), 0); - - if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0) - return (NULL); - - /* - * While TRIM commands could be aggregated based on offset this - * behavior is disabled until it's determined to be beneficial. - */ - if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim) + if (limit == 0) return (NULL); + limit = MIN(limit, SPA_MAXBLOCKSIZE); /* * I/Os to distributed spares are directly dispatched to the dRAID @@ -635,8 +655,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) first = last = zio; - if (zio->io_type == ZIO_TYPE_READ) + if (zio->io_type == ZIO_TYPE_READ) { maxgap = zfs_vdev_read_gap_limit; + t = &vq->vq_read_offset_tree; + } else { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + t = &vq->vq_write_offset_tree; + } /* * We can aggregate I/Os that are sufficiently adjacent and of @@ -657,6 +682,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) * Walk backwards through sufficiently contiguous I/Os * recording the last non-optional I/O. */ + zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; while ((dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= limit && @@ -686,7 +712,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (IO_SPAN(first, dio) <= limit || (dio->io_flags & ZIO_FLAG_OPTIONAL)) && - IO_SPAN(first, dio) <= maxblocksize && + IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE && IO_GAP(last, dio) <= maxgap && dio->io_type == zio->io_type) { last = dio; @@ -725,6 +751,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) * after our span is mandatory. */ dio = AVL_NEXT(t, last); + ASSERT3P(dio, !=, NULL); dio->io_flags &= ~ZIO_FLAG_OPTIONAL; } else { /* do not include the optional i/o */ @@ -739,7 +766,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) return (NULL); size = IO_SPAN(first, last); - ASSERT3U(size, <=, maxblocksize); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); abd = abd_alloc_gang(); if (abd == NULL) @@ -747,8 +774,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, abd, size, first->io_type, zio->io_priority, - flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, - vdev_queue_agg_io_done, NULL); + flags | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; nio = first; @@ -756,6 +782,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) do { dio = nio; nio = AVL_NEXT(t, dio); + ASSERT3P(dio, !=, NULL); zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); @@ -823,19 +850,30 @@ again: return (NULL); } - /* - * For LBA-ordered queues (async / scrub / initializing), issue the - * i/o which follows the most recently issued i/o in LBA (offset) order. - * - * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp. - */ - tree = vdev_queue_class_tree(vq, p); - vq->vq_io_search.io_timestamp = 0; - vq->vq_io_search.io_offset = vq->vq_last_offset - 1; - VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL); - zio = avl_nearest(tree, idx, AVL_AFTER); - if (zio == NULL) - zio = avl_first(tree); + if (vdev_queue_class_fifo(p)) { + zio = list_head(&vq->vq_class[p].vqc_list); + } else { + /* + * For LBA-ordered queues (async / scrub / initializing), + * issue the I/O which follows the most recently issued I/O + * in LBA (offset) order, but to avoid starvation only within + * the same 0.5 second interval as the first I/O. + */ + tree = &vq->vq_class[p].vqc_tree; + zio = aio = avl_first(tree); + if (zio->io_offset < vq->vq_last_offset) { + vq->vq_io_search.io_timestamp = zio->io_timestamp; + vq->vq_io_search.io_offset = vq->vq_last_offset; + zio = avl_find(tree, &vq->vq_io_search, &idx); + if (zio == NULL) { + zio = avl_nearest(tree, idx, AVL_AFTER); + if (zio == NULL || + (zio->io_timestamp >> VDQ_T_SHIFT) != + (aio->io_timestamp >> VDQ_T_SHIFT)) + zio = aio; + } + } + } ASSERT3U(zio->io_priority, ==, p); aio = vdev_queue_aggregate(vq, zio); @@ -905,7 +943,7 @@ vdev_queue_io(zio_t *zio) ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM); } - zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; + zio->io_flags |= ZIO_FLAG_DONT_QUEUE; zio->io_timestamp = gethrtime(); mutex_enter(&vq->vq_lock); @@ -966,7 +1004,6 @@ void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; - avl_tree_t *tree; /* * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio @@ -1001,12 +1038,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) * Otherwise, the zio is currently active and we cannot change its * priority. */ - tree = vdev_queue_class_tree(vq, zio->io_priority); - if (avl_find(tree, zio, NULL) == zio) { - avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + if (zio->io_queue_state == ZIO_QS_QUEUED) { + vdev_queue_class_remove(vq, zio); zio->io_priority = priority; - avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); - } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { + vdev_queue_class_add(vq, zio); + } else if (zio->io_queue_state == ZIO_QS_NONE) { zio->io_priority = priority; } @@ -1019,10 +1055,10 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) * vq_lock mutex use here, instead we prefer to keep it lock free for * performance. */ -int +uint32_t vdev_queue_length(vdev_t *vd) { - return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); + return (vd->vdev_queue.vq_active); } uint64_t @@ -1031,89 +1067,99 @@ vdev_queue_last_offset(vdev_t *vd) return (vd->vdev_queue.vq_last_offset); } -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, INT, ZMOD_RW, +uint64_t +vdev_queue_class_length(vdev_t *vd, zio_priority_t p) +{ + vdev_queue_t *vq = &vd->vdev_queue; + if (vdev_queue_class_fifo(p)) + return (vq->vq_class[p].vqc_list_numnodes); + else + return (avl_numnodes(&vq->vq_class[p].vqc_tree)); +} + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW, "Max vdev I/O aggregation size"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, INT, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT, ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, INT, ZMOD_RW, - "Allow TRIM I/O to be aggregated"); - -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW, "Aggregate read I/O over gap"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, UINT, ZMOD_RW, "Aggregate write I/O over gap"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, UINT, ZMOD_RW, "Maximum number of active I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent, INT, - ZMOD_RW, "Async write concurrency max threshold"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent, + UINT, ZMOD_RW, "Async write concurrency max threshold"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent, INT, - ZMOD_RW, "Async write concurrency min threshold"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent, + UINT, ZMOD_RW, "Async write concurrency min threshold"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, UINT, ZMOD_RW, "Max active async read I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, UINT, ZMOD_RW, "Min active async read I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, UINT, ZMOD_RW, "Max active async write I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, UINT, ZMOD_RW, "Min active async write I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, UINT, ZMOD_RW, "Max active initializing I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, UINT, ZMOD_RW, "Min active initializing I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, UINT, ZMOD_RW, "Max active removal I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, UINT, ZMOD_RW, "Min active removal I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, UINT, ZMOD_RW, "Max active scrub I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, UINT, ZMOD_RW, "Min active scrub I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, UINT, ZMOD_RW, "Max active sync read I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, UINT, ZMOD_RW, "Min active sync read I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, UINT, ZMOD_RW, "Max active sync write I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, UINT, ZMOD_RW, "Min active sync write I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, UINT, ZMOD_RW, "Max active trim/discard I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, UINT, ZMOD_RW, "Min active trim/discard I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, UINT, ZMOD_RW, "Max active rebuild I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, UINT, ZMOD_RW, "Min active rebuild I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, UINT, ZMOD_RW, "Number of non-interactive I/Os to allow in sequence"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, UINT, ZMOD_RW, "Number of non-interactive I/Os before _max_active"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW, "Queue depth percentage for each top-level vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW, + "Default queue depth for each allocator"); |