aboutsummaryrefslogtreecommitdiff
path: root/module/zfs/vdev_queue.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/vdev_queue.c')
-rw-r--r--module/zfs/vdev_queue.c444
1 files changed, 245 insertions, 199 deletions
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 0cad5839bb34..092b3f375be0 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -121,7 +121,7 @@
* The maximum number of i/os active to each device. Ideally, this will be >=
* the sum of each queue's max_active.
*/
-uint32_t zfs_vdev_max_active = 1000;
+uint_t zfs_vdev_max_active = 1000;
/*
* Per-queue limits on the number of i/os active to each device. If the
@@ -141,24 +141,24 @@ uint32_t zfs_vdev_max_active = 1000;
* more quickly, but reads and writes to have higher latency and lower
* throughput.
*/
-static uint32_t zfs_vdev_sync_read_min_active = 10;
-static uint32_t zfs_vdev_sync_read_max_active = 10;
-static uint32_t zfs_vdev_sync_write_min_active = 10;
-static uint32_t zfs_vdev_sync_write_max_active = 10;
-static uint32_t zfs_vdev_async_read_min_active = 1;
-/* */ uint32_t zfs_vdev_async_read_max_active = 3;
-static uint32_t zfs_vdev_async_write_min_active = 2;
-/* */ uint32_t zfs_vdev_async_write_max_active = 10;
-static uint32_t zfs_vdev_scrub_min_active = 1;
-static uint32_t zfs_vdev_scrub_max_active = 3;
-static uint32_t zfs_vdev_removal_min_active = 1;
-static uint32_t zfs_vdev_removal_max_active = 2;
-static uint32_t zfs_vdev_initializing_min_active = 1;
-static uint32_t zfs_vdev_initializing_max_active = 1;
-static uint32_t zfs_vdev_trim_min_active = 1;
-static uint32_t zfs_vdev_trim_max_active = 2;
-static uint32_t zfs_vdev_rebuild_min_active = 1;
-static uint32_t zfs_vdev_rebuild_max_active = 3;
+static uint_t zfs_vdev_sync_read_min_active = 10;
+static uint_t zfs_vdev_sync_read_max_active = 10;
+static uint_t zfs_vdev_sync_write_min_active = 10;
+static uint_t zfs_vdev_sync_write_max_active = 10;
+static uint_t zfs_vdev_async_read_min_active = 1;
+/* */ uint_t zfs_vdev_async_read_max_active = 3;
+static uint_t zfs_vdev_async_write_min_active = 2;
+/* */ uint_t zfs_vdev_async_write_max_active = 10;
+static uint_t zfs_vdev_scrub_min_active = 1;
+static uint_t zfs_vdev_scrub_max_active = 3;
+static uint_t zfs_vdev_removal_min_active = 1;
+static uint_t zfs_vdev_removal_max_active = 2;
+static uint_t zfs_vdev_initializing_min_active = 1;
+static uint_t zfs_vdev_initializing_max_active = 1;
+static uint_t zfs_vdev_trim_min_active = 1;
+static uint_t zfs_vdev_trim_max_active = 2;
+static uint_t zfs_vdev_rebuild_min_active = 1;
+static uint_t zfs_vdev_rebuild_max_active = 3;
/*
* When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -167,8 +167,8 @@ static uint32_t zfs_vdev_rebuild_max_active = 3;
* zfs_vdev_async_write_max_active. The value is linearly interpolated
* between min and max.
*/
-int zfs_vdev_async_write_active_min_dirty_percent = 30;
-int zfs_vdev_async_write_active_max_dirty_percent = 60;
+uint_t zfs_vdev_async_write_active_min_dirty_percent = 30;
+uint_t zfs_vdev_async_write_active_max_dirty_percent = 60;
/*
* For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
@@ -198,10 +198,10 @@ static uint_t zfs_vdev_nia_credit = 5;
* we include spans of optional I/Os to aid aggregation at the disk even when
* they aren't able to help us aggregate at this level.
*/
-static int zfs_vdev_aggregation_limit = 1 << 20;
-static int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE;
-static int zfs_vdev_read_gap_limit = 32 << 10;
-static int zfs_vdev_write_gap_limit = 4 << 10;
+static uint_t zfs_vdev_aggregation_limit = 1 << 20;
+static uint_t zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE;
+static uint_t zfs_vdev_read_gap_limit = 32 << 10;
+static uint_t zfs_vdev_write_gap_limit = 4 << 10;
/*
* Define the queue depth percentage for each top-level. This percentage is
@@ -214,9 +214,9 @@ static int zfs_vdev_write_gap_limit = 4 << 10;
* to 30 allocations per device.
*/
#ifdef _KERNEL
-int zfs_vdev_queue_depth_pct = 1000;
+uint_t zfs_vdev_queue_depth_pct = 1000;
#else
-int zfs_vdev_queue_depth_pct = 300;
+uint_t zfs_vdev_queue_depth_pct = 300;
#endif
/*
@@ -226,14 +226,7 @@ int zfs_vdev_queue_depth_pct = 300;
* we assume that the average allocation size is 4k, so we need the queue depth
* to be 32 per allocator to get good aggregation of sequential writes.
*/
-int zfs_vdev_def_queue_depth = 32;
-
-/*
- * Allow TRIM I/Os to be aggregated. This should normally not be needed since
- * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
- * by the TRIM code in zfs_trim.c.
- */
-static int zfs_vdev_aggregate_trim = 0;
+uint_t zfs_vdev_def_queue_depth = 32;
static int
vdev_queue_offset_compare(const void *x1, const void *x2)
@@ -249,39 +242,64 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
return (TREE_PCMP(z1, z2));
}
-static inline avl_tree_t *
-vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
-{
- return (&vq->vq_class[p].vqc_queued_tree);
-}
-
-static inline avl_tree_t *
-vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
-{
- ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
- if (t == ZIO_TYPE_READ)
- return (&vq->vq_read_offset_tree);
- else if (t == ZIO_TYPE_WRITE)
- return (&vq->vq_write_offset_tree);
- else
- return (&vq->vq_trim_offset_tree);
-}
+#define VDQ_T_SHIFT 29
static int
-vdev_queue_timestamp_compare(const void *x1, const void *x2)
+vdev_queue_to_compare(const void *x1, const void *x2)
{
const zio_t *z1 = (const zio_t *)x1;
const zio_t *z2 = (const zio_t *)x2;
- int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp);
+ int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT,
+ z2->io_timestamp >> VDQ_T_SHIFT);
+ int ocmp = TREE_CMP(z1->io_offset, z2->io_offset);
+ int cmp = tcmp ? tcmp : ocmp;
- if (likely(cmp))
+ if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE)))
return (cmp);
return (TREE_PCMP(z1, z2));
}
-static int
+static inline boolean_t
+vdev_queue_class_fifo(zio_priority_t p)
+{
+ return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE ||
+ p == ZIO_PRIORITY_TRIM);
+}
+
+static void
+vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio)
+{
+ zio_priority_t p = zio->io_priority;
+ vq->vq_cqueued |= 1U << p;
+ if (vdev_queue_class_fifo(p)) {
+ list_insert_tail(&vq->vq_class[p].vqc_list, zio);
+ vq->vq_class[p].vqc_list_numnodes++;
+ }
+ else
+ avl_add(&vq->vq_class[p].vqc_tree, zio);
+}
+
+static void
+vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio)
+{
+ zio_priority_t p = zio->io_priority;
+ uint32_t empty;
+ if (vdev_queue_class_fifo(p)) {
+ list_t *list = &vq->vq_class[p].vqc_list;
+ list_remove(list, zio);
+ empty = list_is_empty(list);
+ vq->vq_class[p].vqc_list_numnodes--;
+ } else {
+ avl_tree_t *tree = &vq->vq_class[p].vqc_tree;
+ avl_remove(tree, zio);
+ empty = avl_is_empty(tree);
+ }
+ vq->vq_cqueued &= ~(empty << p);
+}
+
+static uint_t
vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
{
switch (p) {
@@ -313,10 +331,10 @@ vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
}
}
-static int
+static uint_t
vdev_queue_max_async_writes(spa_t *spa)
{
- int writes;
+ uint_t writes;
uint64_t dirty = 0;
dsl_pool_t *dp = spa_get_dsl(spa);
uint64_t min_bytes = zfs_dirty_data_max *
@@ -359,8 +377,8 @@ vdev_queue_max_async_writes(spa_t *spa)
return (writes);
}
-static int
-vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
+static uint_t
+vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p)
{
switch (p) {
case ZIO_PRIORITY_SYNC_READ:
@@ -370,7 +388,7 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
case ZIO_PRIORITY_ASYNC_READ:
return (zfs_vdev_async_read_max_active);
case ZIO_PRIORITY_ASYNC_WRITE:
- return (vdev_queue_max_async_writes(spa));
+ return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa));
case ZIO_PRIORITY_SCRUB:
if (vq->vq_ia_active > 0) {
return (MIN(vq->vq_nia_credit,
@@ -414,10 +432,10 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
static zio_priority_t
vdev_queue_class_to_issue(vdev_queue_t *vq)
{
- spa_t *spa = vq->vq_vdev->vdev_spa;
- zio_priority_t p, n;
+ uint32_t cq = vq->vq_cqueued;
+ zio_priority_t p, p1;
- if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
+ if (cq == 0 || vq->vq_active >= zfs_vdev_max_active)
return (ZIO_PRIORITY_NUM_QUEUEABLE);
/*
@@ -425,14 +443,18 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
* Do round-robin to reduce starvation due to zfs_vdev_max_active
* and vq_nia_credit limits.
*/
- for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
- p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
- if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
- vq->vq_class[p].vqc_active <
- vdev_queue_class_min_active(vq, p)) {
- vq->vq_last_prio = p;
- return (p);
- }
+ p1 = vq->vq_last_prio + 1;
+ if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE)
+ p1 = 0;
+ for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+ vdev_queue_class_min_active(vq, p))
+ goto found;
+ }
+ for (p = 0; p < p1; p++) {
+ if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+ vdev_queue_class_min_active(vq, p))
+ goto found;
}
/*
@@ -440,16 +462,14 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
* maximum # outstanding i/os.
*/
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
- if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
- vq->vq_class[p].vqc_active <
- vdev_queue_class_max_active(spa, vq, p)) {
- vq->vq_last_prio = p;
- return (p);
- }
+ if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+ vdev_queue_class_max_active(vq, p))
+ break;
}
- /* No eligible queued i/os */
- return (ZIO_PRIORITY_NUM_QUEUEABLE);
+found:
+ vq->vq_last_prio = p;
+ return (p);
}
void
@@ -458,42 +478,30 @@ vdev_queue_init(vdev_t *vd)
vdev_queue_t *vq = &vd->vdev_queue;
zio_priority_t p;
- mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
vq->vq_vdev = vd;
- taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent);
-
- avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
- sizeof (zio_t), offsetof(struct zio, io_queue_node));
- avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
- vdev_queue_offset_compare, sizeof (zio_t),
- offsetof(struct zio, io_offset_node));
- avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
- vdev_queue_offset_compare, sizeof (zio_t),
- offsetof(struct zio, io_offset_node));
- avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
- vdev_queue_offset_compare, sizeof (zio_t),
- offsetof(struct zio, io_offset_node));
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
- int (*compfn) (const void *, const void *);
-
- /*
- * The synchronous/trim i/o queues are dispatched in FIFO rather
- * than LBA order. This provides more consistent latency for
- * these i/os.
- */
- if (p == ZIO_PRIORITY_SYNC_READ ||
- p == ZIO_PRIORITY_SYNC_WRITE ||
- p == ZIO_PRIORITY_TRIM) {
- compfn = vdev_queue_timestamp_compare;
+ if (vdev_queue_class_fifo(p)) {
+ list_create(&vq->vq_class[p].vqc_list,
+ sizeof (zio_t),
+ offsetof(struct zio, io_queue_node.l));
} else {
- compfn = vdev_queue_offset_compare;
+ avl_create(&vq->vq_class[p].vqc_tree,
+ vdev_queue_to_compare, sizeof (zio_t),
+ offsetof(struct zio, io_queue_node.a));
}
- avl_create(vdev_queue_class_tree(vq, p), compfn,
- sizeof (zio_t), offsetof(struct zio, io_queue_node));
}
+ avl_create(&vq->vq_read_offset_tree,
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
+ avl_create(&vq->vq_write_offset_tree,
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
vq->vq_last_offset = 0;
+ list_create(&vq->vq_active_list, sizeof (struct zio),
+ offsetof(struct zio, io_queue_node.l));
+ mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
}
void
@@ -501,30 +509,39 @@ vdev_queue_fini(vdev_t *vd)
{
vdev_queue_t *vq = &vd->vdev_queue;
- for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
- avl_destroy(vdev_queue_class_tree(vq, p));
- avl_destroy(&vq->vq_active_tree);
- avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
- avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
- avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
+ for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ if (vdev_queue_class_fifo(p))
+ list_destroy(&vq->vq_class[p].vqc_list);
+ else
+ avl_destroy(&vq->vq_class[p].vqc_tree);
+ }
+ avl_destroy(&vq->vq_read_offset_tree);
+ avl_destroy(&vq->vq_write_offset_tree);
+ list_destroy(&vq->vq_active_list);
mutex_destroy(&vq->vq_lock);
}
static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
- ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
- avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
+ zio->io_queue_state = ZIO_QS_QUEUED;
+ vdev_queue_class_add(vq, zio);
+ if (zio->io_type == ZIO_TYPE_READ)
+ avl_add(&vq->vq_read_offset_tree, zio);
+ else if (zio->io_type == ZIO_TYPE_WRITE)
+ avl_add(&vq->vq_write_offset_tree, zio);
}
static void
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
- ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
- avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
+ vdev_queue_class_remove(vq, zio);
+ if (zio->io_type == ZIO_TYPE_READ)
+ avl_remove(&vq->vq_read_offset_tree, zio);
+ else if (zio->io_type == ZIO_TYPE_WRITE)
+ avl_remove(&vq->vq_write_offset_tree, zio);
+ zio->io_queue_state = ZIO_QS_NONE;
}
static boolean_t
@@ -546,14 +563,16 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
{
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- vq->vq_class[zio->io_priority].vqc_active++;
+ vq->vq_cactive[zio->io_priority]++;
+ vq->vq_active++;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (++vq->vq_ia_active == 1)
vq->vq_nia_credit = 1;
} else if (vq->vq_ia_active > 0) {
vq->vq_nia_credit--;
}
- avl_add(&vq->vq_active_tree, zio);
+ zio->io_queue_state = ZIO_QS_ACTIVE;
+ list_insert_tail(&vq->vq_active_list, zio);
}
static void
@@ -561,7 +580,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
{
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- vq->vq_class[zio->io_priority].vqc_active--;
+ vq->vq_cactive[zio->io_priority]--;
+ vq->vq_active--;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (--vq->vq_ia_active == 0)
vq->vq_nia_credit = 0;
@@ -569,7 +589,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
vq->vq_nia_credit = zfs_vdev_nia_credit;
} else if (vq->vq_ia_active == 0)
vq->vq_nia_credit++;
- avl_remove(&vq->vq_active_tree, zio);
+ list_remove(&vq->vq_active_list, zio);
+ zio->io_queue_state = ZIO_QS_NONE;
}
static void
@@ -602,29 +623,28 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
uint64_t maxgap = 0;
uint64_t size;
uint64_t limit;
- int maxblocksize;
boolean_t stretch = B_FALSE;
- avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
- enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
uint64_t next_offset;
abd_t *abd;
+ avl_tree_t *t;
+
+ /*
+ * TRIM aggregation should not be needed since code in zfs_trim.c can
+ * submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M).
+ */
+ if (zio->io_type == ZIO_TYPE_TRIM)
+ return (NULL);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
+ return (NULL);
- maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
if (vq->vq_vdev->vdev_nonrot)
limit = zfs_vdev_aggregation_limit_non_rotating;
else
limit = zfs_vdev_aggregation_limit;
- limit = MAX(MIN(limit, maxblocksize), 0);
-
- if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
- return (NULL);
-
- /*
- * While TRIM commands could be aggregated based on offset this
- * behavior is disabled until it's determined to be beneficial.
- */
- if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
+ if (limit == 0)
return (NULL);
+ limit = MIN(limit, SPA_MAXBLOCKSIZE);
/*
* I/Os to distributed spares are directly dispatched to the dRAID
@@ -635,8 +655,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
first = last = zio;
- if (zio->io_type == ZIO_TYPE_READ)
+ if (zio->io_type == ZIO_TYPE_READ) {
maxgap = zfs_vdev_read_gap_limit;
+ t = &vq->vq_read_offset_tree;
+ } else {
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ t = &vq->vq_write_offset_tree;
+ }
/*
* We can aggregate I/Os that are sufficiently adjacent and of
@@ -657,6 +682,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
* Walk backwards through sufficiently contiguous I/Os
* recording the last non-optional I/O.
*/
+ zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
while ((dio = AVL_PREV(t, first)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
IO_SPAN(dio, last) <= limit &&
@@ -686,7 +712,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
(IO_SPAN(first, dio) <= limit ||
(dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
- IO_SPAN(first, dio) <= maxblocksize &&
+ IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE &&
IO_GAP(last, dio) <= maxgap &&
dio->io_type == zio->io_type) {
last = dio;
@@ -725,6 +751,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
* after our span is mandatory.
*/
dio = AVL_NEXT(t, last);
+ ASSERT3P(dio, !=, NULL);
dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
} else {
/* do not include the optional i/o */
@@ -739,7 +766,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
return (NULL);
size = IO_SPAN(first, last);
- ASSERT3U(size, <=, maxblocksize);
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
abd = abd_alloc_gang();
if (abd == NULL)
@@ -747,8 +774,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
abd, size, first->io_type, zio->io_priority,
- flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
- vdev_queue_agg_io_done, NULL);
+ flags | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL);
aio->io_timestamp = first->io_timestamp;
nio = first;
@@ -756,6 +782,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
do {
dio = nio;
nio = AVL_NEXT(t, dio);
+ ASSERT3P(dio, !=, NULL);
zio_add_child(dio, aio);
vdev_queue_io_remove(vq, dio);
@@ -823,19 +850,30 @@ again:
return (NULL);
}
- /*
- * For LBA-ordered queues (async / scrub / initializing), issue the
- * i/o which follows the most recently issued i/o in LBA (offset) order.
- *
- * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
- */
- tree = vdev_queue_class_tree(vq, p);
- vq->vq_io_search.io_timestamp = 0;
- vq->vq_io_search.io_offset = vq->vq_last_offset - 1;
- VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL);
- zio = avl_nearest(tree, idx, AVL_AFTER);
- if (zio == NULL)
- zio = avl_first(tree);
+ if (vdev_queue_class_fifo(p)) {
+ zio = list_head(&vq->vq_class[p].vqc_list);
+ } else {
+ /*
+ * For LBA-ordered queues (async / scrub / initializing),
+ * issue the I/O which follows the most recently issued I/O
+ * in LBA (offset) order, but to avoid starvation only within
+ * the same 0.5 second interval as the first I/O.
+ */
+ tree = &vq->vq_class[p].vqc_tree;
+ zio = aio = avl_first(tree);
+ if (zio->io_offset < vq->vq_last_offset) {
+ vq->vq_io_search.io_timestamp = zio->io_timestamp;
+ vq->vq_io_search.io_offset = vq->vq_last_offset;
+ zio = avl_find(tree, &vq->vq_io_search, &idx);
+ if (zio == NULL) {
+ zio = avl_nearest(tree, idx, AVL_AFTER);
+ if (zio == NULL ||
+ (zio->io_timestamp >> VDQ_T_SHIFT) !=
+ (aio->io_timestamp >> VDQ_T_SHIFT))
+ zio = aio;
+ }
+ }
+ }
ASSERT3U(zio->io_priority, ==, p);
aio = vdev_queue_aggregate(vq, zio);
@@ -905,7 +943,7 @@ vdev_queue_io(zio_t *zio)
ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM);
}
- zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+ zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
zio->io_timestamp = gethrtime();
mutex_enter(&vq->vq_lock);
@@ -966,7 +1004,6 @@ void
vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
- avl_tree_t *tree;
/*
* ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
@@ -1001,12 +1038,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
* Otherwise, the zio is currently active and we cannot change its
* priority.
*/
- tree = vdev_queue_class_tree(vq, zio->io_priority);
- if (avl_find(tree, zio, NULL) == zio) {
- avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ if (zio->io_queue_state == ZIO_QS_QUEUED) {
+ vdev_queue_class_remove(vq, zio);
zio->io_priority = priority;
- avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
- } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
+ vdev_queue_class_add(vq, zio);
+ } else if (zio->io_queue_state == ZIO_QS_NONE) {
zio->io_priority = priority;
}
@@ -1019,10 +1055,10 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
* vq_lock mutex use here, instead we prefer to keep it lock free for
* performance.
*/
-int
+uint32_t
vdev_queue_length(vdev_t *vd)
{
- return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+ return (vd->vdev_queue.vq_active);
}
uint64_t
@@ -1031,89 +1067,99 @@ vdev_queue_last_offset(vdev_t *vd)
return (vd->vdev_queue.vq_last_offset);
}
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, INT, ZMOD_RW,
+uint64_t
+vdev_queue_class_length(vdev_t *vd, zio_priority_t p)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+ if (vdev_queue_class_fifo(p))
+ return (vq->vq_class[p].vqc_list_numnodes);
+ else
+ return (avl_numnodes(&vq->vq_class[p].vqc_tree));
+}
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW,
"Max vdev I/O aggregation size");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, INT,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT,
ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, INT, ZMOD_RW,
- "Allow TRIM I/O to be aggregated");
-
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW,
"Aggregate read I/O over gap");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, UINT, ZMOD_RW,
"Aggregate write I/O over gap");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, UINT, ZMOD_RW,
"Maximum number of active I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent, INT,
- ZMOD_RW, "Async write concurrency max threshold");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent,
+ UINT, ZMOD_RW, "Async write concurrency max threshold");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent, INT,
- ZMOD_RW, "Async write concurrency min threshold");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent,
+ UINT, ZMOD_RW, "Async write concurrency min threshold");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, UINT, ZMOD_RW,
"Max active async read I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, UINT, ZMOD_RW,
"Min active async read I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, UINT, ZMOD_RW,
"Max active async write I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, UINT, ZMOD_RW,
"Min active async write I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, UINT, ZMOD_RW,
"Max active initializing I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, UINT, ZMOD_RW,
"Min active initializing I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, UINT, ZMOD_RW,
"Max active removal I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, UINT, ZMOD_RW,
"Min active removal I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, UINT, ZMOD_RW,
"Max active scrub I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, UINT, ZMOD_RW,
"Min active scrub I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, UINT, ZMOD_RW,
"Max active sync read I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, UINT, ZMOD_RW,
"Min active sync read I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, UINT, ZMOD_RW,
"Max active sync write I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, UINT, ZMOD_RW,
"Min active sync write I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, UINT, ZMOD_RW,
"Max active trim/discard I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, UINT, ZMOD_RW,
"Min active trim/discard I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, UINT, ZMOD_RW,
"Max active rebuild I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, UINT, ZMOD_RW,
"Min active rebuild I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, UINT, ZMOD_RW,
"Number of non-interactive I/Os to allow in sequence");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, UINT, ZMOD_RW,
"Number of non-interactive I/Os before _max_active");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW,
"Queue depth percentage for each top-level vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW,
+ "Default queue depth for each allocator");