aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module')
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c7
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_objset.c8
-rw-r--r--sys/contrib/openzfs/module/zfs/mmp.c5
-rw-r--r--sys/contrib/openzfs/module/zfs/spa.c191
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_misc.c22
-rw-r--r--sys/contrib/openzfs/module/zfs/txg.c9
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev.c22
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_label.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz.c5
-rw-r--r--sys/contrib/openzfs/module/zfs/zap.c336
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_ioctl.c9
-rw-r--r--sys/contrib/openzfs/module/zfs/zio.c17
-rw-r--r--sys/contrib/openzfs/module/zfs/zio_inject.c6
15 files changed, 550 insertions, 95 deletions
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
index 6a7c2d2811b1..712ff1b837d7 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
@@ -1259,7 +1259,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
/* Move to a new hashtable entry. */
- zv->zv_hash = zvol_name_hash(zv->zv_name);
+ zv->zv_hash = zvol_name_hash(newname);
hlist_del(&zv->zv_hlink);
hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
index 2cea61a6294c..463c5f705102 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@@ -429,8 +429,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
/* Determine the logical block size */
int logical_block_size = bdev_logical_block_size(bdev);
- /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
- v->vdev_nowritecache = B_FALSE;
+ /*
+ * If the device has a write cache, clear the nowritecache flag,
+ * so that we start issuing flush requests again.
+ */
+ v->vdev_nowritecache = !zfs_bdev_has_write_cache(bdev);
/* Set when device reports it supports TRIM. */
v->vdev_has_trim = bdev_discard_supported(bdev);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index 4b960daf89ee..2a036dc5136b 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -1571,7 +1571,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
/* move to new hashtable entry */
- zv->zv_hash = zvol_name_hash(zv->zv_name);
+ zv->zv_hash = zvol_name_hash(newname);
hlist_del(&zv->zv_hlink);
hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c
index 2ba26f68e398..f1818ae155bd 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_objset.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c
@@ -400,10 +400,10 @@ dnode_hash(const objset_t *os, uint64_t obj)
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
/*
- * The low 6 bits of the pointer don't have much entropy, because
- * the objset_t is larger than 2^6 bytes long.
+ * The lower 11 bits of the pointer don't have much entropy, because
+ * the objset_t is more than 1KB long and so likely aligned to 2KB.
*/
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 11)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];
@@ -1664,12 +1664,14 @@ sync_dnodes_task(void *arg)
sync_objset_arg_t *soa = sda->sda_soa;
objset_t *os = soa->soa_os;
+ uint_t allocator = spa_acq_allocator(os->os_spa);
multilist_sublist_t *ms =
multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
dmu_objset_sync_dnodes(ms, soa->soa_tx);
multilist_sublist_unlock(ms);
+ spa_rel_allocator(os->os_spa, allocator);
kmem_free(sda, sizeof (*sda));
diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c
index 66bc0ae60b10..71122542758d 100644
--- a/sys/contrib/openzfs/module/zfs/mmp.c
+++ b/sys/contrib/openzfs/module/zfs/mmp.c
@@ -664,12 +664,13 @@ mmp_thread(void *arg)
(gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
"mmp_last_write %llu mmp_interval %llu "
- "mmp_fail_intervals %llu mmp_fail_ns %llu",
+ "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
spa_name(spa), (u_longlong_t)gethrtime(),
(u_longlong_t)mmp->mmp_last_write,
(u_longlong_t)mmp_interval,
(u_longlong_t)mmp_fail_intervals,
- (u_longlong_t)mmp_fail_ns);
+ (u_longlong_t)mmp_fail_ns,
+ (u_longlong_t)spa->spa_uberblock.ub_txg);
cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
"succeeded in over %llu ms; suspending pool. "
"Hrtime %llu",
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
index 96daf51b696a..ec2b674fb7ee 100644
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -208,7 +208,7 @@ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */
static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
#endif
-static uint_t zio_taskq_wr_iss_ncpus = 0;
+static uint_t zio_taskq_write_tpq = 16;
/*
* Report any spa_load_verify errors found, but do not fail spa_load.
@@ -1067,17 +1067,16 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
case ZTI_MODE_SYNC:
/*
- * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus',
- * not to exceed the number of spa allocators.
+ * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs,
+ * not to exceed the number of spa allocators, and align to it.
*/
- if (zio_taskq_wr_iss_ncpus == 0) {
- count = MAX(boot_ncpus / spa->spa_alloc_count, 1);
- } else {
- count = MAX(1,
- boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus));
- }
+ cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
+ count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq));
count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
count = MIN(count, spa->spa_alloc_count);
+ while (spa->spa_alloc_count % count != 0 &&
+ spa->spa_alloc_count < count * 2)
+ count--;
/*
* zio_taskq_batch_pct is unbounded and may exceed 100%, but no
@@ -1495,15 +1494,11 @@ spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
ASSERT3P(tqs->stqs_taskq, !=, NULL);
ASSERT3U(tqs->stqs_count, !=, 0);
- if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
- (zio != NULL) && (zio->io_wr_iss_tq != NULL)) {
- /* dispatch to assigned write issue taskq */
- tq = zio->io_wr_iss_tq;
- return (tq);
- }
-
if (tqs->stqs_count == 1) {
tq = tqs->stqs_taskq[0];
+ } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
+ (zio != NULL) && ZIO_HAS_ALLOCATOR(zio)) {
+ tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count];
} else {
tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
}
@@ -3594,11 +3589,16 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
}
/*
- * Perform the import activity check. If the user canceled the import or
- * we detected activity then fail.
+ * Remote host activity check.
+ *
+ * error results:
+ * 0 - no activity detected
+ * EREMOTEIO - remote activity detected
+ * EINTR - user canceled the operation
*/
static int
-spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
+spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
+ boolean_t importing)
{
uint64_t txg = ub->ub_txg;
uint64_t timestamp = ub->ub_timestamp;
@@ -3643,19 +3643,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
import_expire = gethrtime() + import_delay;
- spa_import_progress_set_notes(spa, "Checking MMP activity, waiting "
- "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+ if (importing) {
+ spa_import_progress_set_notes(spa, "Checking MMP activity, "
+ "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+ }
- int interations = 0;
+ int iterations = 0;
while ((now = gethrtime()) < import_expire) {
- if (interations++ % 30 == 0) {
+ if (importing && iterations++ % 30 == 0) {
spa_import_progress_set_notes(spa, "Checking MMP "
"activity, %llu ms remaining",
(u_longlong_t)NSEC2MSEC(import_expire - now));
}
- (void) spa_import_progress_set_mmp_check(spa_guid(spa),
- NSEC2SEC(import_expire - gethrtime()));
+ if (importing) {
+ (void) spa_import_progress_set_mmp_check(spa_guid(spa),
+ NSEC2SEC(import_expire - gethrtime()));
+ }
vdev_uberblock_load(rvd, ub, &mmp_label);
@@ -3737,6 +3741,61 @@ out:
return (error);
}
+/*
+ * Called from zfs_ioc_clear for a pool that was suspended
+ * after failing mmp write checks.
+ */
+boolean_t
+spa_mmp_remote_host_activity(spa_t *spa)
+{
+ ASSERT(spa_multihost(spa) && spa_suspended(spa));
+
+ nvlist_t *best_label;
+ uberblock_t best_ub;
+
+ /*
+ * Locate the best uberblock on disk
+ */
+ vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
+ if (best_label) {
+ /*
+ * confirm that the best hostid matches our hostid
+ */
+ if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
+ spa_get_hostid(spa) !=
+ fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
+ nvlist_free(best_label);
+ return (B_TRUE);
+ }
+ nvlist_free(best_label);
+ } else {
+ return (B_TRUE);
+ }
+
+ if (!MMP_VALID(&best_ub) ||
+ !MMP_FAIL_INT_VALID(&best_ub) ||
+ MMP_FAIL_INT(&best_ub) == 0) {
+ return (B_TRUE);
+ }
+
+ if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
+ best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
+ zfs_dbgmsg("txg mismatch detected during pool clear "
+ "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
+ (u_longlong_t)spa->spa_uberblock.ub_txg,
+ (u_longlong_t)best_ub.ub_txg,
+ (u_longlong_t)spa->spa_uberblock.ub_timestamp,
+ (u_longlong_t)best_ub.ub_timestamp);
+ return (B_TRUE);
+ }
+
+ /*
+ * Perform an activity check looking for any remote writer
+ */
+ return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
+ B_FALSE) != 0);
+}
+
static int
spa_verify_host(spa_t *spa, nvlist_t *mos_config)
{
@@ -4063,7 +4122,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
}
- int error = spa_activity_check(spa, ub, spa->spa_config);
+ int error =
+ spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
if (error) {
nvlist_free(label);
return (error);
@@ -8771,15 +8831,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
}
static void
-spa_async_probe(spa_t *spa, vdev_t *vd)
+spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
{
- if (vd->vdev_probe_wanted) {
- vd->vdev_probe_wanted = B_FALSE;
- vdev_reopen(vd); /* vdev_open() does the actual probe */
+ if (vd->vdev_fault_wanted) {
+ vd->vdev_fault_wanted = B_FALSE;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ VDEV_AUX_ERR_EXCEEDED);
}
for (int c = 0; c < vd->vdev_children; c++)
- spa_async_probe(spa, vd->vdev_child[c]);
+ spa_async_fault_vdev(spa, vd->vdev_child[c]);
}
static void
@@ -8867,11 +8928,11 @@ spa_async_thread(void *arg)
}
/*
- * See if any devices need to be probed.
+ * See if any devices need to be marked faulted.
*/
- if (tasks & SPA_ASYNC_PROBE) {
+ if (tasks & SPA_ASYNC_FAULT_VDEV) {
spa_vdev_state_enter(spa, SCL_NONE);
- spa_async_probe(spa, spa->spa_root_vdev);
+ spa_async_fault_vdev(spa, spa->spa_root_vdev);
(void) spa_vdev_state_exit(spa, NULL, 0);
}
@@ -10167,16 +10228,10 @@ spa_sync_tq_create(spa_t *spa, const char *name)
VERIFY(spa->spa_sync_tq != NULL);
VERIFY(kthreads != NULL);
- spa_taskqs_t *tqs =
- &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE];
-
spa_syncthread_info_t *ti = spa->spa_syncthreads;
- for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) {
+ for (int i = 0; i < nthreads; i++, ti++) {
ti->sti_thread = kthreads[i];
- if (w == tqs->stqs_count) {
- w = 0;
- }
- ti->sti_wr_iss_tq = tqs->stqs_taskq[w];
+ ti->sti_allocator = i;
}
kmem_free(kthreads, sizeof (*kthreads) * nthreads);
@@ -10195,6 +10250,42 @@ spa_sync_tq_destroy(spa_t *spa)
spa->spa_sync_tq = NULL;
}
+uint_t
+spa_acq_allocator(spa_t *spa)
+{
+ int i;
+
+ if (spa->spa_alloc_count == 1)
+ return (0);
+
+ mutex_enter(&spa->spa_allocs_use->sau_lock);
+ uint_t r = spa->spa_allocs_use->sau_rotor;
+ do {
+ if (++r == spa->spa_alloc_count)
+ r = 0;
+ } while (spa->spa_allocs_use->sau_inuse[r]);
+ spa->spa_allocs_use->sau_inuse[r] = B_TRUE;
+ spa->spa_allocs_use->sau_rotor = r;
+ mutex_exit(&spa->spa_allocs_use->sau_lock);
+
+ spa_syncthread_info_t *ti = spa->spa_syncthreads;
+ for (i = 0; i < spa->spa_alloc_count; i++, ti++) {
+ if (ti->sti_thread == curthread) {
+ ti->sti_allocator = r;
+ break;
+ }
+ }
+ ASSERT3S(i, <, spa->spa_alloc_count);
+ return (r);
+}
+
+void
+spa_rel_allocator(spa_t *spa, uint_t allocator)
+{
+ if (spa->spa_alloc_count > 1)
+ spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE;
+}
+
void
spa_select_allocator(zio_t *zio)
{
@@ -10222,8 +10313,7 @@ spa_select_allocator(zio_t *zio)
spa_syncthread_info_t *ti = spa->spa_syncthreads;
for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
if (ti->sti_thread == curthread) {
- zio->io_allocator = i;
- zio->io_wr_iss_tq = ti->sti_wr_iss_tq;
+ zio->io_allocator = ti->sti_allocator;
return;
}
}
@@ -10240,7 +10330,6 @@ spa_select_allocator(zio_t *zio)
bm->zb_blkid >> 20);
zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
- zio->io_wr_iss_tq = NULL;
}
/*
@@ -10811,10 +10900,10 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
"Print vdev tree to zfs_dbgmsg during pool import");
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW,
"Percentage of CPUs to run an IO worker thread");
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW,
"Number of threads per IO worker taskqueue");
/* BEGIN CSTYLED */
@@ -10845,13 +10934,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
#ifdef _KERNEL
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
- spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD,
+ spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,
"Configure IO queues for read IO");
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
- spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD,
+ spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW,
"Configure IO queues for write IO");
#endif
/* END CSTYLED */
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW,
- "Number of CPUs to run write issue taskqs");
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW,
+ "Number of CPUs per write issue taskq");
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index 5fb7847b5d8b..e6d4a9bdb29c 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -394,6 +394,7 @@ static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
* Number of allocators to use, per spa instance
*/
static int spa_num_allocators = 4;
+static int spa_cpus_per_allocator = 4;
/*
* Spa active allocator.
@@ -747,8 +748,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
if (altroot)
spa->spa_root = spa_strdup(altroot);
- /* Do not allow more allocators than CPUs. */
- spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus);
+ /* Do not allow more allocators than fraction of CPUs. */
+ spa->spa_alloc_count = MAX(MIN(spa_num_allocators,
+ boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1);
spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
sizeof (spa_alloc_t), KM_SLEEP);
@@ -758,6 +760,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
}
+ if (spa->spa_alloc_count > 1) {
+ spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t,
+ sau_inuse[spa->spa_alloc_count]), KM_SLEEP);
+ mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT,
+ NULL);
+ }
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
@@ -853,6 +861,11 @@ spa_remove(spa_t *spa)
}
kmem_free(spa->spa_allocs, spa->spa_alloc_count *
sizeof (spa_alloc_t));
+ if (spa->spa_alloc_count > 1) {
+ mutex_destroy(&spa->spa_allocs_use->sau_lock);
+ kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t,
+ sau_inuse[spa->spa_alloc_count]));
+ }
avl_destroy(&spa->spa_metaslabs_by_flushed);
avl_destroy(&spa->spa_sm_logs_by_txg);
@@ -3097,4 +3110,7 @@ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
param_get_uint, ZMOD_RW, "Reserved free space in pool");
ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW,
- "Number of allocators per spa, capped by ncpus");
+ "Number of allocators per spa");
+
+ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW,
+ "Minimum number of CPUs per allocators");
diff --git a/sys/contrib/openzfs/module/zfs/txg.c b/sys/contrib/openzfs/module/zfs/txg.c
index a67c043446f5..5ce6be69be14 100644
--- a/sys/contrib/openzfs/module/zfs/txg.c
+++ b/sys/contrib/openzfs/module/zfs/txg.c
@@ -551,6 +551,15 @@ txg_sync_thread(void *arg)
}
/*
+ * When we're suspended, nothing should be changing and for
+ * MMP we don't want to bump anything that would make it
+ * harder to detect if another host is changing it when
+ * resuming after a MMP suspend.
+ */
+ if (spa_suspended(spa))
+ continue;
+
+ /*
* Wait until the quiesce thread hands off a txg to us,
* prompting it to do so if necessary.
*/
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index d97d0a8100c2..c5551eb6cf6e 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -1664,6 +1664,7 @@ vdev_metaslab_fini(vdev_t *vd)
typedef struct vdev_probe_stats {
boolean_t vps_readable;
boolean_t vps_writeable;
+ boolean_t vps_zio_done_probe;
int vps_flags;
} vdev_probe_stats_t;
@@ -1709,6 +1710,17 @@ vdev_probe_done(zio_t *zio)
(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
spa, vd, NULL, NULL, 0);
zio->io_error = SET_ERROR(ENXIO);
+
+ /*
+ * If this probe was initiated from zio pipeline, then
+ * change the state in a spa_async_request. Probes that
+ * were initiated from a vdev_open can change the state
+ * as part of the open call.
+ */
+ if (vps->vps_zio_done_probe) {
+ vd->vdev_fault_wanted = B_TRUE;
+ spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
+ }
}
mutex_enter(&vd->vdev_probe_lock);
@@ -1759,6 +1771,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
+ vps->vps_zio_done_probe = (zio != NULL);
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
@@ -1785,15 +1798,6 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
-
- /*
- * We can't change the vdev state in this context, so we
- * kick off an async task to do it on our behalf.
- */
- if (zio != NULL) {
- vd->vdev_probe_wanted = B_TRUE;
- spa_async_request(spa, SPA_ASYNC_PROBE);
- }
}
if (zio != NULL)
diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c
index c31f48028bbc..ed592514fded 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_label.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_label.c
@@ -2027,6 +2027,7 @@ retry:
/*
* If this isn't a resync due to I/O errors,
* and nothing changed in this transaction group,
+ * and multihost protection isn't enabled,
* and the vdev configuration hasn't changed,
* then there's nothing to do.
*/
@@ -2034,7 +2035,8 @@ retry:
boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
txg, spa->spa_mmp.mmp_delay);
- if (!changed && list_is_empty(&spa->spa_config_dirty_list))
+ if (!changed && list_is_empty(&spa->spa_config_dirty_list) &&
+ !spa_multihost(spa))
return (0);
}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
index b03331ec69c6..de7d0fa79478 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
@@ -1891,8 +1891,9 @@ vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
static void
vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
{
- int n, i, c, t, tt;
- int nmissing_rows;
+ int i, c, t, tt;
+ unsigned int n;
+ unsigned int nmissing_rows;
int missing_rows[VDEV_RAIDZ_MAXPARITY];
int parity_map[VDEV_RAIDZ_MAXPARITY];
uint8_t *p, *pp;
diff --git a/sys/contrib/openzfs/module/zfs/zap.c b/sys/contrib/openzfs/module/zfs/zap.c
index da86defb445c..1b6b16fc6662 100644
--- a/sys/contrib/openzfs/module/zfs/zap.c
+++ b/sys/contrib/openzfs/module/zfs/zap.c
@@ -22,6 +22,8 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2023 Alexander Stetsenko <alex.stetsenko@gmail.com>
+ * Copyright (c) 2023, Klara Inc.
*/
/*
@@ -41,6 +43,7 @@
#include <sys/spa.h>
#include <sys/dmu.h>
+#include <sys/dnode.h>
#include <sys/zfs_context.h>
#include <sys/zfs_znode.h>
#include <sys/fs/zfs.h>
@@ -78,9 +81,16 @@
*/
static int zap_iterate_prefetch = B_TRUE;
+/*
+ * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be
+ * collapsed into a single block.
+ */
+int zap_shrink_enabled = B_TRUE;
+
int fzap_default_block_shift = 14; /* 16k blocksize */
static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
+static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx);
void
fzap_byteswap(void *vbuf, size_t size)
@@ -587,6 +597,72 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
}
static int
+zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk,
+ dmu_tx_t *tx)
+{
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ int epb = bs >> 3; /* entries per block */
+ int err = 0;
+
+ ASSERT(tx != NULL);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ /*
+ * Check for i/o errors
+ */
+ for (int i = 0; i < nptrs; i += epb) {
+ uint64_t blk;
+ err = zap_idx_to_blk(zap, idx + i, &blk);
+ if (err != 0) {
+ return (err);
+ }
+ }
+
+ for (int i = 0; i < nptrs; i++) {
+ err = zap_set_idx_to_blk(zap, idx + i, blk, tx);
+ ASSERT0(err); /* we checked for i/o errors above */
+ if (err != 0)
+ break;
+ }
+
+ return (err);
+}
+
+#define ZAP_PREFIX_HASH(pref, pref_len) ((pref) << (64 - (pref_len)))
+
+/*
+ * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl.
+ * If two leaves are siblings, their ranges are adjecent and contain the same
+ * number of entries. In order to find out if a leaf has a sibling, we need to
+ * check the range corresponding to the sibling leaf. There is no need to check
+ * all entries in the range, we only need to check the frist and the last one.
+ */
+static uint64_t
+check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len)
+{
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len);
+ uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len;
+ uint64_t nptrs = (1 << pref_diff);
+ uint64_t first;
+ uint64_t last;
+
+ ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
+
+ if (zap_idx_to_blk(zap, idx, &first) != 0)
+ return (0);
+
+ if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0)
+ return (0);
+
+ if (first != last)
+ return (0);
+ return (first);
+}
+
+static int
zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
{
uint64_t blk;
@@ -958,6 +1034,10 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
if (err == 0) {
zap_entry_remove(&zeh);
zap_increment_num_entries(zn->zn_zap, -1, tx);
+
+ if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 &&
+ zap_shrink_enabled)
+ return (zap_shrink(zn, l, tx));
}
zap_put_leaf(l);
return (err);
@@ -1222,13 +1302,19 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
ZIO_PRIORITY_ASYNC_READ);
}
- if (zc->zc_leaf &&
- (ZAP_HASH_IDX(zc->zc_hash,
- zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
- zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+ if (zc->zc_leaf) {
rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
- zap_put_leaf(zc->zc_leaf);
- zc->zc_leaf = NULL;
+
+ /*
+ * The leaf was either shrunk or split.
+ */
+ if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) ||
+ (ZAP_HASH_IDX(zc->zc_hash,
+ zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
+ zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ }
}
again:
@@ -1237,8 +1323,6 @@ again:
&zc->zc_leaf);
if (err != 0)
return (err);
- } else {
- rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
}
l = zc->zc_leaf;
@@ -1367,6 +1451,242 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
}
}
+/*
+ * Find last allocated block and update freeblk.
+ */
+static void
+zap_trunc(zap_t *zap)
+{
+ uint64_t nentries;
+ uint64_t lastblk;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) {
+ /* External ptrtbl */
+ nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk +
+ zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1;
+ } else {
+ /* Embedded ptrtbl */
+ nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+ lastblk = 0;
+ }
+
+ for (uint64_t idx = 0; idx < nentries; idx++) {
+ uint64_t blk;
+ if (zap_idx_to_blk(zap, idx, &blk) != 0)
+ return;
+ if (blk > lastblk)
+ lastblk = blk;
+ }
+
+ ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk);
+
+ zap_f_phys(zap)->zap_freeblk = lastblk + 1;
+}
+
+/*
+ * ZAP shrinking algorithm.
+ *
+ * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf
+ * only if it has a sibling. Sibling leaves have the same prefix length and
+ * their prefixes differ only by the least significant (sibling) bit. We require
+ * both siblings to be empty. This eliminates a need to rehash the non-empty
+ * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl
+ * entries of the removed leaf to point out to the remaining leaf. Prefix length
+ * of the remaining leaf is decremented. As a result, it has a new prefix and it
+ * might have a new sibling. So, we repeat the process.
+ *
+ * Steps:
+ * 1. Check if a sibling leaf (sl) exists and it is empty.
+ * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1.
+ * 3. Release the sibling (sl) to derefer it again with WRITER lock.
+ * 4. Upgrade zapdir lock to WRITER (once).
+ * 5. Derefer released leaves again.
+ * 6. If it is needed, recheck whether both leaves are still siblings and empty.
+ * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of
+ * the remaining leaf (slbit 0).
+ * 8. Free disk block of the removed leaf (dmu_free_range).
+ * 9. Decrement prefix_len of the remaining leaf.
+ * 10. Repeat the steps.
+ */
+static int
+zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
+{
+ zap_t *zap = zn->zn_zap;
+ int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+ uint64_t hash = zn->zn_hash;
+ uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
+ uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+ boolean_t trunc = B_FALSE;
+ int err = 0;
+
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
+ ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix);
+
+ boolean_t writer = B_FALSE;
+
+ /*
+ * To avoid deadlock always deref leaves in the same order -
+ * sibling 0 first, then sibling 1.
+ */
+ while (prefix_len) {
+ zap_leaf_t *sl;
+ int64_t prefix_diff = zt_shift - prefix_len;
+ uint64_t sl_prefix = prefix ^ 1;
+ uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len);
+ int slbit = prefix & 1;
+
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
+
+ /*
+ * Check if there is a sibling by reading ptrtbl ptrs.
+ */
+ if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0)
+ break;
+
+ /*
+ * sibling 1, unlock it - we haven't yet dereferenced sibling 0.
+ */
+ if (slbit == 1) {
+ zap_put_leaf(l);
+ l = NULL;
+ }
+
+ /*
+ * Dereference sibling leaf and check if it is empty.
+ */
+ if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER,
+ &sl)) != 0)
+ break;
+
+ ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix);
+
+ /*
+ * Check if we have a sibling and it is empty.
+ */
+ if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len ||
+ zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) {
+ zap_put_leaf(sl);
+ break;
+ }
+
+ zap_put_leaf(sl);
+
+ /*
+ * If there two empty sibling, we have work to do, so
+ * we need to lock ZAP ptrtbl as WRITER.
+ */
+ if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) {
+ /* We failed to upgrade */
+ if (l != NULL) {
+ zap_put_leaf(l);
+ l = NULL;
+ }
+
+ /*
+ * Usually, the right way to upgrade from a READER lock
+ * to a WRITER lock is to call zap_unlockdir() and
+ * zap_lockdir(), but we do not have a tag. Instead,
+ * we do it in more sophisticated way.
+ */
+ rw_exit(&zap->zap_rwlock);
+ rw_enter(&zap->zap_rwlock, RW_WRITER);
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+
+ zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+ writer = B_TRUE;
+ }
+
+ /*
+ * Here we have WRITER lock for ptrtbl.
+ * Now, we need a WRITER lock for both siblings leaves.
+ * Also, we have to recheck if the leaves are still siblings
+ * and still empty.
+ */
+ if (l == NULL) {
+ /* sibling 0 */
+ if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash),
+ tx, RW_WRITER, &l)) != 0)
+ break;
+
+ /*
+ * The leaf isn't empty anymore or
+ * it was shrunk/split while our locks were down.
+ */
+ if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 ||
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len)
+ break;
+ }
+
+ /* sibling 1 */
+ if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx,
+ RW_WRITER, &sl)) != 0)
+ break;
+
+ /*
+ * The leaf isn't empty anymore or
+ * it was shrunk/split while our locks were down.
+ */
+ if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 ||
+ zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) {
+ zap_put_leaf(sl);
+ break;
+ }
+
+ /* If we have gotten here, we have a leaf to collapse */
+ uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff;
+ uint64_t nptrs = (1ULL << prefix_diff);
+ uint64_t sl_blkid = sl->l_blkid;
+
+ /*
+ * Set ptrtbl entries to point out to the slibling 0 blkid
+ */
+ if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid,
+ tx)) != 0) {
+ zap_put_leaf(sl);
+ break;
+ }
+
+ /*
+ * Free sibling 1 disk block.
+ */
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1)
+ trunc = B_TRUE;
+
+ (void) dmu_free_range(zap->zap_objset, zap->zap_object,
+ sl_blkid << bs, 1 << bs, tx);
+ zap_put_leaf(sl);
+
+ zap_f_phys(zap)->zap_num_leafs--;
+
+ /*
+ * Update prefix and prefix_len.
+ */
+ zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1;
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len--;
+
+ prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
+ prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+ }
+
+ if (trunc)
+ zap_trunc(zap);
+
+ if (l != NULL)
+ zap_put_leaf(l);
+
+ return (err);
+}
+
/* CSTYLED */
ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW,
"When iterating ZAP object, prefetch it");
+
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW,
+ "Enable ZAP shrinking");
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
index 2ac1e34dccec..908b9efc1813 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@@ -5823,10 +5823,13 @@ zfs_ioc_clear(zfs_cmd_t *zc)
/*
* If multihost is enabled, resuming I/O is unsafe as another
- * host may have imported the pool.
+ * host may have imported the pool. Check for remote activity.
*/
- if (spa_multihost(spa) && spa_suspended(spa))
- return (SET_ERROR(EINVAL));
+ if (spa_multihost(spa) && spa_suspended(spa) &&
+ spa_mmp_remote_host_activity(spa)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EREMOTEIO));
+ }
spa_vdev_state_enter(spa, SCL_NONE);
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index 1ba99f4d4624..870343bf4fa3 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -803,9 +803,10 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
/*
* If we can tell the caller to execute this parent next, do
- * so. We only do this if the parent's zio type matches the
- * child's type. Otherwise dispatch the parent zio in its
- * own taskq.
+ * so. We do this if the parent's zio type matches the child's
+ * type, or if it's a zio_null() with no done callback, and so
+ * has no actual work to do. Otherwise dispatch the parent zio
+ * in its own taskq.
*
* Having the caller execute the parent when possible reduces
* locking on the zio taskq's, reduces context switch
@@ -825,7 +826,8 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
* of writes for spa_sync(), and the chain of ZIL blocks.
*/
if (next_to_executep != NULL && *next_to_executep == NULL &&
- pio->io_type == zio->io_type) {
+ (pio->io_type == zio->io_type ||
+ (pio->io_type == ZIO_TYPE_NULL && !pio->io_done))) {
*next_to_executep = pio;
} else {
zio_taskq_dispatch(pio, type, B_FALSE);
@@ -2532,8 +2534,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
"failure and the failure mode property for this pool "
"is set to panic.", spa_name(spa));
- cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
- "failure and has been suspended.\n", spa_name(spa));
+ if (reason != ZIO_SUSPEND_MMP) {
+ cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable "
+ "I/O failure and has been suspended.\n", spa_name(spa));
+ }
(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
NULL, NULL, 0);
@@ -2921,7 +2925,6 @@ static void
zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
{
cio->io_allocator = pio->io_allocator;
- cio->io_wr_iss_tq = pio->io_wr_iss_tq;
}
static void
diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c
index 3773e400d799..012a0e3c6c17 100644
--- a/sys/contrib/openzfs/module/zfs/zio_inject.c
+++ b/sys/contrib/openzfs/module/zfs/zio_inject.c
@@ -607,9 +607,11 @@ zio_handle_io_delay(zio_t *zio)
if (vd->vdev_guid != handler->zi_record.zi_guid)
continue;
+ /* also match on I/O type (e.g., -T read) */
if (handler->zi_record.zi_iotype != ZIO_TYPES &&
- handler->zi_record.zi_iotype != zio->io_type)
- continue;
+ handler->zi_record.zi_iotype != zio->io_type) {
+ continue;
+ }
/*
* Defensive; should never happen as the array allocation