19 files changed, 630 insertions, 151 deletions
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
index df41e3b49204..bd6dc8edd8ca 100644
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -486,13 +486,13 @@ static taskq_t *arc_flush_taskq;
 static uint_t zfs_arc_evict_threads = 0;
 
 /* The 7 states: */
-arc_state_t ARC_anon;
-arc_state_t ARC_mru;
-arc_state_t ARC_mru_ghost;
-arc_state_t ARC_mfu;
-arc_state_t ARC_mfu_ghost;
-arc_state_t ARC_l2c_only;
-arc_state_t ARC_uncached;
+static arc_state_t ARC_anon;
+/*  */ arc_state_t ARC_mru;
+static arc_state_t ARC_mru_ghost;
+/*  */ arc_state_t ARC_mfu;
+static arc_state_t ARC_mfu_ghost;
+static arc_state_t ARC_l2c_only;
+static arc_state_t ARC_uncached;
 
 arc_stats_t arc_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
@@ -832,15 +832,15 @@ typedef struct arc_async_flush {
 #define	L2ARC_FEED_TYPES	4
 
 /* L2ARC Performance Tunables */
-uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
-uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
-uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* # of dev writes */
-uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
-uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
-uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
-int l2arc_noprefetch = B_TRUE;			/* don't cache prefetch bufs */
-int l2arc_feed_again = B_TRUE;			/* turbo warmup */
-int l2arc_norw = B_FALSE;			/* no reads during writes */
+static uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
+static uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
+static uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* # of dev writes */
+static uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
+static uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
+static uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
+static int l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
+static int l2arc_feed_again = B_TRUE;		/* turbo warmup */
+static int l2arc_norw = B_FALSE;		/* no reads during writes */
 static uint_t l2arc_meta_percent = 33;	/* limit on headers size */
 
 /*
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
index 7403f10d91b7..fccc4c5b5b94 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -2270,14 +2270,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	if (dn->dn_objset->os_dsl_dataset != NULL)
 		rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
 #endif
-	/*
-	 * We make this assert for private objects as well, but after we
-	 * check if we're already dirty.  They are allowed to re-dirty
-	 * in syncing context.
-	 */
-	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
-	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
-	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	mutex_enter(&db->db_mtx);
 	/*
@@ -2289,12 +2281,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
 	    db->db_state == DB_NOFILL);
 
-	mutex_enter(&dn->dn_mtx);
-	dnode_set_dirtyctx(dn, tx, db);
-	if (tx->tx_txg > dn->dn_dirty_txg)
-		dn->dn_dirty_txg = tx->tx_txg;
-	mutex_exit(&dn->dn_mtx);
-
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		dn->dn_have_spill = B_TRUE;
 
@@ -2313,13 +2299,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		return (dr_next);
 	}
 
-	/*
-	 * Only valid if not already dirty.
-	 */
-	ASSERT(dn->dn_object == 0 ||
-	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
-	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
-
 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
 
 	/*
diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c
index d6658375f810..0dc9adc7fd4f 100644
--- a/sys/contrib/openzfs/module/zfs/ddt.c
+++ b/sys/contrib/openzfs/module/zfs/ddt.c
@@ -1701,9 +1701,11 @@ ddt_load(spa_t *spa)
 			}
 		}
 
-		error = ddt_log_load(ddt);
-		if (error != 0 && error != ENOENT)
-			return (error);
+		if (ddt->ddt_flags & DDT_FLAG_LOG) {
+			error = ddt_log_load(ddt);
+			if (error != 0 && error != ENOENT)
+				return (error);
+		}
 
 		DDT_KSTAT_SET(ddt, dds_log_active_entries,
 		    avl_numnodes(&ddt->ddt_log_active->ddl_tree));
diff --git a/sys/contrib/openzfs/module/zfs/ddt_log.c b/sys/contrib/openzfs/module/zfs/ddt_log.c
index 3d30e244c1f7..c7a2426f3a77 100644
--- a/sys/contrib/openzfs/module/zfs/ddt_log.c
+++ b/sys/contrib/openzfs/module/zfs/ddt_log.c
@@ -176,11 +176,13 @@ ddt_log_update_stats(ddt_t *ddt)
 	 * that's reasonable to expect anyway.
 	 */
 	dmu_object_info_t doi;
-	uint64_t nblocks;
-	dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
-	nblocks = doi.doi_physical_blocks_512;
-	dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
-	nblocks += doi.doi_physical_blocks_512;
+	uint64_t nblocks = 0;
+	if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object,
+	    &doi) == 0)
+		nblocks += doi.doi_physical_blocks_512;
+	if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object,
+	    &doi) == 0)
+		nblocks += doi.doi_physical_blocks_512;
 
 	ddt_object_t *ddo = &ddt->ddt_log_stats;
 	ddo->ddo_count =
@@ -243,6 +245,13 @@ ddt_log_alloc_entry(ddt_t *ddt)
 }
 
 static void
+ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle)
+{
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+}
+
+static void
 ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
 {
 	/* Create the log tree entry from a live or stored entry */
@@ -347,8 +356,7 @@ ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
 	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
 
 	avl_remove(&ddl->ddl_tree, ddle);
-	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
-	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+	ddt_log_free_entry(ddt, ddle);
 
 	return (B_TRUE);
 }
@@ -365,8 +373,7 @@ ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
 	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
 
 	avl_remove(&ddl->ddl_tree, ddle);
-	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
-	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+	ddt_log_free_entry(ddt, ddle);
 
 	return (B_TRUE);
 }
@@ -527,8 +534,7 @@ ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
 	IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
 	while ((ddle =
 	    avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
-		kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
-		    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+		ddt_log_free_entry(ddt, ddle);
 	}
 	ASSERT(avl_is_empty(&ddl->ddl_tree));
 }
@@ -727,7 +733,7 @@ ddt_log_load(ddt_t *ddt)
 				ddle = fe;
 				fe = AVL_NEXT(fl, fe);
 				avl_remove(fl, ddle);
-
+				ddt_log_free_entry(ddt, ddle);
 				ddle = ae;
 				ae = AVL_NEXT(al, ae);
 			}
diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c
index f7f808d5b8f7..a7a5c89bdafb 100644
--- a/sys/contrib/openzfs/module/zfs/dmu.c
+++ b/sys/contrib/openzfs/module/zfs/dmu.c
@@ -759,6 +759,8 @@ dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
 		 */
 		uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
 		limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
+		if (limit == 0)
+			end2 = start2;
 		do {
 			level2++;
 			start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
@@ -1689,8 +1691,8 @@ dmu_object_cached_size(objset_t *os, uint64_t object,
 
 	dmu_object_info_from_dnode(dn, &doi);
 
-	for (uint64_t off = 0; off < doi.doi_max_offset;
-	    off += dmu_prefetch_max) {
+	for (uint64_t off = 0; off < doi.doi_max_offset &&
+	    dmu_prefetch_max > 0; off += dmu_prefetch_max) {
 		/* dbuf_read doesn't prefetch L1 blocks. */
 		dmu_prefetch_by_dnode(dn, 1, off,
 		    dmu_prefetch_max, ZIO_PRIORITY_SYNC_READ);
diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c
index a77f338bdfd3..8e6b569c2100 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_objset.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c
@@ -2037,6 +2037,8 @@ userquota_updates_task(void *arg)
 				dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 		}
 		dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
+		ASSERT3U(dn->dn_dirtycnt, >, 0);
+		dn->dn_dirtycnt--;
 		mutex_exit(&dn->dn_mtx);
 
 		multilist_sublist_remove(list, dn);
@@ -2070,6 +2072,10 @@ dnode_rele_task(void *arg)
 
 	dnode_t *dn;
 	while ((dn = multilist_sublist_head(list)) != NULL) {
+		mutex_enter(&dn->dn_mtx);
+		ASSERT3U(dn->dn_dirtycnt, >, 0);
+		dn->dn_dirtycnt--;
+		mutex_exit(&dn->dn_mtx);
 		multilist_sublist_remove(list, dn);
 		dnode_rele(dn, &os->os_synced_dnodes);
 	}
diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
index 51165d0bf723..3d3a9c713568 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
@@ -57,19 +57,19 @@ static unsigned int	zfetch_max_sec_reap = 2;
 /* min bytes to prefetch per stream (default 2MB) */
 static unsigned int	zfetch_min_distance = 2 * 1024 * 1024;
 /* max bytes to prefetch per stream (default 8MB) */
-unsigned int	zfetch_max_distance = 8 * 1024 * 1024;
+static unsigned int	zfetch_max_distance = 8 * 1024 * 1024;
 #else
 /* min bytes to prefetch per stream (default 4MB) */
 static unsigned int	zfetch_min_distance = 4 * 1024 * 1024;
 /* max bytes to prefetch per stream (default 64MB) */
-unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
+static unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
 #endif
 /* max bytes to prefetch indirects for per stream (default 128MB) */
-unsigned int	zfetch_max_idistance = 128 * 1024 * 1024;
+static unsigned int	zfetch_max_idistance = 128 * 1024 * 1024;
 /* max request reorder distance within a stream (default 16MB) */
-unsigned int	zfetch_max_reorder = 16 * 1024 * 1024;
+static unsigned int	zfetch_max_reorder = 16 * 1024 * 1024;
 /* Max log2 fraction of holes in a stream */
-unsigned int	zfetch_hole_shift = 2;
+static unsigned int	zfetch_hole_shift = 2;
 
 typedef struct zfetch_stats {
 	kstat_named_t zfetchstat_hits;
diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c
index 963ff41232a3..6c150d31c669 100644
--- a/sys/contrib/openzfs/module/zfs/dnode.c
+++ b/sys/contrib/openzfs/module/zfs/dnode.c
@@ -173,9 +173,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
 	dn->dn_assigned_txg = 0;
-	dn->dn_dirty_txg = 0;
-	dn->dn_dirtyctx = 0;
-	dn->dn_dirtyctx_firstset = NULL;
+	dn->dn_dirtycnt = 0;
 	dn->dn_bonus = NULL;
 	dn->dn_have_spill = B_FALSE;
 	dn->dn_zio = NULL;
@@ -229,9 +227,7 @@ dnode_dest(void *arg, void *unused)
 	ASSERT0(dn->dn_allocated_txg);
 	ASSERT0(dn->dn_free_txg);
 	ASSERT0(dn->dn_assigned_txg);
-	ASSERT0(dn->dn_dirty_txg);
-	ASSERT0(dn->dn_dirtyctx);
-	ASSERT0P(dn->dn_dirtyctx_firstset);
+	ASSERT0(dn->dn_dirtycnt);
 	ASSERT0P(dn->dn_bonus);
 	ASSERT(!dn->dn_have_spill);
 	ASSERT0P(dn->dn_zio);
@@ -692,10 +688,8 @@ dnode_destroy(dnode_t *dn)
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
 	dn->dn_assigned_txg = 0;
-	dn->dn_dirty_txg = 0;
+	dn->dn_dirtycnt = 0;
 
-	dn->dn_dirtyctx = 0;
-	dn->dn_dirtyctx_firstset = NULL;
 	if (dn->dn_bonus != NULL) {
 		mutex_enter(&dn->dn_bonus->db_mtx);
 		dbuf_destroy(dn->dn_bonus);
@@ -800,11 +794,9 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 	dn->dn_bonuslen = bonuslen;
 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
-	dn->dn_dirtyctx = 0;
 
 	dn->dn_free_txg = 0;
-	dn->dn_dirtyctx_firstset = NULL;
-	dn->dn_dirty_txg = 0;
+	dn->dn_dirtycnt = 0;
 
 	dn->dn_allocated_txg = tx->tx_txg;
 	dn->dn_id_flags = 0;
@@ -955,9 +947,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 	ndn->dn_allocated_txg = odn->dn_allocated_txg;
 	ndn->dn_free_txg = odn->dn_free_txg;
 	ndn->dn_assigned_txg = odn->dn_assigned_txg;
-	ndn->dn_dirty_txg = odn->dn_dirty_txg;
-	ndn->dn_dirtyctx = odn->dn_dirtyctx;
-	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
+	ndn->dn_dirtycnt = odn->dn_dirtycnt;
 	ASSERT0(zfs_refcount_count(&odn->dn_tx_holds));
 	zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
 	ASSERT(avl_is_empty(&ndn->dn_dbufs));
@@ -1020,9 +1010,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 	odn->dn_allocated_txg = 0;
 	odn->dn_free_txg = 0;
 	odn->dn_assigned_txg = 0;
-	odn->dn_dirty_txg = 0;
-	odn->dn_dirtyctx = 0;
-	odn->dn_dirtyctx_firstset = NULL;
+	odn->dn_dirtycnt = 0;
 	odn->dn_have_spill = B_FALSE;
 	odn->dn_zio = NULL;
 	odn->dn_oldused = 0;
@@ -1273,8 +1261,8 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
 		} else if (DN_SLOT_IS_PTR(dn)) {
 			mutex_enter(&dn->dn_mtx);
 			boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
-			    zfs_refcount_is_zero(&dn->dn_holds) &&
-			    !DNODE_IS_DIRTY(dn));
+			    dn->dn_dirtycnt == 0 &&
+			    zfs_refcount_is_zero(&dn->dn_holds));
 			mutex_exit(&dn->dn_mtx);
 
 			if (!can_free)
@@ -1757,17 +1745,23 @@ dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp)
  * reference on the dnode.  Returns FALSE if unable to add a
  * new reference.
  */
+static boolean_t
+dnode_add_ref_locked(dnode_t *dn, const void *tag)
+{
+	ASSERT(MUTEX_HELD(&dn->dn_mtx));
+	if (zfs_refcount_is_zero(&dn->dn_holds))
+		return (FALSE);
+	VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
+	return (TRUE);
+}
+
 boolean_t
 dnode_add_ref(dnode_t *dn, const void *tag)
 {
 	mutex_enter(&dn->dn_mtx);
-	if (zfs_refcount_is_zero(&dn->dn_holds)) {
-		mutex_exit(&dn->dn_mtx);
-		return (FALSE);
-	}
-	VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
+	boolean_t r = dnode_add_ref_locked(dn, tag);
 	mutex_exit(&dn->dn_mtx);
-	return (TRUE);
+	return (r);
 }
 
 void
@@ -1830,31 +1824,20 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)
 }
 
 /*
- * Checks if the dnode itself is dirty, or is carrying any uncommitted records.
- * It is important to check both conditions, as some operations (eg appending
- * to a file) can dirty both as a single logical unit, but they are not synced
- * out atomically, so checking one and not the other can result in an object
- * appearing to be clean mid-way through a commit.
+ * Test if the dnode is dirty, or carrying uncommitted records.
  *
- * Do not change this lightly! If you get it wrong, dmu_offset_next() can
- * detect a hole where there is really data, leading to silent corruption.
+ * dn_dirtycnt is the number of txgs this dnode is dirty on. It's incremented
+ * in dnode_setdirty() the first time the dnode is dirtied on a txg, and
+ * decremented in either dnode_rele_task() or userquota_updates_task() when the
+ * txg is synced out.
  */
 boolean_t
 dnode_is_dirty(dnode_t *dn)
 {
 	mutex_enter(&dn->dn_mtx);
-
-	for (int i = 0; i < TXG_SIZE; i++) {
-		if (multilist_link_active(&dn->dn_dirty_link[i]) ||
-		    !list_is_empty(&dn->dn_dirty_records[i])) {
-			mutex_exit(&dn->dn_mtx);
-			return (B_TRUE);
-		}
-	}
-
+	boolean_t dirty = (dn->dn_dirtycnt != 0);
 	mutex_exit(&dn->dn_mtx);
-
-	return (B_FALSE);
+	return (dirty);
 }
 
 void
@@ -1916,7 +1899,11 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 	 * dnode will hang around after we finish processing its
 	 * children.
 	 */
-	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
+	mutex_enter(&dn->dn_mtx);
+	VERIFY(dnode_add_ref_locked(dn, (void *)(uintptr_t)tx->tx_txg));
+	dn->dn_dirtycnt++;
+	ASSERT3U(dn->dn_dirtycnt, <=, 3);
+	mutex_exit(&dn->dn_mtx);
 
 	(void) dbuf_dirty(dn->dn_dbuf, tx);
 
@@ -2221,32 +2208,6 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
 	mutex_exit(&dn->dn_dbufs_mtx);
 }
 
-void
-dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag)
-{
-	/*
-	 * Don't set dirtyctx to SYNC if we're just modifying this as we
-	 * initialize the objset.
-	 */
-	if (dn->dn_dirtyctx == DN_UNDIRTIED) {
-		dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
-
-		if (ds != NULL) {
-			rrw_enter(&ds->ds_bp_rwlock, RW_READER, tag);
-		}
-		if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
-			if (dmu_tx_is_syncing(tx))
-				dn->dn_dirtyctx = DN_DIRTY_SYNC;
-			else
-				dn->dn_dirtyctx = DN_DIRTY_OPEN;
-			dn->dn_dirtyctx_firstset = tag;
-		}
-		if (ds != NULL) {
-			rrw_exit(&ds->ds_bp_rwlock, tag);
-		}
-	}
-}
-
 static void
 dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len,
     dmu_tx_t *tx)
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index dceafbc27556..6f7c060f97f8 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -251,11 +251,11 @@ spa_mode_t spa_mode_global = SPA_MODE_UNINIT;
 
 #ifdef ZFS_DEBUG
 /*
- * Everything except dprintf, set_error, spa, and indirect_remap is on
- * by default in debug builds.
+ * Everything except dprintf, set_error, indirect_remap, and raidz_reconstruct
+ * is on by default in debug builds.
  */
 int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR |
-    ZFS_DEBUG_INDIRECT_REMAP);
+    ZFS_DEBUG_INDIRECT_REMAP | ZFS_DEBUG_RAIDZ_RECONSTRUCT);
 #else
 int zfs_flags = 0;
 #endif
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index 9cf35e379000..fc6d445f9785 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -29,7 +29,7 @@
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Datto Inc. All rights reserved.
- * Copyright (c) 2021, Klara Inc.
+ * Copyright (c) 2021, 2025, Klara, Inc.
  * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
  */
 
@@ -100,7 +100,7 @@ static uint_t zfs_vdev_default_ms_shift = 29;
 /* upper limit for metaslab size (16G) */
 static uint_t zfs_vdev_max_ms_shift = 34;
 
-int vdev_validate_skip = B_FALSE;
+static int vdev_validate_skip = B_FALSE;
 
 /*
  * Since the DTL space map of a vdev is not expected to have a lot of
@@ -1086,6 +1086,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 		}
 	}
 
+	if (top_level && (ops == &vdev_raidz_ops || ops == &vdev_draid_ops))
+		vd->vdev_autosit =
+		    vdev_prop_default_numeric(VDEV_PROP_AUTOSIT);
+
 	/*
 	 * Add ourselves to the parent's list of children.
 	 */
@@ -1187,6 +1191,9 @@ vdev_free(vdev_t *vd)
 		spa_spare_remove(vd);
 	if (vd->vdev_isl2cache)
 		spa_l2cache_remove(vd);
+	if (vd->vdev_prev_histo)
+		kmem_free(vd->vdev_prev_histo,
+		    sizeof (uint64_t) * VDEV_L_HISTO_BUCKETS);
 
 	txg_list_destroy(&vd->vdev_ms_list);
 	txg_list_destroy(&vd->vdev_dtl_list);
@@ -3857,6 +3864,26 @@ vdev_load(vdev_t *vd)
 		}
 	}
 
+	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+		spa_t *spa = vd->vdev_spa;
+		uint64_t autosit;
+
+		error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+		    vdev_prop_to_name(VDEV_PROP_AUTOSIT), sizeof (autosit),
+		    1, &autosit);
+		if (error == 0) {
+			vd->vdev_autosit = autosit == 1;
+		} else if (error == ENOENT) {
+			vd->vdev_autosit = vdev_prop_default_numeric(
+			    VDEV_PROP_AUTOSIT);
+		} else {
+			vdev_dbgmsg(vd,
+			    "vdev_load: zap_lookup(top_zap=%llu) "
+			    "failed [error=%d]",
+			    (u_longlong_t)vd->vdev_top_zap, error);
+		}
+	}
+
 	/*
 	 * Load any rebuild state from the top-level vdev zap.
 	 */
@@ -4616,6 +4643,8 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 	vd->vdev_stat.vs_checksum_errors = 0;
 	vd->vdev_stat.vs_dio_verify_errors = 0;
 	vd->vdev_stat.vs_slow_ios = 0;
+	atomic_store_64(&vd->vdev_outlier_count, 0);
+	vd->vdev_read_sit_out_expire = 0;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear(spa, vd->vdev_child[c]);
@@ -6107,6 +6136,56 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 			}
 			vd->vdev_failfast = intval & 1;
 			break;
+		case VDEV_PROP_SIT_OUT:
+			/* Only expose this for a draid or raidz leaf */
+			if (!vd->vdev_ops->vdev_op_leaf ||
+			    vd->vdev_top == NULL ||
+			    (vd->vdev_top->vdev_ops != &vdev_raidz_ops &&
+			    vd->vdev_top->vdev_ops != &vdev_draid_ops)) {
+				error = ENOTSUP;
+				break;
+			}
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			if (intval == 1) {
+				vdev_t *ancestor = vd;
+				while (ancestor->vdev_parent != vd->vdev_top)
+					ancestor = ancestor->vdev_parent;
+				vdev_t *pvd = vd->vdev_top;
+				uint_t sitouts = 0;
+				for (int i = 0; i < pvd->vdev_children; i++) {
+					if (pvd->vdev_child[i] == ancestor)
+						continue;
+					if (vdev_sit_out_reads(
+					    pvd->vdev_child[i], 0)) {
+						sitouts++;
+					}
+				}
+				if (sitouts >= vdev_get_nparity(pvd)) {
+					error = ZFS_ERR_TOO_MANY_SITOUTS;
+					break;
+				}
+				if (error == 0)
+					vdev_raidz_sit_child(vd,
+					    INT64_MAX - gethrestime_sec());
+			} else {
+				vdev_raidz_unsit_child(vd);
+			}
+			break;
+		case VDEV_PROP_AUTOSIT:
+			if (vd->vdev_ops != &vdev_raidz_ops &&
+			    vd->vdev_ops != &vdev_draid_ops) {
+				error = ENOTSUP;
+				break;
+			}
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			vd->vdev_autosit = intval == 1;
+			break;
 		case VDEV_PROP_CHECKSUM_N:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
@@ -6456,6 +6535,19 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 					    ZPROP_SRC_NONE);
 				}
 				continue;
+			case VDEV_PROP_SIT_OUT:
+				/* Only expose this for a draid or raidz leaf */
+				if (vd->vdev_ops->vdev_op_leaf &&
+				    vd->vdev_top != NULL &&
+				    (vd->vdev_top->vdev_ops ==
+				    &vdev_raidz_ops ||
+				    vd->vdev_top->vdev_ops ==
+				    &vdev_draid_ops)) {
+					vdev_prop_add_list(outnvl, propname,
+					    NULL, vdev_sit_out_reads(vd, 0),
+					    ZPROP_SRC_NONE);
+				}
+				continue;
 			case VDEV_PROP_TRIM_SUPPORT:
 				/* only valid for leaf vdevs */
 				if (vd->vdev_ops->vdev_op_leaf) {
@@ -6506,6 +6598,29 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 				vdev_prop_add_list(outnvl, propname, strval,
 				    intval, src);
 				break;
+			case VDEV_PROP_AUTOSIT:
+				/* Only raidz vdevs cannot have this property */
+				if (vd->vdev_ops != &vdev_raidz_ops &&
+				    vd->vdev_ops != &vdev_draid_ops) {
+					src = ZPROP_SRC_NONE;
+					intval = ZPROP_BOOLEAN_NA;
+				} else {
+					err = vdev_prop_get_int(vd, prop,
+					    &intval);
+					if (err && err != ENOENT)
+						break;
+
+					if (intval ==
+					    vdev_prop_default_numeric(prop))
+						src = ZPROP_SRC_DEFAULT;
+					else
+						src = ZPROP_SRC_LOCAL;
+				}
+
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    intval, src);
+				break;
+
 			case VDEV_PROP_CHECKSUM_N:
 			case VDEV_PROP_CHECKSUM_T:
 			case VDEV_PROP_IO_N:
diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c
index a05289102af2..8588cfee3f7d 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_draid.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2018 Intel Corporation.
  * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2025, Klara, Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -1996,6 +1997,33 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
 				rc->rc_allow_repair = 1;
 			}
 		}
+
+		if (vdev_sit_out_reads(cvd, zio->io_flags)) {
+			rr->rr_outlier_cnt++;
+			ASSERT0(rc->rc_latency_outlier);
+			rc->rc_latency_outlier = 1;
+		}
+	}
+
+	/*
+	 * When the row contains a latency outlier and sufficient parity
+	 * exists to reconstruct the column data, then skip reading the
+	 * known slow child vdev as a performance optimization.
+	 */
+	if (rr->rr_outlier_cnt > 0 &&
+	    (rr->rr_firstdatacol - rr->rr_missingparity) >=
+	    (rr->rr_missingdata + 1)) {
+
+		for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
+			raidz_col_t *rc = &rr->rr_col[c];
+
+			if (rc->rc_error == 0 && rc->rc_latency_outlier) {
+				rr->rr_missingdata++;
+				rc->rc_error = SET_ERROR(EAGAIN);
+				rc->rc_skipped = 1;
+				break;
+			}
+		}
 	}
 
 	/*
diff --git a/sys/contrib/openzfs/module/zfs/vdev_file.c b/sys/contrib/openzfs/module/zfs/vdev_file.c
index f457669bc809..20b4db65ec06 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_file.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_file.c
@@ -228,7 +228,8 @@ vdev_file_io_strategy(void *arg)
 		abd_return_buf_copy(zio->io_abd, buf, size);
 	} else {
 		buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
-		err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
+		err = zfs_file_pwrite(vf->vf_file, buf, size, off,
+		    vd->vdev_ashift, &resid);
 		abd_return_buf(zio->io_abd, buf, size);
 	}
 	zio->io_error = err;
diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c
index c12713b107bf..e69e5598939e 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_queue.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c
@@ -122,7 +122,7 @@
  * The maximum number of i/os active to each device.  Ideally, this will be >=
  * the sum of each queue's max_active.
  */
-uint_t zfs_vdev_max_active = 1000;
+static uint_t zfs_vdev_max_active = 1000;
 
 /*
  * Per-queue limits on the number of i/os active to each device.  If the
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
index b597d6daefde..56b8e3b60b22 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
@@ -24,6 +24,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
+ * Copyright (c) 2025, Klara, Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -356,6 +357,32 @@ unsigned long raidz_expand_max_reflow_bytes = 0;
 uint_t raidz_expand_pause_point = 0;
 
 /*
+ * This represents the duration for a slow drive read sit out.
+ */
+static unsigned long vdev_read_sit_out_secs = 600;
+
+/*
+ * How often each RAID-Z and dRAID vdev will check for slow disk outliers.
+ * Increasing this interval will reduce the sensitivity of detection (since all
+ * I/Os since the last check are included in the statistics), but will slow the
+ * response to a disk developing a problem.
+ *
+ * Defaults to once per second; setting extremely small values may cause
+ * negative performance effects.
+ */
+static hrtime_t vdev_raidz_outlier_check_interval_ms = 1000;
+
+/*
+ * When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is
+ * used to determine how far out an outlier must be before it counts as an event
+ * worth consdering.
+ *
+ * Smaller values will result in more aggressive sitting out of disks that may
+ * have problems, but may significantly increase the rate of spurious sit-outs.
+ */
+static uint32_t vdev_raidz_outlier_insensitivity = 50;
+
+/*
  * Maximum amount of copy io's outstanding at once.
  */
 #ifdef _ILP32
@@ -2311,6 +2338,41 @@ vdev_raidz_min_asize(vdev_t *vd)
 	    vd->vdev_children);
 }
 
+/*
+ * return B_TRUE if a read should be skipped due to being too slow.
+ *
+ * In vdev_child_slow_outlier() it looks for outliers based on disk
+ * latency from the most recent child reads.  Here we're checking if,
+ * over time, a disk has has been an outlier too many times and is
+ * now in a sit out period.
+ */
+boolean_t
+vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags)
+{
+	if (vdev_read_sit_out_secs == 0)
+		return (B_FALSE);
+
+	/* Avoid skipping a data column read when scrubbing */
+	if (io_flags & ZIO_FLAG_SCRUB)
+		return (B_FALSE);
+
+	if (!vd->vdev_ops->vdev_op_leaf) {
+		boolean_t sitting = B_FALSE;
+		for (int c = 0; c < vd->vdev_children; c++) {
+			sitting |= vdev_sit_out_reads(vd->vdev_child[c],
+			    io_flags);
+		}
+		return (sitting);
+	}
+
+	if (vd->vdev_read_sit_out_expire >= gethrestime_sec())
+		return (B_TRUE);
+
+	vd->vdev_read_sit_out_expire = 0;
+
+	return (B_FALSE);
+}
+
 void
 vdev_raidz_child_done(zio_t *zio)
 {
@@ -2475,6 +2537,45 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
 			rc->rc_skipped = 1;
 			continue;
 		}
+
+		if (vdev_sit_out_reads(cvd, zio->io_flags)) {
+			rr->rr_outlier_cnt++;
+			ASSERT0(rc->rc_latency_outlier);
+			rc->rc_latency_outlier = 1;
+		}
+	}
+
+	/*
+	 * When the row contains a latency outlier and sufficient parity
+	 * exists to reconstruct the column data, then skip reading the
+	 * known slow child vdev as a performance optimization.
+	 */
+	if (rr->rr_outlier_cnt > 0 &&
+	    (rr->rr_firstdatacol - rr->rr_missingparity) >=
+	    (rr->rr_missingdata + 1)) {
+
+		for (int c = rr->rr_cols - 1; c >= 0; c--) {
+			raidz_col_t *rc = &rr->rr_col[c];
+
+			if (rc->rc_error == 0 && rc->rc_latency_outlier) {
+				if (c >= rr->rr_firstdatacol)
+					rr->rr_missingdata++;
+				else
+					rr->rr_missingparity++;
+				rc->rc_error = SET_ERROR(EAGAIN);
+				rc->rc_skipped = 1;
+				break;
+			}
+		}
+	}
+
+	for (int c = rr->rr_cols - 1; c >= 0; c--) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+		if (rc->rc_error || rc->rc_size == 0)
+			continue;
+
 		if (forceparity ||
 		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
@@ -2498,6 +2599,7 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
 
 		ASSERT3U(prc->rc_devidx, ==, i);
 		vdev_t *cvd = vd->vdev_child[i];
+
 		if (!vdev_readable(cvd)) {
 			prc->rc_error = SET_ERROR(ENXIO);
 			prc->rc_tried = 1;	/* don't even try */
@@ -2774,6 +2876,239 @@ vdev_raidz_worst_error(raidz_row_t *rr)
 	return (error);
 }
 
+/*
+ * Find the median value from a set of n values
+ */
+static uint64_t
+latency_median_value(const uint64_t *data, size_t n)
+{
+	uint64_t m;
+
+	if (n % 2 == 0)
+		m = (data[(n >> 1) - 1] + data[n >> 1]) >> 1;
+	else
+		m = data[((n + 1) >> 1) - 1];
+
+	return (m);
+}
+
+/*
+ * Calculate the outlier fence from a set of n latency values
+ *
+ * fence = Q3 + vdev_raidz_outlier_insensitivity x (Q3 - Q1)
+ */
+static uint64_t
+latency_quartiles_fence(const uint64_t *data, size_t n, uint64_t *iqr)
+{
+	uint64_t q1 = latency_median_value(&data[0], n >> 1);
+	uint64_t q3 = latency_median_value(&data[(n + 1) >> 1], n >> 1);
+
+	/*
+	 * To avoid detecting false positive outliers when N is small and
+	 * and the latencies values are very close, make sure the IQR
+	 * is at least 25% larger than Q1.
+	 */
+	*iqr = MAX(q3 - q1, q1 / 4);
+
+	return (q3 + (*iqr * vdev_raidz_outlier_insensitivity));
+}
+#define	LAT_CHILDREN_MIN	5
+#define	LAT_OUTLIER_LIMIT	20
+
+static int
+latency_compare(const void *arg1, const void *arg2)
+{
+	const uint64_t *l1 = (uint64_t *)arg1;
+	const uint64_t *l2 = (uint64_t *)arg2;
+
+	return (TREE_CMP(*l1, *l2));
+}
+
+void
+vdev_raidz_sit_child(vdev_t *svd, uint64_t secs)
+{
+	for (int c = 0; c < svd->vdev_children; c++)
+		vdev_raidz_sit_child(svd->vdev_child[c], secs);
+
+	if (!svd->vdev_ops->vdev_op_leaf)
+		return;
+
+	/* Begin a sit out period for this slow drive */
+	svd->vdev_read_sit_out_expire = gethrestime_sec() +
+	    secs;
+
+	/* Count each slow io period */
+	mutex_enter(&svd->vdev_stat_lock);
+	svd->vdev_stat.vs_slow_ios++;
+	mutex_exit(&svd->vdev_stat_lock);
+}
+
+void
+vdev_raidz_unsit_child(vdev_t *vd)
+{
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_raidz_unsit_child(vd->vdev_child[c]);
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return;
+
+	vd->vdev_read_sit_out_expire = 0;
+}
+
+/*
+ * Check for any latency outlier from latest set of child reads.
+ *
+ * Uses a Tukey's fence, with K = 50, for detecting extreme outliers. This
+ * rule defines extreme outliers as data points outside the fence of the
+ * third quartile plus fifty times the Interquartile Range (IQR). This range
+ * is the distance between the first and third quartile.
+ *
+ * Fifty is an extremely large value for Tukey's fence, but the outliers we're
+ * attempting to detect here are orders of magnitude times larger than the
+ * median. This large value should capture any truly fault disk quickly,
+ * without causing spurious sit-outs.
+ *
+ * To further avoid spurious sit-outs, vdevs must be detected multiple times
+ * as an outlier before they are sat, and outlier counts will gradually decay.
+ * Every nchildren times we have detected an outlier, we subtract 2 from the
+ * outlier count of all children. If detected outliers are close to uniformly
+ * distributed, this will result in the outlier count remaining close to 0
+ * (in expectation; over long enough time-scales, spurious sit-outs are still
+ * possible).
+ */
+static void
+vdev_child_slow_outlier(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	if (!vd->vdev_autosit || vdev_read_sit_out_secs == 0 ||
+	    vd->vdev_children < LAT_CHILDREN_MIN)
+		return;
+
+	hrtime_t now = getlrtime();
+	uint64_t last = atomic_load_64(&vd->vdev_last_latency_check);
+
+	if ((now - last) < MSEC2NSEC(vdev_raidz_outlier_check_interval_ms))
+		return;
+
+	/* Allow a single winner when there are racing callers. */
+	if (atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last)
+		return;
+
+	int children = vd->vdev_children;
+	uint64_t *lat_data = kmem_alloc(sizeof (uint64_t) * children, KM_SLEEP);
+
+	for (int c = 0; c < children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+		if (cvd->vdev_prev_histo == NULL) {
+			mutex_enter(&cvd->vdev_stat_lock);
+			size_t size =
+			    sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
+			cvd->vdev_prev_histo = kmem_zalloc(size, KM_SLEEP);
+			memcpy(cvd->vdev_prev_histo,
+			    cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ],
+			    size);
+			mutex_exit(&cvd->vdev_stat_lock);
+		}
+	}
+	uint64_t max = 0;
+	vdev_t *svd = NULL;
+	uint_t sitouts = 0;
+	boolean_t skip = B_FALSE, svd_sitting = B_FALSE;
+	for (int c = 0; c < children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+		boolean_t sitting = vdev_sit_out_reads(cvd, 0) ||
+		    cvd->vdev_state != VDEV_STATE_HEALTHY;
+
+		/* We can't sit out more disks than we have parity */
+		if (sitting && ++sitouts >= vdev_get_nparity(vd))
+			skip = B_TRUE;
+
+		mutex_enter(&cvd->vdev_stat_lock);
+
+		uint64_t *prev_histo = cvd->vdev_prev_histo;
+		uint64_t *histo =
+		    cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ];
+		if (skip) {
+			size_t size =
+			    sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
+			memcpy(prev_histo, histo, size);
+			mutex_exit(&cvd->vdev_stat_lock);
+			continue;
+		}
+		uint64_t count = 0;
+		lat_data[c] = 0;
+		for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) {
+			uint64_t this_count = histo[i] - prev_histo[i];
+			lat_data[c] += (1ULL << i) * this_count;
+			count += this_count;
+		}
+		size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
+		memcpy(prev_histo, histo, size);
+		mutex_exit(&cvd->vdev_stat_lock);
+		lat_data[c] /= MAX(1, count);
+
+		/* Wait until all disks have been read from */
+		if (lat_data[c] == 0 && !sitting) {
+			skip = B_TRUE;
+			continue;
+		}
+
+		/* Keep track of the vdev with largest value */
+		if (lat_data[c] > max) {
+			max = lat_data[c];
+			svd = cvd;
+			svd_sitting = sitting;
+		}
+	}
+
+	if (skip) {
+		kmem_free(lat_data, sizeof (uint64_t) * children);
+		return;
+	}
+
+	qsort((void *)lat_data, children, sizeof (uint64_t), latency_compare);
+
+	uint64_t iqr;
+	uint64_t fence = latency_quartiles_fence(lat_data, children, &iqr);
+
+	ASSERT3U(lat_data[children - 1], ==, max);
+	if (max > fence && !svd_sitting) {
+		ASSERT3U(iqr, >, 0);
+		uint64_t incr = MAX(1, MIN((max - fence) / iqr,
+		    LAT_OUTLIER_LIMIT / 4));
+		vd->vdev_outlier_count += incr;
+		if (vd->vdev_outlier_count >= children) {
+			for (int c = 0; c < children; c++) {
+				vdev_t *cvd = vd->vdev_child[c];
+				cvd->vdev_outlier_count -= 2;
+				cvd->vdev_outlier_count = MAX(0,
+				    cvd->vdev_outlier_count);
+			}
+			vd->vdev_outlier_count = 0;
+		}
+		/*
+		 * Keep track of how many times this child has had
+		 * an outlier read. A disk that persitently has a
+		 * higher than peers outlier count will be considered
+		 * a slow disk.
+		 */
+		svd->vdev_outlier_count += incr;
+		if (svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) {
+			ASSERT0(svd->vdev_read_sit_out_expire);
+			vdev_raidz_sit_child(svd, vdev_read_sit_out_secs);
+			(void) zfs_ereport_post(FM_EREPORT_ZFS_SITOUT,
+			    zio->io_spa, svd, NULL, NULL, 0);
+			vdev_dbgmsg(svd, "begin read sit out for %d secs",
+			    (int)vdev_read_sit_out_secs);
+
+			for (int c = 0; c < vd->vdev_children; c++)
+				vd->vdev_child[c]->vdev_outlier_count = 0;
+		}
+	}
+
+	kmem_free(lat_data, sizeof (uint64_t) * children);
+}
+
 static void
 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
 {
@@ -3515,6 +3850,9 @@ vdev_raidz_io_done(zio_t *zio)
 				raidz_row_t *rr = rm->rm_row[i];
 				vdev_raidz_io_done_verified(zio, rr);
 			}
+			/* Periodically check for a read outlier */
+			if (zio->io_type == ZIO_TYPE_READ)
+				vdev_child_slow_outlier(zio);
 			zio_checksum_verified(zio);
 		} else {
 			/*
@@ -5155,3 +5493,10 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
 	"For expanded RAIDZ, automatically start a pool scrub when expansion "
 	"completes");
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW,
+	"Raidz/draid slow disk sit out time period in seconds");
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, U64,
+	ZMOD_RW, "Interval to check for slow raidz/draid children");
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_insensitivity, UINT,
+	ZMOD_RW, "How insensitive the slow raidz/draid child check should be");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c
index 2f7a739da241..2ce0121324ad 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_removal.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c
@@ -105,7 +105,7 @@ static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
  *
  * See also the accessor function spa_remove_max_segment().
  */
-uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
+static uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
 
 /*
  * Ignore hard IO errors during device removal.  When set if a device
@@ -137,7 +137,7 @@ uint_t vdev_removal_max_span = 32 * 1024;
  * This is used by the test suite so that it can ensure that certain
  * actions happen while in the middle of a removal.
  */
-int zfs_removal_suspend_progress = 0;
+static int zfs_removal_suspend_progress = 0;
 
 #define	VDEV_REMOVAL_ZAP_OBJS	"lzap"
 
diff --git a/sys/contrib/openzfs/module/zfs/zfeature.c b/sys/contrib/openzfs/module/zfs/zfeature.c
index 0816ea134bf3..4cf9e0dbb405 100644
--- a/sys/contrib/openzfs/module/zfs/zfeature.c
+++ b/sys/contrib/openzfs/module/zfs/zfeature.c
@@ -308,6 +308,7 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
 	ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature));
 	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+	ASSERT(MUTEX_HELD(&spa->spa_feat_stats_lock));
 	VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid,
 	    sizeof (uint64_t), 1, &refcount, tx));
 
@@ -360,7 +361,9 @@ feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
 	    feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
 	    feature->fi_desc, tx));
 
+	mutex_enter(&spa->spa_feat_stats_lock);
 	feature_sync(spa, feature, initial_refcount, tx);
+	mutex_exit(&spa->spa_feat_stats_lock);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) {
 		uint64_t enabling_txg = dmu_tx_get_txg(tx);
@@ -416,6 +419,7 @@ feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
 
+	mutex_enter(&spa->spa_feat_stats_lock);
 	VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP);
 
 	switch (action) {
@@ -433,6 +437,7 @@ feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
 	}
 
 	feature_sync(spa, feature, refcount, tx);
+	mutex_exit(&spa->spa_feat_stats_lock);
 }
 
 void
diff --git a/sys/contrib/openzfs/module/zfs/zfs_crrd.c b/sys/contrib/openzfs/module/zfs/zfs_crrd.c
index f9267ed41d71..30d4c7c36897 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_crrd.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_crrd.c
@@ -162,9 +162,9 @@ dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg)
 	daydiff = time - rrd_tail(&db->dbr_days);
 	monthdiff = time - rrd_tail(&db->dbr_months);
 
-	if (monthdiff >= 0 && monthdiff >= SEC2NSEC(30 * 24 * 60 * 60))
+	if (monthdiff >= 0 && monthdiff >= 30 * 24 * 60 * 60)
 		rrd_add(&db->dbr_months, time, txg);
-	else if (daydiff >= 0 && daydiff >= SEC2NSEC(24 * 60 * 60))
+	else if (daydiff >= 0 && daydiff >= 24 * 60 * 60)
 		rrd_add(&db->dbr_days, time, txg);
 	else if (minutedif >= 0)
 		rrd_add(&db->dbr_minutes, time, txg);
@@ -208,7 +208,8 @@ dbrrd_closest(hrtime_t tv, const rrd_data_t *r1, const rrd_data_t *r2)
 	if (r2 == NULL)
 		return (r1);
 
-	return (ABS(tv - r1->rrdd_time) < ABS(tv - r2->rrdd_time) ? r1 : r2);
+	return (ABS(tv - (hrtime_t)r1->rrdd_time) <
+	    ABS(tv - (hrtime_t)r2->rrdd_time) ? r1 : r2);
 }
 
 uint64_t
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
index 121b966b9864..5ca7c2320c4e 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@@ -683,6 +683,7 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 	dsl_dataset_t *ds;
 	const char *cp;
 	int error;
+	boolean_t rawok = (zc->zc_flags & 0x8);
 
 	/*
 	 * Generate the current snapshot name from the given objsetid, then
@@ -705,6 +706,10 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 
 	error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
 	    ZFS_DELEG_PERM_SEND, cr);
+	if (error != 0 && rawok == B_TRUE) {
+		error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
+		    ZFS_DELEG_PERM_SEND_RAW, cr);
+	}
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 
@@ -714,9 +719,17 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 static int
 zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	boolean_t rawok = nvlist_exists(innvl, "rawok");
+	int error;
+
 	(void) innvl;
-	return (zfs_secpolicy_write_perms(zc->zc_name,
-	    ZFS_DELEG_PERM_SEND, cr));
+	error = zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_SEND, cr);
+	if (error != 0 && rawok == B_TRUE) {
+		error = zfs_secpolicy_write_perms(zc->zc_name,
+		    ZFS_DELEG_PERM_SEND_RAW, cr);
+	}
+	return (error);
 }
 
 static int
@@ -4726,7 +4739,7 @@ zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 			error = error ? error : resume_err;
 		}
 		zfs_vfs_rele(zfsvfs);
-	} else if ((zv = zvol_suspend(fsname)) != NULL) {
+	} else if (zvol_suspend(fsname, &zv) == 0) {
 		error = dsl_dataset_rollback(fsname, target, zvol_tag(zv),
 		    outnvl);
 		zvol_resume(zv);
@@ -5448,7 +5461,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, const char *origin,
 			}
 			error = error ? error : end_err;
 			zfs_vfs_rele(zfsvfs);
-		} else if ((zv = zvol_suspend(tofs)) != NULL) {
+		} else if (zvol_suspend(tofs, &zv) == 0) {
 			error = dmu_recv_end(&drc, zvol_tag(zv));
 			zvol_resume(zv);
 		} else {
@@ -7619,7 +7632,7 @@ zfs_ioctl_init(void)
 
 	zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB,
 	    zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME,
-	    POOL_CHECK_NONE, B_TRUE, B_TRUE,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub));
 
 	zfs_ioctl_register("get_props", ZFS_IOC_POOL_GET_PROPS,
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
index 2fd3e1c37045..faced0db7e9e 100644
--- a/sys/contrib/openzfs/module/zfs/zvol.c
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -1145,20 +1145,34 @@ zvol_tag(zvol_state_t *zv)
 /*
  * Suspend the zvol for recv and rollback.
  */
-zvol_state_t *
-zvol_suspend(const char *name)
+int
+zvol_suspend(const char *name, zvol_state_t **zvp)
 {
 	zvol_state_t *zv;
 
 	zv = zvol_find_by_name(name, RW_WRITER);
 
 	if (zv == NULL)
-		return (NULL);
+		return (SET_ERROR(ENOENT));
 
 	/* block all I/O, release in zvol_resume. */
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 
+	/*
+	 * If it's being removed, unlock and return error. It doesn't make any
+	 * sense to try to suspend a zvol being removed, but being here also
+	 * means that zvol_remove_minors_impl() is about to call zvol_remove()
+	 * and then destroy the zvol_state_t, so returning a pointer to it for
+	 * the caller to mess with would be a disaster anyway.
+	 */
+	if (zv->zv_flags & ZVOL_REMOVING) {
+		mutex_exit(&zv->zv_state_lock);
+		rw_exit(&zv->zv_suspend_lock);
+		/* NB: Returning EIO here to match zfsvfs_teardown() */
+		return (SET_ERROR(EIO));
+	}
+
 	atomic_inc(&zv->zv_suspend_ref);
 
 	if (zv->zv_open_count > 0)
@@ -1171,7 +1185,8 @@ zvol_suspend(const char *name)
 	mutex_exit(&zv->zv_state_lock);
 
 	/* zv_suspend_lock is released in zvol_resume() */
-	return (zv);
+	*zvp = zv;
+	return (0);
 }
 
 int