29 files changed, 759 insertions, 190 deletions
diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c
index d0fcca798fa9..ad707341eec7 100644
--- a/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c
+++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c
@@ -77,7 +77,8 @@ static const uint32_t SHA256_K[64] = {
 	h = g, g = f, f = e, e = d + T1; \
 	d = c, c = b, b = a, a = T1 + T2;
 
-static void sha256_generic(uint32_t state[8], const void *data, size_t num_blks)
+static void
+icp_sha256_generic(uint32_t state[8], const void *data, size_t num_blks)
 {
 	uint64_t blk;
 
@@ -173,7 +174,8 @@ static const uint64_t SHA512_K[80] = {
 	0x5fcb6fab3ad6faec, 0x6c44198c4a475817
 };
 
-static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks)
+static void
+icp_sha512_generic(uint64_t state[8], const void *data, size_t num_blks)
 {
 	uint64_t blk;
 
@@ -226,7 +228,8 @@ static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks)
 	}
 }
 
-static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len)
+static void
+icp_sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len)
 {
 	uint64_t pos = ctx->count[0];
 	uint64_t total = ctx->count[1];
@@ -258,7 +261,8 @@ static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len)
 	ctx->count[1] = total;
 }
 
-static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len)
+static void
+icp_sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len)
 {
 	uint64_t pos = ctx->count[0];
 	uint64_t total = ctx->count[1];
@@ -290,7 +294,8 @@ static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len)
 	ctx->count[1] = total;
 }
 
-static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits)
+static void
+icp_sha256_final(sha256_ctx *ctx, uint8_t *result, int bits)
 {
 	uint64_t mlen, pos = ctx->count[0];
 	uint8_t *m = ctx->wbuf;
@@ -334,7 +339,8 @@ static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits)
 	memset(ctx, 0, sizeof (*ctx));
 }
 
-static void sha512_final(sha512_ctx *ctx, uint8_t *result, int bits)
+static void
+icp_sha512_final(sha512_ctx *ctx, uint8_t *result, int bits)
 {
 	uint64_t mlen, pos = ctx->count[0];
 	uint8_t *m = ctx->wbuf, *r;
@@ -461,14 +467,14 @@ SHA2Update(SHA2_CTX *ctx, const void *data, size_t len)
 
 	switch (ctx->algotype) {
 		case SHA256:
-			sha256_update(&ctx->sha256, data, len);
+			icp_sha256_update(&ctx->sha256, data, len);
 			break;
 		case SHA512:
 		case SHA512_HMAC_MECH_INFO_TYPE:
-			sha512_update(&ctx->sha512, data, len);
+			icp_sha512_update(&ctx->sha512, data, len);
 			break;
 		case SHA512_256:
-			sha512_update(&ctx->sha512, data, len);
+			icp_sha512_update(&ctx->sha512, data, len);
 			break;
 	}
 }
@@ -479,32 +485,33 @@ SHA2Final(void *digest, SHA2_CTX *ctx)
 {
 	switch (ctx->algotype) {
 		case SHA256:
-			sha256_final(&ctx->sha256, digest, 256);
+			icp_sha256_final(&ctx->sha256, digest, 256);
 			break;
 		case SHA512:
 		case SHA512_HMAC_MECH_INFO_TYPE:
-			sha512_final(&ctx->sha512, digest, 512);
+			icp_sha512_final(&ctx->sha512, digest, 512);
 			break;
 		case SHA512_256:
-			sha512_final(&ctx->sha512, digest, 256);
+			icp_sha512_final(&ctx->sha512, digest, 256);
 			break;
 	}
 }
 
 /* the generic implementation is always okay */
-static boolean_t sha2_is_supported(void)
+static boolean_t
+icp_sha2_is_supported(void)
 {
 	return (B_TRUE);
 }
 
 const sha256_ops_t sha256_generic_impl = {
 	.name = "generic",
-	.transform = sha256_generic,
-	.is_supported = sha2_is_supported
+	.transform = icp_sha256_generic,
+	.is_supported = icp_sha2_is_supported
 };
 
 const sha512_ops_t sha512_generic_impl = {
 	.name = "generic",
-	.transform = sha512_generic,
-	.is_supported = sha2_is_supported
+	.transform = icp_sha512_generic,
+	.is_supported = icp_sha2_is_supported
 };
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
index 393bfaa65ff5..ebc2c0eeb6d2 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@@ -163,6 +163,13 @@ param_set_arc_int(SYSCTL_HANDLER_ARGS)
 	return (0);
 }
 
+static void
+warn_deprecated_sysctl(const char *old, const char *new)
+{
+	printf("WARNING: sysctl vfs.zfs.%s is deprecated. Use vfs.zfs.%s instead.\n",
+	    old, new);
+}
+
 int
 param_set_arc_max(SYSCTL_HANDLER_ARGS)
 {
@@ -185,9 +192,17 @@ param_set_arc_max(SYSCTL_HANDLER_ARGS)
 	if (val != 0)
 		zfs_arc_max = arc_c_max;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("arc_max", "arc.max");
+
 	return (0);
 }
 
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max,
+	CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
+	NULL, 1, param_set_arc_max, "LU",
+	"Maximum ARC size in bytes (LEGACY)");
+
 int
 param_set_arc_min(SYSCTL_HANDLER_ARGS)
 {
@@ -209,9 +224,17 @@ param_set_arc_min(SYSCTL_HANDLER_ARGS)
 	if (val != 0)
 		zfs_arc_min = arc_c_min;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("arc_min", "arc.min");
+
 	return (0);
 }
 
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min,
+	CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
+	NULL, 1, param_set_arc_min, "LU",
+	"Minimum ARC size in bytes (LEGACY)");
+
 extern uint_t zfs_arc_free_target;
 
 int
@@ -232,9 +255,22 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS)
 
 	zfs_arc_free_target = val;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("arc_free_target", "arc.free_target");
+
 	return (0);
 }
 
+/*
+ * NOTE: This sysctl is CTLFLAG_RW not CTLFLAG_RWTUN due to its dependency on
+ * pagedaemon initialization.
+ */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
+	CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+	NULL, 1, param_set_arc_free_target, "IU",
+	"Desired number of free pages below which ARC triggers reclaim"
+	" (LEGACY)");
+
 int
 param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
 {
@@ -250,9 +286,193 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
 
 	arc_no_grow_shift = val;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("arc_no_grow_shift", "arc.no_grow_shift");
+
 	return (0);
 }
 
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift,
+	CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
+	NULL, 1, param_set_arc_no_grow_shift, "I",
+	"log2(fraction of ARC which must be free to allow growing) (LEGACY)");
+
+extern uint64_t l2arc_write_max;
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max,
+	CTLFLAG_RWTUN, &l2arc_write_max, 0,
+	"Max write bytes per interval (LEGACY)");
+
+extern uint64_t l2arc_write_boost;
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost,
+	CTLFLAG_RWTUN, &l2arc_write_boost, 0,
+	"Extra write bytes during device warmup (LEGACY)");
+
+extern uint64_t l2arc_headroom;
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom,
+	CTLFLAG_RWTUN, &l2arc_headroom, 0,
+	"Number of max device writes to precache (LEGACY)");
+
+extern uint64_t l2arc_headroom_boost;
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom_boost,
+	CTLFLAG_RWTUN, &l2arc_headroom_boost, 0,
+	"Compressed l2arc_headroom multiplier (LEGACY)");
+
+extern uint64_t l2arc_feed_secs;
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs,
+	CTLFLAG_RWTUN, &l2arc_feed_secs, 0,
+	"Seconds between L2ARC writing (LEGACY)");
+
+extern uint64_t l2arc_feed_min_ms;
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms,
+	CTLFLAG_RWTUN, &l2arc_feed_min_ms, 0,
+	"Min feed interval in milliseconds (LEGACY)");
+
+extern int l2arc_noprefetch;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch,
+	CTLFLAG_RWTUN, &l2arc_noprefetch, 0,
+	"Skip caching prefetched buffers (LEGACY)");
+
+extern int l2arc_feed_again;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again,
+	CTLFLAG_RWTUN, &l2arc_feed_again, 0,
+	"Turbo L2ARC warmup (LEGACY)");
+
+extern int l2arc_norw;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw,
+	CTLFLAG_RWTUN, &l2arc_norw, 0,
+	"No reads during writes (LEGACY)");
+
+static int
+param_get_arc_state_size(SYSCTL_HANDLER_ARGS)
+{
+	arc_state_t *state = (arc_state_t *)arg1;
+	int64_t val;
+
+	val = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]) +
+	    zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]);
+	return (sysctl_handle_64(oidp, &val, 0, req));
+}
+
+extern arc_state_t ARC_anon;
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, anon_size,
+	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+	&ARC_anon, 0, param_get_arc_state_size, "Q",
+	"size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
+	&ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+	"size of evictable metadata in anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
+	&ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+	"size of evictable data in anonymous state");
+
+extern arc_state_t ARC_mru;
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_size,
+	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+	&ARC_mru, 0, param_get_arc_state_size, "Q",
+	"size of mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
+	&ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+	"size of evictable metadata in mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
+	&ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+	"size of evictable data in mru state");
+
+extern arc_state_t ARC_mru_ghost;
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_ghost_size,
+	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+	&ARC_mru_ghost, 0, param_get_arc_state_size, "Q",
+	"size of mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
+	&ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+	"size of evictable metadata in mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
+	&ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+	"size of evictable data in mru ghost state");
+
+extern arc_state_t ARC_mfu;
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_size,
+	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+	&ARC_mfu, 0, param_get_arc_state_size, "Q",
+	"size of mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
+	&ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+	"size of evictable metadata in mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
+	&ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+	"size of evictable data in mfu state");
+
+extern arc_state_t ARC_mfu_ghost;
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_ghost_size,
+	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+	&ARC_mfu_ghost, 0, param_get_arc_state_size, "Q",
+	"size of mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
+	&ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+	"size of evictable metadata in mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
+	&ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+	"size of evictable data in mfu ghost state");
+
+extern arc_state_t ARC_uncached;
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, uncached_size,
+	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+	&ARC_uncached, 0, param_get_arc_state_size, "Q",
+	"size of uncached state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD,
+	&ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+	"size of evictable metadata in uncached state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD,
+	&ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+	"size of evictable data in uncached state");
+
+extern arc_state_t ARC_l2c_only;
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, l2c_only_size,
+	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+	&ARC_l2c_only, 0, param_get_arc_state_size, "Q",
+	"size of l2c_only state");
+
+/* dbuf.c */
+
+/* dmu.c */
+
+/* dmu_zfetch.c */
+
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)");
+
+extern uint32_t	zfetch_max_distance;
+
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance,
+	CTLFLAG_RWTUN, &zfetch_max_distance, 0,
+	"Max bytes to prefetch per stream (LEGACY)");
+
+extern uint32_t	zfetch_max_idistance;
+
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance,
+	CTLFLAG_RWTUN, &zfetch_max_idistance, 0,
+	"Max bytes to prefetch indirects for per stream (LEGACY)");
+
+/* dsl_pool.c */
+
+/* dnode.c */
+
+/* dsl_scan.c */
+
 /* metaslab.c */
 
 int
@@ -313,6 +533,19 @@ SYSCTL_UINT(_vfs_zfs, OID_AUTO, condense_pct,
 	"Condense on-disk spacemap when it is more than this many percents"
 	" of in-memory counterpart");
 
+extern uint_t zfs_remove_max_segment;
+
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, remove_max_segment,
+	CTLFLAG_RWTUN, &zfs_remove_max_segment, 0,
+	"Largest contiguous segment ZFS will attempt to allocate when removing"
+	" a device");
+
+extern int zfs_removal_suspend_progress;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress,
+	CTLFLAG_RWTUN, &zfs_removal_suspend_progress, 0,
+	"Ensures certain actions can happen while in the middle of a removal");
+
 /*
  * Minimum size which forces the dynamic allocator to change
  * it's allocation strategy.  Once the space map cannot satisfy
@@ -532,9 +765,18 @@ param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS)
 
 	zfs_vdev_min_auto_ashift = val;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("min_auto_ashift",
+		    "vdev.min_auto_ashift");
+
 	return (0);
 }
 
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
+	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1,
+	param_set_min_auto_ashift, "IU",
+	"Min ashift used when creating new top-level vdev. (LEGACY)");
+
 int
 param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS)
 {
@@ -551,9 +793,19 @@ param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS)
 
 	zfs_vdev_max_auto_ashift = val;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("max_auto_ashift",
+		    "vdev.max_auto_ashift");
+
 	return (0);
 }
 
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
+	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1,
+	param_set_max_auto_ashift, "IU",
+	"Max ashift used when optimizing for logical -> physical sector size on"
+	" new top-level vdevs. (LEGACY)");
+
 /*
  * Since the DTL space map of a vdev is not expected to have a lot of
  * entries, we default its block size to 4K.
@@ -575,6 +827,23 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz,
 	CTLFLAG_RDTUN, &zfs_vdev_standard_sm_blksz, 0,
 	"Block size for standard space map.  Power of 2 greater than 4096.");
 
+extern int vdev_validate_skip;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip,
+	CTLFLAG_RDTUN, &vdev_validate_skip, 0,
+	"Enable to bypass vdev_validate().");
+
+/* vdev_mirror.c */
+
+/* vdev_queue.c */
+
+extern uint_t zfs_vdev_max_active;
+
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight,
+	CTLFLAG_RWTUN, &zfs_vdev_max_active, 0,
+	"The maximum number of I/Os of all types active for each device."
+	" (LEGACY)");
+
 /* zio.c */
 
 SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata,
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
index 4de48e013ec4..d0a9c662e6f0 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
@@ -762,8 +762,7 @@ zfsctl_common_pathconf(struct vop_pathconf_args *ap)
 		return (0);
 
 	case _PC_MIN_HOLE_SIZE:
-		*ap->a_retval = (int)SPA_MINBLOCKSIZE;
-		return (0);
+		return (EINVAL);
 
 	case _PC_ACL_EXTENDED:
 		*ap->a_retval = 0;
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
index 411225786089..f34a2fd37a77 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -4116,6 +4116,7 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
 {
 	znode_t *zp;
 	zfsvfs_t *zfsvfs;
+	uint_t blksize, iosize;
 	int error;
 
 	switch (cmd) {
@@ -4127,8 +4128,20 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
 		*valp = 64;
 		return (0);
 	case _PC_MIN_HOLE_SIZE:
-		*valp = (int)SPA_MINBLOCKSIZE;
-		return (0);
+		iosize = vp->v_mount->mnt_stat.f_iosize;
+		if (vp->v_type == VREG) {
+			zp = VTOZ(vp);
+			blksize = zp->z_blksz;
+			if (zp->z_size <= blksize)
+				blksize = MAX(blksize, iosize);
+			*valp = (int)blksize;
+			return (0);
+		}
+		if (vp->v_type == VDIR) {
+			*valp = (int)iosize;
+			return (0);
+		}
+		return (EINVAL);
 	case _PC_ACL_EXTENDED:
 #if 0		/* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */
 		zp = VTOZ(vp);
@@ -4210,8 +4223,20 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
 
 			zfs_vmobject_wlock(object);
 			(void) vm_page_grab_pages(object, OFF_TO_IDX(start),
-			    VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_ZERO,
+			    VM_ALLOC_NORMAL | VM_ALLOC_WAITOK,
 			    ma, count);
+			if (!vm_page_all_valid(ma[count - 1])) {
+				/*
+				 * Later in this function, we copy DMU data to
+				 * invalid pages only. The last page may not be
+				 * entirely filled though, if the file does not
+				 * end on a page boundary. Therefore, we zero
+				 * that last page here to make sure it does not
+				 * contain garbage after the end of file.
+				 */
+				ASSERT(vm_page_none_valid(ma[count - 1]));
+				vm_page_zero_invalid(ma[count - 1], FALSE);
+			}
 			zfs_vmobject_wunlock(object);
 		}
 		if (blksz == zp->z_blksz)
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
index 91cf38016e00..8562c42b3220 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
@@ -437,6 +437,7 @@ zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
+	memset(&cuio_s, 0, sizeof (cuio_s));
 	zfs_uio_init(&cuio, &cuio_s);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
@@ -519,6 +520,7 @@ zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
 
+	memset(&cuio_s, 0, sizeof (cuio_s));
 	zfs_uio_init(&cuio, &cuio_s);
 
 	/*
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
index 0dd2ecd7fd8d..3ddbfcb97184 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
@@ -183,6 +183,7 @@ static struct filterops zvol_filterops_vnode = {
 	.f_isfd = 1,
 	.f_detach = zvol_filter_detach,
 	.f_event = zvol_filter_vnode,
+	.f_copy = knote_triv_copy,
 };
 
 extern uint_t zfs_geom_probe_vdev_key;
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
index 45c2999a4bb1..b2eae5d00b10 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
@@ -25,6 +25,10 @@
  * SUCH DAMAGE.
  */
 
+/*
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
+ */
+
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>
@@ -56,6 +60,19 @@ typedef struct zone_dataset {
 } zone_dataset_t;
 
 #ifdef CONFIG_USER_NS
+
+/*
+ * Linux 6.18 moved the generic namespace type away from ns->ops->type onto
+ * ns_common itself.
+ */
+#ifdef HAVE_NS_COMMON_TYPE
+#define	ns_is_newuser(ns)	\
+	((ns)->ns_type == CLONE_NEWUSER)
+#else
+#define	ns_is_newuser(ns)	\
+	((ns)->ops != NULL && (ns)->ops->type == CLONE_NEWUSER)
+#endif
+
 /*
  * Returns:
  * - 0 on success
@@ -84,7 +101,7 @@ user_ns_get(int fd, struct user_namespace **userns)
 		goto done;
 	}
 	ns = get_proc_ns(file_inode(nsfile));
-	if (ns->ops->type != CLONE_NEWUSER) {
+	if (!ns_is_newuser(ns)) {
 		error = ENOTTY;
 		goto done;
 	}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
index 8a8316f63c48..18f2426fbbfc 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 by Delphix. All rights reserved.
  * Copyright (c) 2023, 2024, Klara Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
  */
 
 /*
@@ -1109,6 +1110,14 @@ abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
 #define	ABD_ITER_PAGE_SIZE(page)	(PAGESIZE)
 #endif
 
+#ifndef nth_page
+/*
+ * Since 6.18 nth_page() no longer exists, and is no longer required to iterate
+ * within a single SG entry, so we replace it with a simple addition.
+ */
+#define	nth_page(p, n)	((p)+(n))
+#endif
+
 void
 abd_iter_page(struct abd_iter *aiter)
 {
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
index daa4b5776837..934d74a112fd 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
@@ -2524,7 +2524,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
 	 * Also note: DOS R/O is ignored for directories.
 	 */
 	if ((v4_mode & WRITE_MASK_DATA) &&
-	    S_ISDIR(ZTOI(zp)->i_mode) &&
+	    !S_ISDIR(ZTOI(zp)->i_mode) &&
 	    (zp->z_pflags & ZFS_READONLY)) {
 		return (SET_ERROR(EPERM));
 	}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
index 6106726651a3..e845ad69ad78 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
@@ -2033,10 +2033,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
 		goto out3;
 	}
 
-	if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
-		err = SET_ERROR(EPERM);
-		goto out3;
-	}
+	/* ZFS_READONLY will be handled in zfs_zaccess() */
 
 	/*
 	 * Verify timestamps doesn't overflow 32 bits.
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
index d07317b0d910..02965ac8cbee 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2025, Klara, Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
  */
 
 
@@ -478,6 +479,7 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 	return (ret);
 }
 
+#ifdef HAVE_WRITE_CACHE_PAGES
 #ifdef HAVE_WRITEPAGE_T_FOLIO
 static int
 zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
@@ -499,6 +501,78 @@ zpl_write_cache_pages(struct address_space *mapping,
 #endif
 	return (result);
 }
+#else
+static inline int
+zpl_write_cache_pages(struct address_space *mapping,
+    struct writeback_control *wbc, void *data)
+{
+	pgoff_t start = wbc->range_start >> PAGE_SHIFT;
+	pgoff_t end = wbc->range_end >> PAGE_SHIFT;
+
+	struct folio_batch fbatch;
+	folio_batch_init(&fbatch);
+
+	/*
+	 * This atomically (-ish) tags all DIRTY pages in the range with
+	 * TOWRITE, allowing users to continue dirtying or undirtying pages
+	 * while we get on with writeback, without us treading on each other.
+	 */
+	tag_pages_for_writeback(mapping, start, end);
+
+	int err = 0;
+	unsigned int npages;
+
+	/*
+	 * Grab references to the TOWRITE pages just flagged. This may not get
+	 * all of them, so we do it in a loop until there are none left.
+	 */
+	while ((npages = filemap_get_folios_tag(mapping, &start, end,
+	    PAGECACHE_TAG_TOWRITE, &fbatch)) != 0) {
+
+		/* Loop over each page and write it out. */
+		struct folio *folio;
+		while ((folio = folio_batch_next(&fbatch)) != NULL) {
+			folio_lock(folio);
+
+			/*
+			 * If the folio has been remapped, or is no longer
+			 * dirty, then there's nothing to do.
+			 */
+			if (folio->mapping != mapping ||
+			    !folio_test_dirty(folio)) {
+				folio_unlock(folio);
+				continue;
+			}
+
+			/*
+			 * If writeback is already in progress, wait for it to
+			 * finish. We continue after this even if the page
+			 * ends up clean; zfs_putpage() will skip it if no
+			 * further work is required.
+			 */
+			while (folio_test_writeback(folio))
+				folio_wait_bit(folio, PG_writeback);
+
+			/*
+			 * Write it out and collect any error. zfs_putpage()
+			 * will clear the TOWRITE and DIRTY flags, and return
+			 * with the page unlocked.
+			 */
+			int ferr = zpl_putpage(&folio->page, wbc, data);
+			if (err == 0 && ferr != 0)
+				err = ferr;
+
+			/* Housekeeping for the caller. */
+			wbc->nr_to_write -= folio_nr_pages(folio);
+		}
+
+		/* Release any remaining references on the batch. */
+		folio_batch_release(&fbatch);
+	}
+
+	return (err);
+}
+#endif
 
 static int
 zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
index 444948d03cb3..347b352506e5 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
  * Copyright (c) 2023, Datto Inc. All rights reserved.
  * Copyright (c) 2025, Klara, Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
  */
 
 
@@ -33,6 +34,7 @@
 #include <sys/zpl.h>
 #include <linux/iversion.h>
 #include <linux/version.h>
+#include <linux/vfs_compat.h>
 
 /*
  * What to do when the last reference to an inode is released. If 0, the kernel
@@ -104,7 +106,7 @@ zpl_dirty_inode(struct inode *ip, int flags)
  * reporting memory pressure and requests OpenZFS release some memory (see
  * zfs_prune()).
  *
- * When set to 1, we call generic_delete_node(), which always returns "destroy
+ * When set to 1, we call generic_delete_inode(), which always returns "destroy
  * immediately", resulting in inodes being destroyed immediately, releasing
  * their associated dnodes and dbufs to the dbuf cached and the ARC to be
  * evicted as normal.
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index bac166fcd89e..fe939150b641 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -21,7 +21,7 @@
  */
 /*
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
- * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
+ * Copyright (c) 2024, 2025, Rob Norris <robn@despairlabs.com>
  * Copyright (c) 2024, 2025, Klara, Inc.
  */
 
@@ -337,16 +337,14 @@ zvol_discard(zv_request_t *zvr)
 	}
 
 	/*
-	 * Align the request to volume block boundaries when a secure erase is
-	 * not required.  This will prevent dnode_free_range() from zeroing out
-	 * the unaligned parts which is slow (read-modify-write) and useless
-	 * since we are not freeing any space by doing so.
+	 * Align the request to volume block boundaries. This will prevent
+	 * dnode_free_range() from zeroing out the unaligned parts which is
+	 * slow (read-modify-write) and useless since we are not freeing any
+	 * space by doing so.
 	 */
-	if (!io_is_secure_erase(bio, rq)) {
-		start = P2ROUNDUP(start, zv->zv_volblocksize);
-		end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
-		size = end - start;
-	}
+	start = P2ROUNDUP(start, zv->zv_volblocksize);
+	end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
+	size = end - start;
 
 	if (start >= end)
 		goto unlock;
@@ -467,6 +465,24 @@ zvol_read_task(void *arg)
 	zv_request_task_free(task);
 }
 
+/*
+ * Note:
+ *
+ * The kernel uses different enum names for the IO opcode, depending on the
+ * kernel version ('req_opf', 'req_op').  To sidestep this, use macros rather
+ * than inline functions for these checks.
+ */
+/* Should this IO go down the zvol write path? */
+#define	ZVOL_OP_IS_WRITE(op) \
+	(op == REQ_OP_WRITE || \
+	op == REQ_OP_FLUSH || \
+	op == REQ_OP_DISCARD)
+
+/* Is this IO type supported by zvols? */
+#define	ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op))
+
+/* Get the IO opcode */
+#define	ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq))
 
 /*
  * Process a BIO or request
@@ -484,7 +500,33 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	uint64_t offset = io_offset(bio, rq);
 	uint64_t size = io_size(bio, rq);
-	int rw = io_data_dir(bio, rq);
+	int rw;
+
+	if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) {
+		zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x",
+		    rq != NULL ? "request" : "BIO",
+		    ZVOL_OP(bio, rq),
+		    rq != NULL ? rq->cmd_flags : bio->bi_opf);
+		ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)));
+		zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP));
+		goto out;
+	}
+
+	if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) {
+		rw = WRITE;
+	} else {
+		rw = READ;
+	}
+
+	/*
+	 * Sanity check
+	 *
+	 * If we're a BIO, check our rw matches the kernel's
+	 * bio_data_dir(bio) rw.  We need to check because we support fewer
+	 * IO operations, and want to verify that what we think are reads and
+	 * writes from those operations match what the kernel thinks.
+	 */
+	ASSERT(rq != NULL || rw == bio_data_dir(bio));
 
 	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
 		zvol_end_io(bio, rq, SET_ERROR(ENXIO));
@@ -589,7 +631,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 		 * interfaces lack this functionality (they block waiting for
 		 * the i/o to complete).
 		 */
-		if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) {
+		if (io_is_discard(bio, rq)) {
 			if (force_sync) {
 				zvol_discard(&zvr);
 			} else {
@@ -990,12 +1032,12 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
  * tiny devices.  For devices over 1 Mib a standard head and sector count
  * is used to keep the cylinders count reasonable.
  */
-static int
-zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static inline int
+zvol_getgeo_impl(struct gendisk *disk, struct hd_geometry *geo)
 {
+	zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
 	sector_t sectors;
 
-	zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
 	ASSERT3P(zv, !=, NULL);
 	ASSERT3U(zv->zv_open_count, >, 0);
 
@@ -1015,6 +1057,20 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return (0);
 }
 
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK
+static int
+zvol_getgeo(struct gendisk *disk, struct hd_geometry *geo)
+{
+	return (zvol_getgeo_impl(disk, geo));
+}
+#else
+static int
+zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	return (zvol_getgeo_impl(bdev->bd_disk, geo));
+}
+#endif
+
 /*
  * Why have two separate block_device_operations structs?
  *
@@ -1458,7 +1514,7 @@ zvol_os_remove_minor(zvol_state_t *zv)
 	if (zso->use_blk_mq)
 		blk_mq_free_tag_set(&zso->tag_set);
 
-	ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
+	ida_free(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
 
 	kmem_free(zso, sizeof (struct zvol_state_os));
 
@@ -1613,7 +1669,7 @@ zvol_os_create_minor(const char *name)
 	if (zvol_inhibit_dev)
 		return (0);
 
-	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
+	idx = ida_alloc(&zvol_ida, kmem_flags_convert(KM_SLEEP));
 	if (idx < 0)
 		return (SET_ERROR(-idx));
 	minor = idx << ZVOL_MINOR_BITS;
@@ -1621,7 +1677,7 @@ zvol_os_create_minor(const char *name)
 		/* too many partitions can cause an overflow */
 		zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
 		    name, minor, MINOR(minor));
-		ida_simple_remove(&zvol_ida, idx);
+		ida_free(&zvol_ida, idx);
 		return (SET_ERROR(EINVAL));
 	}
 
@@ -1629,7 +1685,7 @@ zvol_os_create_minor(const char *name)
 	if (zv) {
 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 		mutex_exit(&zv->zv_state_lock);
-		ida_simple_remove(&zvol_ida, idx);
+		ida_free(&zvol_ida, idx);
 		return (SET_ERROR(EEXIST));
 	}
 
@@ -1729,7 +1785,7 @@ out_doi:
 		rw_exit(&zvol_state_lock);
 		error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
 	} else {
-		ida_simple_remove(&zvol_ida, idx);
+		ida_free(&zvol_ida, idx);
 	}
 
 	return (error);
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
index 864e3898b365..9190ae0362ea 100644
--- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c
+++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
@@ -364,8 +364,8 @@ zfs_prop_init(void)
 
 	static const zprop_index_t xattr_table[] = {
 		{ "off",	ZFS_XATTR_OFF },
-		{ "on",		ZFS_XATTR_SA },
 		{ "sa",		ZFS_XATTR_SA },
+		{ "on",		ZFS_XATTR_SA },
 		{ "dir",	ZFS_XATTR_DIR },
 		{ NULL }
 	};
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
index bd6dc8edd8ca..dbb5e942e2e6 100644
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -486,13 +486,13 @@ static taskq_t *arc_flush_taskq;
 static uint_t zfs_arc_evict_threads = 0;
 
 /* The 7 states: */
-static arc_state_t ARC_anon;
-/*  */ arc_state_t ARC_mru;
-static arc_state_t ARC_mru_ghost;
-/*  */ arc_state_t ARC_mfu;
-static arc_state_t ARC_mfu_ghost;
-static arc_state_t ARC_l2c_only;
-static arc_state_t ARC_uncached;
+arc_state_t ARC_anon;
+arc_state_t ARC_mru;
+arc_state_t ARC_mru_ghost;
+arc_state_t ARC_mfu;
+arc_state_t ARC_mfu_ghost;
+arc_state_t ARC_l2c_only;
+arc_state_t ARC_uncached;
 
 arc_stats_t arc_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
@@ -832,15 +832,15 @@ typedef struct arc_async_flush {
 #define	L2ARC_FEED_TYPES	4
 
 /* L2ARC Performance Tunables */
-static uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
-static uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
-static uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* # of dev writes */
-static uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
-static uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
-static uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
-static int l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
-static int l2arc_feed_again = B_TRUE;		/* turbo warmup */
-static int l2arc_norw = B_FALSE;		/* no reads during writes */
+uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
+uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
+uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* # of dev writes */
+uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
+uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
+uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
+int l2arc_noprefetch = B_TRUE;			/* don't cache prefetch bufs */
+int l2arc_feed_again = B_TRUE;			/* turbo warmup */
+int l2arc_norw = B_FALSE;			/* no reads during writes */
 static uint_t l2arc_meta_percent = 33;	/* limit on headers size */
 
 /*
@@ -1157,7 +1157,7 @@ buf_fini(void)
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
-	 * should be using vmem_free() in the linux kernel\
+	 * should be using vmem_free() in the linux kernel.
 	 */
 	vmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
@@ -1392,6 +1392,7 @@ arc_get_complevel(arc_buf_t *buf)
 	return (buf->b_hdr->b_complevel);
 }
 
+__maybe_unused
 static inline boolean_t
 arc_buf_is_shared(arc_buf_t *buf)
 {
@@ -4650,10 +4651,10 @@ arc_flush_task(void *arg)
 	arc_flush_impl(spa_guid, B_FALSE);
 	arc_async_flush_remove(spa_guid, af->af_cache_level);
 
-	uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time);
-	if (elaspsed > 0) {
+	uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time);
+	if (elapsed > 0) {
 		zfs_dbgmsg("spa %llu arc flushed in %llu ms",
-		    (u_longlong_t)spa_guid, (u_longlong_t)elaspsed);
+		    (u_longlong_t)spa_guid, (u_longlong_t)elapsed);
 	}
 }
 
@@ -9151,7 +9152,7 @@ top:
 		if (dev->l2ad_first) {
 			/*
 			 * This is the first sweep through the device. There is
-			 * nothing to evict. We have already trimmmed the
+			 * nothing to evict. We have already trimmed the
 			 * whole device.
 			 */
 			goto out;
@@ -10085,12 +10086,12 @@ l2arc_device_teardown(void *arg)
 	kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
 	vmem_free(remdev, sizeof (l2arc_dev_t));
 
-	uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time);
-	if (elaspsed > 0) {
+	uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time);
+	if (elapsed > 0) {
 		zfs_dbgmsg("spa %llu, vdev %llu removed in %llu ms",
 		    (u_longlong_t)rva->rva_spa_gid,
 		    (u_longlong_t)rva->rva_vdev_gid,
-		    (u_longlong_t)elaspsed);
+		    (u_longlong_t)elapsed);
 	}
 
 	if (rva->rva_async)
diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
index 3d3a9c713568..51165d0bf723 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
@@ -57,19 +57,19 @@ static unsigned int	zfetch_max_sec_reap = 2;
 /* min bytes to prefetch per stream (default 2MB) */
 static unsigned int	zfetch_min_distance = 2 * 1024 * 1024;
 /* max bytes to prefetch per stream (default 8MB) */
-static unsigned int	zfetch_max_distance = 8 * 1024 * 1024;
+unsigned int	zfetch_max_distance = 8 * 1024 * 1024;
 #else
 /* min bytes to prefetch per stream (default 4MB) */
 static unsigned int	zfetch_min_distance = 4 * 1024 * 1024;
 /* max bytes to prefetch per stream (default 64MB) */
-static unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
+unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
 #endif
 /* max bytes to prefetch indirects for per stream (default 128MB) */
-static unsigned int	zfetch_max_idistance = 128 * 1024 * 1024;
+unsigned int	zfetch_max_idistance = 128 * 1024 * 1024;
 /* max request reorder distance within a stream (default 16MB) */
-static unsigned int	zfetch_max_reorder = 16 * 1024 * 1024;
+unsigned int	zfetch_max_reorder = 16 * 1024 * 1024;
 /* Max log2 fraction of holes in a stream */
-static unsigned int	zfetch_hole_shift = 2;
+unsigned int	zfetch_hole_shift = 2;
 
 typedef struct zfetch_stats {
 	kstat_named_t zfetchstat_hits;
diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c
index 6c150d31c669..e88d394b5229 100644
--- a/sys/contrib/openzfs/module/zfs/dnode.c
+++ b/sys/contrib/openzfs/module/zfs/dnode.c
@@ -2656,6 +2656,32 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 }
 
 /*
+ * Adjust *offset to the next (or previous) block byte offset at lvl.
+ * Returns FALSE if *offset would overflow or underflow.
+ */
+static boolean_t
+dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl)
+{
+	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	int span = lvl * epbs + dn->dn_datablkshift;
+	uint64_t blkid, maxblkid;
+
+	if (span >= 8 * sizeof (uint64_t))
+		return (B_FALSE);
+
+	blkid = *offset >> span;
+	maxblkid = 1ULL << (8 * sizeof (*offset) - span);
+	if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid)
+		*offset = (blkid + 1) << span;
+	else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0)
+		*offset = (blkid << span) - 1;
+	else
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
  * Find the next hole, data, or sparse region at or after *offset.
  * The value 'blkfill' tells us how many items we expect to find
  * in an L0 data block; this value is 1 for normal objects,
@@ -2682,7 +2708,7 @@ int
 dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
     int minlvl, uint64_t blkfill, uint64_t txg)
 {
-	uint64_t initial_offset = *offset;
+	uint64_t matched = *offset;
 	int lvl, maxlvl;
 	int error = 0;
 
@@ -2706,16 +2732,36 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
 
 	maxlvl = dn->dn_phys->dn_nlevels;
 
-	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
+	for (lvl = minlvl; lvl <= maxlvl; ) {
 		error = dnode_next_offset_level(dn,
 		    flags, offset, lvl, blkfill, txg);
-		if (error != ESRCH)
+		if (error == 0 && lvl > minlvl) {
+			--lvl;
+			matched = *offset;
+		} else if (error == ESRCH && lvl < maxlvl &&
+		    dnode_next_block(dn, flags, &matched, lvl)) {
+			/*
+			 * Continue search at next/prev offset in lvl+1 block.
+			 *
+			 * Usually we only search upwards at the start of the
+			 * search as higher level blocks point at a matching
+			 * minlvl block in most cases, but we backtrack if not.
+			 *
+			 * This can happen for txg > 0 searches if the block
+			 * contains only BPs/dnodes freed at that txg. It also
+			 * happens if we are still syncing out the tree, and
+			 * some BP's at higher levels are not updated yet.
+			 *
+			 * We must adjust offset to avoid coming back to the
+			 * same offset and getting stuck looping forever. This
+			 * also deals with the case where offset is already at
+			 * the beginning or end of the object.
+			 */
+			++lvl;
+			*offset = matched;
+		} else {
 			break;
-	}
-
-	while (error == 0 && --lvl >= minlvl) {
-		error = dnode_next_offset_level(dn,
-		    flags, offset, lvl, blkfill, txg);
+		}
 	}
 
 	/*
@@ -2727,9 +2773,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
 		error = 0;
 	}
 
-	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
-	    initial_offset < *offset : initial_offset > *offset))
-		error = SET_ERROR(ESRCH);
 out:
 	if (!(flags & DNODE_FIND_HAVELOCK))
 		rw_exit(&dn->dn_struct_rwlock);
diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c
index 7db72b9b04b0..fd46127b6068 100644
--- a/sys/contrib/openzfs/module/zfs/mmp.c
+++ b/sys/contrib/openzfs/module/zfs/mmp.c
@@ -446,7 +446,7 @@ mmp_write_uberblock(spa_t *spa)
 	uint64_t offset;
 
 	hrtime_t lock_acquire_time = gethrtime();
-	spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER);
+	spa_config_enter_priority(spa, SCL_STATE, mmp_tag, RW_READER);
 	lock_acquire_time = gethrtime() - lock_acquire_time;
 	if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
 		zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c
index ea2d2c7227c8..d73195f1a21f 100644
--- a/sys/contrib/openzfs/module/zfs/range_tree.c
+++ b/sys/contrib/openzfs/module/zfs/range_tree.c
@@ -585,7 +585,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
 		 * the size, since we do not support removing partial segments
 		 * of range trees with gaps.
 		 */
-		zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
+		zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
 		    zfs_rs_get_start_raw(rs, rt));
 		zfs_range_tree_stat_incr(rt, &rs_tmp);
 
diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c
index cf28955b0c50..f615591e826b 100644
--- a/sys/contrib/openzfs/module/zfs/spa_config.c
+++ b/sys/contrib/openzfs/module/zfs/spa_config.c
@@ -372,6 +372,8 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata);
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_MIN_ALLOC, spa->spa_min_alloc);
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, spa->spa_max_alloc);
 	if (spa->spa_comment != NULL)
 		fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
 		    spa->spa_comment);
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index 6f7c060f97f8..0bead6d49666 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -510,7 +510,7 @@ spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw)
 
 static void
 spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw,
-    int mmp_flag)
+    int priority_flag)
 {
 	(void) tag;
 	int wlocks_held = 0;
@@ -526,7 +526,7 @@ spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw,
 		mutex_enter(&scl->scl_lock);
 		if (rw == RW_READER) {
 			while (scl->scl_writer ||
-			    (!mmp_flag && scl->scl_write_wanted)) {
+			    (!priority_flag && scl->scl_write_wanted)) {
 				cv_wait(&scl->scl_cv, &scl->scl_lock);
 			}
 		} else {
@@ -551,7 +551,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
 }
 
 /*
- * The spa_config_enter_mmp() allows the mmp thread to cut in front of
+ * The spa_config_enter_priority() allows the mmp thread to cut in front of
  * outstanding write lock requests. This is needed since the mmp updates are
  * time sensitive and failure to service them promptly will result in a
  * suspended pool. This pool suspension has been seen in practice when there is
@@ -560,7 +560,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
  */
 
 void
-spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw)
+spa_config_enter_priority(spa_t *spa, int locks, const void *tag, krw_t rw)
 {
 	spa_config_enter_impl(spa, locks, tag, rw, 1);
 }
@@ -806,6 +806,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	spa->spa_min_ashift = INT_MAX;
 	spa->spa_max_ashift = 0;
 	spa->spa_min_alloc = INT_MAX;
+	spa->spa_max_alloc = 0;
 	spa->spa_gcd_alloc = INT_MAX;
 
 	/* Reset cached value */
@@ -1865,6 +1866,19 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
 }
 
 /*
+ * Return the range of minimum allocation sizes for the normal allocation
+ * class. This can be used by external consumers of the DMU to estimate
+ * potential wasted capacity when setting the recordsize for an object.
+ * This is mainly for dRAID pools which always pad to a full stripe width.
+ */
+void
+spa_get_min_alloc_range(spa_t *spa, uint64_t *min_alloc, uint64_t *max_alloc)
+{
+	*min_alloc = spa->spa_min_alloc;
+	*max_alloc = spa->spa_max_alloc;
+}
+
+/*
  * Return the amount of slop space in bytes.  It is typically 1/32 of the pool
  * (3.2%), minus the embedded log space.  On very small pools, it may be
  * slightly larger than this.  On very large pools, it will be capped to
@@ -3085,6 +3099,7 @@ EXPORT_SYMBOL(spa_version);
 EXPORT_SYMBOL(spa_state);
 EXPORT_SYMBOL(spa_load_state);
 EXPORT_SYMBOL(spa_freeze_txg);
+EXPORT_SYMBOL(spa_get_min_alloc_range); /* for Lustre */
 EXPORT_SYMBOL(spa_get_dspace);
 EXPORT_SYMBOL(spa_update_dspace);
 EXPORT_SYMBOL(spa_deflate);
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index fc6d445f9785..c8d7280387a2 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -100,7 +100,7 @@ static uint_t zfs_vdev_default_ms_shift = 29;
 /* upper limit for metaslab size (16G) */
 static uint_t zfs_vdev_max_ms_shift = 34;
 
-static int vdev_validate_skip = B_FALSE;
+int vdev_validate_skip = B_FALSE;
 
 /*
  * Since the DTL space map of a vdev is not expected to have a lot of
@@ -1497,12 +1497,14 @@ vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
 {
 	if (min_alloc < spa->spa_min_alloc)
 		spa->spa_min_alloc = min_alloc;
-	if (spa->spa_gcd_alloc == INT_MAX) {
+
+	if (min_alloc > spa->spa_max_alloc)
+		spa->spa_max_alloc = min_alloc;
+
+	if (spa->spa_gcd_alloc == INT_MAX)
 		spa->spa_gcd_alloc = min_alloc;
-	} else {
-		spa->spa_gcd_alloc = vdev_gcd(min_alloc,
-		    spa->spa_gcd_alloc);
-	}
+	else
+		spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc);
 }
 
 void
@@ -1560,8 +1562,7 @@ vdev_metaslab_group_create(vdev_t *vd)
 			if (vd->vdev_ashift < spa->spa_min_ashift)
 				spa->spa_min_ashift = vd->vdev_ashift;
 
-			uint64_t min_alloc = vdev_get_min_alloc(vd);
-			vdev_spa_set_alloc(spa, min_alloc);
+			vdev_spa_set_alloc(spa, vdev_get_min_alloc(vd));
 		}
 	}
 }
diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c
index c44f654b0261..0d4fdaa77ba0 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_label.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_label.c
@@ -511,6 +511,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    vd->vdev_asize);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_MIN_ALLOC,
+		    vdev_get_min_alloc(vd));
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
 		if (vd->vdev_noalloc) {
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c
index e69e5598939e..c12713b107bf 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_queue.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c
@@ -122,7 +122,7 @@
  * The maximum number of i/os active to each device.  Ideally, this will be >=
  * the sum of each queue's max_active.
  */
-static uint_t zfs_vdev_max_active = 1000;
+uint_t zfs_vdev_max_active = 1000;
 
 /*
  * Per-queue limits on the number of i/os active to each device.  If the
diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c
index 2ce0121324ad..abb71543e3ab 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_removal.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c
@@ -51,34 +51,70 @@
 #include <sys/trace_zfs.h>
 
 /*
- * This file contains the necessary logic to remove vdevs from a
- * storage pool.  Currently, the only devices that can be removed
- * are log, cache, and spare devices; and top level vdevs from a pool
- * w/o raidz or mirrors.  (Note that members of a mirror can be removed
- * by the detach operation.)
+ * This file contains the necessary logic to remove vdevs from a storage
+ * pool. Note that members of a mirror can be removed by the detach
+ * operation. Currently, the only devices that can be removed are:
  *
- * Log vdevs are removed by evacuating them and then turning the vdev
- * into a hole vdev while holding spa config locks.
+ * 1) Traditional hot spare and cache vdevs. Note that draid distributed
+ *    spares are fixed at creation time and cannot be removed.
  *
- * Top level vdevs are removed and converted into an indirect vdev via
- * a multi-step process:
+ * 2) Log vdevs are removed by evacuating them and then turning the vdev
+ *    into a hole vdev while holding spa config locks.
  *
- *  - Disable allocations from this device (spa_vdev_remove_top).
+ * 3) Top-level singleton and mirror vdevs, including dedup and special
+ *    vdevs, are removed and converted into an indirect vdev via a
+ *    multi-step process:
  *
- *  - From a new thread (spa_vdev_remove_thread), copy data from
- *    the removing vdev to a different vdev.  The copy happens in open
- *    context (spa_vdev_copy_impl) and issues a sync task
- *    (vdev_mapping_sync) so the sync thread can update the partial
- *    indirect mappings in core and on disk.
+ *    - Disable allocations from this device (spa_vdev_remove_top).
  *
- *  - If a free happens during a removal, it is freed from the
- *    removing vdev, and if it has already been copied, from the new
- *    location as well (free_from_removing_vdev).
+ *    - From a new thread (spa_vdev_remove_thread), copy data from the
+ *      removing vdev to a different vdev. The copy happens in open context
+ *      (spa_vdev_copy_impl) and issues a sync task (vdev_mapping_sync) so
+ *      the sync thread can update the partial indirect mappings in core
+ *      and on disk.
  *
- *  - After the removal is completed, the copy thread converts the vdev
- *    into an indirect vdev (vdev_remove_complete) before instructing
- *    the sync thread to destroy the space maps and finish the removal
- *    (spa_finish_removal).
+ *    - If a free happens during a removal, it is freed from the removing
+ *      vdev, and if it has already been copied, from the new location as
+ *      well (free_from_removing_vdev).
+ *
+ *    - After the removal is completed, the copy thread converts the vdev
+ *      into an indirect vdev (vdev_remove_complete) before instructing
+ *      the sync thread to destroy the space maps and finish the removal
+ *      (spa_finish_removal).
+ *
+ *   The following constraints currently apply primary device removal:
+ *
+ *     - All vdevs must be online, healthy, and not be missing any data
+ *       according to the DTLs.
+ *
+ *     - When removing a singleton or mirror vdev, regardless of it's a
+ *       special, dedup, or primary device, it must have the same ashift
+ *       as the devices in the normal allocation class. Furthermore, all
+ *       vdevs in the normal allocation class must have the same ashift to
+ *       ensure the new allocations never includes additional padding.
+ *
+ *     - The normal allocation class cannot contain any raidz or draid
+ *       top-level vdevs since segments are copied without regard for block
+ *       boundaries. This makes it impossible to calculate the required
+ *       parity columns when using these vdev types as the destination.
+ *
+ *     - The encryption keys must be loaded so the ZIL logs can be reset
+ *       in order to prevent writing to the device being removed.
+ *
+ * N.B. ashift and raidz/draid constraints for primary top-level device
+ * removal could be slightly relaxed if it were possible to request that
+ * DVAs from a mirror or singleton in the specified allocation class be
+ * used (metaslab_alloc_dva).
+ *
+ * This flexibility would be particularly useful for raidz/draid pools which
+ * often include a mirrored special device. If a mistakenly added top-level
+ * singleton were added it could then still be removed at the cost of some
+ * special device capacity. This may be a worthwhile tradeoff depending on
+ * the pool capacity and expense (cost, complexity, time) of creating a new
+ * pool and copying all of the data to correct the configuration.
+ *
+ * Furthermore, while not currently supported it should be possible to allow
+ * vdevs of any type to be removed as long as they've never been written to.
  */
 
 typedef struct vdev_copy_arg {
@@ -105,7 +141,7 @@ static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
  *
  * See also the accessor function spa_remove_max_segment().
  */
-static uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
+uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
 
 /*
  * Ignore hard IO errors during device removal.  When set if a device
@@ -137,7 +173,7 @@ uint_t vdev_removal_max_span = 32 * 1024;
  * This is used by the test suite so that it can ensure that certain
  * actions happen while in the middle of a removal.
  */
-static int zfs_removal_suspend_progress = 0;
+int zfs_removal_suspend_progress = 0;
 
 #define	VDEV_REMOVAL_ZAP_OBJS	"lzap"
 
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index 4cf8912d4269..aeea58bedfe4 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -4574,8 +4574,29 @@ zio_vdev_io_start(zio_t *zio)
 	ASSERT0(zio->io_child_error[ZIO_CHILD_VDEV]);
 
 	if (vd == NULL) {
-		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
-			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
+		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) {
+			/*
+			 * A deadlock workaround. The ddt_prune_unique_entries()
+			 * -> prune_candidates_sync() code path takes the
+			 * SCL_ZIO reader lock and may request it again here.
+			 * If there is another thread who wants the SCL_ZIO
+			 * writer lock, then scl_write_wanted will be set.
+			 * Thus, the spa_config_enter_priority() is used to
+			 * ignore pending writer requests.
+			 *
+			 * The locking should be revised to remove the need
+			 * for this workaround.  If that's not workable then
+			 * it should only be applied to the zios involved in
+			 * the pruning process.  This impacts the read/write
+			 * I/O balance while pruning.
+			 */
+			if (spa->spa_active_ddt_prune)
+				spa_config_enter_priority(spa, SCL_ZIO, zio,
+				    RW_READER);
+			else
+				spa_config_enter(spa, SCL_ZIO, zio,
+				    RW_READER);
+		}
 
 		/*
 		 * The mirror_ops handle multiple DVAs in a single BP.
@@ -5305,6 +5326,16 @@ zio_ready(zio_t *zio)
 		return (NULL);
 	}
 
+	if (zio_injection_enabled) {
+		hrtime_t target = zio_handle_ready_delay(zio);
+		if (target != 0 && zio->io_target_timestamp == 0) {
+			zio->io_stage >>= 1;
+			zio->io_target_timestamp = target;
+			zio_delay_interrupt(zio);
+			return (NULL);
+		}
+	}
+
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
 		ASSERT(BP_GET_BIRTH(bp) == zio->io_txg ||
diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c
index 981a1be4847c..287577018ed1 100644
--- a/sys/contrib/openzfs/module/zfs/zio_inject.c
+++ b/sys/contrib/openzfs/module/zfs/zio_inject.c
@@ -827,6 +827,44 @@ zio_handle_export_delay(spa_t *spa, hrtime_t elapsed)
 	zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT);
 }
 
+/*
+ * For testing, inject a delay before ready state.
+ */
+hrtime_t
+zio_handle_ready_delay(zio_t *zio)
+{
+	inject_handler_t *handler;
+	hrtime_t now = gethrtime();
+	hrtime_t target = 0;
+
+	/*
+	 * Ignore I/O not associated with any logical data.
+	 */
+	if (zio->io_logical == NULL)
+		return (0);
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+		if (zio->io_spa != handler->zi_spa ||
+		    handler->zi_record.zi_cmd != ZINJECT_DELAY_READY)
+			continue;
+
+		/* If this handler matches, inject the delay */
+		if (zio_match_iotype(zio, handler->zi_record.zi_iotype) &&
+		    zio_match_handler(&zio->io_logical->io_bookmark,
+		    zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
+		    zio_match_dva(zio), &handler->zi_record, zio->io_error)) {
+			target = now + (hrtime_t)handler->zi_record.zi_timer;
+			break;
+		}
+	}
+
+	rw_exit(&inject_lock);
+	return (target);
+}
+
 static int
 zio_calculate_range(const char *pool, zinject_record_t *record)
 {
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
index faced0db7e9e..00f98168d3d8 100644
--- a/sys/contrib/openzfs/module/zfs/zvol.c
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -410,7 +410,7 @@ zvol_set_volthreading(const char *name, boolean_t value)
 {
 	zvol_state_t *zv = zvol_find_by_name(name, RW_NONE);
 	if (zv == NULL)
-		return (SET_ERROR(ENOENT));
+		return (-1);
 	zv->zv_threading = value;
 	mutex_exit(&zv->zv_state_lock);
 	return (0);
diff --git a/sys/contrib/openzfs/module/zstd/zfs_zstd.c b/sys/contrib/openzfs/module/zstd/zfs_zstd.c
index 3db196953f74..c403c001086a 100644
--- a/sys/contrib/openzfs/module/zstd/zfs_zstd.c
+++ b/sys/contrib/openzfs/module/zstd/zfs_zstd.c
@@ -441,64 +441,6 @@ zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
 }
 
 #ifndef IN_LIBSA
-static size_t
-zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
-    int level)
-{
-	int16_t zstd_level;
-	if (zstd_enum_to_level(level, &zstd_level)) {
-		ZSTDSTAT_BUMP(zstd_stat_com_inval);
-		return (s_len);
-	}
-	/*
-	 * A zstd early abort heuristic.
-	 *
-	 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
-	 *   128k), don't try any of this, just go.
-	 *   (because experimentally that was a reasonable cutoff for a perf win
-	 *   with tiny ratio change)
-	 * - First, we try LZ4 compression, and if it doesn't early abort, we
-	 *   jump directly to whatever compression level we intended to try.
-	 * - Second, we try zstd-1 - if that errors out (usually, but not
-	 *   exclusively, if it would overflow), we give up early.
-	 *
-	 *   If it works, instead we go on and compress anyway.
-	 *
-	 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
-	 * compressible data, it was losing up to 8.5% of the compressed
-	 * savings versus no early abort, and all the zstd-fast levels are
-	 * worse indications on their own than LZ4, and don't improve the LZ4
-	 * pass noticably if stacked like this.
-	 */
-	size_t actual_abort_size = zstd_abort_size;
-	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
-	    s_len >= actual_abort_size) {
-		int pass_len = 1;
-		pass_len = zfs_lz4_compress(s_start, d_start, s_len, d_len, 0);
-		if (pass_len < d_len) {
-			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
-			goto keep_trying;
-		}
-		ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
-
-		pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
-		    ZIO_ZSTD_LEVEL_1);
-		if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
-			ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
-			return (s_len);
-		}
-		ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
-	} else {
-		ZSTDSTAT_BUMP(zstd_stat_passignored);
-		if (s_len < actual_abort_size) {
-			ZSTDSTAT_BUMP(zstd_stat_passignored_size);
-		}
-	}
-keep_trying:
-	return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
-
-}
-
 /* Compress block using zstd */
 static size_t
 zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,