88 files changed, 4424 insertions, 1486 deletions
diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in
index 3d6f288fa5da..95313c984178 100644
--- a/sys/contrib/openzfs/module/Kbuild.in
+++ b/sys/contrib/openzfs/module/Kbuild.in
@@ -4,7 +4,7 @@
 
 ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement
 ZFS_MODULE_CFLAGS += -Wmissing-prototypes
-ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@  @NO_FORMAT_ZERO_LENGTH@
+ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @KERNEL_NO_FORMAT_ZERO_LENGTH@
 
 ifneq ($(KBUILD_EXTMOD),)
 zfs_include = @abs_top_srcdir@/include
@@ -135,6 +135,7 @@ ICP_OBJS_X86_64 := \
 	asm-x86_64/sha2/sha256-x86_64.o \
 	asm-x86_64/sha2/sha512-x86_64.o \
 	asm-x86_64/modes/aesni-gcm-x86_64.o \
+	asm-x86_64/modes/aesni-gcm-avx2-vaes.o \
 	asm-x86_64/modes/gcm_pclmulqdq.o \
 	asm-x86_64/modes/ghash-x86_64.o
 
@@ -292,10 +293,9 @@ ZSTD_UPSTREAM_OBJS := \
 
 zfs-objs += $(addprefix zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS))
 
-# Disable aarch64 neon SIMD instructions for kernel mode
 $(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -I$(zstd_include) $(ZFS_ZSTD_FLAGS)
 $(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : asflags-y += -I$(zstd_include)
-$(addprefix $(obj)/zstd/,$(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -include $(zstd_include)/aarch64_compat.h -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w
+$(addprefix $(obj)/zstd/,$(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w
 $(obj)/zstd/zfs_zstd.o : ccflags-y += -include $(zstd_include)/zstd_compat_wrapper.h
 
 
diff --git a/sys/contrib/openzfs/module/Makefile.bsd b/sys/contrib/openzfs/module/Makefile.bsd
index 3ba38c43f25b..c20fdc0c483b 100644
--- a/sys/contrib/openzfs/module/Makefile.bsd
+++ b/sys/contrib/openzfs/module/Makefile.bsd
@@ -521,30 +521,6 @@ CFLAGS.zstd_ldm.c= -U__BMI__ -fno-tree-vectorize ${NO_WBITWISE_INSTEAD_OF_LOGICA
 CFLAGS.zstd_opt.c= -U__BMI__ -fno-tree-vectorize ${NO_WBITWISE_INSTEAD_OF_LOGICAL}
 
 .if ${MACHINE_ARCH} == "aarch64"
-__ZFS_ZSTD_AARCH64_FLAGS= -include ${SRCDIR}/zstd/include/aarch64_compat.h
-CFLAGS.zstd.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.entropy_common.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.error_private.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.fse_compress.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.fse_decompress.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.hist.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.huf_compress.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.huf_decompress.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.pool.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.xxhash.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_common.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_compress.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_compress_literals.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_compress_sequences.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_compress_superblock.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_ddict.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_decompress.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_decompress_block.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_double_fast.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_fast.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_lazy.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_ldm.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_opt.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
 
 sha256-armv8.o: sha256-armv8.S
 	${CC} -c ${CFLAGS:N-mgeneral-regs-only} ${WERROR} ${.IMPSRC} \
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c
index c2a982b5a376..3cfa5b8165ce 100644
--- a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c
+++ b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c
@@ -46,6 +46,9 @@
 #define	IMPL_CYCLE	(UINT32_MAX-1)
 #ifdef CAN_USE_GCM_ASM
 #define	IMPL_AVX	(UINT32_MAX-2)
+#if CAN_USE_GCM_ASM >= 2
+#define	IMPL_AVX2	(UINT32_MAX-3)
+#endif
 #endif
 #define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
 static uint32_t icp_gcm_impl = IMPL_FASTEST;
@@ -56,17 +59,16 @@ static uint32_t user_sel_impl = IMPL_FASTEST;
 boolean_t gcm_avx_can_use_movbe = B_FALSE;
 /*
  * Whether to use the optimized openssl gcm and ghash implementations.
- * Set to true if module parameter icp_gcm_impl == "avx".
  */
-static boolean_t gcm_use_avx = B_FALSE;
-#define	GCM_IMPL_USE_AVX	(*(volatile boolean_t *)&gcm_use_avx)
+static gcm_impl gcm_impl_used = GCM_IMPL_GENERIC;
+#define	GCM_IMPL_USED	(*(volatile gcm_impl *)&gcm_impl_used)
 
 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
 
 static inline boolean_t gcm_avx_will_work(void);
-static inline void gcm_set_avx(boolean_t);
-static inline boolean_t gcm_toggle_avx(void);
-static inline size_t gcm_simd_get_htab_size(boolean_t);
+static inline boolean_t gcm_avx2_will_work(void);
+static inline void gcm_use_impl(gcm_impl impl);
+static inline gcm_impl gcm_toggle_impl(void);
 
 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
     crypto_data_t *, size_t);
@@ -89,7 +91,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
     void (*xor_block)(uint8_t *, uint8_t *))
 {
 #ifdef CAN_USE_GCM_ASM
-	if (ctx->gcm_use_avx == B_TRUE)
+	if (ctx->impl != GCM_IMPL_GENERIC)
 		return (gcm_mode_encrypt_contiguous_blocks_avx(
 		    ctx, data, length, out, block_size));
 #endif
@@ -208,7 +210,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
 {
 	(void) copy_block;
 #ifdef CAN_USE_GCM_ASM
-	if (ctx->gcm_use_avx == B_TRUE)
+	if (ctx->impl != GCM_IMPL_GENERIC)
 		return (gcm_encrypt_final_avx(ctx, out, block_size));
 #endif
 
@@ -374,7 +376,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
     void (*xor_block)(uint8_t *, uint8_t *))
 {
 #ifdef CAN_USE_GCM_ASM
-	if (ctx->gcm_use_avx == B_TRUE)
+	if (ctx->impl != GCM_IMPL_GENERIC)
 		return (gcm_decrypt_final_avx(ctx, out, block_size));
 #endif
 
@@ -631,23 +633,23 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
 	    ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
 
 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
-		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
+		gcm_ctx->impl = GCM_IMPL_USED;
 	} else {
 		/*
-		 * Handle the "cycle" implementation by creating avx and
-		 * non-avx contexts alternately.
+		 * Handle the "cycle" implementation by creating different
+		 * contexts, one per implementation.
 		 */
-		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
+		gcm_ctx->impl = gcm_toggle_impl();
 
-		/* The avx impl. doesn't handle byte swapped key schedules. */
-		if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
-			gcm_ctx->gcm_use_avx = B_FALSE;
+		/* The AVX impl. doesn't handle byte swapped key schedules. */
+		if (needs_bswap == B_TRUE) {
+			gcm_ctx->impl = GCM_IMPL_GENERIC;
 		}
 		/*
-		 * If this is a GCM context, use the MOVBE and the BSWAP
+		 * If this is an AVX context, use the MOVBE and the BSWAP
 		 * variants alternately.
 		 */
-		if (gcm_ctx->gcm_use_avx == B_TRUE &&
+		if (gcm_ctx->impl == GCM_IMPL_AVX &&
 		    zfs_movbe_available() == B_TRUE) {
 			(void) atomic_toggle_boolean_nv(
 			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
@@ -658,12 +660,13 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
 	 * still they could be created by the aes generic implementation.
 	 * Make sure not to use them since we'll corrupt data if we do.
 	 */
-	if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
-		gcm_ctx->gcm_use_avx = B_FALSE;
+	if (gcm_ctx->impl != GCM_IMPL_GENERIC && needs_bswap == B_TRUE) {
+		gcm_ctx->impl = GCM_IMPL_GENERIC;
 
 		cmn_err_once(CE_WARN,
 		    "ICP: Can't use the aes generic or cycle implementations "
-		    "in combination with the gcm avx implementation!");
+		    "in combination with the gcm avx or avx2-vaes "
+		    "implementation!");
 		cmn_err_once(CE_WARN,
 		    "ICP: Falling back to a compatible implementation, "
 		    "aes-gcm performance will likely be degraded.");
@@ -672,36 +675,20 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
 		    "restore performance.");
 	}
 
-	/* Allocate Htab memory as needed. */
-	if (gcm_ctx->gcm_use_avx == B_TRUE) {
-		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
-
-		if (htab_len == 0) {
-			return (CRYPTO_MECHANISM_PARAM_INVALID);
-		}
-		gcm_ctx->gcm_htab_len = htab_len;
-		gcm_ctx->gcm_Htable =
-		    kmem_alloc(htab_len, KM_SLEEP);
-
-		if (gcm_ctx->gcm_Htable == NULL) {
-			return (CRYPTO_HOST_MEMORY);
-		}
+	/*
+	 * AVX implementations use Htable with sizes depending on
+	 * implementation.
+	 */
+	if (gcm_ctx->impl != GCM_IMPL_GENERIC) {
+		rv = gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
+		    block_size);
 	}
-	/* Avx and non avx context initialization differs from here on. */
-	if (gcm_ctx->gcm_use_avx == B_FALSE) {
+	else
 #endif /* ifdef CAN_USE_GCM_ASM */
-		if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
-		    encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
-			rv = CRYPTO_MECHANISM_PARAM_INVALID;
-		}
-#ifdef CAN_USE_GCM_ASM
-	} else {
-		if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
-		    block_size) != CRYPTO_SUCCESS) {
-			rv = CRYPTO_MECHANISM_PARAM_INVALID;
-		}
+	if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
+	    encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
+		rv = CRYPTO_MECHANISM_PARAM_INVALID;
 	}
-#endif /* ifdef CAN_USE_GCM_ASM */
 
 	return (rv);
 }
@@ -767,6 +754,9 @@ gcm_impl_get_ops(void)
 		break;
 #ifdef CAN_USE_GCM_ASM
 	case IMPL_AVX:
+#if CAN_USE_GCM_ASM >= 2
+	case IMPL_AVX2:
+#endif
 		/*
 		 * Make sure that we return a valid implementation while
 		 * switching to the avx implementation since there still
@@ -828,6 +818,13 @@ gcm_impl_init(void)
 	 * Use the avx implementation if it's available and the implementation
 	 * hasn't changed from its default value of fastest on module load.
 	 */
+#if CAN_USE_GCM_ASM >= 2
+	if (gcm_avx2_will_work()) {
+		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
+			gcm_use_impl(GCM_IMPL_AVX2);
+		}
+	} else
+#endif
 	if (gcm_avx_will_work()) {
 #ifdef HAVE_MOVBE
 		if (zfs_movbe_available() == B_TRUE) {
@@ -835,7 +832,7 @@ gcm_impl_init(void)
 		}
 #endif
 		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
-			gcm_set_avx(B_TRUE);
+			gcm_use_impl(GCM_IMPL_AVX);
 		}
 	}
 #endif
@@ -852,6 +849,7 @@ static const struct {
 		{ "fastest",	IMPL_FASTEST },
 #ifdef CAN_USE_GCM_ASM
 		{ "avx",	IMPL_AVX },
+		{ "avx2-vaes",	IMPL_AVX2 },
 #endif
 };
 
@@ -887,7 +885,13 @@ gcm_impl_set(const char *val)
 	/* Check mandatory options */
 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
 #ifdef CAN_USE_GCM_ASM
+#if CAN_USE_GCM_ASM >= 2
 		/* Ignore avx implementation if it won't work. */
+		if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
+		    !gcm_avx2_will_work()) {
+			continue;
+		}
+#endif
 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
 			continue;
 		}
@@ -915,11 +919,17 @@ gcm_impl_set(const char *val)
 	 * Use the avx implementation if available and the requested one is
 	 * avx or fastest.
 	 */
+#if CAN_USE_GCM_ASM >= 2
+	if (gcm_avx2_will_work() == B_TRUE &&
+	    (impl == IMPL_AVX2 || impl == IMPL_FASTEST)) {
+		gcm_use_impl(GCM_IMPL_AVX2);
+	} else
+#endif
 	if (gcm_avx_will_work() == B_TRUE &&
 	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
-		gcm_set_avx(B_TRUE);
+		gcm_use_impl(GCM_IMPL_AVX);
 	} else {
-		gcm_set_avx(B_FALSE);
+		gcm_use_impl(GCM_IMPL_GENERIC);
 	}
 #endif
 
@@ -952,6 +962,12 @@ icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
 #ifdef CAN_USE_GCM_ASM
 		/* Ignore avx implementation if it won't work. */
+#if CAN_USE_GCM_ASM >= 2
+		if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
+		    !gcm_avx2_will_work()) {
+			continue;
+		}
+#endif
 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
 			continue;
 		}
@@ -993,9 +1009,6 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
 
 /* Clear the FPU registers since they hold sensitive internal state. */
 #define	clear_fpu_regs() clear_fpu_regs_avx()
-#define	GHASH_AVX(ctx, in, len) \
-    gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
-    in, len)
 
 #define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
 
@@ -1010,20 +1023,77 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
 static uint32_t gcm_avx_chunk_size =
 	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
 
+/*
+ * GCM definitions: uint128_t is copied from include/crypto/modes.h
+ * Avoiding u128 because it is already defined in kernel sources.
+ */
+typedef struct {
+    uint64_t hi, lo;
+} uint128_t;
+
 extern void ASMABI clear_fpu_regs_avx(void);
 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
     const uint32_t pt[4], uint32_t ct[4]);
 
 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
+#if CAN_USE_GCM_ASM >= 2
+extern void ASMABI gcm_init_vpclmulqdq_avx2(uint128_t Htable[16],
+    const uint64_t H[2]);
+#endif
 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
     const uint8_t *in, size_t len);
+#if CAN_USE_GCM_ASM >= 2
+extern void ASMABI gcm_ghash_vpclmulqdq_avx2(uint64_t ghash[2],
+    const uint64_t *Htable, const uint8_t *in, size_t len);
+#endif
+static inline void GHASH_AVX(gcm_ctx_t *ctx, const uint8_t *in, size_t len)
+{
+	switch (ctx->impl) {
+#if CAN_USE_GCM_ASM >= 2
+		case GCM_IMPL_AVX2:
+			gcm_ghash_vpclmulqdq_avx2(ctx->gcm_ghash,
+			    (const uint64_t *)ctx->gcm_Htable, in, len);
+			break;
+#endif
+
+		case GCM_IMPL_AVX:
+			gcm_ghash_avx(ctx->gcm_ghash,
+			    (const uint64_t *)ctx->gcm_Htable, in, len);
+			break;
+
+		default:
+			VERIFY(B_FALSE);
+	}
+}
 
+typedef size_t ASMABI aesni_gcm_encrypt_impl(const uint8_t *, uint8_t *,
+    size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
     const void *, uint64_t *, uint64_t *);
+#if CAN_USE_GCM_ASM >= 2
+extern void ASMABI aes_gcm_enc_update_vaes_avx2(const uint8_t *in,
+    uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
+    const uint128_t Htable[16], uint8_t Xi[16]);
+#endif
 
+typedef size_t ASMABI aesni_gcm_decrypt_impl(const uint8_t *, uint8_t *,
+    size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
     const void *, uint64_t *, uint64_t *);
+#if CAN_USE_GCM_ASM >= 2
+extern void ASMABI aes_gcm_dec_update_vaes_avx2(const uint8_t *in,
+    uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
+    const uint128_t Htable[16], uint8_t Xi[16]);
+#endif
+
+static inline boolean_t
+gcm_avx2_will_work(void)
+{
+	return (kfpu_allowed() &&
+	    zfs_avx2_available() && zfs_vaes_available() &&
+	    zfs_vpclmulqdq_available());
+}
 
 static inline boolean_t
 gcm_avx_will_work(void)
@@ -1035,33 +1105,67 @@ gcm_avx_will_work(void)
 }
 
 static inline void
-gcm_set_avx(boolean_t val)
+gcm_use_impl(gcm_impl impl)
 {
-	if (gcm_avx_will_work() == B_TRUE) {
-		atomic_swap_32(&gcm_use_avx, val);
+	switch (impl) {
+#if CAN_USE_GCM_ASM >= 2
+		case GCM_IMPL_AVX2:
+			if (gcm_avx2_will_work() == B_TRUE) {
+				atomic_swap_32(&gcm_impl_used, impl);
+				return;
+			}
+
+			zfs_fallthrough;
+#endif
+
+		case GCM_IMPL_AVX:
+			if (gcm_avx_will_work() == B_TRUE) {
+				atomic_swap_32(&gcm_impl_used, impl);
+				return;
+			}
+
+			zfs_fallthrough;
+
+		default:
+			atomic_swap_32(&gcm_impl_used, GCM_IMPL_GENERIC);
 	}
 }
 
 static inline boolean_t
-gcm_toggle_avx(void)
+gcm_impl_will_work(gcm_impl impl)
 {
-	if (gcm_avx_will_work() == B_TRUE) {
-		return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
-	} else {
-		return (B_FALSE);
+	switch (impl) {
+#if CAN_USE_GCM_ASM >= 2
+		case GCM_IMPL_AVX2:
+			return (gcm_avx2_will_work());
+#endif
+
+		case GCM_IMPL_AVX:
+			return (gcm_avx_will_work());
+
+		default:
+			return (B_TRUE);
 	}
 }
 
-static inline size_t
-gcm_simd_get_htab_size(boolean_t simd_mode)
+static inline gcm_impl
+gcm_toggle_impl(void)
 {
-	switch (simd_mode) {
-	case B_TRUE:
-		return (2 * 6 * 2 * sizeof (uint64_t));
+	gcm_impl current_impl, new_impl;
+	do { /* handle races */
+		current_impl = atomic_load_32(&gcm_impl_used);
+		new_impl = current_impl;
+		while (B_TRUE) { /* handle incompatble implementations */
+			new_impl = (new_impl + 1) % GCM_IMPL_MAX;
+			if (gcm_impl_will_work(new_impl)) {
+				break;
+			}
+		}
 
-	default:
-		return (0);
-	}
+	} while (atomic_cas_32(&gcm_impl_used, current_impl, new_impl) !=
+	    current_impl);
+
+	return (new_impl);
 }
 
 
@@ -1077,6 +1181,50 @@ gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
 }
 
+static size_t aesni_gcm_encrypt_avx(const uint8_t *in, uint8_t *out,
+    size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
+    uint64_t *Xip)
+{
+	(void) Htable;
+	return (aesni_gcm_encrypt(in, out, len, key, iv, Xip));
+}
+
+#if CAN_USE_GCM_ASM >= 2
+// kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four
+// bits of a |size_t|.
+// This is from boringssl/crypto/fipsmodule/aes/gcm.cc.inc
+static const size_t kSizeTWithoutLower4Bits = (size_t)-16;
+
+/* The following CRYPTO methods are from boringssl/crypto/internal.h */
+static inline uint32_t CRYPTO_bswap4(uint32_t x) {
+	return (__builtin_bswap32(x));
+}
+
+static inline uint32_t CRYPTO_load_u32_be(const void *in) {
+	uint32_t v;
+	memcpy(&v, in, sizeof (v));
+	return (CRYPTO_bswap4(v));
+}
+
+static inline void CRYPTO_store_u32_be(void *out, uint32_t v) {
+	v = CRYPTO_bswap4(v);
+	memcpy(out, &v, sizeof (v));
+}
+
+static size_t aesni_gcm_encrypt_avx2(const uint8_t *in, uint8_t *out,
+    size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
+    uint64_t *Xip)
+{
+	uint8_t *ivec = (uint8_t *)iv;
+	len &= kSizeTWithoutLower4Bits;
+	aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec,
+	    (const uint128_t *)Htable, (uint8_t *)Xip);
+	CRYPTO_store_u32_be(&ivec[12],
+	    CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+	return (len);
+}
+#endif /* if CAN_USE_GCM_ASM >= 2 */
+
 /*
  * Encrypt multiple blocks of data in GCM mode.
  * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
@@ -1091,8 +1239,15 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
 	size_t done = 0;
 	uint8_t *datap = (uint8_t *)data;
 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
+	aesni_gcm_encrypt_impl *encrypt_blocks =
+#if CAN_USE_GCM_ASM >= 2
+	    ctx->impl == GCM_IMPL_AVX2 ?
+	    aesni_gcm_encrypt_avx2 :
+#endif
+	    aesni_gcm_encrypt_avx;
 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
 	uint64_t *ghash = ctx->gcm_ghash;
+	uint64_t *htable = ctx->gcm_Htable;
 	uint64_t *cb = ctx->gcm_cb;
 	uint8_t *ct_buf = NULL;
 	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
@@ -1156,8 +1311,8 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
 	/* Do the bulk encryption in chunk_size blocks. */
 	for (; bleft >= chunk_size; bleft -= chunk_size) {
 		kfpu_begin();
-		done = aesni_gcm_encrypt(
-		    datap, ct_buf, chunk_size, key, cb, ghash);
+		done = encrypt_blocks(
+		    datap, ct_buf, chunk_size, key, cb, htable, ghash);
 
 		clear_fpu_regs();
 		kfpu_end();
@@ -1180,7 +1335,8 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
 	/* Bulk encrypt the remaining data. */
 	kfpu_begin();
 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
-		done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
+		done = encrypt_blocks(datap, ct_buf, bleft, key, cb, htable,
+		    ghash);
 		if (done == 0) {
 			rv = CRYPTO_FAILED;
 			goto out;
@@ -1293,6 +1449,29 @@ gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
 	return (CRYPTO_SUCCESS);
 }
 
+static size_t aesni_gcm_decrypt_avx(const uint8_t *in, uint8_t *out,
+    size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
+    uint64_t *Xip)
+{
+	(void) Htable;
+	return (aesni_gcm_decrypt(in, out, len, key, iv, Xip));
+}
+
+#if CAN_USE_GCM_ASM >= 2
+static size_t aesni_gcm_decrypt_avx2(const uint8_t *in, uint8_t *out,
+    size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
+    uint64_t *Xip)
+{
+	uint8_t *ivec = (uint8_t *)iv;
+	len &= kSizeTWithoutLower4Bits;
+	aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec,
+	    (const uint128_t *)Htable, (uint8_t *)Xip);
+	CRYPTO_store_u32_be(&ivec[12],
+	    CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+	return (len);
+}
+#endif /* if CAN_USE_GCM_ASM >= 2 */
+
 /*
  * Finalize decryption: We just have accumulated crypto text, so now we
  * decrypt it here inplace.
@@ -1306,10 +1485,17 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
 	    B_FALSE);
 
 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
+	aesni_gcm_decrypt_impl *decrypt_blocks =
+#if CAN_USE_GCM_ASM >= 2
+	    ctx->impl == GCM_IMPL_AVX2 ?
+	    aesni_gcm_decrypt_avx2 :
+#endif
+	    aesni_gcm_decrypt_avx;
 	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
 	uint8_t *datap = ctx->gcm_pt_buf;
 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
 	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
+	uint64_t *htable = ctx->gcm_Htable;
 	uint64_t *ghash = ctx->gcm_ghash;
 	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
 	int rv = CRYPTO_SUCCESS;
@@ -1322,8 +1508,8 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
 	 */
 	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
 		kfpu_begin();
-		done = aesni_gcm_decrypt(datap, datap, chunk_size,
-		    (const void *)key, ctx->gcm_cb, ghash);
+		done = decrypt_blocks(datap, datap, chunk_size,
+		    (const void *)key, ctx->gcm_cb, htable, ghash);
 		clear_fpu_regs();
 		kfpu_end();
 		if (done != chunk_size) {
@@ -1334,8 +1520,8 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
 	/* Decrypt remainder, which is less than chunk size, in one go. */
 	kfpu_begin();
 	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
-		done = aesni_gcm_decrypt(datap, datap, bleft,
-		    (const void *)key, ctx->gcm_cb, ghash);
+		done = decrypt_blocks(datap, datap, bleft,
+		    (const void *)key, ctx->gcm_cb, htable, ghash);
 		if (done == 0) {
 			clear_fpu_regs();
 			kfpu_end();
@@ -1424,13 +1610,42 @@ gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
 	    B_FALSE);
 
+	size_t htab_len = 0;
+#if CAN_USE_GCM_ASM >= 2
+	if (ctx->impl == GCM_IMPL_AVX2) {
+		/*
+		 * BoringSSL's API specifies uint128_t[16] for htab; but only
+		 * uint128_t[12] are used.
+		 * See https://github.com/google/boringssl/blob/
+		 * 813840dd094f9e9c1b00a7368aa25e656554221f1/crypto/fipsmodule/
+		 * modes/asm/aes-gcm-avx2-x86_64.pl#L198-L200
+		 */
+		htab_len = (2 * 8 * sizeof (uint128_t));
+	} else
+#endif /* CAN_USE_GCM_ASM >= 2 */
+	{
+		htab_len = (2 * 6 * sizeof (uint128_t));
+	}
+
+	ctx->gcm_Htable = kmem_alloc(htab_len, KM_SLEEP);
+	if (ctx->gcm_Htable == NULL) {
+		return (CRYPTO_HOST_MEMORY);
+	}
+
 	/* Init H (encrypt zero block) and create the initial counter block. */
 	memset(H, 0, sizeof (ctx->gcm_H));
 	kfpu_begin();
 	aes_encrypt_intel(keysched, aes_rounds,
 	    (const uint32_t *)H, (uint32_t *)H);
 
-	gcm_init_htab_avx(ctx->gcm_Htable, H);
+#if CAN_USE_GCM_ASM >= 2
+	if (ctx->impl == GCM_IMPL_AVX2) {
+		gcm_init_vpclmulqdq_avx2((uint128_t *)ctx->gcm_Htable, H);
+	} else
+#endif /* if CAN_USE_GCM_ASM >= 2 */
+	{
+		gcm_init_htab_avx(ctx->gcm_Htable, H);
+	}
 
 	if (iv_len == 12) {
 		memcpy(cb, iv, 12);
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/modes.c b/sys/contrib/openzfs/module/icp/algs/modes/modes.c
index 343591cd9691..ef3c1806e4b6 100644
--- a/sys/contrib/openzfs/module/icp/algs/modes/modes.c
+++ b/sys/contrib/openzfs/module/icp/algs/modes/modes.c
@@ -171,7 +171,7 @@ gcm_clear_ctx(gcm_ctx_t *ctx)
 	explicit_memset(ctx->gcm_remainder, 0, sizeof (ctx->gcm_remainder));
 	explicit_memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
 #if defined(CAN_USE_GCM_ASM)
-	if (ctx->gcm_use_avx == B_TRUE) {
+	if (ctx->impl != GCM_IMPL_GENERIC) {
 		ASSERT3P(ctx->gcm_Htable, !=, NULL);
 		explicit_memset(ctx->gcm_Htable, 0, ctx->gcm_htab_len);
 		kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);
diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c
index 6d3bcca9f995..dcb0a391dda4 100644
--- a/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c
+++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c
@@ -38,11 +38,14 @@
 	kfpu_begin(); E(s, d, b); kfpu_end(); \
 }
 
+#if defined(__x86_64) || defined(__aarch64__) || defined(__arm__) || \
+    defined(__PPC64__)
 /* some implementation is always okay */
 static inline boolean_t sha2_is_supported(void)
 {
 	return (B_TRUE);
 }
+#endif
 
 #if defined(__x86_64)
 
diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c
index d0fcca798fa9..ad707341eec7 100644
--- a/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c
+++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c
@@ -77,7 +77,8 @@ static const uint32_t SHA256_K[64] = {
 	h = g, g = f, f = e, e = d + T1; \
 	d = c, c = b, b = a, a = T1 + T2;
 
-static void sha256_generic(uint32_t state[8], const void *data, size_t num_blks)
+static void
+icp_sha256_generic(uint32_t state[8], const void *data, size_t num_blks)
 {
 	uint64_t blk;
 
@@ -173,7 +174,8 @@ static const uint64_t SHA512_K[80] = {
 	0x5fcb6fab3ad6faec, 0x6c44198c4a475817
 };
 
-static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks)
+static void
+icp_sha512_generic(uint64_t state[8], const void *data, size_t num_blks)
 {
 	uint64_t blk;
 
@@ -226,7 +228,8 @@ static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks)
 	}
 }
 
-static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len)
+static void
+icp_sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len)
 {
 	uint64_t pos = ctx->count[0];
 	uint64_t total = ctx->count[1];
@@ -258,7 +261,8 @@ static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len)
 	ctx->count[1] = total;
 }
 
-static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len)
+static void
+icp_sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len)
 {
 	uint64_t pos = ctx->count[0];
 	uint64_t total = ctx->count[1];
@@ -290,7 +294,8 @@ static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len)
 	ctx->count[1] = total;
 }
 
-static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits)
+static void
+icp_sha256_final(sha256_ctx *ctx, uint8_t *result, int bits)
 {
 	uint64_t mlen, pos = ctx->count[0];
 	uint8_t *m = ctx->wbuf;
@@ -334,7 +339,8 @@ static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits)
 	memset(ctx, 0, sizeof (*ctx));
 }
 
-static void sha512_final(sha512_ctx *ctx, uint8_t *result, int bits)
+static void
+icp_sha512_final(sha512_ctx *ctx, uint8_t *result, int bits)
 {
 	uint64_t mlen, pos = ctx->count[0];
 	uint8_t *m = ctx->wbuf, *r;
@@ -461,14 +467,14 @@ SHA2Update(SHA2_CTX *ctx, const void *data, size_t len)
 
 	switch (ctx->algotype) {
 		case SHA256:
-			sha256_update(&ctx->sha256, data, len);
+			icp_sha256_update(&ctx->sha256, data, len);
 			break;
 		case SHA512:
 		case SHA512_HMAC_MECH_INFO_TYPE:
-			sha512_update(&ctx->sha512, data, len);
+			icp_sha512_update(&ctx->sha512, data, len);
 			break;
 		case SHA512_256:
-			sha512_update(&ctx->sha512, data, len);
+			icp_sha512_update(&ctx->sha512, data, len);
 			break;
 	}
 }
@@ -479,32 +485,33 @@ SHA2Final(void *digest, SHA2_CTX *ctx)
 {
 	switch (ctx->algotype) {
 		case SHA256:
-			sha256_final(&ctx->sha256, digest, 256);
+			icp_sha256_final(&ctx->sha256, digest, 256);
 			break;
 		case SHA512:
 		case SHA512_HMAC_MECH_INFO_TYPE:
-			sha512_final(&ctx->sha512, digest, 512);
+			icp_sha512_final(&ctx->sha512, digest, 512);
 			break;
 		case SHA512_256:
-			sha512_final(&ctx->sha512, digest, 256);
+			icp_sha512_final(&ctx->sha512, digest, 256);
 			break;
 	}
 }
 
 /* the generic implementation is always okay */
-static boolean_t sha2_is_supported(void)
+static boolean_t
+icp_sha2_is_supported(void)
 {
 	return (B_TRUE);
 }
 
 const sha256_ops_t sha256_generic_impl = {
 	.name = "generic",
-	.transform = sha256_generic,
-	.is_supported = sha2_is_supported
+	.transform = icp_sha256_generic,
+	.is_supported = icp_sha2_is_supported
 };
 
 const sha512_ops_t sha512_generic_impl = {
 	.name = "generic",
-	.transform = sha512_generic,
-	.is_supported = sha2_is_supported
+	.transform = icp_sha512_generic,
+	.is_supported = icp_sha2_is_supported
 };
diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c
index 2efd9fcf4c99..a85a71a83df4 100644
--- a/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c
+++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c
@@ -38,11 +38,14 @@
 	kfpu_begin(); E(s, d, b); kfpu_end(); \
 }
 
+#if defined(__x86_64) || defined(__aarch64__) || defined(__arm__) || \
+    defined(__aarch64__) || defined(__arm__) || defined(__PPC64__)
 /* some implementation is always okay */
 static inline boolean_t sha2_is_supported(void)
 {
 	return (B_TRUE);
 }
+#endif
 
 #if defined(__x86_64)
 
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl
new file mode 100644
index 000000000000..04c03a37e0cb
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl
@@ -0,0 +1,253 @@
+BoringSSL is a fork of OpenSSL. As such, large parts of it fall under OpenSSL
+licensing. Files that are completely new have a Google copyright and an ISC
+license. This license is reproduced at the bottom of this file.
+
+Contributors to BoringSSL are required to follow the CLA rules for Chromium:
+https://cla.developers.google.com/clas
+
+Files in third_party/ have their own licenses, as described therein. The MIT
+license, for third_party/fiat, which, unlike other third_party directories, is
+compiled into non-test libraries, is included below.
+
+The OpenSSL toolkit stays under a dual license, i.e. both the conditions of the
+OpenSSL License and the original SSLeay license apply to the toolkit. See below
+for the actual license texts. Actually both licenses are BSD-style Open Source
+licenses. In case of any license issues related to OpenSSL please contact
+openssl-core@openssl.org.
+
+The following are Google-internal bug numbers where explicit permission from
+some authors is recorded for use of their work. (This is purely for our own
+record keeping.)
+  27287199
+  27287880
+  27287883
+  263291445
+
+
+  OpenSSL License
+  ---------------
+
+/* ====================================================================
+ * Copyright (c) 1998-2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+ Original SSLeay License
+ -----------------------
+
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+
+ISC license used for completely new code in BoringSSL:
+
+/* Copyright 2015 The BoringSSL Authors
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+
+The code in third_party/fiat carries the MIT license:
+
+Copyright (c) 2015-2016 the fiat-crypto authors (see
+https://github.com/mit-plv/fiat-crypto/blob/master/AUTHORS).
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+
+Licenses for support code
+-------------------------
+
+Parts of the TLS test suite are under the Go license. This code is not included
+in BoringSSL (i.e. libcrypto and libssl) when compiled, however, so
+distributing code linked against BoringSSL does not trigger this license:
+
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+BoringSSL uses the Chromium test infrastructure to run a continuous build,
+trybots etc. The scripts which manage this, and the script for generating build
+metadata, are under the Chromium license. Distributing code linked against
+BoringSSL does not trigger this license.
+
+Copyright 2015 The Chromium Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl.descrip b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl.descrip
new file mode 100644
index 000000000000..f63a67a4d2ae
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl.descrip
@@ -0,0 +1 @@
+PORTIONS OF AES GCM and GHASH FUNCTIONALITY
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S
new file mode 100644
index 000000000000..3d1b045127e2
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S
@@ -0,0 +1,1323 @@
+// SPDX-License-Identifier: Apache-2.0
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__x86_64__) && defined(HAVE_AVX) && \
+    defined(HAVE_VAES) && defined(HAVE_VPCLMULQDQ)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+/* Windows userland links with OpenSSL */
+#if !defined (_WIN32) || defined (_KERNEL)
+
+.section	.rodata
+.balign	16
+
+
+.Lbswap_mask:
+.quad	0x08090a0b0c0d0e0f, 0x0001020304050607
+
+
+
+
+
+
+
+
+.Lgfpoly:
+.quad	1, 0xc200000000000000
+
+
+.Lgfpoly_and_internal_carrybit:
+.quad	1, 0xc200000000000001
+
+.balign	32
+
+.Lctr_pattern:
+.quad	0, 0
+.quad	1, 0
+.Linc_2blocks:
+.quad	2, 0
+.quad	2, 0
+
+ENTRY_ALIGN(gcm_init_vpclmulqdq_avx2, 32)
+.cfi_startproc
+
+ENDBR
+
+
+
+
+
+	vmovdqu	(%rsi),%xmm3
+	// KCF/ICP stores H in network byte order with the hi qword first
+	// so we need to swap all bytes, not the 2 qwords.
+	vmovdqu	.Lbswap_mask(%rip),%xmm4
+	vpshufb	%xmm4,%xmm3,%xmm3
+
+
+
+
+
+	vpshufd	$0xd3,%xmm3,%xmm0
+	vpsrad	$31,%xmm0,%xmm0
+	vpaddq	%xmm3,%xmm3,%xmm3
+	vpand	.Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm6
+
+
+	vpclmulqdq	$0x00,%xmm3,%xmm3,%xmm0
+	vpclmulqdq	$0x11,%xmm3,%xmm3,%xmm5
+	vpclmulqdq	$0x01,%xmm0,%xmm6,%xmm1
+	vpshufd	$0x4e,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm1,%xmm6,%xmm0
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpxor	%xmm1,%xmm5,%xmm5
+	vpxor	%xmm0,%xmm5,%xmm5
+
+
+
+	vinserti128	$1,%xmm3,%ymm5,%ymm3
+	vinserti128	$1,%xmm5,%ymm5,%ymm5
+
+
+	vpclmulqdq	$0x00,%ymm5,%ymm3,%ymm0
+	vpclmulqdq	$0x01,%ymm5,%ymm3,%ymm1
+	vpclmulqdq	$0x10,%ymm5,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm6,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x11,%ymm5,%ymm3,%ymm4
+	vpclmulqdq	$0x01,%ymm1,%ymm6,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+
+
+	vmovdqu	%ymm3,96(%rdi)
+	vmovdqu	%ymm4,64(%rdi)
+
+
+
+	vpunpcklqdq	%ymm3,%ymm4,%ymm0
+	vpunpckhqdq	%ymm3,%ymm4,%ymm1
+	vpxor	%ymm1,%ymm0,%ymm0
+	vmovdqu	%ymm0,128+32(%rdi)
+
+
+	vpclmulqdq	$0x00,%ymm5,%ymm4,%ymm0
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm1
+	vpclmulqdq	$0x10,%ymm5,%ymm4,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm6,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x11,%ymm5,%ymm4,%ymm3
+	vpclmulqdq	$0x01,%ymm1,%ymm6,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm3,%ymm3
+
+	vpclmulqdq	$0x00,%ymm5,%ymm3,%ymm0
+	vpclmulqdq	$0x01,%ymm5,%ymm3,%ymm1
+	vpclmulqdq	$0x10,%ymm5,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm6,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x11,%ymm5,%ymm3,%ymm4
+	vpclmulqdq	$0x01,%ymm1,%ymm6,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vmovdqu	%ymm3,32(%rdi)
+	vmovdqu	%ymm4,0(%rdi)
+
+
+
+	vpunpcklqdq	%ymm3,%ymm4,%ymm0
+	vpunpckhqdq	%ymm3,%ymm4,%ymm1
+	vpxor	%ymm1,%ymm0,%ymm0
+	vmovdqu	%ymm0,128(%rdi)
+
+	vzeroupper
+	RET
+
+.cfi_endproc
+SET_SIZE(gcm_init_vpclmulqdq_avx2)
+ENTRY_ALIGN(gcm_gmult_vpclmulqdq_avx2, 32)
+.cfi_startproc
+
+ENDBR
+
+
+
+	vmovdqu	(%rdi),%xmm0
+	vmovdqu	.Lbswap_mask(%rip),%xmm1
+	vmovdqu	128-16(%rsi),%xmm2
+	vmovdqu	.Lgfpoly(%rip),%xmm3
+	vpshufb	%xmm1,%xmm0,%xmm0
+
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm4
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm5
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm4,%xmm3,%xmm6
+	vpshufd	$0x4e,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x01,%xmm5,%xmm3,%xmm4
+	vpshufd	$0x4e,%xmm5,%xmm5
+	vpxor	%xmm5,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm0,%xmm0
+
+
+	vpshufb	%xmm1,%xmm0,%xmm0
+	vmovdqu	%xmm0,(%rdi)
+
+
+	RET
+
+.cfi_endproc
+SET_SIZE(gcm_gmult_vpclmulqdq_avx2)
+ENTRY_ALIGN(gcm_ghash_vpclmulqdq_avx2, 32)
+.cfi_startproc
+
+ENDBR
+
+
+
+
+
+
+	vmovdqu	.Lbswap_mask(%rip),%xmm6
+	vmovdqu	.Lgfpoly(%rip),%xmm7
+
+
+	vmovdqu	(%rdi),%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+
+
+	cmpq	$32,%rcx
+	jb	.Lghash_lastblock
+
+
+
+	vinserti128	$1,%xmm6,%ymm6,%ymm6
+	vinserti128	$1,%xmm7,%ymm7,%ymm7
+
+	cmpq	$127,%rcx
+	jbe	.Lghash_loop_1x
+
+
+	vmovdqu	128(%rsi),%ymm8
+	vmovdqu	128+32(%rsi),%ymm9
+.Lghash_loop_4x:
+
+	vmovdqu	0(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	0(%rsi),%ymm2
+	vpxor	%ymm5,%ymm1,%ymm1
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm8,%ymm0,%ymm4
+
+	vmovdqu	32(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	32(%rsi),%ymm2
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x10,%ymm8,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vmovdqu	64(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	64(%rsi),%ymm2
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm9,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm4,%ymm4
+
+
+	vmovdqu	96(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	96(%rsi),%ymm2
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x10,%ymm9,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpxor	%ymm5,%ymm4,%ymm4
+
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm2
+	vpclmulqdq	$0x01,%ymm3,%ymm2,%ymm0
+	vpshufd	$0x4e,%ymm3,%ymm3
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vpclmulqdq	$0x01,%ymm4,%ymm2,%ymm0
+	vpshufd	$0x4e,%ymm4,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpxor	%ymm0,%ymm5,%ymm5
+	vextracti128	$1,%ymm5,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+
+	subq	$-128,%rdx
+	addq	$-128,%rcx
+	cmpq	$127,%rcx
+	ja	.Lghash_loop_4x
+
+
+	cmpq	$32,%rcx
+	jb	.Lghash_loop_1x_done
+.Lghash_loop_1x:
+	vmovdqu	(%rdx),%ymm0
+	vpshufb	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vmovdqu	128-32(%rsi),%ymm0
+	vpclmulqdq	$0x00,%ymm0,%ymm5,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm5,%ymm2
+	vpclmulqdq	$0x10,%ymm0,%ymm5,%ymm3
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x01,%ymm1,%ymm7,%ymm3
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x11,%ymm0,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm7,%ymm1
+	vpshufd	$0x4e,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpxor	%ymm1,%ymm5,%ymm5
+
+	vextracti128	$1,%ymm5,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+	addq	$32,%rdx
+	subq	$32,%rcx
+	cmpq	$32,%rcx
+	jae	.Lghash_loop_1x
+.Lghash_loop_1x_done:
+
+
+.Lghash_lastblock:
+	testq	%rcx,%rcx
+	jz	.Lghash_done
+	vmovdqu	(%rdx),%xmm0
+	vpshufb	%xmm6,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+	vmovdqu	128-16(%rsi),%xmm0
+	vpclmulqdq	$0x00,%xmm0,%xmm5,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm5,%xmm2
+	vpclmulqdq	$0x10,%xmm0,%xmm5,%xmm3
+	vpxor	%xmm3,%xmm2,%xmm2
+	vpclmulqdq	$0x01,%xmm1,%xmm7,%xmm3
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpxor	%xmm3,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm2,%xmm7,%xmm1
+	vpshufd	$0x4e,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpxor	%xmm1,%xmm5,%xmm5
+
+
+.Lghash_done:
+
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vmovdqu	%xmm5,(%rdi)
+
+	vzeroupper
+	RET
+
+.cfi_endproc
+SET_SIZE(gcm_ghash_vpclmulqdq_avx2)
+ENTRY_ALIGN(aes_gcm_enc_update_vaes_avx2, 32)
+.cfi_startproc
+
+ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+
+	movq	16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+6(%rip)
+#endif
+	vbroadcasti128	.Lbswap_mask(%rip),%ymm0
+
+
+
+	vmovdqu	(%r12),%xmm1
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vbroadcasti128	(%r8),%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm11
+
+
+
+	movl	504(%rcx),%r10d		// ICP has a larger offset for rounds.
+	leal	-24(,%r10,4),%r10d	// ICP uses 10,12,14 not 9,11,13 for rounds.
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti128	(%rcx),%ymm9
+	vbroadcasti128	(%r11),%ymm10
+
+
+	vpaddd	.Lctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+	cmpq	$127,%rdx
+	jbe	.Lcrypt_loop_4x_done__func1
+
+	vmovdqu	128(%r9),%ymm7
+	vmovdqu	128+32(%r9),%ymm8
+
+
+
+	vmovdqu	.Linc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_first_4_vecs__func1:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_first_4_vecs__func1
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+	vaesenclast	%ymm2,%ymm12,%ymm12
+	vaesenclast	%ymm3,%ymm13,%ymm13
+	vaesenclast	%ymm5,%ymm14,%ymm14
+	vaesenclast	%ymm6,%ymm15,%ymm15
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	jbe	.Lghash_last_ciphertext_4x__func1
+.balign	16
+.Lcrypt_loop_4x__func1:
+
+
+
+
+	vmovdqu	.Linc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	cmpl	$24,%r10d
+	jl	.Laes128__func1
+	je	.Laes192__func1
+
+	vbroadcasti128	-208(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-192(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+.Laes192__func1:
+	vbroadcasti128	-176(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-160(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+.Laes128__func1:
+	prefetcht0	512(%rdi)
+	prefetcht0	512+64(%rdi)
+
+	vmovdqu	0(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm7,%ymm2,%ymm6
+
+	vbroadcasti128	-144(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vbroadcasti128	-128(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	32(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm7,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-112(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	64(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+
+	vbroadcasti128	-96(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-80(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+
+	vbroadcasti128	-64(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vmovdqu	96(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-48(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm4
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-32(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpclmulqdq	$0x01,%ymm6,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-16(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+
+	subq	$-128,%rsi
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+	vaesenclast	%ymm2,%ymm12,%ymm12
+	vaesenclast	%ymm3,%ymm13,%ymm13
+	vaesenclast	%ymm5,%ymm14,%ymm14
+	vaesenclast	%ymm6,%ymm15,%ymm15
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	ja	.Lcrypt_loop_4x__func1
+.Lghash_last_ciphertext_4x__func1:
+
+	vmovdqu	0(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm7,%ymm2,%ymm6
+
+	vmovdqu	32(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm7,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vmovdqu	64(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	96(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm4
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vpclmulqdq	$0x01,%ymm6,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	subq	$-128,%rsi
+.Lcrypt_loop_4x_done__func1:
+
+	testq	%rdx,%rdx
+	jz	.Ldone__func1
+
+
+
+
+
+	leaq	128(%r9),%r8
+	subq	%rdx,%r8
+
+
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+	vpxor	%xmm7,%xmm7,%xmm7
+
+	cmpq	$64,%rdx
+	jb	.Llessthan64bytes__func1
+
+
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_1__func1:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_1__func1
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%ymm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm13,%ymm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+
+
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm0,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%ymm3
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm7
+	vpclmulqdq	$0x00,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	jz	.Lreduce__func1
+
+	vpxor	%xmm1,%xmm1,%xmm1
+
+
+.Llessthan64bytes__func1:
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_2__func1:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_2__func1
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+
+
+	cmpq	$32,%rdx
+	jb	.Lxor_one_block__func1
+	je	.Lxor_two_blocks__func1
+
+.Lxor_three_blocks__func1:
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%xmm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%xmm3,%xmm13,%xmm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%xmm13,32(%rsi)
+
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%xmm3
+	vpclmulqdq	$0x00,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm7,%ymm7
+	jmp	.Lghash_mul_one_vec_unreduced__func1
+
+.Lxor_two_blocks__func1:
+	vmovdqu	(%rdi),%ymm2
+	vpxor	%ymm2,%ymm12,%ymm12
+	vmovdqu	%ymm12,(%rsi)
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	jmp	.Lghash_mul_one_vec_unreduced__func1
+
+.Lxor_one_block__func1:
+	vmovdqu	(%rdi),%xmm2
+	vpxor	%xmm2,%xmm12,%xmm12
+	vmovdqu	%xmm12,(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm1,%xmm12,%xmm12
+	vmovdqu	(%r8),%xmm2
+
+.Lghash_mul_one_vec_unreduced__func1:
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+.Lreduce__func1:
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm2
+	vpclmulqdq	$0x01,%ymm5,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpclmulqdq	$0x01,%ymm6,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm7,%ymm7
+	vpxor	%ymm3,%ymm7,%ymm7
+	vextracti128	$1,%ymm7,%xmm1
+	vpxor	%xmm7,%xmm1,%xmm1
+
+.Ldone__func1:
+
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vmovdqu	%xmm1,(%r12)
+
+	vzeroupper
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	RET
+
+.cfi_endproc
+SET_SIZE(aes_gcm_enc_update_vaes_avx2)
+ENTRY_ALIGN(aes_gcm_dec_update_vaes_avx2, 32)
+.cfi_startproc
+
+ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+
+	movq	16(%rsp),%r12
+	vbroadcasti128	.Lbswap_mask(%rip),%ymm0
+
+
+
+	vmovdqu	(%r12),%xmm1
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vbroadcasti128	(%r8),%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm11
+
+
+
+	movl	504(%rcx),%r10d		// ICP has a larger offset for rounds.
+	leal	-24(,%r10,4),%r10d	// ICP uses 10,12,14 not 9,11,13 for rounds.
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti128	(%rcx),%ymm9
+	vbroadcasti128	(%r11),%ymm10
+
+
+	vpaddd	.Lctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+	cmpq	$127,%rdx
+	jbe	.Lcrypt_loop_4x_done__func2
+
+	vmovdqu	128(%r9),%ymm7
+	vmovdqu	128+32(%r9),%ymm8
+.balign	16
+.Lcrypt_loop_4x__func2:
+
+
+
+
+	vmovdqu	.Linc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	cmpl	$24,%r10d
+	jl	.Laes128__func2
+	je	.Laes192__func2
+
+	vbroadcasti128	-208(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-192(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+.Laes192__func2:
+	vbroadcasti128	-176(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-160(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+.Laes128__func2:
+	prefetcht0	512(%rdi)
+	prefetcht0	512+64(%rdi)
+
+	vmovdqu	0(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm7,%ymm2,%ymm6
+
+	vbroadcasti128	-144(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vbroadcasti128	-128(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	32(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm7,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-112(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	64(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+
+	vbroadcasti128	-96(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-80(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+
+	vbroadcasti128	-64(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vmovdqu	96(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-48(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm4
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-32(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpclmulqdq	$0x01,%ymm6,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-16(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+
+
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+	vaesenclast	%ymm2,%ymm12,%ymm12
+	vaesenclast	%ymm3,%ymm13,%ymm13
+	vaesenclast	%ymm5,%ymm14,%ymm14
+	vaesenclast	%ymm6,%ymm15,%ymm15
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	ja	.Lcrypt_loop_4x__func2
+.Lcrypt_loop_4x_done__func2:
+
+	testq	%rdx,%rdx
+	jz	.Ldone__func2
+
+
+
+
+
+	leaq	128(%r9),%r8
+	subq	%rdx,%r8
+
+
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+	vpxor	%xmm7,%xmm7,%xmm7
+
+	cmpq	$64,%rdx
+	jb	.Llessthan64bytes__func2
+
+
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_1__func2:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_1__func2
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%ymm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm13,%ymm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+
+
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpshufb	%ymm0,%ymm3,%ymm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%ymm3
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm7
+	vpclmulqdq	$0x00,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	jz	.Lreduce__func2
+
+	vpxor	%xmm1,%xmm1,%xmm1
+
+
+.Llessthan64bytes__func2:
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_2__func2:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_2__func2
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+
+
+	cmpq	$32,%rdx
+	jb	.Lxor_one_block__func2
+	je	.Lxor_two_blocks__func2
+
+.Lxor_three_blocks__func2:
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%xmm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%xmm3,%xmm13,%xmm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%xmm13,32(%rsi)
+
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpshufb	%xmm0,%xmm3,%xmm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%xmm3
+	vpclmulqdq	$0x00,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm7,%ymm7
+	jmp	.Lghash_mul_one_vec_unreduced__func2
+
+.Lxor_two_blocks__func2:
+	vmovdqu	(%rdi),%ymm2
+	vpxor	%ymm2,%ymm12,%ymm12
+	vmovdqu	%ymm12,(%rsi)
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	jmp	.Lghash_mul_one_vec_unreduced__func2
+
+.Lxor_one_block__func2:
+	vmovdqu	(%rdi),%xmm2
+	vpxor	%xmm2,%xmm12,%xmm12
+	vmovdqu	%xmm12,(%rsi)
+	vpshufb	%xmm0,%xmm2,%xmm12
+	vpxor	%xmm1,%xmm12,%xmm12
+	vmovdqu	(%r8),%xmm2
+
+.Lghash_mul_one_vec_unreduced__func2:
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+.Lreduce__func2:
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm2
+	vpclmulqdq	$0x01,%ymm5,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpclmulqdq	$0x01,%ymm6,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm7,%ymm7
+	vpxor	%ymm3,%ymm7,%ymm7
+	vextracti128	$1,%ymm7,%xmm1
+	vpxor	%xmm7,%xmm1,%xmm1
+
+.Ldone__func2:
+
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vmovdqu	%xmm1,(%r12)
+
+	vzeroupper
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	RET
+
+.cfi_endproc
+SET_SIZE(aes_gcm_dec_update_vaes_avx2)
+
+#endif /* !_WIN32 || _KERNEL */
+
+/* Mark the stack non-executable. */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
diff --git a/sys/contrib/openzfs/module/icp/include/modes/modes.h b/sys/contrib/openzfs/module/icp/include/modes/modes.h
index ca734cf4f045..de11d9eafafb 100644
--- a/sys/contrib/openzfs/module/icp/include/modes/modes.h
+++ b/sys/contrib/openzfs/module/icp/include/modes/modes.h
@@ -42,7 +42,7 @@ extern "C" {
  */
 #if defined(__x86_64__) && defined(HAVE_AVX) && \
     defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
-#define	CAN_USE_GCM_ASM
+#define	CAN_USE_GCM_ASM (HAVE_VAES && HAVE_VPCLMULQDQ ? 2 : 1)
 extern boolean_t gcm_avx_can_use_movbe;
 #endif
 
@@ -129,6 +129,15 @@ typedef struct ccm_ctx {
 #define	ccm_copy_to		ccm_common.cc_copy_to
 #define	ccm_flags		ccm_common.cc_flags
 
+#ifdef CAN_USE_GCM_ASM
+typedef enum gcm_impl {
+	GCM_IMPL_GENERIC = 0,
+	GCM_IMPL_AVX,
+	GCM_IMPL_AVX2,
+	GCM_IMPL_MAX,
+} gcm_impl;
+#endif
+
 /*
  * gcm_tag_len:		Length of authentication tag.
  *
@@ -174,7 +183,7 @@ typedef struct gcm_ctx {
 	uint64_t gcm_len_a_len_c[2];
 	uint8_t *gcm_pt_buf;
 #ifdef CAN_USE_GCM_ASM
-	boolean_t gcm_use_avx;
+	enum gcm_impl impl;
 #endif
 } gcm_ctx_t;
 
diff --git a/sys/contrib/openzfs/module/icp/spi/kcf_spi.c b/sys/contrib/openzfs/module/icp/spi/kcf_spi.c
index 806c0b028017..35fe55b2595d 100644
--- a/sys/contrib/openzfs/module/icp/spi/kcf_spi.c
+++ b/sys/contrib/openzfs/module/icp/spi/kcf_spi.c
@@ -31,7 +31,6 @@
  */
 
 
-#include <sys/zfs_context.h>
 #include <sys/crypto/common.h>
 #include <sys/crypto/impl.h>
 #include <sys/crypto/sched_impl.h>
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c
index 54d4029c5e6f..b92be3710f3c 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c
@@ -238,7 +238,7 @@ zfs_uio_iov_step(struct iovec v, zfs_uio_t *uio, int *numpages)
 	    zfs_uio_rw(uio), &uio->uio_dio.pages[uio->uio_dio.npages]);
 
 	if (res != n)
-		return (SET_ERROR(EFAULT));
+		return (EFAULT);
 
 	ASSERT3U(len, ==, res * PAGE_SIZE);
 	*numpages = res;
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
index 26cc7981bfcd..1990ec677d37 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
@@ -76,7 +76,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		return (0);
 
 	err = dmu_buf_hold_array(os, object, offset, size,
-	    FALSE, FTAG, &numbufs, &dbp);
+	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
 	if (err)
 		return (err);
 
@@ -147,7 +147,8 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
 	ASSERT3S(last_size, <=, PAGE_SIZE);
 
 	err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
-	    IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
+	    IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp,
+	    DMU_READ_PREFETCH);
 	if (err != 0)
 		return (err);
 
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
index c114db14a916..b218c0da8125 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
@@ -112,7 +112,6 @@ static int zfs__fini(void);
 static void zfs_shutdown(void *, int);
 
 static eventhandler_tag zfs_shutdown_event_tag;
-static eventhandler_tag zfs_mountroot_event_tag;
 
 #define	ZFS_MIN_KSTACK_PAGES 4
 
@@ -311,9 +310,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused)
 			zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
 			    shutdown_post_sync, zfs_shutdown, NULL,
 			    SHUTDOWN_PRI_FIRST);
-			zfs_mountroot_event_tag = EVENTHANDLER_REGISTER(
-			    mountroot, spa_boot_init, NULL,
-			    SI_ORDER_ANY);
 		}
 		return (err);
 	case MOD_UNLOAD:
@@ -322,9 +318,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused)
 			if (zfs_shutdown_event_tag != NULL)
 				EVENTHANDLER_DEREGISTER(shutdown_post_sync,
 				    zfs_shutdown_event_tag);
-			if (zfs_mountroot_event_tag != NULL)
-				EVENTHANDLER_DEREGISTER(mountroot,
-				    zfs_mountroot_event_tag);
 		}
 		return (err);
 	case MOD_SHUTDOWN:
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/spa_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/spa_os.c
index 2d04ccf95fbf..d918b26521a7 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/spa_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/spa_os.c
@@ -193,7 +193,7 @@ spa_import_rootpool(const char *name, bool checkpointrewind)
 	 */
 	config = spa_generate_rootconf(name);
 
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	if (config != NULL) {
 		pname = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME);
 		VERIFY0(strcmp(name, pname));
@@ -204,7 +204,7 @@ spa_import_rootpool(const char *name, bool checkpointrewind)
 			 * e.g., after reboot -r.
 			 */
 			if (spa->spa_state == POOL_STATE_ACTIVE) {
-				mutex_exit(&spa_namespace_lock);
+				spa_namespace_exit(FTAG);
 				fnvlist_free(config);
 				return (0);
 			}
@@ -226,7 +226,7 @@ spa_import_rootpool(const char *name, bool checkpointrewind)
 		    &spa->spa_ubsync.ub_version) != 0)
 			spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
 	} else if ((spa = spa_lookup(name)) == NULL) {
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		fnvlist_free(config);
 		cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
 		    name);
@@ -249,7 +249,7 @@ spa_import_rootpool(const char *name, bool checkpointrewind)
 	    VDEV_ALLOC_ROOTPOOL);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (error) {
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		fnvlist_free(config);
 		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
 		    name);
@@ -259,7 +259,7 @@ spa_import_rootpool(const char *name, bool checkpointrewind)
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_free(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 
 	fnvlist_free(config);
 	return (0);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
index ace2360c032d..ebc2c0eeb6d2 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@@ -163,6 +163,13 @@ param_set_arc_int(SYSCTL_HANDLER_ARGS)
 	return (0);
 }
 
+static void
+warn_deprecated_sysctl(const char *old, const char *new)
+{
+	printf("WARNING: sysctl vfs.zfs.%s is deprecated. Use vfs.zfs.%s instead.\n",
+	    old, new);
+}
+
 int
 param_set_arc_max(SYSCTL_HANDLER_ARGS)
 {
@@ -185,12 +192,15 @@ param_set_arc_max(SYSCTL_HANDLER_ARGS)
 	if (val != 0)
 		zfs_arc_max = arc_c_max;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("arc_max", "arc.max");
+
 	return (0);
 }
 
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max,
 	CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
-	NULL, 0, param_set_arc_max, "LU",
+	NULL, 1, param_set_arc_max, "LU",
 	"Maximum ARC size in bytes (LEGACY)");
 
 int
@@ -214,12 +224,15 @@ param_set_arc_min(SYSCTL_HANDLER_ARGS)
 	if (val != 0)
 		zfs_arc_min = arc_c_min;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("arc_min", "arc.min");
+
 	return (0);
 }
 
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min,
 	CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
-	NULL, 0, param_set_arc_min, "LU",
+	NULL, 1, param_set_arc_min, "LU",
 	"Minimum ARC size in bytes (LEGACY)");
 
 extern uint_t zfs_arc_free_target;
@@ -242,6 +255,9 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS)
 
 	zfs_arc_free_target = val;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("arc_free_target", "arc.free_target");
+
 	return (0);
 }
 
@@ -251,7 +267,7 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS)
  */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
 	CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
-	NULL, 0, param_set_arc_free_target, "IU",
+	NULL, 1, param_set_arc_free_target, "IU",
 	"Desired number of free pages below which ARC triggers reclaim"
 	" (LEGACY)");
 
@@ -270,12 +286,15 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
 
 	arc_no_grow_shift = val;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("arc_no_grow_shift", "arc.no_grow_shift");
+
 	return (0);
 }
 
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift,
 	CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
-	NULL, 0, param_set_arc_no_grow_shift, "I",
+	NULL, 1, param_set_arc_no_grow_shift, "I",
 	"log2(fraction of ARC which must be free to allow growing) (LEGACY)");
 
 extern uint64_t l2arc_write_max;
@@ -746,12 +765,15 @@ param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS)
 
 	zfs_vdev_min_auto_ashift = val;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("min_auto_ashift",
+		    "vdev.min_auto_ashift");
+
 	return (0);
 }
 
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
-	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
-	&zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift),
+	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1,
 	param_set_min_auto_ashift, "IU",
 	"Min ashift used when creating new top-level vdev. (LEGACY)");
 
@@ -771,12 +793,15 @@ param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS)
 
 	zfs_vdev_max_auto_ashift = val;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("max_auto_ashift",
+		    "vdev.max_auto_ashift");
+
 	return (0);
 }
 
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
-	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
-	&zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift),
+	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1,
 	param_set_max_auto_ashift, "IU",
 	"Max ashift used when optimizing for logical -> physical sector size on"
 	" new top-level vdevs. (LEGACY)");
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c
index 11e93b800a54..9663f05cb354 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c
@@ -42,7 +42,8 @@ vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size)
 	spa_t *spa = vd->vdev_spa;
 	zio_t *zio;
 	abd_t *pad2;
-	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_TRYHARD;
 	int error;
 
 	if (size > VDEV_PAD_SIZE)
@@ -59,16 +60,11 @@ vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size)
 	abd_copy_from_buf(pad2, buf, size);
 	abd_zero_off(pad2, size, VDEV_PAD_SIZE - size);
 
-retry:
 	zio = zio_root(spa, NULL, NULL, flags);
 	vdev_label_write(zio, vd, 0, pad2,
 	    offsetof(vdev_label_t, vl_be),
 	    VDEV_PAD_SIZE, NULL, NULL, flags);
 	error = zio_wait(zio);
-	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
-		flags |= ZIO_FLAG_TRYHARD;
-		goto retry;
-	}
 
 	abd_free(pad2);
 	return (error);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
index b15a3e6e38c0..c98ccd756405 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
@@ -1175,7 +1175,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
 	int			count = 0;
 	zfs_acl_phys_t		acl_phys;
 
-	if (zp->z_zfsvfs->z_replay == B_FALSE) {
+	if (ZTOV(zp) != NULL && zp->z_zfsvfs->z_replay == B_FALSE) {
 		ASSERT_VOP_IN_SEQC(ZTOV(zp));
 	}
 
@@ -1262,7 +1262,8 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
 				if (aclnode->z_ace_count == 0)
 					continue;
 				dmu_write(zfsvfs->z_os, aoid, off,
-				    aclnode->z_size, aclnode->z_acldata, tx);
+				    aclnode->z_size, aclnode->z_acldata, tx,
+				    DMU_READ_NO_PREFETCH);
 				off += aclnode->z_size;
 			}
 		} else {
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
index 61d0bb26d1e5..d0a9c662e6f0 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
@@ -494,7 +494,7 @@ zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 
 	vap->va_uid = 0;
 	vap->va_gid = 0;
-	vap->va_rdev = 0;
+	vap->va_rdev = NODEV;
 	/*
 	 * We are a purely virtual object, so we have no
 	 * blocksize or allocated blocks.
@@ -688,6 +688,8 @@ zfsctl_root_readdir(struct vop_readdir_args *ap)
 	 * count to return is 0.
 	 */
 	if (zfs_uio_offset(&uio) == 3 * sizeof (entry)) {
+		if (eofp != NULL)
+			*eofp = 1;
 		return (0);
 	}
 
@@ -760,8 +762,7 @@ zfsctl_common_pathconf(struct vop_pathconf_args *ap)
 		return (0);
 
 	case _PC_MIN_HOLE_SIZE:
-		*ap->a_retval = (int)SPA_MINBLOCKSIZE;
-		return (0);
+		return (EINVAL);
 
 	case _PC_ACL_EXTENDED:
 		*ap->a_retval = 0;
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
index 21e5f7938f9f..ca13569a1235 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
@@ -164,8 +164,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
 
 int
 zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
-    ssize_t *resid)
+    uint8_t ashift, ssize_t *resid)
 {
+	(void) ashift;
 	return (zfs_file_write_impl(fp, buf, count, &off, resid));
 }
 
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c
index dcdefae56639..29711fcf5d2c 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c
@@ -108,11 +108,11 @@ zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
 	    "command", &command) != 0)
 		return (EINVAL);
 
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	spa = spa_by_guid(pool_guid, vdev_guid);
 	if (spa != NULL)
 		strcpy(name, spa_name(spa));
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 	if (spa == NULL)
 		return (ENOENT);
 
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
index 1813c411b013..8a9d23d0d554 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -61,6 +61,7 @@
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
@@ -277,7 +278,7 @@ zfs_ioctl_getxattr(vnode_t *vp, zfsxattr_t *fsx)
 
 	memset(fsx, 0, sizeof (*fsx));
 	fsx->fsx_xflags = (zp->z_pflags & ZFS_PROJINHERIT) ?
-	    ZFS_PROJINHERIT_FL : 0;
+	    FS_PROJINHERIT_FL : 0;
 	fsx->fsx_projid = zp->z_projid;
 
 	return (0);
@@ -289,7 +290,7 @@ zfs_ioctl_setflags(vnode_t *vp, uint32_t ioctl_flags, xvattr_t *xva)
 	uint64_t zfs_flags = VTOZ(vp)->z_pflags;
 	xoptattr_t *xoap;
 
-	if (ioctl_flags & ~(ZFS_PROJINHERIT_FL))
+	if (ioctl_flags & ~(FS_PROJINHERIT_FL))
 		return (SET_ERROR(EOPNOTSUPP));
 
 	xva_init(xva);
@@ -303,7 +304,7 @@ zfs_ioctl_setflags(vnode_t *vp, uint32_t ioctl_flags, xvattr_t *xva)
 	}								\
 } while (0)
 
-	FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
+	FLAG_CHANGE(FS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
 	    xoap->xoa_projinherit);
 
 #undef	FLAG_CHANGE
@@ -388,7 +389,9 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error)
 			return (error);
+		vn_seqc_write_begin(vp);
 		error = zfs_ioctl_setxattr(vp, fsx, cred);
+		vn_seqc_write_end(vp);
 		VOP_UNLOCK(vp);
 		return (error);
 	}
@@ -1735,7 +1738,7 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
 	/*
 	 * Quit if directory has been removed (posix)
 	 */
-	if ((*eofp = zp->z_unlinked) != 0) {
+	if ((*eofp = (zp->z_unlinked != 0)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
@@ -2013,7 +2016,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		vap->va_rdev = zfs_cmpldev(rdev);
 	else
-		vap->va_rdev = 0;
+		vap->va_rdev = NODEV;
 	vap->va_gen = zp->z_gen;
 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
 	vap->va_filerev = zp->z_seq;
@@ -2203,6 +2206,7 @@ zfs_setattr_dir(znode_t *dzp)
 		if (err)
 			break;
 
+		vn_seqc_write_begin(ZTOV(zp));
 		mutex_enter(&dzp->z_lock);
 
 		if (zp->z_uid != dzp->z_uid) {
@@ -2252,6 +2256,7 @@ sa_add_projid_err:
 			dmu_tx_abort(tx);
 		}
 		tx = NULL;
+		vn_seqc_write_end(ZTOV(zp));
 		if (err != 0 && err != ENOENT)
 			break;
 
@@ -4111,6 +4116,7 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
 {
 	znode_t *zp;
 	zfsvfs_t *zfsvfs;
+	uint_t blksize, iosize;
 	int error;
 
 	switch (cmd) {
@@ -4122,8 +4128,20 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
 		*valp = 64;
 		return (0);
 	case _PC_MIN_HOLE_SIZE:
-		*valp = (int)SPA_MINBLOCKSIZE;
-		return (0);
+		iosize = vp->v_mount->mnt_stat.f_iosize;
+		if (vp->v_type == VREG) {
+			zp = VTOZ(vp);
+			blksize = zp->z_blksz;
+			if (zp->z_size <= blksize)
+				blksize = MAX(blksize, iosize);
+			*valp = (int)blksize;
+			return (0);
+		}
+		if (vp->v_type == VDIR) {
+			*valp = (int)iosize;
+			return (0);
+		}
+		return (EINVAL);
 	case _PC_ACL_EXTENDED:
 #if 0		/* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */
 		zp = VTOZ(vp);
@@ -4205,8 +4223,20 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
 
 			zfs_vmobject_wlock(object);
 			(void) vm_page_grab_pages(object, OFF_TO_IDX(start),
-			    VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_ZERO,
+			    VM_ALLOC_NORMAL | VM_ALLOC_WAITOK,
 			    ma, count);
+			if (!vm_page_all_valid(ma[count - 1])) {
+				/*
+				 * Later in this function, we copy DMU data to
+				 * invalid pages only. The last page may not be
+				 * entirely filled though, if the file does not
+				 * end on a page boundary. Therefore, we zero
+				 * that last page here to make sure it does not
+				 * contain garbage after the end of file.
+				 */
+				ASSERT(vm_page_none_valid(ma[count - 1]));
+				vm_page_zero_invalid(ma[count - 1], FALSE);
+			}
 			zfs_vmobject_wunlock(object);
 		}
 		if (blksz == zp->z_blksz)
@@ -4449,7 +4479,8 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
 		for (i = 0; wlen > 0; woff += tocopy, wlen -= tocopy, i++) {
 			tocopy = MIN(PAGE_SIZE, wlen);
 			va = zfs_map_page(ma[i], &sf);
-			dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx);
+			dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx,
+			    DMU_READ_PREFETCH);
 			zfs_unmap_page(sf);
 		}
 	} else {
@@ -5727,6 +5758,9 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
 {
 	ulong_t val;
 	int error;
+#if defined(_PC_CLONE_BLKSIZE) || defined(_PC_CASE_INSENSITIVE)
+	zfsvfs_t *zfsvfs;
+#endif
 
 	error = zfs_pathconf(ap->a_vp, ap->a_name, &val,
 	    curthread->td_ucred, NULL);
@@ -5773,6 +5807,30 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
 		*ap->a_retval = 1;
 		return (0);
 #endif
+#ifdef _PC_CLONE_BLKSIZE
+	case _PC_CLONE_BLKSIZE:
+		zfsvfs = (zfsvfs_t *)ap->a_vp->v_mount->mnt_data;
+		if (zfs_bclone_enabled &&
+		    spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
+		    SPA_FEATURE_BLOCK_CLONING))
+			*ap->a_retval = dsl_dataset_feature_is_active(
+			    zfsvfs->z_os->os_dsl_dataset,
+			    SPA_FEATURE_LARGE_BLOCKS) ?
+			    SPA_MAXBLOCKSIZE :
+			    SPA_OLD_MAXBLOCKSIZE;
+		else
+			*ap->a_retval = 0;
+		return (0);
+#endif
+#ifdef _PC_CASE_INSENSITIVE
+	case _PC_CASE_INSENSITIVE:
+		zfsvfs = (zfsvfs_t *)ap->a_vp->v_mount->mnt_data;
+		if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE)
+			*ap->a_retval = 1;
+		else
+			*ap->a_retval = 0;
+		return (0);
+#endif
 	default:
 		return (vop_stdpathconf(ap));
 	}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
index 7cd0a153577c..649022ab5bcb 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
@@ -817,6 +817,10 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 	(*zpp)->z_dnodesize = dnodesize;
 	(*zpp)->z_projid = projid;
 
+	vnode_t *vp = ZTOV(*zpp);
+	if (!(flag & IS_ROOT_NODE))
+		vn_seqc_write_begin(vp);
+
 	if (vap->va_mask & AT_XVATTR)
 		zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
 
@@ -825,7 +829,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
 	}
 	if (!(flag & IS_ROOT_NODE)) {
-		vnode_t *vp = ZTOV(*zpp);
+		vn_seqc_write_end(vp);
 		vp->v_vflag |= VV_FORCEINSMQ;
 		int err = insmntque(vp, zfsvfs->z_vfs);
 		vp->v_vflag &= ~VV_FORCEINSMQ;
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
index 91cf38016e00..8562c42b3220 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
@@ -437,6 +437,7 @@ zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
+	memset(&cuio_s, 0, sizeof (cuio_s));
 	zfs_uio_init(&cuio, &cuio_s);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
@@ -519,6 +520,7 @@ zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
 
+	memset(&cuio_s, 0, sizeof (cuio_s));
 	zfs_uio_init(&cuio, &cuio_s);
 
 	/*
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
index 265dfd55fc4d..dc30f6dd939c 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
@@ -31,7 +31,7 @@
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Klara, Inc.
  */
 
 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
@@ -183,6 +183,7 @@ static struct filterops zvol_filterops_vnode = {
 	.f_isfd = 1,
 	.f_detach = zvol_filter_detach,
 	.f_event = zvol_filter_vnode,
+	.f_copy = knote_triv_copy,
 };
 
 extern uint_t zfs_geom_probe_vdev_key;
@@ -196,7 +197,6 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
 
 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
-static void zvol_geom_destroy(zvol_state_t *zv);
 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
 static void zvol_geom_bio_start(struct bio *bp);
 static int zvol_geom_bio_getattr(struct bio *bp);
@@ -226,25 +226,14 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
 	}
 
 retry:
-	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
-	/*
-	 * Obtain a copy of private under zvol_state_lock to make sure either
-	 * the result of zvol free code setting private to NULL is observed,
-	 * or the zv is protected from being freed because of the positive
-	 * zv_open_count.
-	 */
-	zv = pp->private;
-	if (zv == NULL) {
-		rw_exit(&zvol_state_lock);
-		err = SET_ERROR(ENXIO);
-		goto out_locked;
-	}
+	zv = atomic_load_ptr(&pp->private);
+	if (zv == NULL)
+		return (SET_ERROR(ENXIO));
 
 	mutex_enter(&zv->zv_state_lock);
 	if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
-		rw_exit(&zvol_state_lock);
 		err = SET_ERROR(ENXIO);
-		goto out_zv_locked;
+		goto out_locked;
 	}
 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
 
@@ -257,8 +246,24 @@ retry:
 		drop_suspend = B_TRUE;
 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
 			mutex_exit(&zv->zv_state_lock);
+
+			/*
+			 * Removal may happen while the locks are down, so
+			 * we can't trust zv any longer; we have to start over.
+			 */
+			zv = atomic_load_ptr(&pp->private);
+			if (zv == NULL)
+				return (SET_ERROR(ENXIO));
+
 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 			mutex_enter(&zv->zv_state_lock);
+
+			if (zv->zv_zso->zso_dying ||
+			    zv->zv_flags & ZVOL_REMOVING) {
+				err = SET_ERROR(ENXIO);
+				goto out_locked;
+			}
+
 			/* Check to see if zv_suspend_lock is needed. */
 			if (zv->zv_open_count != 0) {
 				rw_exit(&zv->zv_suspend_lock);
@@ -266,7 +271,6 @@ retry:
 			}
 		}
 	}
-	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -279,8 +283,8 @@ retry:
 		 * Take spa_namespace_lock to prevent lock inversion when
 		 * zvols from one pool are opened as vdevs in another.
 		 */
-		if (!mutex_owned(&spa_namespace_lock)) {
-			if (!mutex_tryenter(&spa_namespace_lock)) {
+		if (!spa_namespace_held()) {
+			if (!spa_namespace_tryenter(FTAG)) {
 				mutex_exit(&zv->zv_state_lock);
 				rw_exit(&zv->zv_suspend_lock);
 				drop_suspend = B_FALSE;
@@ -292,9 +296,9 @@ retry:
 		}
 		err = zvol_first_open(zv, !(flag & FWRITE));
 		if (drop_namespace)
-			mutex_exit(&spa_namespace_lock);
+			spa_namespace_exit(FTAG);
 		if (err)
-			goto out_zv_locked;
+			goto out_locked;
 		pp->mediasize = zv->zv_volsize;
 		pp->stripeoffset = 0;
 		pp->stripesize = zv->zv_volblocksize;
@@ -329,9 +333,8 @@ out_opened:
 		zvol_last_close(zv);
 		wakeup(zv);
 	}
-out_zv_locked:
-	mutex_exit(&zv->zv_state_lock);
 out_locked:
+	mutex_exit(&zv->zv_state_lock);
 	if (drop_suspend)
 		rw_exit(&zv->zv_suspend_lock);
 	return (err);
@@ -345,12 +348,9 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
 	boolean_t drop_suspend = B_TRUE;
 	int new_open_count;
 
-	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
-	zv = pp->private;
-	if (zv == NULL) {
-		rw_exit(&zvol_state_lock);
+	zv = atomic_load_ptr(&pp->private);
+	if (zv == NULL)
 		return (SET_ERROR(ENXIO));
-	}
 
 	mutex_enter(&zv->zv_state_lock);
 	if (zv->zv_flags & ZVOL_EXCL) {
@@ -377,6 +377,15 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 			mutex_enter(&zv->zv_state_lock);
+
+			/*
+			 * Unlike in zvol_geom_open(), we don't check if
+			 * removal started here, because we might be one of the
+			 * openers that needs to be thrown out! If we're the
+			 * last, we need to call zvol_last_close() below to
+			 * finish cleanup. So, no special treatment for us.
+			 */
+
 			/* Check to see if zv_suspend_lock is needed. */
 			new_open_count = zv->zv_open_count - count;
 			if (new_open_count != 0) {
@@ -387,7 +396,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
 	} else {
 		drop_suspend = B_FALSE;
 	}
-	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -408,20 +416,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
 	return (0);
 }
 
-static void
-zvol_geom_destroy(zvol_state_t *zv)
-{
-	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
-	struct g_provider *pp = zsg->zsg_provider;
-
-	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
-
-	g_topology_assert();
-
-	zsg->zsg_provider = NULL;
-	g_wither_geom(pp->geom, ENXIO);
-}
-
 void
 zvol_wait_close(zvol_state_t *zv)
 {
@@ -454,7 +448,7 @@ zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
 	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
 	    pp->name, acr, acw, ace));
 
-	if (pp->private == NULL) {
+	if (atomic_load_ptr(&pp->private) == NULL) {
 		if (acr <= 0 && acw <= 0 && ace <= 0)
 			return (0);
 		return (pp->error);
@@ -921,25 +915,14 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
 	boolean_t drop_suspend = B_FALSE;
 
 retry:
-	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
-	/*
-	 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
-	 * the result of zvol free code setting si_drv2 to NULL is observed,
-	 * or the zv is protected from being freed because of the positive
-	 * zv_open_count.
-	 */
-	zv = dev->si_drv2;
-	if (zv == NULL) {
-		rw_exit(&zvol_state_lock);
-		err = SET_ERROR(ENXIO);
-		goto out_locked;
-	}
+	zv = atomic_load_ptr(&dev->si_drv2);
+	if (zv == NULL)
+		return (SET_ERROR(ENXIO));
 
 	mutex_enter(&zv->zv_state_lock);
-	if (zv->zv_zso->zso_dying) {
-		rw_exit(&zvol_state_lock);
+	if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
 		err = SET_ERROR(ENXIO);
-		goto out_zv_locked;
+		goto out_locked;
 	}
 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
 
@@ -954,6 +937,13 @@ retry:
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 			mutex_enter(&zv->zv_state_lock);
+
+			if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
+				/* Removal started while locks were down. */
+				err = SET_ERROR(ENXIO);
+				goto out_locked;
+			}
+
 			/* Check to see if zv_suspend_lock is needed. */
 			if (zv->zv_open_count != 0) {
 				rw_exit(&zv->zv_suspend_lock);
@@ -961,7 +951,6 @@ retry:
 			}
 		}
 	}
-	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -974,8 +963,8 @@ retry:
 		 * Take spa_namespace_lock to prevent lock inversion when
 		 * zvols from one pool are opened as vdevs in another.
 		 */
-		if (!mutex_owned(&spa_namespace_lock)) {
-			if (!mutex_tryenter(&spa_namespace_lock)) {
+		if (!spa_namespace_held()) {
+			if (!spa_namespace_tryenter(FTAG)) {
 				mutex_exit(&zv->zv_state_lock);
 				rw_exit(&zv->zv_suspend_lock);
 				drop_suspend = B_FALSE;
@@ -987,9 +976,9 @@ retry:
 		}
 		err = zvol_first_open(zv, !(flags & FWRITE));
 		if (drop_namespace)
-			mutex_exit(&spa_namespace_lock);
+			spa_namespace_exit(FTAG);
 		if (err)
-			goto out_zv_locked;
+			goto out_locked;
 	}
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -1016,9 +1005,8 @@ out_opened:
 		zvol_last_close(zv);
 		wakeup(zv);
 	}
-out_zv_locked:
-	mutex_exit(&zv->zv_state_lock);
 out_locked:
+	mutex_exit(&zv->zv_state_lock);
 	if (drop_suspend)
 		rw_exit(&zv->zv_suspend_lock);
 	return (err);
@@ -1030,12 +1018,9 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 	zvol_state_t *zv;
 	boolean_t drop_suspend = B_TRUE;
 
-	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
-	zv = dev->si_drv2;
-	if (zv == NULL) {
-		rw_exit(&zvol_state_lock);
+	zv = atomic_load_ptr(&dev->si_drv2);
+	if (zv == NULL)
 		return (SET_ERROR(ENXIO));
-	}
 
 	mutex_enter(&zv->zv_state_lock);
 	if (zv->zv_flags & ZVOL_EXCL) {
@@ -1060,6 +1045,15 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 			mutex_enter(&zv->zv_state_lock);
+
+			/*
+			 * Unlike in zvol_cdev_open(), we don't check if
+			 * removal started here, because we might be one of the
+			 * openers that needs to be thrown out! If we're the
+			 * last, we need to call zvol_last_close() below to
+			 * finish cleanup. So, no special treatment for us.
+			 */
+
 			/* Check to see if zv_suspend_lock is needed. */
 			if (zv->zv_open_count != 1) {
 				rw_exit(&zv->zv_suspend_lock);
@@ -1069,7 +1063,6 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 	} else {
 		drop_suspend = B_FALSE;
 	}
-	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -1101,7 +1094,8 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
 	int error;
 	boolean_t sync;
 
-	zv = dev->si_drv2;
+	zv = atomic_load_ptr(&dev->si_drv2);
+	ASSERT3P(zv, !=, NULL);
 
 	error = 0;
 	KASSERT(zv->zv_open_count > 0,
@@ -1162,6 +1156,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
 		*(off_t *)data = 0;
 		break;
 	case DIOCGATTR: {
+		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
 		uint64_t refd, avail, usedobjs, availobjs;
@@ -1186,6 +1181,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
 			arg->value.off = refd / DEV_BSIZE;
 		} else
 			error = SET_ERROR(ENOIOCTL);
+		rw_exit(&zv->zv_suspend_lock);
 		break;
 	}
 	case FIOSEEKHOLE:
@@ -1196,10 +1192,12 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
 
 		hole = (cmd == FIOSEEKHOLE);
 		noff = *off;
+		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 		lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
 		    RL_READER);
 		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
 		zfs_rangelock_exit(lr);
+		rw_exit(&zv->zv_suspend_lock);
 		*off = noff;
 		break;
 	}
@@ -1400,42 +1398,65 @@ zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
  * Remove minor node for the specified volume.
  */
 void
-zvol_os_free(zvol_state_t *zv)
+zvol_os_remove_minor(zvol_state_t *zv)
 {
-	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
-	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT0(zv->zv_open_count);
+	ASSERT0(atomic_read(&zv->zv_suspend_ref));
+	ASSERT(zv->zv_flags & ZVOL_REMOVING);
 
-	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
-
-	rw_destroy(&zv->zv_suspend_lock);
-	zfs_rangelock_fini(&zv->zv_rangelock);
+	struct zvol_state_os *zso = zv->zv_zso;
+	zv->zv_zso = NULL;
 
 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
-		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
-		struct g_provider *pp __maybe_unused = zsg->zsg_provider;
-
-		ASSERT0P(pp->private);
+		struct zvol_state_geom *zsg = &zso->zso_geom;
+		struct g_provider *pp = zsg->zsg_provider;
+		atomic_store_ptr(&pp->private, NULL);
+		mutex_exit(&zv->zv_state_lock);
 
 		g_topology_lock();
-		zvol_geom_destroy(zv);
+		g_wither_geom(pp->geom, ENXIO);
 		g_topology_unlock();
 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
-		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+		struct zvol_state_dev *zsd = &zso->zso_dev;
 		struct cdev *dev = zsd->zsd_cdev;
 
+		if (dev != NULL)
+			atomic_store_ptr(&dev->si_drv2, NULL);
+		mutex_exit(&zv->zv_state_lock);
+
 		if (dev != NULL) {
-			ASSERT0P(dev->si_drv2);
 			destroy_dev(dev);
 			knlist_clear(&zsd->zsd_selinfo.si_note, 0);
 			knlist_destroy(&zsd->zsd_selinfo.si_note);
 		}
 	}
 
+	kmem_free(zso, sizeof (struct zvol_state_os));
+
+	mutex_enter(&zv->zv_state_lock);
+}
+
+void
+zvol_os_free(zvol_state_t *zv)
+{
+	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT0(zv->zv_open_count);
+	ASSERT0P(zv->zv_zso);
+
+	ASSERT0P(zv->zv_objset);
+	ASSERT0P(zv->zv_zilog);
+	ASSERT0P(zv->zv_dn);
+
+	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+
+	rw_destroy(&zv->zv_suspend_lock);
+	zfs_rangelock_fini(&zv->zv_rangelock);
+
 	mutex_destroy(&zv->zv_state_lock);
 	cv_destroy(&zv->zv_removing_cv);
 	dataset_kstats_destroy(&zv->zv_kstat);
-	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
 	kmem_free(zv, sizeof (zvol_state_t));
 	zvol_minors--;
 }
@@ -1538,28 +1559,6 @@ out_doi:
 	return (error);
 }
 
-void
-zvol_os_clear_private(zvol_state_t *zv)
-{
-	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
-	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
-		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
-		struct g_provider *pp = zsg->zsg_provider;
-
-		if (pp->private == NULL) /* already cleared */
-			return;
-
-		pp->private = NULL;
-		ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
-	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
-		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
-		struct cdev *dev = zsd->zsd_cdev;
-
-		if (dev != NULL)
-			dev->si_drv2 = NULL;
-	}
-}
-
 int
 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
 {
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
index 092f090d934b..00ff789265c6 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
@@ -32,7 +32,6 @@
 #include <sys/taskq.h>
 #include <sys/kmem.h>
 #include <sys/tsd.h>
-#include <sys/trace_spl.h>
 #include <sys/time.h>
 #include <sys/atomic.h>
 #include <sys/kstat.h>
@@ -325,7 +324,6 @@ task_expire_impl(taskq_ent_t *t)
 	}
 
 	t->tqent_birth = jiffies;
-	DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
 
 	/*
 	 * The priority list must be maintained in strict task id order
@@ -713,9 +711,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 	t->tqent_taskq = tq;
 	t->tqent_timer.function = NULL;
 	t->tqent_timer.expires = 0;
-
 	t->tqent_birth = jiffies;
-	DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
 
 	ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
 
@@ -840,9 +836,7 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
 	t->tqent_func = func;
 	t->tqent_arg = arg;
 	t->tqent_taskq = tq;
-
 	t->tqent_birth = jiffies;
-	DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
 
 	spin_unlock(&t->tqent_lock);
 
@@ -1054,11 +1048,6 @@ taskq_thread(void *args)
 			 * A TQENT_FLAG_PREALLOC task may be reused or freed
 			 * during the task function call. Store tqent_id and
 			 * tqent_flags here.
-			 *
-			 * Also use an on stack taskq_ent_t for tqt_task
-			 * assignment in this case; we want to make sure
-			 * to duplicate all fields, so the values are
-			 * correct when it's accessed via DTRACE_PROBE*.
 			 */
 			tqt->tqt_id = t->tqent_id;
 			tqt->tqt_flags = t->tqent_flags;
@@ -1074,13 +1063,10 @@ taskq_thread(void *args)
 			spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 			TQSTAT_INC(tq, threads_active);
-			DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t);
 
 			/* Perform the requested task */
 			t->tqent_func(t->tqent_arg);
 
-			DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t);
-
 			TQSTAT_DEC(tq, threads_active);
 			if ((t->tqent_flags & TQENT_LIST_MASK) ==
 			    TQENT_LIST_PENDING)
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
index 45c2999a4bb1..b2eae5d00b10 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
@@ -25,6 +25,10 @@
  * SUCH DAMAGE.
  */
 
+/*
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
+ */
+
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>
@@ -56,6 +60,19 @@ typedef struct zone_dataset {
 } zone_dataset_t;
 
 #ifdef CONFIG_USER_NS
+
+/*
+ * Linux 6.18 moved the generic namespace type away from ns->ops->type onto
+ * ns_common itself.
+ */
+#ifdef HAVE_NS_COMMON_TYPE
+#define	ns_is_newuser(ns)	\
+	((ns)->ns_type == CLONE_NEWUSER)
+#else
+#define	ns_is_newuser(ns)	\
+	((ns)->ops != NULL && (ns)->ops->type == CLONE_NEWUSER)
+#endif
+
 /*
  * Returns:
  * - 0 on success
@@ -84,7 +101,7 @@ user_ns_get(int fd, struct user_namespace **userns)
 		goto done;
 	}
 	ns = get_proc_ns(file_inode(nsfile));
-	if (ns->ops->type != CLONE_NEWUSER) {
+	if (!ns_is_newuser(ns)) {
 		error = ENOTTY;
 		goto done;
 	}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
index 8a8316f63c48..18f2426fbbfc 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 by Delphix. All rights reserved.
  * Copyright (c) 2023, 2024, Klara Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
  */
 
 /*
@@ -1109,6 +1110,14 @@ abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
 #define	ABD_ITER_PAGE_SIZE(page)	(PAGESIZE)
 #endif
 
+#ifndef nth_page
+/*
+ * Since 6.18 nth_page() no longer exists, and is no longer required to iterate
+ * within a single SG entry, so we replace it with a simple addition.
+ */
+#define	nth_page(p, n)	((p)+(n))
+#endif
+
 void
 abd_iter_page(struct abd_iter *aiter)
 {
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
index 830fad7fe793..1bd3500e9f66 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@@ -471,13 +471,17 @@ vdev_disk_close(vdev_t *v)
 	if (v->vdev_reopening || vd == NULL)
 		return;
 
+	rw_enter(&vd->vd_lock, RW_WRITER);
+
 	if (vd->vd_bdh != NULL)
 		vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
 		    zfs_vdev_holder);
 
+	v->vdev_tsd = NULL;
+
+	rw_exit(&vd->vd_lock);
 	rw_destroy(&vd->vd_lock);
 	kmem_free(vd, sizeof (vdev_disk_t));
-	v->vdev_tsd = NULL;
 }
 
 /*
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
index daa4b5776837..4c929a4642b1 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
@@ -1447,7 +1447,8 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
 				if (aclnode->z_ace_count == 0)
 					continue;
 				dmu_write(zfsvfs->z_os, aoid, off,
-				    aclnode->z_size, aclnode->z_acldata, tx);
+				    aclnode->z_size, aclnode->z_acldata, tx,
+				    DMU_READ_NO_PREFETCH);
 				off += aclnode->z_size;
 			}
 		} else {
@@ -2524,7 +2525,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
 	 * Also note: DOS R/O is ignored for directories.
 	 */
 	if ((v4_mode & WRITE_MASK_DATA) &&
-	    S_ISDIR(ZTOI(zp)->i_mode) &&
+	    !S_ISDIR(ZTOI(zp)->i_mode) &&
 	    (zp->z_pflags & ZFS_READONLY)) {
 		return (SET_ERROR(EPERM));
 	}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
index c729947369c2..3fdcdbac6f68 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
@@ -115,8 +115,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
  */
 int
 zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
-    ssize_t *resid)
+    uint8_t ashift, ssize_t *resid)
 {
+	(void) ashift;
 	ssize_t rc;
 
 	rc  = kernel_write(fp, buf, count, &off);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
index cd606e667bff..8a7d14ab6119 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -1556,6 +1556,12 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
 	sb->s_xattr = zpl_xattr_handlers;
 	sb->s_export_op = &zpl_export_operations;
 
+#ifdef HAVE_SET_DEFAULT_D_OP
+	set_default_d_op(sb, &zpl_dentry_operations);
+#else
+	sb->s_d_op = &zpl_dentry_operations;
+#endif
+
 	/* Set features for file system. */
 	zfs_set_fuid_feature(zfsvfs);
 
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
index 6106726651a3..02465adf36d5 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
@@ -2033,10 +2033,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
 		goto out3;
 	}
 
-	if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
-		err = SET_ERROR(EPERM);
-		goto out3;
-	}
+	/* ZFS_READONLY will be handled in zfs_zaccess() */
 
 	/*
 	 * Verify timestamps doesn't overflow 32 bits.
@@ -3895,7 +3892,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 
 	va = kmap(pp);
 	ASSERT3U(pglen, <=, PAGE_SIZE);
-	dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
+	dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx,
+	    DMU_READ_PREFETCH);
 	kunmap(pp);
 
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
index 48dae79a2373..81ac26cb0c93 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
@@ -202,7 +202,7 @@ zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
 	return (!!dentry->d_inode);
 }
 
-static dentry_operations_t zpl_dops_snapdirs = {
+static const struct dentry_operations zpl_dops_snapdirs = {
 /*
  * Auto mounting of snapshots is only supported for 2.6.37 and
  * newer kernels.  Prior to this kernel the ops->follow_link()
@@ -215,6 +215,51 @@ static dentry_operations_t zpl_dops_snapdirs = {
 	.d_revalidate	= zpl_snapdir_revalidate,
 };
 
+/*
+ * For the .zfs control directory to work properly we must be able to override
+ * the default operations table and register custom .d_automount and
+ * .d_revalidate callbacks.
+ */
+static void
+set_snapdir_dentry_ops(struct dentry *dentry, unsigned int extraflags) {
+	static const unsigned int op_flags =
+	    DCACHE_OP_HASH | DCACHE_OP_COMPARE |
+	    DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE |
+	    DCACHE_OP_PRUNE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_REAL;
+
+#ifdef HAVE_D_SET_D_OP
+	/*
+	 * d_set_d_op() will set the DCACHE_OP_ flags according to what it
+	 * finds in the passed dentry_operations, so we don't have to.
+	 *
+	 * We clear the flags and the old op table before calling d_set_d_op()
+	 * because issues a warning when the dentry operations table is already
+	 * set.
+	 */
+	dentry->d_op = NULL;
+	dentry->d_flags &= ~op_flags;
+	d_set_d_op(dentry, &zpl_dops_snapdirs);
+	dentry->d_flags |= extraflags;
+#else
+	/*
+	 * Since 6.17 there's no exported way to modify dentry ops, so we have
+	 * to reach in and do it ourselves. This should be safe for our very
+	 * narrow use case, which is to create or splice in an entry to give
+	 * access to a snapshot.
+	 *
+	 * We need to set the op flags directly. We hardcode
+	 * DCACHE_OP_REVALIDATE because that's the only operation we have; if
+	 * we ever extend zpl_dops_snapdirs we will need to update the op flags
+	 * to match.
+	 */
+	spin_lock(&dentry->d_lock);
+	dentry->d_op = &zpl_dops_snapdirs;
+	dentry->d_flags &= ~op_flags;
+	dentry->d_flags |= DCACHE_OP_REVALIDATE | extraflags;
+	spin_unlock(&dentry->d_lock);
+#endif
+}
+
 static struct dentry *
 zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
     unsigned int flags)
@@ -236,10 +281,7 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
 		return (ERR_PTR(error));
 
 	ASSERT(error == 0 || ip == NULL);
-	d_clear_d_op(dentry);
-	d_set_d_op(dentry, &zpl_dops_snapdirs);
-	dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
-
+	set_snapdir_dentry_ops(dentry, DCACHE_NEED_AUTOMOUNT);
 	return (d_splice_alias(ip, dentry));
 }
 
@@ -373,8 +415,7 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
 
 	error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
 	if (error == 0) {
-		d_clear_d_op(dentry);
-		d_set_d_op(dentry, &zpl_dops_snapdirs);
+		set_snapdir_dentry_ops(dentry, 0);
 		d_instantiate(dentry, ip);
 	}
 
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
index d07317b0d910..f7691c02d163 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2025, Klara, Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
  */
 
 
@@ -478,6 +479,7 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 	return (ret);
 }
 
+#ifdef HAVE_WRITE_CACHE_PAGES
 #ifdef HAVE_WRITEPAGE_T_FOLIO
 static int
 zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
@@ -499,6 +501,78 @@ zpl_write_cache_pages(struct address_space *mapping,
 #endif
 	return (result);
 }
+#else
+static inline int
+zpl_write_cache_pages(struct address_space *mapping,
+    struct writeback_control *wbc, void *data)
+{
+	pgoff_t start = wbc->range_start >> PAGE_SHIFT;
+	pgoff_t end = wbc->range_end >> PAGE_SHIFT;
+
+	struct folio_batch fbatch;
+	folio_batch_init(&fbatch);
+
+	/*
+	 * This atomically (-ish) tags all DIRTY pages in the range with
+	 * TOWRITE, allowing users to continue dirtying or undirtying pages
+	 * while we get on with writeback, without us treading on each other.
+	 */
+	tag_pages_for_writeback(mapping, start, end);
+
+	int err = 0;
+	unsigned int npages;
+
+	/*
+	 * Grab references to the TOWRITE pages just flagged. This may not get
+	 * all of them, so we do it in a loop until there are none left.
+	 */
+	while ((npages = filemap_get_folios_tag(mapping, &start, end,
+	    PAGECACHE_TAG_TOWRITE, &fbatch)) != 0) {
+
+		/* Loop over each page and write it out. */
+		struct folio *folio;
+		while ((folio = folio_batch_next(&fbatch)) != NULL) {
+			folio_lock(folio);
+
+			/*
+			 * If the folio has been remapped, or is no longer
+			 * dirty, then there's nothing to do.
+			 */
+			if (folio->mapping != mapping ||
+			    !folio_test_dirty(folio)) {
+				folio_unlock(folio);
+				continue;
+			}
+
+			/*
+			 * If writeback is already in progress, wait for it to
+			 * finish. We continue after this even if the page
+			 * ends up clean; zfs_putpage() will skip it if no
+			 * further work is required.
+			 */
+			while (folio_test_writeback(folio))
+				folio_wait_bit(folio, PG_writeback);
+
+			/*
+			 * Write it out and collect any error. zfs_putpage()
+			 * will clear the TOWRITE and DIRTY flags, and return
+			 * with the page unlocked.
+			 */
+			int ferr = zpl_putpage(&folio->page, wbc, data);
+			if (err == 0 && ferr != 0)
+				err = ferr;
+
+			/* Housekeeping for the caller. */
+			wbc->nr_to_write -= folio_nr_pages(folio);
+		}
+
+		/* Release any remaining references on the batch. */
+		folio_batch_release(&fbatch);
+	}
+
+	return (err);
+}
+#endif
 
 static int
 zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
@@ -737,28 +811,44 @@ zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
 	return (error);
 }
 
-#define	ZFS_FL_USER_VISIBLE	(FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
-#define	ZFS_FL_USER_MODIFIABLE	(FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
+#define	ZFS_FL_USER_VISIBLE	(FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL)
+#define	ZFS_FL_USER_MODIFIABLE	(FS_FL_USER_MODIFIABLE | FS_PROJINHERIT_FL)
+
+
+static struct {
+	uint64_t zfs_flag;
+	uint32_t fs_flag;
+	uint32_t xflag;
+} flags_lookup[] = {
+	{ZFS_IMMUTABLE, FS_IMMUTABLE_FL, FS_XFLAG_IMMUTABLE},
+	{ZFS_APPENDONLY, FS_APPEND_FL, FS_XFLAG_APPEND},
+	{ZFS_NODUMP, FS_NODUMP_FL, FS_XFLAG_NODUMP},
+	{ZFS_PROJINHERIT, FS_PROJINHERIT_FL, FS_XFLAG_PROJINHERIT}
+};
 
 static uint32_t
 __zpl_ioctl_getflags(struct inode *ip)
 {
 	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
 	uint32_t ioctl_flags = 0;
+	for (int i = 0; i < ARRAY_SIZE(flags_lookup); i++)
+		if (zfs_flags & flags_lookup[i].zfs_flag)
+			ioctl_flags |= flags_lookup[i].fs_flag;
 
-	if (zfs_flags & ZFS_IMMUTABLE)
-		ioctl_flags |= FS_IMMUTABLE_FL;
-
-	if (zfs_flags & ZFS_APPENDONLY)
-		ioctl_flags |= FS_APPEND_FL;
+	return (ioctl_flags);
+}
 
-	if (zfs_flags & ZFS_NODUMP)
-		ioctl_flags |= FS_NODUMP_FL;
+static uint32_t
+__zpl_ioctl_getxflags(struct inode *ip)
+{
+	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
+	uint32_t ioctl_flags = 0;
 
-	if (zfs_flags & ZFS_PROJINHERIT)
-		ioctl_flags |= ZFS_PROJINHERIT_FL;
+	for (int i = 0; i < ARRAY_SIZE(flags_lookup); i++)
+		if (zfs_flags & flags_lookup[i].zfs_flag)
+			ioctl_flags |= flags_lookup[i].xflag;
 
-	return (ioctl_flags & ZFS_FL_USER_VISIBLE);
+	return (ioctl_flags);
 }
 
 /*
@@ -772,6 +862,7 @@ zpl_ioctl_getflags(struct file *filp, void __user *arg)
 	int err;
 
 	flags = __zpl_ioctl_getflags(file_inode(filp));
+	flags = flags & ZFS_FL_USER_VISIBLE;
 	err = copy_to_user(arg, &flags, sizeof (flags));
 
 	return (err);
@@ -795,7 +886,7 @@ __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
 	xoptattr_t *xoap;
 
 	if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |
-	    ZFS_PROJINHERIT_FL))
+	    FS_PROJINHERIT_FL))
 		return (-EOPNOTSUPP);
 
 	if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)
@@ -826,7 +917,51 @@ __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
 	    xoap->xoa_appendonly);
 	FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP,
 	    xoap->xoa_nodump);
-	FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
+	FLAG_CHANGE(FS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
+	    xoap->xoa_projinherit);
+
+#undef	FLAG_CHANGE
+
+	return (0);
+}
+
+static int
+__zpl_ioctl_setxflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
+{
+	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
+	xoptattr_t *xoap;
+
+	if (ioctl_flags & ~(FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND |
+	    FS_XFLAG_NODUMP | FS_XFLAG_PROJINHERIT))
+		return (-EOPNOTSUPP);
+
+	if ((fchange(ioctl_flags, zfs_flags, FS_XFLAG_IMMUTABLE,
+	    ZFS_IMMUTABLE) ||
+	    fchange(ioctl_flags, zfs_flags, FS_XFLAG_APPEND, ZFS_APPENDONLY)) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return (-EPERM);
+
+	if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
+		return (-EACCES);
+
+	xva_init(xva);
+	xoap = xva_getxoptattr(xva);
+
+#define	FLAG_CHANGE(iflag, zflag, xflag, xfield)	do {	\
+	if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) ||	\
+	    ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) {	\
+		XVA_SET_REQ(xva, (xflag));	\
+		(xfield) = ((ioctl_flags & (iflag)) != 0);	\
+	}	\
+} while (0)
+
+	FLAG_CHANGE(FS_XFLAG_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
+	    xoap->xoa_immutable);
+	FLAG_CHANGE(FS_XFLAG_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
+	    xoap->xoa_appendonly);
+	FLAG_CHANGE(FS_XFLAG_NODUMP, ZFS_NODUMP, XAT_NODUMP,
+	    xoap->xoa_nodump);
+	FLAG_CHANGE(FS_XFLAG_PROJINHERIT, ZFS_PROJINHERIT, XAT_PROJINHERIT,
 	    xoap->xoa_projinherit);
 
 #undef	FLAG_CHANGE
@@ -867,7 +1002,7 @@ zpl_ioctl_getxattr(struct file *filp, void __user *arg)
 	struct inode *ip = file_inode(filp);
 	int err;
 
-	fsx.fsx_xflags = __zpl_ioctl_getflags(ip);
+	fsx.fsx_xflags = __zpl_ioctl_getxflags(ip);
 	fsx.fsx_projid = ITOZ(ip)->z_projid;
 	err = copy_to_user(arg, &fsx, sizeof (fsx));
 
@@ -891,7 +1026,7 @@ zpl_ioctl_setxattr(struct file *filp, void __user *arg)
 	if (!zpl_is_valid_projid(fsx.fsx_projid))
 		return (-EINVAL);
 
-	err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva);
+	err = __zpl_ioctl_setxflags(ip, fsx.fsx_xflags, &xva);
 	if (err)
 		return (err);
 
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
index 53819628627d..347b352506e5 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@@ -22,6 +22,8 @@
 /*
  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
  * Copyright (c) 2023, Datto Inc. All rights reserved.
+ * Copyright (c) 2025, Klara, Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
  */
 
 
@@ -32,7 +34,22 @@
 #include <sys/zpl.h>
 #include <linux/iversion.h>
 #include <linux/version.h>
+#include <linux/vfs_compat.h>
 
+/*
+ * What to do when the last reference to an inode is released. If 0, the kernel
+ * will cache it on the superblock. If 1, the inode will be freed immediately.
+ * See zpl_drop_inode().
+ */
+int zfs_delete_inode = 0;
+
+/*
+ * What to do when the last reference to a dentry is released. If 0, the kernel
+ * will cache it until the entry (file) is destroyed. If 1, the dentry will be
+ * marked for cleanup, at which time its inode reference will be released. See
+ * zpl_dentry_delete().
+ */
+int zfs_delete_dentry = 0;
 
 static struct inode *
 zpl_inode_alloc(struct super_block *sb)
@@ -77,11 +94,36 @@ zpl_dirty_inode(struct inode *ip, int flags)
 }
 
 /*
- * When ->drop_inode() is called its return value indicates if the
- * inode should be evicted from the inode cache.  If the inode is
- * unhashed and has no links the default policy is to evict it
- * immediately.
+ * ->drop_inode() is called when the last reference to an inode is released.
+ * Its return value indicates if the inode should be destroyed immediately, or
+ * cached on the superblock structure.
+ *
+ * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns
+ * "destroy immediately" if the inode is unhashed and has no links (roughly: no
+ * longer exists on disk). On datasets with millions of rarely-accessed files,
+ * this can cause a large amount of memory to be "pinned" by cached inodes,
+ * which in turn pin their associated dnodes and dbufs, until the kernel starts
+ * reporting memory pressure and requests OpenZFS release some memory (see
+ * zfs_prune()).
+ *
+ * When set to 1, we call generic_delete_inode(), which always returns "destroy
+ * immediately", resulting in inodes being destroyed immediately, releasing
+ * their associated dnodes and dbufs to the dbuf cached and the ARC to be
+ * evicted as normal.
  *
+ * Note that the "last reference" doesn't always mean the last _userspace_
+ * reference; the dentry cache also holds a reference, so "busy" inodes will
+ * still be kept alive that way (subject to dcache tuning).
+ */
+static int
+zpl_drop_inode(struct inode *ip)
+{
+	if (zfs_delete_inode)
+		return (generic_delete_inode(ip));
+	return (generic_drop_inode(ip));
+}
+
+/*
  * The ->evict_inode() callback must minimally truncate the inode pages,
  * and call clear_inode().  For 2.6.35 and later kernels this will
  * simply update the inode state, with the sync occurring before the
@@ -470,6 +512,7 @@ const struct super_operations zpl_super_operations = {
 	.destroy_inode		= zpl_inode_destroy,
 	.dirty_inode		= zpl_dirty_inode,
 	.write_inode		= NULL,
+	.drop_inode		= zpl_drop_inode,
 	.evict_inode		= zpl_evict_inode,
 	.put_super		= zpl_put_super,
 	.sync_fs		= zpl_sync_fs,
@@ -480,6 +523,35 @@ const struct super_operations zpl_super_operations = {
 	.show_stats		= NULL,
 };
 
+/*
+ * ->d_delete() is called when the last reference to a dentry is released. Its
+ *  return value indicates if the dentry should be destroyed immediately, or
+ *  retained in the dentry cache.
+ *
+ * By default (zfs_delete_dentry=0) the kernel will always cache unused
+ * entries.  Each dentry holds an inode reference, so cached dentries can hold
+ * the final inode reference indefinitely, leading to the inode and its related
+ * data being pinned (see zpl_drop_inode()).
+ *
+ * When set to 1, we signal that the dentry should be destroyed immediately and
+ * never cached. This reduces memory usage, at the cost of higher overheads to
+ * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be
+ * reloaded and reinflated.
+ *
+ * Note that userspace does not have direct control over dentry references and
+ * reclaim; rather, this is part of the kernel's caching and reclaim subsystems
+ * (eg vm.vfs_cache_pressure).
+ */
+static int
+zpl_dentry_delete(const struct dentry *dentry)
+{
+	return (zfs_delete_dentry ? 1 : 0);
+}
+
+const struct dentry_operations zpl_dentry_operations = {
+	.d_delete = zpl_dentry_delete,
+};
+
 struct file_system_type zpl_fs_type = {
 	.owner			= THIS_MODULE,
 	.name			= ZFS_DRIVER,
@@ -491,3 +563,10 @@ struct file_system_type zpl_fs_type = {
 	.mount			= zpl_mount,
 	.kill_sb		= zpl_kill_sb,
 };
+
+ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW,
+	"Delete inodes as soon as the last reference is released.");
+
+ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW,
+	"Delete dentries from dentry cache as soon as the last reference is "
+	"released.");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index a73acdad34ae..89f9bc555fcf 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -21,8 +21,8 @@
  */
 /*
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
- * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Rob Norris <robn@despairlabs.com>
+ * Copyright (c) 2024, 2025, Klara, Inc.
  */
 
 #include <sys/dataset_kstats.h>
@@ -337,16 +337,14 @@ zvol_discard(zv_request_t *zvr)
 	}
 
 	/*
-	 * Align the request to volume block boundaries when a secure erase is
-	 * not required.  This will prevent dnode_free_range() from zeroing out
-	 * the unaligned parts which is slow (read-modify-write) and useless
-	 * since we are not freeing any space by doing so.
+	 * Align the request to volume block boundaries. This will prevent
+	 * dnode_free_range() from zeroing out the unaligned parts which is
+	 * slow (read-modify-write) and useless since we are not freeing any
+	 * space by doing so.
 	 */
-	if (!io_is_secure_erase(bio, rq)) {
-		start = P2ROUNDUP(start, zv->zv_volblocksize);
-		end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
-		size = end - start;
-	}
+	start = P2ROUNDUP(start, zv->zv_volblocksize);
+	end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
+	size = end - start;
 
 	if (start >= end)
 		goto unlock;
@@ -467,6 +465,24 @@ zvol_read_task(void *arg)
 	zv_request_task_free(task);
 }
 
+/*
+ * Note:
+ *
+ * The kernel uses different enum names for the IO opcode, depending on the
+ * kernel version ('req_opf', 'req_op').  To sidestep this, use macros rather
+ * than inline functions for these checks.
+ */
+/* Should this IO go down the zvol write path? */
+#define	ZVOL_OP_IS_WRITE(op) \
+	(op == REQ_OP_WRITE || \
+	op == REQ_OP_FLUSH || \
+	op == REQ_OP_DISCARD)
+
+/* Is this IO type supported by zvols? */
+#define	ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op))
+
+/* Get the IO opcode */
+#define	ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq))
 
 /*
  * Process a BIO or request
@@ -484,7 +500,33 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	uint64_t offset = io_offset(bio, rq);
 	uint64_t size = io_size(bio, rq);
-	int rw = io_data_dir(bio, rq);
+	int rw;
+
+	if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) {
+		zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x",
+		    rq != NULL ? "request" : "BIO",
+		    ZVOL_OP(bio, rq),
+		    rq != NULL ? rq->cmd_flags : bio->bi_opf);
+		ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)));
+		zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP));
+		goto out;
+	}
+
+	if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) {
+		rw = WRITE;
+	} else {
+		rw = READ;
+	}
+
+	/*
+	 * Sanity check
+	 *
+	 * If we're a BIO, check our rw matches the kernel's
+	 * bio_data_dir(bio) rw.  We need to check because we support fewer
+	 * IO operations, and want to verify that what we think are reads and
+	 * writes from those operations match what the kernel thinks.
+	 */
+	ASSERT(rq != NULL || rw == bio_data_dir(bio));
 
 	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
 		zvol_end_io(bio, rq, SET_ERROR(ENXIO));
@@ -589,7 +631,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 		 * interfaces lack this functionality (they block waiting for
 		 * the i/o to complete).
 		 */
-		if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) {
+		if (io_is_discard(bio, rq)) {
 			if (force_sync) {
 				zvol_discard(&zvr);
 			} else {
@@ -679,28 +721,19 @@ zvol_open(struct block_device *bdev, fmode_t flag)
 
 retry:
 #endif
-	rw_enter(&zvol_state_lock, RW_READER);
-	/*
-	 * Obtain a copy of private_data under the zvol_state_lock to make
-	 * sure that either the result of zvol free code path setting
-	 * disk->private_data to NULL is observed, or zvol_os_free()
-	 * is not called on this zv because of the positive zv_open_count.
-	 */
+
 #ifdef HAVE_BLK_MODE_T
-	zv = disk->private_data;
+	zv = atomic_load_ptr(&disk->private_data);
 #else
-	zv = bdev->bd_disk->private_data;
+	zv = atomic_load_ptr(&bdev->bd_disk->private_data);
 #endif
 	if (zv == NULL) {
-		rw_exit(&zvol_state_lock);
 		return (-SET_ERROR(ENXIO));
 	}
 
 	mutex_enter(&zv->zv_state_lock);
-
 	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
 		mutex_exit(&zv->zv_state_lock);
-		rw_exit(&zvol_state_lock);
 		return (-SET_ERROR(ENXIO));
 	}
 
@@ -712,8 +745,28 @@ retry:
 	if (zv->zv_open_count == 0) {
 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 			mutex_exit(&zv->zv_state_lock);
+
+			/*
+			 * Removal may happen while the locks are down, so
+			 * we can't trust zv any longer; we have to start over.
+			 */
+#ifdef HAVE_BLK_MODE_T
+			zv = atomic_load_ptr(&disk->private_data);
+#else
+			zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+#endif
+			if (zv == NULL)
+				return (-SET_ERROR(ENXIO));
+
 			rw_enter(&zv->zv_suspend_lock, RW_READER);
 			mutex_enter(&zv->zv_state_lock);
+
+			if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
+				mutex_exit(&zv->zv_state_lock);
+				rw_exit(&zv->zv_suspend_lock);
+				return (-SET_ERROR(ENXIO));
+			}
+
 			/* check to see if zv_suspend_lock is needed */
 			if (zv->zv_open_count != 0) {
 				rw_exit(&zv->zv_suspend_lock);
@@ -724,7 +777,6 @@ retry:
 			drop_suspend = B_TRUE;
 		}
 	}
-	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -757,8 +809,8 @@ retry:
 		 * the kernel so the only option is to return the error for
 		 * the caller to handle it.
 		 */
-		if (!mutex_owned(&spa_namespace_lock)) {
-			if (!mutex_tryenter(&spa_namespace_lock)) {
+		if (!spa_namespace_held()) {
+			if (!spa_namespace_tryenter(FTAG)) {
 				mutex_exit(&zv->zv_state_lock);
 				rw_exit(&zv->zv_suspend_lock);
 				drop_suspend = B_FALSE;
@@ -782,7 +834,7 @@ retry:
 		error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag)));
 
 		if (drop_namespace)
-			mutex_exit(&spa_namespace_lock);
+			spa_namespace_exit(FTAG);
 	}
 
 	if (error == 0) {
@@ -821,11 +873,11 @@ zvol_release(struct gendisk *disk, fmode_t unused)
 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
 	(void) unused;
 #endif
-	zvol_state_t *zv;
 	boolean_t drop_suspend = B_TRUE;
 
-	rw_enter(&zvol_state_lock, RW_READER);
-	zv = disk->private_data;
+	zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
+	if (zv == NULL)
+		return;
 
 	mutex_enter(&zv->zv_state_lock);
 	ASSERT3U(zv->zv_open_count, >, 0);
@@ -839,6 +891,15 @@ zvol_release(struct gendisk *disk, fmode_t unused)
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, RW_READER);
 			mutex_enter(&zv->zv_state_lock);
+
+			/*
+			 * Unlike in zvol_open(), we don't check if removal
+			 * started here, because we might be one of the openers
+			 * that needs to be thrown out! If we're the last, we
+			 * need to call zvol_last_close() below to finish
+			 * cleanup. So, no special treatment for us.
+			 */
+
 			/* check to see if zv_suspend_lock is needed */
 			if (zv->zv_open_count != 1) {
 				rw_exit(&zv->zv_suspend_lock);
@@ -848,7 +909,6 @@ zvol_release(struct gendisk *disk, fmode_t unused)
 	} else {
 		drop_suspend = B_FALSE;
 	}
-	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -868,9 +928,10 @@ static int
 zvol_ioctl(struct block_device *bdev, fmode_t mode,
     unsigned int cmd, unsigned long arg)
 {
-	zvol_state_t *zv = bdev->bd_disk->private_data;
 	int error = 0;
 
+	zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+	ASSERT3P(zv, !=, NULL);
 	ASSERT3U(zv->zv_open_count, >, 0);
 
 	switch (cmd) {
@@ -923,9 +984,8 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
 {
 	unsigned int mask = 0;
 
-	rw_enter(&zvol_state_lock, RW_READER);
+	zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
 
-	zvol_state_t *zv = disk->private_data;
 	if (zv != NULL) {
 		mutex_enter(&zv->zv_state_lock);
 		mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
@@ -933,17 +993,14 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
 		mutex_exit(&zv->zv_state_lock);
 	}
 
-	rw_exit(&zvol_state_lock);
-
 	return (mask);
 }
 
 static int
 zvol_revalidate_disk(struct gendisk *disk)
 {
-	rw_enter(&zvol_state_lock, RW_READER);
+	zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
 
-	zvol_state_t *zv = disk->private_data;
 	if (zv != NULL) {
 		mutex_enter(&zv->zv_state_lock);
 		set_capacity(zv->zv_zso->zvo_disk,
@@ -951,8 +1008,6 @@ zvol_revalidate_disk(struct gendisk *disk)
 		mutex_exit(&zv->zv_state_lock);
 	}
 
-	rw_exit(&zvol_state_lock);
-
 	return (0);
 }
 
@@ -971,28 +1026,19 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
 	return (0);
 }
 
-void
-zvol_os_clear_private(zvol_state_t *zv)
-{
-	/*
-	 * Cleared while holding zvol_state_lock as a writer
-	 * which will prevent zvol_open() from opening it.
-	 */
-	zv->zv_zso->zvo_disk->private_data = NULL;
-}
-
 /*
  * Provide a simple virtual geometry for legacy compatibility.  For devices
  * smaller than 1 MiB a small head and sector count is used to allow very
  * tiny devices.  For devices over 1 Mib a standard head and sector count
  * is used to keep the cylinders count reasonable.
  */
-static int
-zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static inline int
+zvol_getgeo_impl(struct gendisk *disk, struct hd_geometry *geo)
 {
-	zvol_state_t *zv = bdev->bd_disk->private_data;
+	zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
 	sector_t sectors;
 
+	ASSERT3P(zv, !=, NULL);
 	ASSERT3U(zv->zv_open_count, >, 0);
 
 	sectors = get_capacity(zv->zv_zso->zvo_disk);
@@ -1011,6 +1057,20 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return (0);
 }
 
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK
+static int
+zvol_getgeo(struct gendisk *disk, struct hd_geometry *geo)
+{
+	return (zvol_getgeo_impl(disk, geo));
+}
+#else
+static int
+zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	return (zvol_getgeo_impl(bdev->bd_disk, geo));
+}
+#endif
+
 /*
  * Why have two separate block_device_operations structs?
  *
@@ -1417,53 +1477,70 @@ out_kmem:
 	return (ret);
 }
 
-/*
- * Cleanup then free a zvol_state_t which was created by zvol_alloc().
- * At this time, the structure is not opened by anyone, is taken off
- * the zvol_state_list, and has its private data set to NULL.
- * The zvol_state_lock is dropped.
- *
- * This function may take many milliseconds to complete (e.g. we've seen
- * it take over 256ms), due to the calls to "blk_cleanup_queue" and
- * "del_gendisk". Thus, consumers need to be careful to account for this
- * latency when calling this function.
- */
 void
-zvol_os_free(zvol_state_t *zv)
+zvol_os_remove_minor(zvol_state_t *zv)
 {
-
-	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
-	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT0(zv->zv_open_count);
-	ASSERT0P(zv->zv_zso->zvo_disk->private_data);
+	ASSERT0(atomic_read(&zv->zv_suspend_ref));
+	ASSERT(zv->zv_flags & ZVOL_REMOVING);
 
-	rw_destroy(&zv->zv_suspend_lock);
-	zfs_rangelock_fini(&zv->zv_rangelock);
+	struct zvol_state_os *zso = zv->zv_zso;
+	zv->zv_zso = NULL;
 
-	del_gendisk(zv->zv_zso->zvo_disk);
+	/* Clearing private_data will make new callers return immediately. */
+	atomic_store_ptr(&zso->zvo_disk->private_data, NULL);
+
+	/*
+	 * Drop the state lock before calling del_gendisk(). There may be
+	 * callers waiting to acquire it, but del_gendisk() will block until
+	 * they exit, which would deadlock.
+	 */
+	mutex_exit(&zv->zv_state_lock);
+
+	del_gendisk(zso->zvo_disk);
 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
 	(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
 #if defined(HAVE_BLK_CLEANUP_DISK)
-	blk_cleanup_disk(zv->zv_zso->zvo_disk);
+	blk_cleanup_disk(zso->zvo_disk);
 #else
-	put_disk(zv->zv_zso->zvo_disk);
+	put_disk(zso->zvo_disk);
 #endif
 #else
-	blk_cleanup_queue(zv->zv_zso->zvo_queue);
-	put_disk(zv->zv_zso->zvo_disk);
+	blk_cleanup_queue(zso->zvo_queue);
+	put_disk(zso->zvo_disk);
 #endif
 
-	if (zv->zv_zso->use_blk_mq)
-		blk_mq_free_tag_set(&zv->zv_zso->tag_set);
+	if (zso->use_blk_mq)
+		blk_mq_free_tag_set(&zso->tag_set);
+
+	ida_free(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
+
+	kmem_free(zso, sizeof (struct zvol_state_os));
+
+	mutex_enter(&zv->zv_state_lock);
+}
+
+void
+zvol_os_free(zvol_state_t *zv)
+{
+
+	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT0(zv->zv_open_count);
+	ASSERT0P(zv->zv_zso);
+
+	ASSERT0P(zv->zv_objset);
+	ASSERT0P(zv->zv_zilog);
+	ASSERT0P(zv->zv_dn);
 
-	ida_simple_remove(&zvol_ida,
-	    MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
+	rw_destroy(&zv->zv_suspend_lock);
+	zfs_rangelock_fini(&zv->zv_rangelock);
 
 	cv_destroy(&zv->zv_removing_cv);
 	mutex_destroy(&zv->zv_state_lock);
 	dataset_kstats_destroy(&zv->zv_kstat);
 
-	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
 	kmem_free(zv, sizeof (zvol_state_t));
 }
 
@@ -1592,7 +1669,7 @@ zvol_os_create_minor(const char *name)
 	if (zvol_inhibit_dev)
 		return (0);
 
-	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
+	idx = ida_alloc(&zvol_ida, kmem_flags_convert(KM_SLEEP));
 	if (idx < 0)
 		return (SET_ERROR(-idx));
 	minor = idx << ZVOL_MINOR_BITS;
@@ -1600,7 +1677,7 @@ zvol_os_create_minor(const char *name)
 		/* too many partitions can cause an overflow */
 		zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
 		    name, minor, MINOR(minor));
-		ida_simple_remove(&zvol_ida, idx);
+		ida_free(&zvol_ida, idx);
 		return (SET_ERROR(EINVAL));
 	}
 
@@ -1608,7 +1685,7 @@ zvol_os_create_minor(const char *name)
 	if (zv) {
 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 		mutex_exit(&zv->zv_state_lock);
-		ida_simple_remove(&zvol_ida, idx);
+		ida_free(&zvol_ida, idx);
 		return (SET_ERROR(EEXIST));
 	}
 
@@ -1708,7 +1785,7 @@ out_doi:
 		rw_exit(&zvol_state_lock);
 		error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
 	} else {
-		ida_simple_remove(&zvol_ida, idx);
+		ida_free(&zvol_ida, idx);
 	}
 
 	return (error);
diff --git a/sys/contrib/openzfs/module/zcommon/simd_stat.c b/sys/contrib/openzfs/module/zcommon/simd_stat.c
index 11e2080ff9f2..007ae9e4fbbc 100644
--- a/sys/contrib/openzfs/module/zcommon/simd_stat.c
+++ b/sys/contrib/openzfs/module/zcommon/simd_stat.c
@@ -118,6 +118,10 @@ simd_stat_kstat_data(char *buf, size_t size, void *data)
 		    "pclmulqdq", zfs_pclmulqdq_available());
 		off += SIMD_STAT_PRINT(simd_stat_kstat_payload,
 		    "movbe", zfs_movbe_available());
+		off += SIMD_STAT_PRINT(simd_stat_kstat_payload,
+		    "vaes", zfs_vaes_available());
+		off += SIMD_STAT_PRINT(simd_stat_kstat_payload,
+		    "vpclmulqdq", zfs_vpclmulqdq_available());
 
 		off += SIMD_STAT_PRINT(simd_stat_kstat_payload,
 		    "osxsave", boot_cpu_has(X86_FEATURE_OSXSAVE));
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_deleg.c b/sys/contrib/openzfs/module/zcommon/zfs_deleg.c
index 49bb534ca26c..87596558c9a1 100644
--- a/sys/contrib/openzfs/module/zcommon/zfs_deleg.c
+++ b/sys/contrib/openzfs/module/zcommon/zfs_deleg.c
@@ -59,6 +59,7 @@ const zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
 	{ZFS_DELEG_PERM_SNAPSHOT},
 	{ZFS_DELEG_PERM_SHARE},
 	{ZFS_DELEG_PERM_SEND},
+	{ZFS_DELEG_PERM_SEND_RAW},
 	{ZFS_DELEG_PERM_USERPROP},
 	{ZFS_DELEG_PERM_USERQUOTA},
 	{ZFS_DELEG_PERM_GROUPQUOTA},
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
index 864e3898b365..9190ae0362ea 100644
--- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c
+++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
@@ -364,8 +364,8 @@ zfs_prop_init(void)
 
 	static const zprop_index_t xattr_table[] = {
 		{ "off",	ZFS_XATTR_OFF },
-		{ "on",		ZFS_XATTR_SA },
 		{ "sa",		ZFS_XATTR_SA },
+		{ "on",		ZFS_XATTR_SA },
 		{ "dir",	ZFS_XATTR_DIR },
 		{ NULL }
 	};
diff --git a/sys/contrib/openzfs/module/zcommon/zpool_prop.c b/sys/contrib/openzfs/module/zcommon/zpool_prop.c
index 04ae9f986d8f..4826237b23e8 100644
--- a/sys/contrib/openzfs/module/zcommon/zpool_prop.c
+++ b/sys/contrib/openzfs/module/zcommon/zpool_prop.c
@@ -467,14 +467,23 @@ vdev_prop_init(void)
 	zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0,
 	    PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING",
 	    boolean_table, sfeatures);
+	zprop_register_index(VDEV_PROP_SIT_OUT, "sit_out", 0,
+	    PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "SIT_OUT", boolean_table,
+	    sfeatures);
 	zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0,
 	    PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP",
 	    boolean_table, sfeatures);
+	zprop_register_index(VDEV_PROP_AUTOSIT, "autosit", 0,
+	    PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "AUTOSIT", boolean_table,
+	    sfeatures);
 
 	/* default index properties */
 	zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE,
 	    PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "FAILFAST", boolean_table,
 	    sfeatures);
+	zprop_register_index(VDEV_PROP_SLOW_IO_EVENTS, "slow_io_events",
+	    B_TRUE, PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off",
+	    "SLOW_IO_EVENTS", boolean_table, sfeatures);
 
 	/* hidden properties */
 	zprop_register_hidden(VDEV_PROP_NAME, "name", PROP_TYPE_STRING,
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
index df41e3b49204..48bf99f1aeb7 100644
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -1157,7 +1157,7 @@ buf_fini(void)
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
-	 * should be using vmem_free() in the linux kernel\
+	 * should be using vmem_free() in the linux kernel.
 	 */
 	vmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
@@ -1392,6 +1392,7 @@ arc_get_complevel(arc_buf_t *buf)
 	return (buf->b_hdr->b_complevel);
 }
 
+__maybe_unused
 static inline boolean_t
 arc_buf_is_shared(arc_buf_t *buf)
 {
@@ -4650,10 +4651,10 @@ arc_flush_task(void *arg)
 	arc_flush_impl(spa_guid, B_FALSE);
 	arc_async_flush_remove(spa_guid, af->af_cache_level);
 
-	uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time);
-	if (elaspsed > 0) {
+	uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time);
+	if (elapsed > 0) {
 		zfs_dbgmsg("spa %llu arc flushed in %llu ms",
-		    (u_longlong_t)spa_guid, (u_longlong_t)elaspsed);
+		    (u_longlong_t)spa_guid, (u_longlong_t)elapsed);
 	}
 }
 
@@ -8547,7 +8548,7 @@ l2arc_dev_get_next(void)
 	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
 	 * both locks will be dropped and a spa config lock held instead.
 	 */
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	mutex_enter(&l2arc_dev_mtx);
 
 	/* if there are no vdevs, there is nothing to do */
@@ -8590,7 +8591,7 @@ out:
 	 */
 	if (next != NULL)
 		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 
 	return (next);
 }
@@ -9151,7 +9152,7 @@ top:
 		if (dev->l2ad_first) {
 			/*
 			 * This is the first sweep through the device. There is
-			 * nothing to evict. We have already trimmmed the
+			 * nothing to evict. We have already trimmed the
 			 * whole device.
 			 */
 			goto out;
@@ -10085,12 +10086,12 @@ l2arc_device_teardown(void *arg)
 	kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
 	vmem_free(remdev, sizeof (l2arc_dev_t));
 
-	uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time);
-	if (elaspsed > 0) {
+	uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time);
+	if (elapsed > 0) {
 		zfs_dbgmsg("spa %llu, vdev %llu removed in %llu ms",
 		    (u_longlong_t)rva->rva_spa_gid,
 		    (u_longlong_t)rva->rva_vdev_gid,
-		    (u_longlong_t)elaspsed);
+		    (u_longlong_t)elapsed);
 	}
 
 	if (rva->rva_async)
@@ -10230,7 +10231,7 @@ l2arc_stop(void)
 void
 l2arc_spa_rebuild_start(spa_t *spa)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	/*
 	 * Locate the spa's l2arc devices and kick off rebuild threads.
@@ -10255,7 +10256,7 @@ l2arc_spa_rebuild_start(spa_t *spa)
 void
 l2arc_spa_rebuild_stop(spa_t *spa)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	ASSERT(spa_namespace_held() ||
 	    spa->spa_export_thread == curthread);
 
 	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
diff --git a/sys/contrib/openzfs/module/zfs/bpobj.c b/sys/contrib/openzfs/module/zfs/bpobj.c
index ea9fbd036c6e..afcb2374f824 100644
--- a/sys/contrib/openzfs/module/zfs/bpobj.c
+++ b/sys/contrib/openzfs/module/zfs/bpobj.c
@@ -752,7 +752,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 		}
 		dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
 		    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
-		    numsubsub * sizeof (subobj), subdb->db_data, tx);
+		    numsubsub * sizeof (subobj), subdb->db_data, tx,
+		    DMU_READ_NO_PREFETCH);
 		dmu_buf_rele(subdb, FTAG);
 		bpo->bpo_phys->bpo_num_subobjs += numsubsub;
 
@@ -777,7 +778,7 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 		dmu_write(bpo->bpo_os, bpo->bpo_object,
 		    bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
 		    numbps * sizeof (blkptr_t),
-		    bps->db_data, tx);
+		    bps->db_data, tx, DMU_READ_NO_PREFETCH);
 		dmu_buf_rele(bps, FTAG);
 		bpo->bpo_phys->bpo_num_blkptrs += numbps;
 
@@ -794,7 +795,7 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 
 		dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
 		    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
-		    sizeof (subobj), &subobj, tx);
+		    sizeof (subobj), &subobj, tx, DMU_READ_NO_PREFETCH);
 		bpo->bpo_phys->bpo_num_subobjs++;
 	}
 
diff --git a/sys/contrib/openzfs/module/zfs/bptree.c b/sys/contrib/openzfs/module/zfs/bptree.c
index a98bba3eb259..1274278e8e91 100644
--- a/sys/contrib/openzfs/module/zfs/bptree.c
+++ b/sys/contrib/openzfs/module/zfs/bptree.c
@@ -137,7 +137,8 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
 	bte = kmem_zalloc(sizeof (*bte), KM_SLEEP);
 	bte->be_birth_txg = birth_txg;
 	bte->be_bp = *bp;
-	dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx);
+	dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx,
+	    DMU_READ_NO_PREFETCH);
 	kmem_free(bte, sizeof (*bte));
 
 	dmu_buf_will_dirty(db, tx);
@@ -247,7 +248,8 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
 				    ZB_DESTROYED_OBJSET);
 				ASSERT0(bte.be_zb.zb_level);
 				dmu_write(os, obj, i * sizeof (bte),
-				    sizeof (bte), &bte, tx);
+				    sizeof (bte), &bte, tx,
+				    DMU_READ_NO_PREFETCH);
 				if (err == EIO || err == ECKSUM ||
 				    err == ENXIO) {
 					/*
@@ -269,7 +271,8 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
 				 */
 				bte.be_birth_txg = UINT64_MAX;
 				dmu_write(os, obj, i * sizeof (bte),
-				    sizeof (bte), &bte, tx);
+				    sizeof (bte), &bte, tx,
+				    DMU_READ_NO_PREFETCH);
 			}
 
 			if (!ioerr) {
diff --git a/sys/contrib/openzfs/module/zfs/brt.c b/sys/contrib/openzfs/module/zfs/brt.c
index 40664354aa73..08a6bd52ab31 100644
--- a/sys/contrib/openzfs/module/zfs/brt.c
+++ b/sys/contrib/openzfs/module/zfs/brt.c
@@ -260,8 +260,8 @@ static int brt_zap_prefetch = 1;
 #define	BRT_DEBUG(...)	do { } while (0)
 #endif
 
-static int brt_zap_default_bs = 12;
-static int brt_zap_default_ibs = 12;
+static int brt_zap_default_bs = 13;
+static int brt_zap_default_ibs = 13;
 
 static kstat_t	*brt_ksp;
 
@@ -454,6 +454,7 @@ brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
 	VERIFY(mos_entries != 0);
 	VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd,
 	    &brtvd->bv_mos_entries_dnode));
+	dnode_set_storage_type(brtvd->bv_mos_entries_dnode, DMU_OT_DDT_ZAP);
 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
 	brtvd->bv_mos_entries = mos_entries;
 	rw_exit(&brtvd->bv_mos_entries_lock);
@@ -508,8 +509,8 @@ brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd)
 	size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1;
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
-	entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
 	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
+	entcount = vmem_zalloc(nblocks * BRT_BLOCKSIZE, KM_SLEEP);
 	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
 
 	if (!brtvd->bv_initiated) {
@@ -530,9 +531,8 @@ brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd)
 
 		memcpy(entcount, brtvd->bv_entcount,
 		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
-		vmem_free(brtvd->bv_entcount,
-		    sizeof (entcount[0]) * brtvd->bv_size);
 		onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
+		vmem_free(brtvd->bv_entcount, onblocks * BRT_BLOCKSIZE);
 		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
 		    BT_SIZEOFMAP(onblocks)));
 		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks));
@@ -581,13 +581,14 @@ brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd)
 	 */
 	error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
 	    MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
-	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
+	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO);
 	if (error != 0)
 		return (error);
 
 	ASSERT(bvphys->bvp_mos_entries != 0);
 	VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd,
 	    &brtvd->bv_mos_entries_dnode));
+	dnode_set_storage_type(brtvd->bv_mos_entries_dnode, DMU_OT_DDT_ZAP);
 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
 	brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
 	rw_exit(&brtvd->bv_mos_entries_lock);
@@ -613,9 +614,9 @@ brt_vdev_dealloc(brt_vdev_t *brtvd)
 	ASSERT(brtvd->bv_initiated);
 	ASSERT0(avl_numnodes(&brtvd->bv_tree));
 
-	vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
-	brtvd->bv_entcount = NULL;
 	uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
+	vmem_free(brtvd->bv_entcount, nblocks * BRT_BLOCKSIZE);
+	brtvd->bv_entcount = NULL;
 	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks));
 	brtvd->bv_bitmap = NULL;
 
@@ -807,10 +808,10 @@ brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
 		/*
 		 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
 		 */
-		dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
-		    brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
-		    brtvd->bv_entcount, tx);
 		uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
+		dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
+		    nblocks * BRT_BLOCKSIZE, brtvd->bv_entcount, tx,
+		    DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO);
 		memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks));
 		brtvd->bv_entcount_dirty = FALSE;
 	}
@@ -1510,6 +1511,31 @@ brt_load(spa_t *spa)
 }
 
 void
+brt_prefetch_all(spa_t *spa)
+{
+	/*
+	 * Load all BRT entries for each vdev. This is intended to perform
+	 * a prefetch on all such blocks. For the same reason that brt_prefetch
+	 * (called from brt_pending_add) isn't locked, this is also not locked.
+	 */
+	brt_rlock(spa);
+	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
+		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
+		brt_unlock(spa);
+
+		rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);
+		if (brtvd->bv_mos_entries != 0) {
+			(void) zap_prefetch_object(spa->spa_meta_objset,
+			    brtvd->bv_mos_entries);
+		}
+		rw_exit(&brtvd->bv_mos_entries_lock);
+
+		brt_rlock(spa);
+	}
+	brt_unlock(spa);
+}
+
+void
 brt_unload(spa_t *spa)
 {
 	if (spa->spa_brt_rangesize == 0)
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
index 3d0f88b36336..72c597609ade 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -446,7 +446,10 @@ static boolean_t
 dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
 {
 	DB_DNODE_ENTER(db);
-	dmu_object_type_t type = DB_DNODE(db)->dn_type;
+	dnode_t *dn = DB_DNODE(db);
+	dmu_object_type_t type = dn->dn_storage_type;
+	if (type == DMU_OT_NONE)
+		type = dn->dn_type;
 	DB_DNODE_EXIT(db);
 
 	/* Check if this dbuf is one of the types we care about */
@@ -2270,14 +2273,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	if (dn->dn_objset->os_dsl_dataset != NULL)
 		rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
 #endif
-	/*
-	 * We make this assert for private objects as well, but after we
-	 * check if we're already dirty.  They are allowed to re-dirty
-	 * in syncing context.
-	 */
-	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
-	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
-	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	mutex_enter(&db->db_mtx);
 	/*
@@ -2289,12 +2284,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
 	    db->db_state == DB_NOFILL);
 
-	mutex_enter(&dn->dn_mtx);
-	dnode_set_dirtyctx(dn, tx, db);
-	if (tx->tx_txg > dn->dn_dirty_txg)
-		dn->dn_dirty_txg = tx->tx_txg;
-	mutex_exit(&dn->dn_mtx);
-
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		dn->dn_have_spill = B_TRUE;
 
@@ -2313,13 +2302,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		return (dr_next);
 	}
 
-	/*
-	 * Only valid if not already dirty.
-	 */
-	ASSERT(dn->dn_object == 0 ||
-	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
-	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
-
 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
 
 	/*
@@ -2557,12 +2539,13 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	/*
 	 * Due to our use of dn_nlevels below, this can only be called
-	 * in open context, unless we are operating on the MOS.
-	 * From syncing context, dn_nlevels may be different from the
-	 * dn_nlevels used when dbuf was dirtied.
+	 * in open context, unless we are operating on the MOS or it's
+	 * a special object. From syncing context, dn_nlevels may be
+	 * different from the dn_nlevels used when dbuf was dirtied.
 	 */
 	ASSERT(db->db_objset ==
 	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
+	    DMU_OBJECT_IS_SPECIAL(db->db.db_object) ||
 	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT0(db->db_level);
diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c
index d6658375f810..0dc9adc7fd4f 100644
--- a/sys/contrib/openzfs/module/zfs/ddt.c
+++ b/sys/contrib/openzfs/module/zfs/ddt.c
@@ -1701,9 +1701,11 @@ ddt_load(spa_t *spa)
 			}
 		}
 
-		error = ddt_log_load(ddt);
-		if (error != 0 && error != ENOENT)
-			return (error);
+		if (ddt->ddt_flags & DDT_FLAG_LOG) {
+			error = ddt_log_load(ddt);
+			if (error != 0 && error != ENOENT)
+				return (error);
+		}
 
 		DDT_KSTAT_SET(ddt, dds_log_active_entries,
 		    avl_numnodes(&ddt->ddt_log_active->ddl_tree));
diff --git a/sys/contrib/openzfs/module/zfs/ddt_log.c b/sys/contrib/openzfs/module/zfs/ddt_log.c
index 3d30e244c1f7..3d42c51365a8 100644
--- a/sys/contrib/openzfs/module/zfs/ddt_log.c
+++ b/sys/contrib/openzfs/module/zfs/ddt_log.c
@@ -176,11 +176,13 @@ ddt_log_update_stats(ddt_t *ddt)
 	 * that's reasonable to expect anyway.
 	 */
 	dmu_object_info_t doi;
-	uint64_t nblocks;
-	dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
-	nblocks = doi.doi_physical_blocks_512;
-	dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
-	nblocks += doi.doi_physical_blocks_512;
+	uint64_t nblocks = 0;
+	if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object,
+	    &doi) == 0)
+		nblocks += doi.doi_physical_blocks_512;
+	if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object,
+	    &doi) == 0)
+		nblocks += doi.doi_physical_blocks_512;
 
 	ddt_object_t *ddo = &ddt->ddt_log_stats;
 	ddo->ddo_count =
@@ -220,7 +222,7 @@ ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
 
 	VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
 	    B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
-	    DMU_READ_NO_PREFETCH));
+	    DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO));
 
 	dlu->dlu_tx = tx;
 	dlu->dlu_block = dlu->dlu_offset = 0;
@@ -243,6 +245,13 @@ ddt_log_alloc_entry(ddt_t *ddt)
 }
 
 static void
+ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle)
+{
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+}
+
+static void
 ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
 {
 	/* Create the log tree entry from a live or stored entry */
@@ -289,7 +298,8 @@ ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
 	 * we will fill it, and zero it out.
 	 */
 	if (dlu->dlu_offset == 0) {
-		dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);
+		dmu_buf_will_fill_flags(db, dlu->dlu_tx, B_FALSE,
+		    DMU_UNCACHEDIO);
 		memset(db->db_data, 0, db->db_size);
 	}
 
@@ -347,8 +357,7 @@ ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
 	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
 
 	avl_remove(&ddl->ddl_tree, ddle);
-	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
-	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+	ddt_log_free_entry(ddt, ddle);
 
 	return (B_TRUE);
 }
@@ -365,8 +374,7 @@ ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
 	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
 
 	avl_remove(&ddl->ddl_tree, ddle);
-	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
-	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+	ddt_log_free_entry(ddt, ddle);
 
 	return (B_TRUE);
 }
@@ -527,8 +535,7 @@ ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
 	IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
 	while ((ddle =
 	    avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
-		kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
-		    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+		ddt_log_free_entry(ddt, ddle);
 	}
 	ASSERT(avl_is_empty(&ddl->ddl_tree));
 }
@@ -591,7 +598,7 @@ ddt_log_load_one(ddt_t *ddt, uint_t n)
 		for (uint64_t offset = 0; offset < hdr.dlh_length;
 		    offset += dn->dn_datablksz) {
 			err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
-			    DMU_READ_PREFETCH);
+			    DMU_READ_PREFETCH | DMU_UNCACHEDIO);
 			if (err != 0) {
 				dnode_rele(dn, FTAG);
 				ddt_log_empty(ddt, ddl);
@@ -727,7 +734,7 @@ ddt_log_load(ddt_t *ddt)
 				ddle = fe;
 				fe = AVL_NEXT(fl, fe);
 				avl_remove(fl, ddle);
-
+				ddt_log_free_entry(ddt, ddle);
 				ddle = ae;
 				ae = AVL_NEXT(al, ae);
 			}
diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c
index f7f808d5b8f7..5690f8afad00 100644
--- a/sys/contrib/openzfs/module/zfs/dmu.c
+++ b/sys/contrib/openzfs/module/zfs/dmu.c
@@ -635,7 +635,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 int
 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, int read, const void *tag, int *numbufsp,
-    dmu_buf_t ***dbpp)
+    dmu_buf_t ***dbpp, dmu_flags_t flags)
 {
 	dnode_t *dn;
 	int err;
@@ -645,7 +645,7 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
 		return (err);
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
-	    numbufsp, dbpp, DMU_READ_PREFETCH);
+	    numbufsp, dbpp, flags);
 
 	dnode_rele(dn, FTAG);
 
@@ -655,14 +655,14 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
 int
 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
     uint64_t length, boolean_t read, const void *tag, int *numbufsp,
-    dmu_buf_t ***dbpp)
+    dmu_buf_t ***dbpp, dmu_flags_t flags)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	err = dmu_buf_hold_array_by_dnode(DB_DNODE(db), offset, length, read,
-	    tag, numbufsp, dbpp, DMU_READ_PREFETCH);
+	    tag, numbufsp, dbpp, flags);
 	DB_DNODE_EXIT(db);
 
 	return (err);
@@ -759,6 +759,8 @@ dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
 		 */
 		uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
 		limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
+		if (limit == 0)
+			end2 = start2;
 		do {
 			level2++;
 			start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
@@ -848,12 +850,15 @@ dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset, uint64_t size)
 		return (err);
 
 	/*
-	 * Chunk the requests (16 indirects worth) so that we can be interrupted
+	 * Chunk the requests (16 indirects worth) so that we can be
+	 * interrupted.  Prefetch at least SPA_MAXBLOCKSIZE at a time
+	 * to better utilize pools with smaller block sizes.
 	 */
 	uint64_t chunksize;
 	if (dn->dn_indblkshift) {
 		uint64_t nbps = bp_span_in_blocks(dn->dn_indblkshift, 1);
 		chunksize = (nbps * 16) << dn->dn_datablkshift;
+		chunksize = MAX(chunksize, SPA_MAXBLOCKSIZE);
 	} else {
 		chunksize = dn->dn_datablksz;
 	}
@@ -1291,7 +1296,7 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
 
 void
 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    const void *buf, dmu_tx_t *tx)
+    const void *buf, dmu_tx_t *tx, dmu_flags_t flags)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
@@ -1300,8 +1305,8 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		return;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
-	    FALSE, FTAG, &numbufs, &dbp));
-	dmu_write_impl(dbp, numbufs, offset, size, buf, tx, DMU_READ_PREFETCH);
+	    FALSE, FTAG, &numbufs, &dbp, flags));
+	dmu_write_impl(dbp, numbufs, offset, size, buf, tx, flags);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
@@ -1344,7 +1349,7 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		return;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
-	    FALSE, FTAG, &numbufs, &dbp));
+	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
 
 	for (i = 0; i < numbufs; i++) {
 		dmu_buf_t *db = dbp[i];
@@ -1381,7 +1386,7 @@ dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	dmu_buf_t **dbp;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG,
-	    &numbufs, &dbp));
+	    &numbufs, &dbp, DMU_READ_PREFETCH));
 	for (i = 0; i < numbufs; i++)
 		dmu_buf_redact(dbp[i], tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
@@ -1689,8 +1694,8 @@ dmu_object_cached_size(objset_t *os, uint64_t object,
 
 	dmu_object_info_from_dnode(dn, &doi);
 
-	for (uint64_t off = 0; off < doi.doi_max_offset;
-	    off += dmu_prefetch_max) {
+	for (uint64_t off = 0; off < doi.doi_max_offset &&
+	    dmu_prefetch_max > 0; off += dmu_prefetch_max) {
 		/* dbuf_read doesn't prefetch L1 blocks. */
 		dmu_prefetch_by_dnode(dn, 1, off,
 		    dmu_prefetch_max, ZIO_PRIORITY_SYNC_READ);
@@ -2590,7 +2595,7 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 	int error, numbufs;
 
 	error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
-	    &numbufs, &dbp);
+	    &numbufs, &dbp, DMU_READ_PREFETCH);
 	if (error != 0) {
 		if (error == ESRCH) {
 			error = SET_ERROR(ENXIO);
@@ -2691,7 +2696,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 	spa = os->os_spa;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
-	    &numbufs, &dbp));
+	    &numbufs, &dbp, DMU_READ_PREFETCH));
 	ASSERT3U(nbps, ==, numbufs);
 
 	/*
diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c
index a77f338bdfd3..8e6b569c2100 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_objset.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c
@@ -2037,6 +2037,8 @@ userquota_updates_task(void *arg)
 				dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 		}
 		dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
+		ASSERT3U(dn->dn_dirtycnt, >, 0);
+		dn->dn_dirtycnt--;
 		mutex_exit(&dn->dn_mtx);
 
 		multilist_sublist_remove(list, dn);
@@ -2070,6 +2072,10 @@ dnode_rele_task(void *arg)
 
 	dnode_t *dn;
 	while ((dn = multilist_sublist_head(list)) != NULL) {
+		mutex_enter(&dn->dn_mtx);
+		ASSERT3U(dn->dn_dirtycnt, >, 0);
+		dn->dn_dirtycnt--;
+		mutex_exit(&dn->dn_mtx);
 		multilist_sublist_remove(list, dn);
 		dnode_rele(dn, &os->os_synced_dnodes);
 	}
diff --git a/sys/contrib/openzfs/module/zfs/dmu_redact.c b/sys/contrib/openzfs/module/zfs/dmu_redact.c
index 5a22ed71a5fe..c087be4c811d 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_redact.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_redact.c
@@ -544,7 +544,8 @@ redaction_list_update_sync(void *arg, dmu_tx_t *tx)
 		if (index == bufsize) {
 			dmu_write(mos, rl->rl_object,
 			    rl->rl_phys->rlp_num_entries * sizeof (*buf),
-			    bufsize * sizeof (*buf), buf, tx);
+			    bufsize * sizeof (*buf), buf, tx,
+			    DMU_READ_NO_PREFETCH);
 			rl->rl_phys->rlp_num_entries += bufsize;
 			index = 0;
 		}
@@ -552,7 +553,8 @@ redaction_list_update_sync(void *arg, dmu_tx_t *tx)
 	}
 	if (index > 0) {
 		dmu_write(mos, rl->rl_object, rl->rl_phys->rlp_num_entries *
-		    sizeof (*buf), index * sizeof (*buf), buf, tx);
+		    sizeof (*buf), index * sizeof (*buf), buf, tx,
+		    DMU_READ_NO_PREFETCH);
 		rl->rl_phys->rlp_num_entries += index;
 	}
 	kmem_free(buf, bufsize * sizeof (*buf));
diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c
index 963ff41232a3..e0cc4a7e13e0 100644
--- a/sys/contrib/openzfs/module/zfs/dnode.c
+++ b/sys/contrib/openzfs/module/zfs/dnode.c
@@ -173,9 +173,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
 	dn->dn_assigned_txg = 0;
-	dn->dn_dirty_txg = 0;
-	dn->dn_dirtyctx = 0;
-	dn->dn_dirtyctx_firstset = NULL;
+	dn->dn_dirtycnt = 0;
 	dn->dn_bonus = NULL;
 	dn->dn_have_spill = B_FALSE;
 	dn->dn_zio = NULL;
@@ -229,9 +227,7 @@ dnode_dest(void *arg, void *unused)
 	ASSERT0(dn->dn_allocated_txg);
 	ASSERT0(dn->dn_free_txg);
 	ASSERT0(dn->dn_assigned_txg);
-	ASSERT0(dn->dn_dirty_txg);
-	ASSERT0(dn->dn_dirtyctx);
-	ASSERT0P(dn->dn_dirtyctx_firstset);
+	ASSERT0(dn->dn_dirtycnt);
 	ASSERT0P(dn->dn_bonus);
 	ASSERT(!dn->dn_have_spill);
 	ASSERT0P(dn->dn_zio);
@@ -692,10 +688,8 @@ dnode_destroy(dnode_t *dn)
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
 	dn->dn_assigned_txg = 0;
-	dn->dn_dirty_txg = 0;
+	dn->dn_dirtycnt = 0;
 
-	dn->dn_dirtyctx = 0;
-	dn->dn_dirtyctx_firstset = NULL;
 	if (dn->dn_bonus != NULL) {
 		mutex_enter(&dn->dn_bonus->db_mtx);
 		dbuf_destroy(dn->dn_bonus);
@@ -800,11 +794,9 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 	dn->dn_bonuslen = bonuslen;
 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
-	dn->dn_dirtyctx = 0;
 
 	dn->dn_free_txg = 0;
-	dn->dn_dirtyctx_firstset = NULL;
-	dn->dn_dirty_txg = 0;
+	dn->dn_dirtycnt = 0;
 
 	dn->dn_allocated_txg = tx->tx_txg;
 	dn->dn_id_flags = 0;
@@ -955,9 +947,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 	ndn->dn_allocated_txg = odn->dn_allocated_txg;
 	ndn->dn_free_txg = odn->dn_free_txg;
 	ndn->dn_assigned_txg = odn->dn_assigned_txg;
-	ndn->dn_dirty_txg = odn->dn_dirty_txg;
-	ndn->dn_dirtyctx = odn->dn_dirtyctx;
-	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
+	ndn->dn_dirtycnt = odn->dn_dirtycnt;
 	ASSERT0(zfs_refcount_count(&odn->dn_tx_holds));
 	zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
 	ASSERT(avl_is_empty(&ndn->dn_dbufs));
@@ -1020,9 +1010,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 	odn->dn_allocated_txg = 0;
 	odn->dn_free_txg = 0;
 	odn->dn_assigned_txg = 0;
-	odn->dn_dirty_txg = 0;
-	odn->dn_dirtyctx = 0;
-	odn->dn_dirtyctx_firstset = NULL;
+	odn->dn_dirtycnt = 0;
 	odn->dn_have_spill = B_FALSE;
 	odn->dn_zio = NULL;
 	odn->dn_oldused = 0;
@@ -1273,8 +1261,8 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
 		} else if (DN_SLOT_IS_PTR(dn)) {
 			mutex_enter(&dn->dn_mtx);
 			boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
-			    zfs_refcount_is_zero(&dn->dn_holds) &&
-			    !DNODE_IS_DIRTY(dn));
+			    dn->dn_dirtycnt == 0 &&
+			    zfs_refcount_is_zero(&dn->dn_holds));
 			mutex_exit(&dn->dn_mtx);
 
 			if (!can_free)
@@ -1757,17 +1745,23 @@ dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp)
  * reference on the dnode.  Returns FALSE if unable to add a
  * new reference.
  */
+static boolean_t
+dnode_add_ref_locked(dnode_t *dn, const void *tag)
+{
+	ASSERT(MUTEX_HELD(&dn->dn_mtx));
+	if (zfs_refcount_is_zero(&dn->dn_holds))
+		return (FALSE);
+	VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
+	return (TRUE);
+}
+
 boolean_t
 dnode_add_ref(dnode_t *dn, const void *tag)
 {
 	mutex_enter(&dn->dn_mtx);
-	if (zfs_refcount_is_zero(&dn->dn_holds)) {
-		mutex_exit(&dn->dn_mtx);
-		return (FALSE);
-	}
-	VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
+	boolean_t r = dnode_add_ref_locked(dn, tag);
 	mutex_exit(&dn->dn_mtx);
-	return (TRUE);
+	return (r);
 }
 
 void
@@ -1830,31 +1824,20 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)
 }
 
 /*
- * Checks if the dnode itself is dirty, or is carrying any uncommitted records.
- * It is important to check both conditions, as some operations (eg appending
- * to a file) can dirty both as a single logical unit, but they are not synced
- * out atomically, so checking one and not the other can result in an object
- * appearing to be clean mid-way through a commit.
+ * Test if the dnode is dirty, or carrying uncommitted records.
  *
- * Do not change this lightly! If you get it wrong, dmu_offset_next() can
- * detect a hole where there is really data, leading to silent corruption.
+ * dn_dirtycnt is the number of txgs this dnode is dirty on. It's incremented
+ * in dnode_setdirty() the first time the dnode is dirtied on a txg, and
+ * decremented in either dnode_rele_task() or userquota_updates_task() when the
+ * txg is synced out.
  */
 boolean_t
 dnode_is_dirty(dnode_t *dn)
 {
 	mutex_enter(&dn->dn_mtx);
-
-	for (int i = 0; i < TXG_SIZE; i++) {
-		if (multilist_link_active(&dn->dn_dirty_link[i]) ||
-		    !list_is_empty(&dn->dn_dirty_records[i])) {
-			mutex_exit(&dn->dn_mtx);
-			return (B_TRUE);
-		}
-	}
-
+	boolean_t dirty = (dn->dn_dirtycnt != 0);
 	mutex_exit(&dn->dn_mtx);
-
-	return (B_FALSE);
+	return (dirty);
 }
 
 void
@@ -1916,7 +1899,11 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 	 * dnode will hang around after we finish processing its
 	 * children.
 	 */
-	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
+	mutex_enter(&dn->dn_mtx);
+	VERIFY(dnode_add_ref_locked(dn, (void *)(uintptr_t)tx->tx_txg));
+	dn->dn_dirtycnt++;
+	ASSERT3U(dn->dn_dirtycnt, <=, 3);
+	mutex_exit(&dn->dn_mtx);
 
 	(void) dbuf_dirty(dn->dn_dbuf, tx);
 
@@ -2221,32 +2208,6 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
 	mutex_exit(&dn->dn_dbufs_mtx);
 }
 
-void
-dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag)
-{
-	/*
-	 * Don't set dirtyctx to SYNC if we're just modifying this as we
-	 * initialize the objset.
-	 */
-	if (dn->dn_dirtyctx == DN_UNDIRTIED) {
-		dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
-
-		if (ds != NULL) {
-			rrw_enter(&ds->ds_bp_rwlock, RW_READER, tag);
-		}
-		if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
-			if (dmu_tx_is_syncing(tx))
-				dn->dn_dirtyctx = DN_DIRTY_SYNC;
-			else
-				dn->dn_dirtyctx = DN_DIRTY_OPEN;
-			dn->dn_dirtyctx_firstset = tag;
-		}
-		if (ds != NULL) {
-			rrw_exit(&ds->ds_bp_rwlock, tag);
-		}
-	}
-}
-
 static void
 dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len,
     dmu_tx_t *tx)
@@ -2535,26 +2496,27 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
 }
 
 /*
- * Scans a block at the indicated "level" looking for a hole or data,
- * depending on 'flags'.
+ * Scans the block at the indicated "level" looking for a hole or data,
+ * depending on 'flags' starting from array position given by *index.
  *
- * If level > 0, then we are scanning an indirect block looking at its
- * pointers.  If level == 0, then we are looking at a block of dnodes.
+ * If lvl > 0, then we are scanning an indirect block looking at its
+ * pointers. If lvl == 0, then we are looking at a block of dnodes.
  *
  * If we don't find what we are looking for in the block, we return ESRCH.
- * Otherwise, return with *offset pointing to the beginning (if searching
- * forwards) or end (if searching backwards) of the range covered by the
- * block pointer we matched on (or dnode).
+ * Otherwise, return with *index set to the matching array position.
  *
- * The basic search algorithm used below by dnode_next_offset() is to
- * use this function to search up the block tree (widen the search) until
- * we find something (i.e., we don't return ESRCH) and then search back
- * down the tree (narrow the search) until we reach our original search
- * level.
+ * In both cases, *offset is updated to point at the matched BP/dnode or
+ * the next offset to search (unless at the limit of possible offsets).
+ *
+ * The basic search algorithm used below by dnode_next_offset() uses this
+ * function to perform a block-order tree traversal. We search up the block
+ * tree (widen the search) until we find something (i.e., we don't return
+ * ESRCH) and then search back down the tree (narrow the search) until we
+ * reach our original search level or backtrack up because nothing matches.
  */
 static int
-dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
-    int lvl, uint64_t blkfill, uint64_t txg)
+dnode_next_offset_level(dnode_t *dn, int flags, int lvl, uint64_t blkid,
+    int *index, uint64_t blkfill, uint64_t txg, uint64_t *offset)
 {
 	dmu_buf_impl_t *db = NULL;
 	void *data = NULL;
@@ -2580,20 +2542,12 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 			rrw_enter(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock,
 			    RW_READER, FTAG);
 	} else {
-		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
 		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
 		if (error) {
 			if (error != ENOENT)
 				return (error);
 			if (hole)
 				return (0);
-			/*
-			 * This can only happen when we are searching up
-			 * the block tree for data.  We don't really need to
-			 * adjust the offset, as we will just end up looking
-			 * at the pointer to this block in its parent, and its
-			 * going to be unallocated, so we will skip over it.
-			 */
 			return (SET_ERROR(ESRCH));
 		}
 		error = dbuf_read(db, NULL,
@@ -2621,8 +2575,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 		ASSERT(dn->dn_type == DMU_OT_DNODE);
 		ASSERT(!(flags & DNODE_FIND_BACKWARDS));
 
-		for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
-		    i < blkfill; i += dnp[i].dn_extra_slots + 1) {
+		for (i = *index; i < blkfill; i += dnp[i].dn_extra_slots + 1) {
 			if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
 				break;
 		}
@@ -2630,11 +2583,11 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 		if (i == blkfill)
 			error = SET_ERROR(ESRCH);
 
+		*index = i;
 		*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
 		    (i << DNODE_SHIFT);
 	} else {
 		blkptr_t *bp = data;
-		uint64_t start = *offset;
 		span = (lvl - 1) * epbs + dn->dn_datablkshift;
 		minfill = 0;
 		maxfill = blkfill << ((lvl - 1) * epbs);
@@ -2644,40 +2597,27 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 		else
 			minfill++;
 
-		if (span >= 8 * sizeof (*offset)) {
-			/* This only happens on the highest indirection level */
-			ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
-			*offset = 0;
-		} else {
-			*offset = *offset >> span;
-		}
-
-		for (i = BF64_GET(*offset, 0, epbs);
-		    i >= 0 && i < epb; i += inc) {
+		for (i = *index; i >= 0 && i < epb; i += inc) {
 			if (BP_GET_FILL(&bp[i]) >= minfill &&
 			    BP_GET_FILL(&bp[i]) <= maxfill &&
 			    (hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg))
 				break;
-			if (inc > 0 || *offset > 0)
-				*offset += inc;
 		}
 
-		if (span >= 8 * sizeof (*offset)) {
-			*offset = start;
-		} else {
-			*offset = *offset << span;
-		}
-
-		if (inc < 0) {
-			/* traversing backwards; position offset at the end */
-			if (span < 8 * sizeof (*offset))
-				*offset = MIN(*offset + (1ULL << span) - 1,
-				    start);
-		} else if (*offset < start) {
-			*offset = start;
-		}
 		if (i < 0 || i >= epb)
 			error = SET_ERROR(ESRCH);
+
+		*index = i;
+		if (span < 8 * sizeof (*offset)) {
+			uint64_t nblk = blkid << epbs;
+			if (i >= 0 || blkid != 0)
+				nblk += i;
+			if ((nblk >> (8 * sizeof (*offset) - span)) == 0)
+				*offset = (flags & DNODE_FIND_BACKWARDS) ?
+				    /* backwards: position offset at the end */
+				    MIN(*offset, ((nblk + 1) << span) - 1) :
+				    MAX(*offset, nblk << span);
+		}
 	}
 
 	if (db != NULL) {
@@ -2701,6 +2641,18 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
  * DNODES_PER_BLOCK for the meta dnode, and some fraction of
  * DNODES_PER_BLOCK when searching for sparse regions thereof.
  *
+ * If minlvl == 0, this searches for dnodes or unallocated dnodes.
+ * If found, *offset points to the first offset of the matched dnode.
+ * Backwards search is not allowed for dnodes.
+ *
+ * If minlvl > 0, this searches for blocks at the given level.
+ * If found, *offset points to the first L0 offset of the block
+ * (or for backwards search, the last offset, inclusive).
+ *
+ * If not found, in both cases, *offset is set to the first (or last)
+ * offset of the unallocated indirect block where the search ended or
+ * the initial offset if no such block was encountered.
+ *
  * Examples:
  *
  * dnode_next_offset(dn, flags, offset, 1, 1, 0);
@@ -2721,7 +2673,8 @@ int
 dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
     int minlvl, uint64_t blkfill, uint64_t txg)
 {
-	uint64_t initial_offset = *offset;
+	uint64_t blkid;
+	int index, epbs;
 	int lvl, maxlvl;
 	int error = 0;
 
@@ -2743,18 +2696,52 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
 		goto out;
 	}
 
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 	maxlvl = dn->dn_phys->dn_nlevels;
 
-	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
-		error = dnode_next_offset_level(dn,
-		    flags, offset, lvl, blkfill, txg);
-		if (error != ESRCH)
-			break;
+	if (minlvl > 0) {
+		uint64_t n = dbuf_whichblock(dn, minlvl - 1, *offset);
+		blkid = n >> epbs;
+		index = BF64_GET(n, 0, epbs);
+	} else {
+		blkid = dbuf_whichblock(dn, 0, *offset);
+		index = (*offset >> DNODE_SHIFT) & (blkfill - 1);
+		ASSERT3U(BF64_GET(*offset, 0, DNODE_SHIFT), ==, 0);
 	}
 
-	while (error == 0 && --lvl >= minlvl) {
+	for (lvl = minlvl; lvl <= maxlvl; ) {
 		error = dnode_next_offset_level(dn,
-		    flags, offset, lvl, blkfill, txg);
+		    flags, lvl, blkid, &index, blkfill, txg, offset);
+
+		if (error == 0 && lvl > minlvl) {
+			/* Continue search at matched block in lvl-1. */
+			blkid = (blkid << epbs) + index;
+			index = 0;
+			--lvl;
+		} else if (error == ESRCH && lvl < maxlvl) {
+			/*
+			 * Continue search at next/prev index in lvl+1 block.
+			 *
+			 * Usually we only search upwards at the start of the
+			 * search as higher level blocks point at a matching
+			 * minlvl block in most cases, but we backtrack if not.
+			 *
+			 * This can happen for txg > 0 searches if the block
+			 * contains only BPs/dnodes freed at that txg. It also
+			 * happens if we are still syncing out the tree, and
+			 * some BP's at higher levels are not updated yet.
+			 *
+			 * We must adjust index to avoid coming back to the
+			 * same offset and getting stuck looping forever. The
+			 * next loop goes up again if index is -1 or (1<<epbs).
+			 */
+			index = BF64_GET(blkid, 0, epbs) +
+			    ((flags & DNODE_FIND_BACKWARDS) ? -1 : 1);
+			blkid = blkid >> epbs;
+			++lvl;
+		} else {
+			break;
+		}
 	}
 
 	/*
@@ -2766,9 +2753,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
 		error = 0;
 	}
 
-	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
-	    initial_offset < *offset : initial_offset > *offset))
-		error = SET_ERROR(ESRCH);
 out:
 	if (!(flags & DNODE_FIND_HAVELOCK))
 		rw_exit(&dn->dn_struct_rwlock);
diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c
index 9f4399af56bd..3f649ffb44e4 100644
--- a/sys/contrib/openzfs/module/zfs/metaslab.c
+++ b/sys/contrib/openzfs/module/zfs/metaslab.c
@@ -3966,7 +3966,8 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
 		object = space_map_object(msp->ms_sm);
 		dmu_write(spa->spa_meta_objset,
 		    msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
-		    msp->ms_id, sizeof (uint64_t), &object, tx);
+		    msp->ms_id, sizeof (uint64_t), &object, tx,
+		    DMU_READ_NO_PREFETCH);
 	}
 
 	/*
@@ -4292,7 +4293,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		VERIFY3U(new_object, !=, 0);
 
 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
-		    msp->ms_id, sizeof (uint64_t), &new_object, tx);
+		    msp->ms_id, sizeof (uint64_t), &new_object, tx,
+		    DMU_READ_NO_PREFETCH);
 
 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
 		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
@@ -6328,7 +6330,7 @@ metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
 	}
 
 	dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
-	    &entry, tx);
+	    &entry, tx, DMU_READ_NO_PREFETCH);
 }
 
 void
diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c
index 7db72b9b04b0..b8ba40ecdc9d 100644
--- a/sys/contrib/openzfs/module/zfs/mmp.c
+++ b/sys/contrib/openzfs/module/zfs/mmp.c
@@ -446,7 +446,7 @@ mmp_write_uberblock(spa_t *spa)
 	uint64_t offset;
 
 	hrtime_t lock_acquire_time = gethrtime();
-	spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER);
+	spa_config_enter_priority(spa, SCL_STATE, mmp_tag, RW_READER);
 	lock_acquire_time = gethrtime() - lock_acquire_time;
 	if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
 		zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
@@ -729,12 +729,12 @@ mmp_signal_all_threads(void)
 {
 	spa_t *spa = NULL;
 
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	while ((spa = spa_next(spa))) {
 		if (spa->spa_state == POOL_STATE_ACTIVE)
 			mmp_signal_thread(spa);
 	}
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 }
 
 ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval,
diff --git a/sys/contrib/openzfs/module/zfs/multilist.c b/sys/contrib/openzfs/module/zfs/multilist.c
index 7b85d19e19ee..46fb79269310 100644
--- a/sys/contrib/openzfs/module/zfs/multilist.c
+++ b/sys/contrib/openzfs/module/zfs/multilist.c
@@ -81,7 +81,7 @@ multilist_create_impl(multilist_t *ml, size_t size, size_t offset,
 	ml->ml_num_sublists = num;
 	ml->ml_index_func = index_func;
 
-	ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
+	ml->ml_sublists = vmem_zalloc(sizeof (multilist_sublist_t) *
 	    ml->ml_num_sublists, KM_SLEEP);
 
 	ASSERT3P(ml->ml_sublists, !=, NULL);
@@ -134,7 +134,7 @@ multilist_destroy(multilist_t *ml)
 	}
 
 	ASSERT3P(ml->ml_sublists, !=, NULL);
-	kmem_free(ml->ml_sublists,
+	vmem_free(ml->ml_sublists,
 	    sizeof (multilist_sublist_t) * ml->ml_num_sublists);
 
 	ml->ml_num_sublists = 0;
diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c
index ea2d2c7227c8..d73195f1a21f 100644
--- a/sys/contrib/openzfs/module/zfs/range_tree.c
+++ b/sys/contrib/openzfs/module/zfs/range_tree.c
@@ -585,7 +585,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
 		 * the size, since we do not support removing partial segments
 		 * of range trees with gaps.
 		 */
-		zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
+		zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
 		    zfs_rs_get_start_raw(rs, rt));
 		zfs_range_tree_stat_incr(rt, &rs_tmp);
 
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
index b3bb46da263b..34de3f1d9525 100644
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -141,7 +141,7 @@ typedef enum zti_modes {
 
 #define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
 #define	ZTI_PCT(n)	{ ZTI_MODE_ONLINE_PERCENT, (n), 1 }
-#define	ZTI_SCALE	{ ZTI_MODE_SCALE, 0, 1 }
+#define	ZTI_SCALE(min)	{ ZTI_MODE_SCALE, (min), 1 }
 #define	ZTI_SYNC	{ ZTI_MODE_SYNC, 0, 1 }
 #define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
 
@@ -180,13 +180,13 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
-	{ ZTI_N(8),	ZTI_NULL,	ZTI_SCALE,	ZTI_NULL }, /* READ */
+	{ ZTI_N(8),	ZTI_NULL,	ZTI_SCALE(0),	ZTI_NULL }, /* READ */
 #ifdef illumos
-	{ ZTI_SYNC,	ZTI_N(5),	ZTI_SCALE,	ZTI_N(5) }, /* WRITE */
+	{ ZTI_SYNC,	ZTI_N(5),	ZTI_SCALE(0),	ZTI_N(5) }, /* WRITE */
 #else
-	{ ZTI_SYNC,	ZTI_NULL,	ZTI_SCALE,	ZTI_NULL }, /* WRITE */
+	{ ZTI_SYNC,	ZTI_NULL,	ZTI_SCALE(0),	ZTI_NULL }, /* WRITE */
 #endif
-	{ ZTI_SCALE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
+	{ ZTI_SCALE(32), ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FLUSH */
 	{ ZTI_N(4),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* TRIM */
@@ -1082,7 +1082,7 @@ spa_change_guid(spa_t *spa, const uint64_t *guidp)
 	int error;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 
 	if (guidp != NULL) {
 		guid = *guidp;
@@ -1117,7 +1117,7 @@ spa_change_guid(spa_t *spa, const uint64_t *guidp)
 	}
 
 out:
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
@@ -1170,7 +1170,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 	uint_t value = ztip->zti_value;
 	uint_t count = ztip->zti_count;
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
-	uint_t cpus, flags = TASKQ_DYNAMIC;
+	uint_t cpus, threads, flags = TASKQ_DYNAMIC;
 
 	switch (mode) {
 	case ZTI_MODE_FIXED:
@@ -1183,8 +1183,8 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 		 * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs,
 		 * not to exceed the number of spa allocators, and align to it.
 		 */
-		cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
-		count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq));
+		threads = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
+		count = MAX(1, threads / MAX(1, zio_taskq_write_tpq));
 		count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
 		count = MIN(count, spa->spa_alloc_count);
 		while (spa->spa_alloc_count % count != 0 &&
@@ -1201,14 +1201,14 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 		break;
 
 	case ZTI_MODE_SCALE:
-		flags |= TASKQ_THREADS_CPU_PCT;
 		/*
 		 * We want more taskqs to reduce lock contention, but we want
 		 * less for better request ordering and CPU utilization.
 		 */
-		cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
+		threads = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
+		threads = MAX(threads, value);
 		if (zio_taskq_batch_tpq > 0) {
-			count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) /
+			count = MAX(1, (threads + zio_taskq_batch_tpq / 2) /
 			    zio_taskq_batch_tpq);
 		} else {
 			/*
@@ -1228,13 +1228,23 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 			 * 128     10      8%      10      100
 			 * 256     14      6%      15      210
 			 */
-			count = 1 + cpus / 6;
+			cpus = MIN(threads, boot_ncpus);
+			count = 1 + threads / 6;
 			while (count * count > cpus)
 				count--;
 		}
-		/* Limit each taskq within 100% to not trigger assertion. */
-		count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
-		value = (zio_taskq_batch_pct + count / 2) / count;
+
+		/*
+		 * Try to represent the number of threads per taskq as percent
+		 * of online CPUs to allow scaling with later online/offline.
+		 * Fall back to absolute numbers if can't.
+		 */
+		value = (threads * 100 + boot_ncpus * count / 2) /
+		    (boot_ncpus * count);
+		if (value < 5 || value > 100)
+			value = MAX(1, (threads + count / 2) / count);
+		else
+			flags |= TASKQ_THREADS_CPU_PCT;
 		break;
 
 	case ZTI_MODE_NULL:
@@ -1433,8 +1443,30 @@ spa_taskq_param_set(zio_type_t t, char *cfg)
 			break;
 		}
 
+		/*
+		 * SCALE is optionally parameterised by minimum number of
+		 * threads.
+		 */
 		case ZTI_MODE_SCALE: {
-			const zio_taskq_info_t zti = ZTI_SCALE;
+			unsigned long long mint = 0;
+			if (c != NULL && *c != '\0') {
+				/* Need a number */
+				if (!(isdigit(*c)))
+					break;
+				tok = c;
+
+				/* Take digits */
+				err = ddi_strtoull(tok, &tok, 10, &mint);
+				/* Must succeed, and moved forward */
+				if (err != 0 || tok == c || *tok != '\0')
+					break;
+
+				/* Sanity check */
+				if (mint >= 16384)
+					break;
+			}
+
+			const zio_taskq_info_t zti = ZTI_SCALE(mint);
 			row[q] = zti;
 			break;
 		}
@@ -1501,6 +1533,9 @@ spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline)
 			pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
 			    modes[zti->zti_mode], zti->zti_count,
 			    zti->zti_value);
+		else if (zti->zti_mode == ZTI_MODE_SCALE && zti->zti_value > 0)
+			pos += sprintf(&buf[pos], "%s%s,%u", sep,
+			    modes[zti->zti_mode], zti->zti_value);
 		else
 			pos += sprintf(&buf[pos], "%s%s", sep,
 			    modes[zti->zti_mode]);
@@ -1520,9 +1555,10 @@ spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
 {
 	char *cfg = kmem_strdup(val);
 	int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg);
-	kmem_free(cfg, strlen(val)+1);
+	kmem_strfree(cfg);
 	return (-err);
 }
+
 static int
 spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
 {
@@ -1534,14 +1570,30 @@ spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
 {
 	char *cfg = kmem_strdup(val);
 	int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg);
-	kmem_free(cfg, strlen(val)+1);
+	kmem_strfree(cfg);
 	return (-err);
 }
+
 static int
 spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
 {
 	return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE));
 }
+
+static int
+spa_taskq_free_param_set(const char *val, zfs_kernel_param_t *kp)
+{
+	char *cfg = kmem_strdup(val);
+	int err = spa_taskq_param_set(ZIO_TYPE_FREE, cfg);
+	kmem_strfree(cfg);
+	return (-err);
+}
+
+static int
+spa_taskq_free_param_get(char *buf, zfs_kernel_param_t *kp)
+{
+	return (spa_taskq_param_get(ZIO_TYPE_FREE, buf, TRUE));
+}
 #else
 /*
  * On FreeBSD load-time parameters can be set up before malloc() is available,
@@ -1574,6 +1626,19 @@ spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
 		return (err);
 	return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf));
 }
+
+static int
+spa_taskq_free_param(ZFS_MODULE_PARAM_ARGS)
+{
+	char buf[SPA_TASKQ_PARAM_MAX];
+	int err;
+
+	(void) spa_taskq_param_get(ZIO_TYPE_FREE, buf, FALSE);
+	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+	if (err || req->newptr == NULL)
+		return (err);
+	return (spa_taskq_param_set(ZIO_TYPE_FREE, buf));
+}
 #endif
 #endif /* _KERNEL */
 
@@ -2187,7 +2252,7 @@ spa_should_sync_time_logger_on_unload(spa_t *spa)
 static void
 spa_unload(spa_t *spa)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	ASSERT(spa_namespace_held() ||
 	    spa->spa_export_thread == curthread);
 	ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
 
@@ -5260,7 +5325,7 @@ spa_ld_read_checkpoint_txg(spa_t *spa)
 	int error = 0;
 
 	ASSERT0(spa->spa_checkpoint_txg);
-	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	ASSERT(spa_namespace_held() ||
 	    spa->spa_load_thread == curthread);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -5287,7 +5352,7 @@ spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
 
 	/*
@@ -5363,7 +5428,7 @@ spa_ld_checkpoint_rewind(spa_t *spa)
 	uberblock_t checkpoint;
 	int error = 0;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 	ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -5510,7 +5575,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	boolean_t update_config_cache = B_FALSE;
 	hrtime_t load_start = gethrtime();
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
 
 	spa_load_note(spa, "LOADING");
@@ -5557,7 +5622,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	 * Drop the namespace lock for the rest of the function.
 	 */
 	spa->spa_load_thread = curthread;
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 
 	/*
 	 * Retrieve the checkpoint txg if the pool has a checkpoint.
@@ -5796,9 +5861,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 
 	spa_load_note(spa, "LOADED");
 fail:
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	spa->spa_load_thread = NULL;
-	cv_broadcast(&spa_namespace_cv);
+	spa_namespace_broadcast();
 
 	return (error);
 
@@ -5960,14 +6025,14 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag,
 	 * up calling spa_open() again.  The real fix is to figure out how to
 	 * avoid dsl_dir_open() calling this in the first place.
 	 */
-	if (MUTEX_NOT_HELD(&spa_namespace_lock)) {
-		mutex_enter(&spa_namespace_lock);
+	if (!spa_namespace_held()) {
+		spa_namespace_enter(FTAG);
 		locked = B_TRUE;
 	}
 
 	if ((spa = spa_lookup(pool)) == NULL) {
 		if (locked)
-			mutex_exit(&spa_namespace_lock);
+			spa_namespace_exit(FTAG);
 		return (SET_ERROR(ENOENT));
 	}
 
@@ -6004,7 +6069,7 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag,
 			spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
 			spa_remove(spa);
 			if (locked)
-				mutex_exit(&spa_namespace_lock);
+				spa_namespace_exit(FTAG);
 			return (SET_ERROR(ENOENT));
 		}
 
@@ -6024,7 +6089,7 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag,
 			spa_deactivate(spa);
 			spa->spa_last_open_failed = error;
 			if (locked)
-				mutex_exit(&spa_namespace_lock);
+				spa_namespace_exit(FTAG);
 			*spapp = NULL;
 			return (error);
 		}
@@ -6048,7 +6113,7 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag,
 		spa->spa_last_open_failed = 0;
 		spa->spa_last_ubsync_txg = 0;
 		spa->spa_load_txg = 0;
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 	}
 
 	if (firstopen)
@@ -6081,13 +6146,13 @@ spa_inject_addref(char *name)
 {
 	spa_t *spa;
 
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	if ((spa = spa_lookup(name)) == NULL) {
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		return (NULL);
 	}
 	spa->spa_inject_ref++;
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 
 	return (spa);
 }
@@ -6095,9 +6160,9 @@ spa_inject_addref(char *name)
 void
 spa_inject_delref(spa_t *spa)
 {
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	spa->spa_inject_ref--;
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 }
 
 /*
@@ -6341,14 +6406,14 @@ spa_get_stats(const char *name, nvlist_t **config,
 	 */
 	if (altroot) {
 		if (spa == NULL) {
-			mutex_enter(&spa_namespace_lock);
+			spa_namespace_enter(FTAG);
 			spa = spa_lookup(name);
 			if (spa)
 				spa_altroot(spa, altroot, buflen);
 			else
 				altroot[0] = '\0';
 			spa = NULL;
-			mutex_exit(&spa_namespace_lock);
+			spa_namespace_exit(FTAG);
 		} else {
 			spa_altroot(spa, altroot, buflen);
 		}
@@ -6568,9 +6633,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	/*
 	 * If this pool already exists, return failure.
 	 */
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	if (spa_lookup(poolname) != NULL) {
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		return (SET_ERROR(EEXIST));
 	}
 
@@ -6588,7 +6653,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	if (props && (error = spa_prop_validate(spa, props))) {
 		spa_deactivate(spa);
 		spa_remove(spa);
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		return (error);
 	}
 
@@ -6621,14 +6686,14 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 		if (error != 0) {
 			spa_deactivate(spa);
 			spa_remove(spa);
-			mutex_exit(&spa_namespace_lock);
+			spa_namespace_exit(FTAG);
 			return (error);
 		}
 	}
 	if (!has_allocclass && zfs_special_devs(nvroot, NULL)) {
 		spa_deactivate(spa);
 		spa_remove(spa);
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		return (ENOTSUP);
 	}
 
@@ -6694,7 +6759,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		return (error);
 	}
 
@@ -6847,7 +6912,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 
 	spa_import_os(spa);
 
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 
 	return (0);
 }
@@ -6872,9 +6937,9 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 	/*
 	 * If a pool with this name exists, return failure.
 	 */
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	if (spa_lookup(pool) != NULL) {
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		return (SET_ERROR(EEXIST));
 	}
 
@@ -6901,7 +6966,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 		zfs_dbgmsg("spa_import: verbatim import of %s", pool);
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		return (0);
 	}
 
@@ -6960,7 +7025,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		return (error);
 	}
 
@@ -7028,7 +7093,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 
 	zvol_create_minors(pool);
 
@@ -7060,7 +7125,7 @@ spa_tryimport(nvlist_t *tryconfig)
 	(void) snprintf(name, MAXPATHLEN, "%s-%llx-%s",
 	    TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname);
 
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	spa = spa_add(name, tryconfig, NULL);
 	spa_activate(spa, SPA_MODE_READ);
 	kmem_free(name, MAXPATHLEN);
@@ -7158,7 +7223,7 @@ spa_tryimport(nvlist_t *tryconfig)
 	spa_unload(spa);
 	spa_deactivate(spa);
 	spa_remove(spa);
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 
 	return (config);
 }
@@ -7186,15 +7251,15 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return (SET_ERROR(EROFS));
 
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	if ((spa = spa_lookup(pool)) == NULL) {
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (spa->spa_is_exporting) {
 		/* the pool is being exported by another thread */
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS));
 	}
 	spa->spa_is_exporting = B_TRUE;
@@ -7204,18 +7269,18 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
 	 * and see if we can export.
 	 */
 	spa_open_ref(spa, FTAG);
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 	spa_async_suspend(spa);
 	if (spa->spa_zvol_taskq) {
 		zvol_remove_minors(spa, spa_name(spa), B_TRUE);
 		taskq_wait(spa->spa_zvol_taskq);
 	}
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	spa->spa_export_thread = curthread;
 	spa_close(spa, FTAG);
 
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		goto export_spa;
 	}
 
@@ -7239,7 +7304,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
 		goto fail;
 	}
 
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 	/*
 	 * At this point we no longer hold the spa_namespace_lock and
 	 * there were no references on the spa. Future spa_lookups will
@@ -7258,7 +7323,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
 		if (!force && new_state == POOL_STATE_EXPORTED &&
 		    spa_has_active_shared_spare(spa)) {
 			error = SET_ERROR(EXDEV);
-			mutex_enter(&spa_namespace_lock);
+			spa_namespace_enter(FTAG);
 			goto fail;
 		}
 
@@ -7333,7 +7398,7 @@ export_spa:
 	/*
 	 * Take the namespace lock for the actual spa_t removal
 	 */
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	if (new_state != POOL_STATE_UNINITIALIZED) {
 		if (!hardforce)
 			spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
@@ -7351,8 +7416,8 @@ export_spa:
 	/*
 	 * Wake up any waiters in spa_lookup()
 	 */
-	cv_broadcast(&spa_namespace_cv);
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_broadcast();
+	spa_namespace_exit(FTAG);
 	return (0);
 
 fail:
@@ -7363,8 +7428,8 @@ fail:
 	/*
 	 * Wake up any waiters in spa_lookup()
 	 */
-	cv_broadcast(&spa_namespace_cv);
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_broadcast();
+	spa_namespace_exit(FTAG);
 	return (error);
 }
 
@@ -7574,10 +7639,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift)
 	 */
 	(void) spa_vdev_exit(spa, vd, txg, 0);
 
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 
 	return (0);
 }
@@ -7694,7 +7759,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 
 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
@@ -8078,7 +8143,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 	 * as spa_vdev_resilver_done() calls this function everything
 	 * should be fine as the resilver will return right away.
 	 */
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
@@ -8282,28 +8347,28 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 	if (unspare) {
 		spa_t *altspa = NULL;
 
-		mutex_enter(&spa_namespace_lock);
+		spa_namespace_enter(FTAG);
 		while ((altspa = spa_next(altspa)) != NULL) {
 			if (altspa->spa_state != POOL_STATE_ACTIVE ||
 			    altspa == spa)
 				continue;
 
 			spa_open_ref(altspa, FTAG);
-			mutex_exit(&spa_namespace_lock);
+			spa_namespace_exit(FTAG);
 			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
-			mutex_enter(&spa_namespace_lock);
+			spa_namespace_enter(FTAG);
 			spa_close(altspa, FTAG);
 		}
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 
 		/* search the rest of the vdevs for spares to remove */
 		spa_vdev_resilver_done(spa);
 	}
 
 	/* all done with the spa; OK to release */
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	spa_close(spa, FTAG);
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 
 	return (error);
 }
@@ -8312,7 +8377,7 @@ static int
 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
     list_t *vd_list)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 
@@ -8396,7 +8461,7 @@ spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
 	 * we can properly assess the vdev state before we commit to
 	 * the initializing operation.
 	 */
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 
 	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
@@ -8419,7 +8484,7 @@ spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
 
 	/* Sync out the initializing state */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 
 	list_destroy(&vd_list);
 
@@ -8430,7 +8495,7 @@ static int
 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
     uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 
@@ -8517,7 +8582,7 @@ spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
 	 * we can properly assess the vdev state before we commit to
 	 * the TRIM operation.
 	 */
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 
 	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
@@ -8540,7 +8605,7 @@ spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
 
 	/* Sync out the TRIM state */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 
 	list_destroy(&vd_list);
 
@@ -8568,7 +8633,7 @@ spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config,
 
 	txg = spa_vdev_enter(spa);
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
@@ -9242,7 +9307,7 @@ spa_async_thread(void *arg)
 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
 		uint64_t old_space, new_space;
 
-		mutex_enter(&spa_namespace_lock);
+		spa_namespace_enter(FTAG);
 		old_space = metaslab_class_get_space(spa_normal_class(spa));
 		old_space += metaslab_class_get_space(spa_special_class(spa));
 		old_space += metaslab_class_get_space(spa_dedup_class(spa));
@@ -9260,7 +9325,7 @@ spa_async_thread(void *arg)
 		    spa_embedded_log_class(spa));
 		new_space += metaslab_class_get_space(
 		    spa_special_embedded_log_class(spa));
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 
 		/*
 		 * If the pool grew as a result of the config update,
@@ -9329,49 +9394,49 @@ spa_async_thread(void *arg)
 		dsl_scan_restart_resilver(dp, 0);
 
 	if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
-		mutex_enter(&spa_namespace_lock);
+		spa_namespace_enter(FTAG);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_initialize_restart(spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 	}
 
 	if (tasks & SPA_ASYNC_TRIM_RESTART) {
-		mutex_enter(&spa_namespace_lock);
+		spa_namespace_enter(FTAG);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_trim_restart(spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 	}
 
 	if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
-		mutex_enter(&spa_namespace_lock);
+		spa_namespace_enter(FTAG);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_autotrim_restart(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 	}
 
 	/*
 	 * Kick off L2 cache whole device TRIM.
 	 */
 	if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
-		mutex_enter(&spa_namespace_lock);
+		spa_namespace_enter(FTAG);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_trim_l2arc(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 	}
 
 	/*
 	 * Kick off L2 cache rebuilding.
 	 */
 	if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
-		mutex_enter(&spa_namespace_lock);
+		spa_namespace_enter(FTAG);
 		spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
 		l2arc_spa_rebuild_start(spa);
 		spa_config_exit(spa, SCL_L2ARC, FTAG);
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 	}
 
 	/*
@@ -9601,7 +9666,8 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 	    KM_SLEEP));
 	memset(packed + nvsize, 0, bufsize - nvsize);
 
-	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
+	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx,
+	    DMU_READ_NO_PREFETCH);
 
 	vmem_free(packed, bufsize);
 
@@ -10522,18 +10588,18 @@ void
 spa_sync_allpools(void)
 {
 	spa_t *spa = NULL;
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	while ((spa = spa_next(spa)) != NULL) {
 		if (spa_state(spa) != POOL_STATE_ACTIVE ||
 		    !spa_writeable(spa) || spa_suspended(spa))
 			continue;
 		spa_open_ref(spa, FTAG);
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		txg_wait_synced(spa_get_dsl(spa), 0);
-		mutex_enter(&spa_namespace_lock);
+		spa_namespace_enter(FTAG);
 		spa_close(spa, FTAG);
 	}
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 }
 
 taskq_t *
@@ -10680,7 +10746,7 @@ spa_evict_all(void)
 	 * Remove all cached state.  All pools should be closed now,
 	 * so every spa in the AVL tree should be unreferenced.
 	 */
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	while ((spa = spa_next(NULL)) != NULL) {
 		/*
 		 * Stop async tasks.  The async thread may need to detach
@@ -10688,9 +10754,9 @@ spa_evict_all(void)
 		 * spa_namespace_lock, so we must drop it here.
 		 */
 		spa_open_ref(spa, FTAG);
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		spa_async_suspend(spa);
-		mutex_enter(&spa_namespace_lock);
+		spa_namespace_enter(FTAG);
 		spa_close(spa, FTAG);
 
 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
@@ -10699,7 +10765,7 @@ spa_evict_all(void)
 		}
 		spa_remove(spa);
 	}
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 }
 
 vdev_t *
@@ -11272,6 +11338,9 @@ ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
 	spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW,
 	"Configure IO queues for write IO");
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_free,
+	spa_taskq_free_param_set, spa_taskq_free_param_get, ZMOD_RW,
+	"Configure IO queues for free IO");
 #endif
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW,
diff --git a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
index e07756c46748..a42aa62e6599 100644
--- a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
+++ b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
@@ -427,7 +427,7 @@ spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
 			 */
 			int error = dmu_buf_hold_array_by_bonus(
 			    checkpoint_sm->sm_dbuf, offset, size,
-			    B_TRUE, FTAG, &numbufs, &dbp);
+			    B_TRUE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
 			if (error != 0) {
 				zfs_panic_recover("zfs: error %d was returned "
 				    "while prefetching checkpoint space map "
diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c
index 7d4d06659146..31216e9a7ccc 100644
--- a/sys/contrib/openzfs/module/zfs/spa_config.c
+++ b/sys/contrib/openzfs/module/zfs/spa_config.c
@@ -48,18 +48,17 @@
 /*
  * Pool configuration repository.
  *
- * Pool configuration is stored as a packed nvlist on the filesystem.  By
- * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot
- * (when the ZFS module is loaded).  Pools can also have the 'cachefile'
- * property set that allows them to be stored in an alternate location until
- * the control of external software.
+ * Pool configuration is stored as a packed nvlist on the filesystem.  When
+ * pools are imported they are added to the /etc/zfs/zpool.cache file and
+ * removed from it when exported.  For each cache file, we have a single nvlist
+ * which holds all the configuration information.  Pools can also have the
+ * 'cachefile' property set which allows this config to be stored in an
+ * alternate location under the control of external software.
  *
- * For each cache file, we have a single nvlist which holds all the
- * configuration information.  When the module loads, we read this information
- * from /etc/zfs/zpool.cache and populate the SPA namespace.  This namespace is
- * maintained independently in spa.c.  Whenever the namespace is modified, or
- * the configuration of a pool is changed, we call spa_write_cachefile(), which
- * walks through all the active pools and writes the configuration to disk.
+ * The kernel independantly maintains an AVL tree of imported pools.  See the
+ * "SPA locking" comment in spa.c.  Whenever a pool configuration is modified
+ * we call spa_write_cachefile() which walks through all the active pools and
+ * writes the updated configuration to to /etc/zfs/zpool.cache file.
  */
 
 static uint64_t spa_config_generation = 1;
@@ -69,94 +68,6 @@ static uint64_t spa_config_generation = 1;
  * userland pools when doing testing.
  */
 char *spa_config_path = (char *)ZPOOL_CACHE;
-#ifdef _KERNEL
-static int zfs_autoimport_disable = B_TRUE;
-#endif
-
-/*
- * Called when the module is first loaded, this routine loads the configuration
- * file into the SPA namespace.  It does not actually open or load the pools; it
- * only populates the namespace.
- */
-void
-spa_config_load(void)
-{
-	void *buf = NULL;
-	nvlist_t *nvlist, *child;
-	nvpair_t *nvpair;
-	char *pathname;
-	zfs_file_t *fp;
-	zfs_file_attr_t zfa;
-	uint64_t fsize;
-	int err;
-
-#ifdef _KERNEL
-	if (zfs_autoimport_disable)
-		return;
-#endif
-
-	/*
-	 * Open the configuration file.
-	 */
-	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-
-	(void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
-
-	err = zfs_file_open(pathname, O_RDONLY, 0, &fp);
-
-#ifdef __FreeBSD__
-	if (err)
-		err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp);
-#endif
-	kmem_free(pathname, MAXPATHLEN);
-
-	if (err)
-		return;
-
-	if (zfs_file_getattr(fp, &zfa))
-		goto out;
-
-	fsize = zfa.zfa_size;
-	buf = kmem_alloc(fsize, KM_SLEEP);
-
-	/*
-	 * Read the nvlist from the file.
-	 */
-	if (zfs_file_read(fp, buf, fsize, NULL) < 0)
-		goto out;
-
-	/*
-	 * Unpack the nvlist.
-	 */
-	if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
-		goto out;
-
-	/*
-	 * Iterate over all elements in the nvlist, creating a new spa_t for
-	 * each one with the specified configuration.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	nvpair = NULL;
-	while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
-		if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
-			continue;
-
-		child = fnvpair_value_nvlist(nvpair);
-
-		if (spa_lookup(nvpair_name(nvpair)) != NULL)
-			continue;
-		(void) spa_add(nvpair_name(nvpair), child, NULL);
-	}
-	mutex_exit(&spa_namespace_lock);
-
-	nvlist_free(nvlist);
-
-out:
-	if (buf != NULL)
-		kmem_free(buf, fsize);
-
-	zfs_file_close(fp);
-}
 
 static int
 spa_config_remove(spa_config_dirent_t *dp)
@@ -250,7 +161,7 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent,
 	boolean_t ccw_failure;
 	int error = 0;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return;
@@ -376,7 +287,7 @@ spa_all_configs(uint64_t *generation, nvlist_t **pools)
 	if (*generation == spa_config_generation)
 		return (SET_ERROR(EEXIST));
 
-	int error = mutex_enter_interruptible(&spa_namespace_lock);
+	int error = spa_namespace_enter_interruptible(FTAG);
 	if (error)
 		return (SET_ERROR(EINTR));
 
@@ -391,7 +302,7 @@ spa_all_configs(uint64_t *generation, nvlist_t **pools)
 		}
 	}
 	*generation = spa_config_generation;
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 
 	return (0);
 }
@@ -461,6 +372,8 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata);
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_MIN_ALLOC, spa->spa_min_alloc);
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, spa->spa_max_alloc);
 	if (spa->spa_comment != NULL)
 		fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
 		    spa->spa_comment);
@@ -570,7 +483,7 @@ spa_config_update(spa_t *spa, int what)
 	uint64_t txg;
 	int c;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	txg = spa_last_synced_txg(spa) + 1;
@@ -623,7 +536,6 @@ spa_config_update(spa_t *spa, int what)
 		spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
 }
 
-EXPORT_SYMBOL(spa_config_load);
 EXPORT_SYMBOL(spa_all_configs);
 EXPORT_SYMBOL(spa_config_set);
 EXPORT_SYMBOL(spa_config_generate);
@@ -634,8 +546,3 @@ EXPORT_SYMBOL(spa_config_update);
 ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD,
 	"SPA config file (/etc/zfs/zpool.cache)");
 #endif
-
-#ifdef _KERNEL
-ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW,
-	"Disable pool import at module load");
-#endif
diff --git a/sys/contrib/openzfs/module/zfs/spa_history.c b/sys/contrib/openzfs/module/zfs/spa_history.c
index 60ab07944d72..b9d0c9656726 100644
--- a/sys/contrib/openzfs/module/zfs/spa_history.c
+++ b/sys/contrib/openzfs/module/zfs/spa_history.c
@@ -169,13 +169,14 @@ spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
 	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
 	firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
 	shpp->sh_eof += len;
-	dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
+	dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx,
+	    DMU_READ_NO_PREFETCH);
 
 	len -= firstwrite;
 	if (len > 0) {
 		/* write out the rest at the beginning of physical file */
 		dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
-		    len, (char *)buf + firstwrite, tx);
+		    len, (char *)buf + firstwrite, tx, DMU_READ_NO_PREFETCH);
 	}
 
 	return (0);
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index cce772eae598..bf22d2eb68e7 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -28,7 +28,7 @@
  * Copyright (c) 2017 Datto Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
- * Copyright (c) 2023, 2024, Klara Inc.
+ * Copyright (c) 2023, 2024, 2025, Klara, Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -237,9 +237,10 @@
  * locking is, always, based on spa_namespace_lock and spa_config_lock[].
  */
 
-avl_tree_t spa_namespace_avl;
-kmutex_t spa_namespace_lock;
-kcondvar_t spa_namespace_cv;
+static avl_tree_t spa_namespace_avl;
+static kmutex_t spa_namespace_lock;
+static kcondvar_t spa_namespace_cv;
+
 static const int spa_max_replication_override = SPA_DVAS_PER_BP;
 
 static kmutex_t spa_spare_lock;
@@ -251,11 +252,11 @@ spa_mode_t spa_mode_global = SPA_MODE_UNINIT;
 
 #ifdef ZFS_DEBUG
 /*
- * Everything except dprintf, set_error, spa, and indirect_remap is on
- * by default in debug builds.
+ * Everything except dprintf, set_error, indirect_remap, and raidz_reconstruct
+ * is on by default in debug builds.
  */
 int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR |
-    ZFS_DEBUG_INDIRECT_REMAP);
+    ZFS_DEBUG_INDIRECT_REMAP | ZFS_DEBUG_RAIDZ_RECONSTRUCT);
 #else
 int zfs_flags = 0;
 #endif
@@ -510,7 +511,7 @@ spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw)
 
 static void
 spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw,
-    int mmp_flag)
+    int priority_flag)
 {
 	(void) tag;
 	int wlocks_held = 0;
@@ -526,7 +527,7 @@ spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw,
 		mutex_enter(&scl->scl_lock);
 		if (rw == RW_READER) {
 			while (scl->scl_writer ||
-			    (!mmp_flag && scl->scl_write_wanted)) {
+			    (!priority_flag && scl->scl_write_wanted)) {
 				cv_wait(&scl->scl_cv, &scl->scl_lock);
 			}
 		} else {
@@ -551,7 +552,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
 }
 
 /*
- * The spa_config_enter_mmp() allows the mmp thread to cut in front of
+ * The spa_config_enter_priority() allows the mmp thread to cut in front of
  * outstanding write lock requests. This is needed since the mmp updates are
  * time sensitive and failure to service them promptly will result in a
  * suspended pool. This pool suspension has been seen in practice when there is
@@ -560,7 +561,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
  */
 
 void
-spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw)
+spa_config_enter_priority(spa_t *spa, int locks, const void *tag, krw_t rw)
 {
 	spa_config_enter_impl(spa, locks, tag, rw, 1);
 }
@@ -608,6 +609,58 @@ spa_config_held(spa_t *spa, int locks, krw_t rw)
  * ==========================================================================
  */
 
+void
+spa_namespace_enter(const void *tag)
+{
+	(void) tag;
+	ASSERT(!MUTEX_HELD(&spa_namespace_lock));
+	mutex_enter(&spa_namespace_lock);
+}
+
+boolean_t
+spa_namespace_tryenter(const void *tag)
+{
+	(void) tag;
+	ASSERT(!MUTEX_HELD(&spa_namespace_lock));
+	return (mutex_tryenter(&spa_namespace_lock));
+}
+
+int
+spa_namespace_enter_interruptible(const void *tag)
+{
+	(void) tag;
+	ASSERT(!MUTEX_HELD(&spa_namespace_lock));
+	return (mutex_enter_interruptible(&spa_namespace_lock));
+}
+
+void
+spa_namespace_exit(const void *tag)
+{
+	(void) tag;
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	mutex_exit(&spa_namespace_lock);
+}
+
+boolean_t
+spa_namespace_held(void)
+{
+	return (MUTEX_HELD(&spa_namespace_lock));
+}
+
+void
+spa_namespace_wait(void)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	cv_wait(&spa_namespace_cv, &spa_namespace_lock);
+}
+
+void
+spa_namespace_broadcast(void)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	cv_broadcast(&spa_namespace_cv);
+}
+
 /*
  * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
  * Returns NULL if no matching spa_t is found.
@@ -620,7 +673,7 @@ spa_lookup(const char *name)
 	avl_index_t where;
 	char *cp;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 retry:
 	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
@@ -645,7 +698,7 @@ retry:
 	    spa->spa_load_thread != curthread) ||
 	    (spa->spa_export_thread != NULL &&
 	    spa->spa_export_thread != curthread)) {
-		cv_wait(&spa_namespace_cv, &spa_namespace_lock);
+		spa_namespace_wait();
 		goto retry;
 	}
 
@@ -697,7 +750,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	spa_t *spa;
 	spa_config_dirent_t *dp;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
 
@@ -747,7 +800,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	spa_config_lock_init(spa);
 	spa_stats_init(spa);
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 	avl_add(&spa_namespace_avl, spa);
 
 	/*
@@ -806,6 +859,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	spa->spa_min_ashift = INT_MAX;
 	spa->spa_max_ashift = 0;
 	spa->spa_min_alloc = INT_MAX;
+	spa->spa_max_alloc = 0;
 	spa->spa_gcd_alloc = INT_MAX;
 
 	/* Reset cached value */
@@ -836,7 +890,7 @@ spa_remove(spa_t *spa)
 {
 	spa_config_dirent_t *dp;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 	ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED);
 	ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0);
 	ASSERT0(spa->spa_waiters);
@@ -915,7 +969,7 @@ spa_remove(spa_t *spa)
 spa_t *
 spa_next(spa_t *prev)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	if (prev)
 		return (AVL_NEXT(&spa_namespace_avl, prev));
@@ -937,7 +991,7 @@ void
 spa_open_ref(spa_t *spa, const void *tag)
 {
 	ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
-	    MUTEX_HELD(&spa_namespace_lock) ||
+	    spa_namespace_held() ||
 	    spa->spa_load_thread == curthread);
 	(void) zfs_refcount_add(&spa->spa_refcount, tag);
 }
@@ -950,7 +1004,7 @@ void
 spa_close(spa_t *spa, const void *tag)
 {
 	ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
-	    MUTEX_HELD(&spa_namespace_lock) ||
+	    spa_namespace_held() ||
 	    spa->spa_load_thread == curthread ||
 	    spa->spa_export_thread == curthread);
 	(void) zfs_refcount_remove(&spa->spa_refcount, tag);
@@ -979,7 +1033,7 @@ spa_async_close(spa_t *spa, const void *tag)
 boolean_t
 spa_refcount_zero(spa_t *spa)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	ASSERT(spa_namespace_held() ||
 	    spa->spa_export_thread == curthread);
 
 	return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
@@ -1226,7 +1280,7 @@ uint64_t
 spa_vdev_enter(spa_t *spa)
 {
 	mutex_enter(&spa->spa_vdev_top_lock);
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 
 	ASSERT0(spa->spa_export_thread);
 
@@ -1245,7 +1299,7 @@ uint64_t
 spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
 {
 	mutex_enter(&spa->spa_vdev_top_lock);
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 
 	ASSERT0(spa->spa_export_thread);
 
@@ -1269,7 +1323,7 @@ spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
 uint64_t
 spa_vdev_config_enter(spa_t *spa)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 
@@ -1284,7 +1338,7 @@ void
 spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error,
     const char *tag)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	int config_changed = B_FALSE;
 
@@ -1373,7 +1427,7 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 	vdev_rebuild_restart(spa);
 
 	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
@@ -1451,9 +1505,9 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
 	 * If the config changed, update the config cache.
 	 */
 	if (config_changed) {
-		mutex_enter(&spa_namespace_lock);
+		spa_namespace_enter(FTAG);
 		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 	}
 
 	return (error);
@@ -1500,7 +1554,7 @@ spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
 	spa_t *spa;
 	avl_tree_t *t = &spa_namespace_avl;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
 		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
@@ -1582,7 +1636,7 @@ spa_load_guid_exists(uint64_t guid)
 {
 	avl_tree_t *t = &spa_namespace_avl;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	for (spa_t *spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
 		if (spa_load_guid(spa) == guid)
@@ -1865,6 +1919,19 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
 }
 
 /*
+ * Return the range of minimum allocation sizes for the normal allocation
+ * class. This can be used by external consumers of the DMU to estimate
+ * potential wasted capacity when setting the recordsize for an object.
+ * This is mainly for dRAID pools which always pad to a full stripe width.
+ */
+void
+spa_get_min_alloc_range(spa_t *spa, uint64_t *min_alloc, uint64_t *max_alloc)
+{
+	*min_alloc = spa->spa_min_alloc;
+	*max_alloc = spa->spa_max_alloc;
+}
+
+/*
  * Return the amount of slop space in bytes.  It is typically 1/32 of the pool
  * (3.2%), minus the embedded log space.  On very small pools, it may be
  * slightly larger than this.  On very large pools, it will be capped to
@@ -2186,10 +2253,10 @@ spa_set_deadman_ziotime(hrtime_t ns)
 	spa_t *spa = NULL;
 
 	if (spa_mode_global != SPA_MODE_UNINIT) {
-		mutex_enter(&spa_namespace_lock);
+		spa_namespace_enter(FTAG);
 		while ((spa = spa_next(spa)) != NULL)
 			spa->spa_deadman_ziotime = ns;
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 	}
 }
 
@@ -2199,10 +2266,10 @@ spa_set_deadman_synctime(hrtime_t ns)
 	spa_t *spa = NULL;
 
 	if (spa_mode_global != SPA_MODE_UNINIT) {
-		mutex_enter(&spa_namespace_lock);
+		spa_namespace_enter(FTAG);
 		while ((spa = spa_next(spa)) != NULL)
 			spa->spa_deadman_synctime = ns;
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 	}
 }
 
@@ -2548,13 +2615,6 @@ spa_name_compare(const void *a1, const void *a2)
 }
 
 void
-spa_boot_init(void *unused)
-{
-	(void) unused;
-	spa_config_load();
-}
-
-void
 spa_init(spa_mode_t mode)
 {
 	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -2607,7 +2667,6 @@ spa_init(spa_mode_t mode)
 	chksum_init();
 	zpool_prop_init();
 	zpool_feature_init();
-	spa_config_load();
 	vdev_prop_init();
 	l2arc_start();
 	scan_init();
@@ -3042,10 +3101,10 @@ param_set_deadman_failmode_common(const char *val)
 		return (SET_ERROR(EINVAL));
 
 	if (spa_mode_global != SPA_MODE_UNINIT) {
-		mutex_enter(&spa_namespace_lock);
+		spa_namespace_enter(FTAG);
 		while ((spa = spa_next(spa)) != NULL)
 			spa_set_deadman_failmode(spa, val);
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 	}
 
 	return (0);
@@ -3093,6 +3152,7 @@ EXPORT_SYMBOL(spa_version);
 EXPORT_SYMBOL(spa_state);
 EXPORT_SYMBOL(spa_load_state);
 EXPORT_SYMBOL(spa_freeze_txg);
+EXPORT_SYMBOL(spa_get_min_alloc_range); /* for Lustre */
 EXPORT_SYMBOL(spa_get_dspace);
 EXPORT_SYMBOL(spa_update_dspace);
 EXPORT_SYMBOL(spa_deflate);
@@ -3128,7 +3188,6 @@ EXPORT_SYMBOL(spa_has_slogs);
 EXPORT_SYMBOL(spa_is_root);
 EXPORT_SYMBOL(spa_writeable);
 EXPORT_SYMBOL(spa_mode);
-EXPORT_SYMBOL(spa_namespace_lock);
 EXPORT_SYMBOL(spa_trust_config);
 EXPORT_SYMBOL(spa_missing_tvds_allowed);
 EXPORT_SYMBOL(spa_set_missing_tvds);
diff --git a/sys/contrib/openzfs/module/zfs/space_map.c b/sys/contrib/openzfs/module/zfs/space_map.c
index 5f24963f2291..f20c49ebb6de 100644
--- a/sys/contrib/openzfs/module/zfs/space_map.c
+++ b/sys/contrib/openzfs/module/zfs/space_map.c
@@ -537,7 +537,7 @@ space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
 	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
 
 	dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
-	    sizeof (dentry), &dentry, tx);
+	    sizeof (dentry), &dentry, tx, DMU_READ_NO_PREFETCH);
 
 	sm->sm_phys->smp_length += sizeof (dentry);
 }
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index 9cf35e379000..2a4d1876251f 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -29,7 +29,7 @@
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Datto Inc. All rights reserved.
- * Copyright (c) 2021, Klara Inc.
+ * Copyright (c) 2021, 2025, Klara, Inc.
  * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
  */
 
@@ -449,32 +449,53 @@ vdev_get_nparity(vdev_t *vd)
 }
 
 static int
-vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
+vdev_prop_get_objid(vdev_t *vd, uint64_t *objid)
 {
-	spa_t *spa = vd->vdev_spa;
-	objset_t *mos = spa->spa_meta_objset;
-	uint64_t objid;
-	int err;
 
 	if (vd->vdev_root_zap != 0) {
-		objid = vd->vdev_root_zap;
+		*objid = vd->vdev_root_zap;
 	} else if (vd->vdev_top_zap != 0) {
-		objid = vd->vdev_top_zap;
+		*objid = vd->vdev_top_zap;
 	} else if (vd->vdev_leaf_zap != 0) {
-		objid = vd->vdev_leaf_zap;
+		*objid = vd->vdev_leaf_zap;
 	} else {
 		return (EINVAL);
 	}
 
+	return (0);
+}
+
+static int
+vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
+{
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa->spa_meta_objset;
+	uint64_t objid;
+	int err;
+
+	if (vdev_prop_get_objid(vd, &objid) != 0)
+		return (EINVAL);
+
 	err = zap_lookup(mos, objid, vdev_prop_to_name(prop),
 	    sizeof (uint64_t), 1, value);
-
 	if (err == ENOENT)
 		*value = vdev_prop_default_numeric(prop);
 
 	return (err);
 }
 
+static int
+vdev_prop_get_bool(vdev_t *vd, vdev_prop_t prop, boolean_t *bvalue)
+{
+	int err;
+	uint64_t ivalue;
+
+	err = vdev_prop_get_int(vd, prop, &ivalue);
+	*bvalue = ivalue != 0;
+
+	return (err);
+}
+
 /*
  * Get the number of data disks for a top-level vdev.
  */
@@ -737,8 +758,12 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	 */
 	vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N);
 	vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
+
 	vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
 	vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
+
+	vd->vdev_slow_io_events = vdev_prop_default_numeric(
+	    VDEV_PROP_SLOW_IO_EVENTS);
 	vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
 	vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
 
@@ -1086,6 +1111,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 		}
 	}
 
+	if (top_level && (ops == &vdev_raidz_ops || ops == &vdev_draid_ops))
+		vd->vdev_autosit =
+		    vdev_prop_default_numeric(VDEV_PROP_AUTOSIT);
+
 	/*
 	 * Add ourselves to the parent's list of children.
 	 */
@@ -1187,6 +1216,9 @@ vdev_free(vdev_t *vd)
 		spa_spare_remove(vd);
 	if (vd->vdev_isl2cache)
 		spa_l2cache_remove(vd);
+	if (vd->vdev_prev_histo)
+		kmem_free(vd->vdev_prev_histo,
+		    sizeof (uint64_t) * VDEV_L_HISTO_BUCKETS);
 
 	txg_list_destroy(&vd->vdev_ms_list);
 	txg_list_destroy(&vd->vdev_dtl_list);
@@ -1490,12 +1522,14 @@ vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
 {
 	if (min_alloc < spa->spa_min_alloc)
 		spa->spa_min_alloc = min_alloc;
-	if (spa->spa_gcd_alloc == INT_MAX) {
+
+	if (min_alloc > spa->spa_max_alloc)
+		spa->spa_max_alloc = min_alloc;
+
+	if (spa->spa_gcd_alloc == INT_MAX)
 		spa->spa_gcd_alloc = min_alloc;
-	} else {
-		spa->spa_gcd_alloc = vdev_gcd(min_alloc,
-		    spa->spa_gcd_alloc);
-	}
+	else
+		spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc);
 }
 
 void
@@ -1553,8 +1587,7 @@ vdev_metaslab_group_create(vdev_t *vd)
 			if (vd->vdev_ashift < spa->spa_min_ashift)
 				spa->spa_min_ashift = vd->vdev_ashift;
 
-			uint64_t min_alloc = vdev_get_min_alloc(vd);
-			vdev_spa_set_alloc(spa, min_alloc);
+			vdev_spa_set_alloc(spa, vdev_get_min_alloc(vd));
 		}
 	}
 }
@@ -3857,6 +3890,26 @@ vdev_load(vdev_t *vd)
 		}
 	}
 
+	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+		spa_t *spa = vd->vdev_spa;
+		uint64_t autosit;
+
+		error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+		    vdev_prop_to_name(VDEV_PROP_AUTOSIT), sizeof (autosit),
+		    1, &autosit);
+		if (error == 0) {
+			vd->vdev_autosit = autosit == 1;
+		} else if (error == ENOENT) {
+			vd->vdev_autosit = vdev_prop_default_numeric(
+			    VDEV_PROP_AUTOSIT);
+		} else {
+			vdev_dbgmsg(vd,
+			    "vdev_load: zap_lookup(top_zap=%llu) "
+			    "failed [error=%d]",
+			    (u_longlong_t)vd->vdev_top_zap, error);
+		}
+	}
+
 	/*
 	 * Load any rebuild state from the top-level vdev zap.
 	 */
@@ -3903,6 +3956,11 @@ vdev_load(vdev_t *vd)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
+		error = vdev_prop_get_bool(vd, VDEV_PROP_SLOW_IO_EVENTS,
+		    &vd->vdev_slow_io_events);
+		if (error && error != ENOENT)
+			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
 		    &vd->vdev_slow_io_n);
 		if (error && error != ENOENT)
@@ -4616,6 +4674,8 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 	vd->vdev_stat.vs_checksum_errors = 0;
 	vd->vdev_stat.vs_dio_verify_errors = 0;
 	vd->vdev_stat.vs_slow_ios = 0;
+	atomic_store_64(&vd->vdev_outlier_count, 0);
+	vd->vdev_read_sit_out_expire = 0;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear(spa, vd->vdev_child[c]);
@@ -5950,15 +6010,8 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx)
 	/*
 	 * Set vdev property values in the vdev props mos object.
 	 */
-	if (vd->vdev_root_zap != 0) {
-		objid = vd->vdev_root_zap;
-	} else if (vd->vdev_top_zap != 0) {
-		objid = vd->vdev_top_zap;
-	} else if (vd->vdev_leaf_zap != 0) {
-		objid = vd->vdev_leaf_zap;
-	} else {
+	if (vdev_prop_get_objid(vd, &objid) != 0)
 		panic("unexpected vdev type");
-	}
 
 	mutex_enter(&spa->spa_props_lock);
 
@@ -6107,6 +6160,56 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 			}
 			vd->vdev_failfast = intval & 1;
 			break;
+		case VDEV_PROP_SIT_OUT:
+			/* Only expose this for a draid or raidz leaf */
+			if (!vd->vdev_ops->vdev_op_leaf ||
+			    vd->vdev_top == NULL ||
+			    (vd->vdev_top->vdev_ops != &vdev_raidz_ops &&
+			    vd->vdev_top->vdev_ops != &vdev_draid_ops)) {
+				error = ENOTSUP;
+				break;
+			}
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			if (intval == 1) {
+				vdev_t *ancestor = vd;
+				while (ancestor->vdev_parent != vd->vdev_top)
+					ancestor = ancestor->vdev_parent;
+				vdev_t *pvd = vd->vdev_top;
+				uint_t sitouts = 0;
+				for (int i = 0; i < pvd->vdev_children; i++) {
+					if (pvd->vdev_child[i] == ancestor)
+						continue;
+					if (vdev_sit_out_reads(
+					    pvd->vdev_child[i], 0)) {
+						sitouts++;
+					}
+				}
+				if (sitouts >= vdev_get_nparity(pvd)) {
+					error = ZFS_ERR_TOO_MANY_SITOUTS;
+					break;
+				}
+				if (error == 0)
+					vdev_raidz_sit_child(vd,
+					    INT64_MAX - gethrestime_sec());
+			} else {
+				vdev_raidz_unsit_child(vd);
+			}
+			break;
+		case VDEV_PROP_AUTOSIT:
+			if (vd->vdev_ops != &vdev_raidz_ops &&
+			    vd->vdev_ops != &vdev_draid_ops) {
+				error = ENOTSUP;
+				break;
+			}
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			vd->vdev_autosit = intval == 1;
+			break;
 		case VDEV_PROP_CHECKSUM_N:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
@@ -6135,6 +6238,13 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 			}
 			vd->vdev_io_t = intval;
 			break;
+		case VDEV_PROP_SLOW_IO_EVENTS:
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			vd->vdev_slow_io_events = intval != 0;
+			break;
 		case VDEV_PROP_SLOW_IO_N:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
@@ -6176,6 +6286,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 	nvpair_t *elem = NULL;
 	nvlist_t *nvprops = NULL;
 	uint64_t intval = 0;
+	boolean_t boolval = 0;
 	char *strval = NULL;
 	const char *propname = NULL;
 	vdev_prop_t prop;
@@ -6189,15 +6300,8 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 
 	nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
 
-	if (vd->vdev_root_zap != 0) {
-		objid = vd->vdev_root_zap;
-	} else if (vd->vdev_top_zap != 0) {
-		objid = vd->vdev_top_zap;
-	} else if (vd->vdev_leaf_zap != 0) {
-		objid = vd->vdev_leaf_zap;
-	} else {
+	if (vdev_prop_get_objid(vd, &objid) != 0)
 		return (SET_ERROR(EINVAL));
-	}
 	ASSERT(objid != 0);
 
 	mutex_enter(&spa->spa_props_lock);
@@ -6456,6 +6560,19 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 					    ZPROP_SRC_NONE);
 				}
 				continue;
+			case VDEV_PROP_SIT_OUT:
+				/* Only expose this for a draid or raidz leaf */
+				if (vd->vdev_ops->vdev_op_leaf &&
+				    vd->vdev_top != NULL &&
+				    (vd->vdev_top->vdev_ops ==
+				    &vdev_raidz_ops ||
+				    vd->vdev_top->vdev_ops ==
+				    &vdev_draid_ops)) {
+					vdev_prop_add_list(outnvl, propname,
+					    NULL, vdev_sit_out_reads(vd, 0),
+					    ZPROP_SRC_NONE);
+				}
+				continue;
 			case VDEV_PROP_TRIM_SUPPORT:
 				/* only valid for leaf vdevs */
 				if (vd->vdev_ops->vdev_op_leaf) {
@@ -6506,6 +6623,41 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 				vdev_prop_add_list(outnvl, propname, strval,
 				    intval, src);
 				break;
+			case VDEV_PROP_AUTOSIT:
+				/* Only raidz vdevs cannot have this property */
+				if (vd->vdev_ops != &vdev_raidz_ops &&
+				    vd->vdev_ops != &vdev_draid_ops) {
+					src = ZPROP_SRC_NONE;
+					intval = ZPROP_BOOLEAN_NA;
+				} else {
+					err = vdev_prop_get_int(vd, prop,
+					    &intval);
+					if (err && err != ENOENT)
+						break;
+
+					if (intval ==
+					    vdev_prop_default_numeric(prop))
+						src = ZPROP_SRC_DEFAULT;
+					else
+						src = ZPROP_SRC_LOCAL;
+				}
+
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    intval, src);
+				break;
+
+			case VDEV_PROP_SLOW_IO_EVENTS:
+				err = vdev_prop_get_bool(vd, prop, &boolval);
+				if (err && err != ENOENT)
+					break;
+
+				src = ZPROP_SRC_LOCAL;
+				if (boolval == vdev_prop_default_numeric(prop))
+					src = ZPROP_SRC_DEFAULT;
+
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    boolval, src);
+				break;
 			case VDEV_PROP_CHECKSUM_N:
 			case VDEV_PROP_CHECKSUM_T:
 			case VDEV_PROP_IO_N:
diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c
index a05289102af2..8588cfee3f7d 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_draid.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2018 Intel Corporation.
  * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2025, Klara, Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -1996,6 +1997,33 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
 				rc->rc_allow_repair = 1;
 			}
 		}
+
+		if (vdev_sit_out_reads(cvd, zio->io_flags)) {
+			rr->rr_outlier_cnt++;
+			ASSERT0(rc->rc_latency_outlier);
+			rc->rc_latency_outlier = 1;
+		}
+	}
+
+	/*
+	 * When the row contains a latency outlier and sufficient parity
+	 * exists to reconstruct the column data, then skip reading the
+	 * known slow child vdev as a performance optimization.
+	 */
+	if (rr->rr_outlier_cnt > 0 &&
+	    (rr->rr_firstdatacol - rr->rr_missingparity) >=
+	    (rr->rr_missingdata + 1)) {
+
+		for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
+			raidz_col_t *rc = &rr->rr_col[c];
+
+			if (rc->rc_error == 0 && rc->rc_latency_outlier) {
+				rr->rr_missingdata++;
+				rc->rc_error = SET_ERROR(EAGAIN);
+				rc->rc_skipped = 1;
+				break;
+			}
+		}
 	}
 
 	/*
diff --git a/sys/contrib/openzfs/module/zfs/vdev_file.c b/sys/contrib/openzfs/module/zfs/vdev_file.c
index f457669bc809..20b4db65ec06 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_file.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_file.c
@@ -228,7 +228,8 @@ vdev_file_io_strategy(void *arg)
 		abd_return_buf_copy(zio->io_abd, buf, size);
 	} else {
 		buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
-		err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
+		err = zfs_file_pwrite(vf->vf_file, buf, size, off,
+		    vd->vdev_ashift, &resid);
 		abd_return_buf(zio->io_abd, buf, size);
 	}
 	zio->io_error = err;
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c b/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c
index c0127829c26c..ab7069f44b37 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c
@@ -147,7 +147,7 @@ vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
 
 	old_size = vdev_indirect_births_size_impl(vib);
 	dmu_write(vib->vib_objset, vib->vib_object, old_size, sizeof (vibe),
-	    &vibe, tx);
+	    &vibe, tx, DMU_READ_NO_PREFETCH);
 	vib->vib_phys->vib_count++;
 	new_size = vdev_indirect_births_size_impl(vib);
 
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c b/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c
index 1515ddc1baa2..da90a8de016f 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c
@@ -459,13 +459,14 @@ vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
 		dmu_write(vim->vim_objset, vim->vim_object,
 		    vim->vim_phys->vimp_num_entries * sizeof (*mapbuf),
 		    i * sizeof (*mapbuf),
-		    mapbuf, tx);
+		    mapbuf, tx, DMU_READ_NO_PREFETCH);
 		if (vim->vim_havecounts) {
 			dmu_write(vim->vim_objset,
 			    vim->vim_phys->vimp_counts_object,
 			    vim->vim_phys->vimp_num_entries *
 			    sizeof (*countbuf),
-			    i * sizeof (*countbuf), countbuf, tx);
+			    i * sizeof (*countbuf), countbuf, tx,
+			    DMU_READ_NO_PREFETCH);
 		}
 		vim->vim_phys->vimp_num_entries += i;
 	}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
index 27188c46e561..d13da1e5a663 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_initialize.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
@@ -685,7 +685,7 @@ vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list)
 	(void) spa;
 	vdev_t *vd;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	ASSERT(spa_namespace_held() ||
 	    spa->spa_export_thread == curthread);
 
 	while ((vd = list_remove_head(vd_list)) != NULL) {
@@ -728,7 +728,7 @@ vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state,
 	if (vd_list == NULL) {
 		vdev_initialize_stop_wait_impl(vd);
 	} else {
-		ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+		ASSERT(spa_namespace_held() ||
 		    vd->vdev_spa->spa_export_thread == curthread);
 		list_insert_tail(vd_list, vd);
 	}
@@ -761,7 +761,7 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
 	spa_t *spa = vd->vdev_spa;
 	list_t vd_list;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	ASSERT(spa_namespace_held() ||
 	    spa->spa_export_thread == curthread);
 
 	list_create(&vd_list, sizeof (vdev_t),
@@ -781,7 +781,7 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
 void
 vdev_initialize_restart(vdev_t *vd)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	ASSERT(spa_namespace_held() ||
 	    vd->vdev_spa->spa_load_thread == curthread);
 	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c
index c44f654b0261..7e222eac5edc 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_label.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_label.c
@@ -511,6 +511,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    vd->vdev_asize);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_MIN_ALLOC,
+		    vdev_get_min_alloc(vd));
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
 		if (vd->vdev_noalloc) {
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
@@ -860,8 +862,8 @@ retry:
 		}
 	}
 
-	if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
-		flags |= ZIO_FLAG_TRYHARD;
+	if (config == NULL && !(flags & ZIO_FLAG_IO_RETRY)) {
+		flags |= ZIO_FLAG_IO_RETRY;
 		goto retry;
 	}
 
@@ -1077,7 +1079,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	size_t buflen;
 	int error;
 	uint64_t spare_guid = 0, l2cache_guid = 0;
-	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_TRYHARD;
 	boolean_t reason_spare = (reason == VDEV_LABEL_SPARE || (reason ==
 	    VDEV_LABEL_REMOVE && vd->vdev_isspare));
 	boolean_t reason_l2cache = (reason == VDEV_LABEL_L2CACHE || (reason ==
@@ -1221,7 +1224,6 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	/*
 	 * Write everything in parallel.
 	 */
-retry:
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
@@ -1246,11 +1248,6 @@ retry:
 
 	error = zio_wait(zio);
 
-	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
-		flags |= ZIO_FLAG_TRYHARD;
-		goto retry;
-	}
-
 	nvlist_free(label);
 	abd_free(bootenv);
 	abd_free(ub_abd);
@@ -1396,7 +1393,8 @@ vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env)
 	zio_t *zio;
 	spa_t *spa = vd->vdev_spa;
 	vdev_boot_envblock_t *bootenv;
-	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_TRYHARD;
 	int error;
 	size_t nvsize;
 	char *nvbuf;
@@ -1464,7 +1462,6 @@ vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env)
 		return (SET_ERROR(error));
 	}
 
-retry:
 	zio = zio_root(spa, NULL, NULL, flags);
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		vdev_label_write(zio, vd, l, abd,
@@ -1473,10 +1470,6 @@ retry:
 	}
 
 	error = zio_wait(zio);
-	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
-		flags |= ZIO_FLAG_TRYHARD;
-		goto retry;
-	}
 
 	abd_free(abd);
 	return (error);
@@ -2054,13 +2047,13 @@ retry:
 	 * Normally, we don't want to try too hard to write every label and
 	 * uberblock.  If there is a flaky disk, we don't want the rest of the
 	 * sync process to block while we retry.  But if we can't write a
-	 * single label out, we should retry with ZIO_FLAG_TRYHARD before
+	 * single label out, we should retry with ZIO_FLAG_IO_RETRY before
 	 * bailing out and declaring the pool faulted.
 	 */
 	if (error != 0) {
-		if ((flags & ZIO_FLAG_TRYHARD) != 0)
+		if ((flags & ZIO_FLAG_IO_RETRY) != 0)
 			return (error);
-		flags |= ZIO_FLAG_TRYHARD;
+		flags |= ZIO_FLAG_IO_RETRY;
 	}
 
 	ASSERT(ub->ub_txg <= txg);
@@ -2111,7 +2104,7 @@ retry:
 	 * are committed to stable storage before the uberblock update.
 	 */
 	if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) {
-		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+		if ((flags & ZIO_FLAG_IO_RETRY) != 0) {
 			zfs_dbgmsg("vdev_label_sync_list() returned error %d "
 			    "for pool '%s' when syncing out the even labels "
 			    "of dirty vdevs", error, spa_name(spa));
@@ -2135,7 +2128,7 @@ retry:
 	 *	to the new uberblocks.
 	 */
 	if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) {
-		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+		if ((flags & ZIO_FLAG_IO_RETRY) != 0) {
 			zfs_dbgmsg("vdev_uberblock_sync_list() returned error "
 			    "%d for pool '%s'", error, spa_name(spa));
 		}
@@ -2156,7 +2149,7 @@ retry:
 	 * stable storage before the next transaction group begins.
 	 */
 	if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) {
-		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+		if ((flags & ZIO_FLAG_IO_RETRY) != 0) {
 			zfs_dbgmsg("vdev_label_sync_list() returned error %d "
 			    "for pool '%s' when syncing out the odd labels of "
 			    "dirty vdevs", error, spa_name(spa));
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
index b597d6daefde..5fe70ec2b1d5 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
@@ -24,6 +24,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
+ * Copyright (c) 2025, Klara, Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -356,6 +357,32 @@ unsigned long raidz_expand_max_reflow_bytes = 0;
 uint_t raidz_expand_pause_point = 0;
 
 /*
+ * This represents the duration for a slow drive read sit out.
+ */
+static unsigned long vdev_read_sit_out_secs = 600;
+
+/*
+ * How often each RAID-Z and dRAID vdev will check for slow disk outliers.
+ * Increasing this interval will reduce the sensitivity of detection (since all
+ * I/Os since the last check are included in the statistics), but will slow the
+ * response to a disk developing a problem.
+ *
+ * Defaults to once per second; setting extremely small values may cause
+ * negative performance effects.
+ */
+static hrtime_t vdev_raidz_outlier_check_interval_ms = 1000;
+
+/*
+ * When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is
+ * used to determine how far out an outlier must be before it counts as an event
+ * worth consdering.
+ *
+ * Smaller values will result in more aggressive sitting out of disks that may
+ * have problems, but may significantly increase the rate of spurious sit-outs.
+ */
+static uint32_t vdev_raidz_outlier_insensitivity = 50;
+
+/*
  * Maximum amount of copy io's outstanding at once.
  */
 #ifdef _ILP32
@@ -2311,6 +2338,41 @@ vdev_raidz_min_asize(vdev_t *vd)
 	    vd->vdev_children);
 }
 
+/*
+ * return B_TRUE if a read should be skipped due to being too slow.
+ *
+ * In vdev_child_slow_outlier() it looks for outliers based on disk
+ * latency from the most recent child reads.  Here we're checking if,
+ * over time, a disk has has been an outlier too many times and is
+ * now in a sit out period.
+ */
+boolean_t
+vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags)
+{
+	if (vdev_read_sit_out_secs == 0)
+		return (B_FALSE);
+
+	/* Avoid skipping a data column read when scrubbing */
+	if (io_flags & ZIO_FLAG_SCRUB)
+		return (B_FALSE);
+
+	if (!vd->vdev_ops->vdev_op_leaf) {
+		boolean_t sitting = B_FALSE;
+		for (int c = 0; c < vd->vdev_children; c++) {
+			sitting |= vdev_sit_out_reads(vd->vdev_child[c],
+			    io_flags);
+		}
+		return (sitting);
+	}
+
+	if (vd->vdev_read_sit_out_expire >= gethrestime_sec())
+		return (B_TRUE);
+
+	vd->vdev_read_sit_out_expire = 0;
+
+	return (B_FALSE);
+}
+
 void
 vdev_raidz_child_done(zio_t *zio)
 {
@@ -2475,6 +2537,45 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
 			rc->rc_skipped = 1;
 			continue;
 		}
+
+		if (vdev_sit_out_reads(cvd, zio->io_flags)) {
+			rr->rr_outlier_cnt++;
+			ASSERT0(rc->rc_latency_outlier);
+			rc->rc_latency_outlier = 1;
+		}
+	}
+
+	/*
+	 * When the row contains a latency outlier and sufficient parity
+	 * exists to reconstruct the column data, then skip reading the
+	 * known slow child vdev as a performance optimization.
+	 */
+	if (rr->rr_outlier_cnt > 0 &&
+	    (rr->rr_firstdatacol - rr->rr_missingparity) >=
+	    (rr->rr_missingdata + 1)) {
+
+		for (int c = rr->rr_cols - 1; c >= 0; c--) {
+			raidz_col_t *rc = &rr->rr_col[c];
+
+			if (rc->rc_error == 0 && rc->rc_latency_outlier) {
+				if (c >= rr->rr_firstdatacol)
+					rr->rr_missingdata++;
+				else
+					rr->rr_missingparity++;
+				rc->rc_error = SET_ERROR(EAGAIN);
+				rc->rc_skipped = 1;
+				break;
+			}
+		}
+	}
+
+	for (int c = rr->rr_cols - 1; c >= 0; c--) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+		if (rc->rc_error || rc->rc_size == 0)
+			continue;
+
 		if (forceparity ||
 		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
@@ -2498,6 +2599,7 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
 
 		ASSERT3U(prc->rc_devidx, ==, i);
 		vdev_t *cvd = vd->vdev_child[i];
+
 		if (!vdev_readable(cvd)) {
 			prc->rc_error = SET_ERROR(ENXIO);
 			prc->rc_tried = 1;	/* don't even try */
@@ -2774,6 +2876,239 @@ vdev_raidz_worst_error(raidz_row_t *rr)
 	return (error);
 }
 
+/*
+ * Find the median value from a set of n values
+ */
+static uint64_t
+latency_median_value(const uint64_t *data, size_t n)
+{
+	uint64_t m;
+
+	if (n % 2 == 0)
+		m = (data[(n >> 1) - 1] + data[n >> 1]) >> 1;
+	else
+		m = data[((n + 1) >> 1) - 1];
+
+	return (m);
+}
+
+/*
+ * Calculate the outlier fence from a set of n latency values
+ *
+ * fence = Q3 + vdev_raidz_outlier_insensitivity x (Q3 - Q1)
+ */
+static uint64_t
+latency_quartiles_fence(const uint64_t *data, size_t n, uint64_t *iqr)
+{
+	uint64_t q1 = latency_median_value(&data[0], n >> 1);
+	uint64_t q3 = latency_median_value(&data[(n + 1) >> 1], n >> 1);
+
+	/*
+	 * To avoid detecting false positive outliers when N is small and
+	 * and the latencies values are very close, make sure the IQR
+	 * is at least 25% larger than Q1.
+	 */
+	*iqr = MAX(q3 - q1, q1 / 4);
+
+	return (q3 + (*iqr * vdev_raidz_outlier_insensitivity));
+}
+#define	LAT_CHILDREN_MIN	5
+#define	LAT_OUTLIER_LIMIT	20
+
+static int
+latency_compare(const void *arg1, const void *arg2)
+{
+	const uint64_t *l1 = (uint64_t *)arg1;
+	const uint64_t *l2 = (uint64_t *)arg2;
+
+	return (TREE_CMP(*l1, *l2));
+}
+
+void
+vdev_raidz_sit_child(vdev_t *svd, uint64_t secs)
+{
+	for (int c = 0; c < svd->vdev_children; c++)
+		vdev_raidz_sit_child(svd->vdev_child[c], secs);
+
+	if (!svd->vdev_ops->vdev_op_leaf)
+		return;
+
+	/* Begin a sit out period for this slow drive */
+	svd->vdev_read_sit_out_expire = gethrestime_sec() +
+	    secs;
+
+	/* Count each slow io period */
+	mutex_enter(&svd->vdev_stat_lock);
+	svd->vdev_stat.vs_slow_ios++;
+	mutex_exit(&svd->vdev_stat_lock);
+}
+
+void
+vdev_raidz_unsit_child(vdev_t *vd)
+{
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_raidz_unsit_child(vd->vdev_child[c]);
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return;
+
+	vd->vdev_read_sit_out_expire = 0;
+}
+
+/*
+ * Check for any latency outlier from latest set of child reads.
+ *
+ * Uses a Tukey's fence, with K = 50, for detecting extreme outliers. This
+ * rule defines extreme outliers as data points outside the fence of the
+ * third quartile plus fifty times the Interquartile Range (IQR). This range
+ * is the distance between the first and third quartile.
+ *
+ * Fifty is an extremely large value for Tukey's fence, but the outliers we're
+ * attempting to detect here are orders of magnitude times larger than the
+ * median. This large value should capture any truly fault disk quickly,
+ * without causing spurious sit-outs.
+ *
+ * To further avoid spurious sit-outs, vdevs must be detected multiple times
+ * as an outlier before they are sat, and outlier counts will gradually decay.
+ * Every nchildren times we have detected an outlier, we subtract 2 from the
+ * outlier count of all children. If detected outliers are close to uniformly
+ * distributed, this will result in the outlier count remaining close to 0
+ * (in expectation; over long enough time-scales, spurious sit-outs are still
+ * possible).
+ */
+static void
+vdev_child_slow_outlier(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	if (!vd->vdev_autosit || vdev_read_sit_out_secs == 0 ||
+	    vd->vdev_children < LAT_CHILDREN_MIN)
+		return;
+
+	hrtime_t now = getlrtime();
+	uint64_t last = atomic_load_64(&vd->vdev_last_latency_check);
+
+	if ((now - last) < MSEC2NSEC(vdev_raidz_outlier_check_interval_ms))
+		return;
+
+	/* Allow a single winner when there are racing callers. */
+	if (atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last)
+		return;
+
+	int children = vd->vdev_children;
+	uint64_t *lat_data = kmem_alloc(sizeof (uint64_t) * children, KM_SLEEP);
+
+	for (int c = 0; c < children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+		if (cvd->vdev_prev_histo == NULL) {
+			mutex_enter(&cvd->vdev_stat_lock);
+			size_t size =
+			    sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
+			cvd->vdev_prev_histo = kmem_zalloc(size, KM_SLEEP);
+			memcpy(cvd->vdev_prev_histo,
+			    cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ],
+			    size);
+			mutex_exit(&cvd->vdev_stat_lock);
+		}
+	}
+	uint64_t max = 0;
+	vdev_t *svd = NULL;
+	uint_t sitouts = 0;
+	boolean_t skip = B_FALSE, svd_sitting = B_FALSE;
+	for (int c = 0; c < children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+		boolean_t sitting = vdev_sit_out_reads(cvd, 0) ||
+		    cvd->vdev_state != VDEV_STATE_HEALTHY;
+
+		/* We can't sit out more disks than we have parity */
+		if (sitting && ++sitouts >= vdev_get_nparity(vd))
+			skip = B_TRUE;
+
+		mutex_enter(&cvd->vdev_stat_lock);
+
+		uint64_t *prev_histo = cvd->vdev_prev_histo;
+		uint64_t *histo =
+		    cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ];
+		if (skip) {
+			size_t size =
+			    sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
+			memcpy(prev_histo, histo, size);
+			mutex_exit(&cvd->vdev_stat_lock);
+			continue;
+		}
+		uint64_t count = 0;
+		lat_data[c] = 0;
+		for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) {
+			uint64_t this_count = histo[i] - prev_histo[i];
+			lat_data[c] += (1ULL << i) * this_count;
+			count += this_count;
+		}
+		size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
+		memcpy(prev_histo, histo, size);
+		mutex_exit(&cvd->vdev_stat_lock);
+		lat_data[c] /= MAX(1, count);
+
+		/* Wait until all disks have been read from */
+		if (lat_data[c] == 0 && !sitting) {
+			skip = B_TRUE;
+			continue;
+		}
+
+		/* Keep track of the vdev with largest value */
+		if (lat_data[c] > max) {
+			max = lat_data[c];
+			svd = cvd;
+			svd_sitting = sitting;
+		}
+	}
+
+	if (skip) {
+		kmem_free(lat_data, sizeof (uint64_t) * children);
+		return;
+	}
+
+	qsort((void *)lat_data, children, sizeof (uint64_t), latency_compare);
+
+	uint64_t iqr;
+	uint64_t fence = latency_quartiles_fence(lat_data, children, &iqr);
+
+	ASSERT3U(lat_data[children - 1], ==, max);
+	if (max > fence && !svd_sitting) {
+		ASSERT3U(iqr, >, 0);
+		uint64_t incr = MAX(1, MIN((max - fence) / iqr,
+		    LAT_OUTLIER_LIMIT / 4));
+		vd->vdev_outlier_count += incr;
+		if (vd->vdev_outlier_count >= children) {
+			for (int c = 0; c < children; c++) {
+				vdev_t *cvd = vd->vdev_child[c];
+				cvd->vdev_outlier_count -= 2;
+				cvd->vdev_outlier_count = MAX(0,
+				    cvd->vdev_outlier_count);
+			}
+			vd->vdev_outlier_count = 0;
+		}
+		/*
+		 * Keep track of how many times this child has had
+		 * an outlier read. A disk that persitently has a
+		 * higher than peers outlier count will be considered
+		 * a slow disk.
+		 */
+		svd->vdev_outlier_count += incr;
+		if (svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) {
+			ASSERT0(svd->vdev_read_sit_out_expire);
+			vdev_raidz_sit_child(svd, vdev_read_sit_out_secs);
+			(void) zfs_ereport_post(FM_EREPORT_ZFS_SITOUT,
+			    zio->io_spa, svd, NULL, NULL, 0);
+			vdev_dbgmsg(svd, "begin read sit out for %d secs",
+			    (int)vdev_read_sit_out_secs);
+
+			for (int c = 0; c < vd->vdev_children; c++)
+				vd->vdev_child[c]->vdev_outlier_count = 0;
+		}
+	}
+
+	kmem_free(lat_data, sizeof (uint64_t) * children);
+}
+
 static void
 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
 {
@@ -3515,6 +3850,9 @@ vdev_raidz_io_done(zio_t *zio)
 				raidz_row_t *rr = rm->rm_row[i];
 				vdev_raidz_io_done_verified(zio, rr);
 			}
+			/* Periodically check for a read outlier */
+			if (zio->io_type == ZIO_TYPE_READ)
+				vdev_child_slow_outlier(zio);
 			zio_checksum_verified(zio);
 		} else {
 			/*
@@ -4534,7 +4872,7 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr)
 	else
 		vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
 
-	/* Reflow the begining portion using the scratch area */
+	/* Reflow the beginning portion using the scratch area */
 	if (vre->vre_offset == 0) {
 		VERIFY0(dsl_sync_task(spa_name(spa),
 		    NULL, raidz_reflow_scratch_sync,
@@ -5155,3 +5493,10 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
 	"For expanded RAIDZ, automatically start a pool scrub when expansion "
 	"completes");
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW,
+	"Raidz/draid slow disk sit out time period in seconds");
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, U64,
+	ZMOD_RW, "Interval to check for slow raidz/draid children");
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_insensitivity, UINT,
+	ZMOD_RW, "How insensitive the slow raidz/draid child check should be");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
index 47b3b9921abe..30be1f851eb3 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
@@ -1079,7 +1079,7 @@ vdev_rebuild_restart_impl(vdev_t *vd)
 void
 vdev_rebuild_restart(spa_t *spa)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	ASSERT(spa_namespace_held() ||
 	    spa->spa_load_thread == curthread);
 
 	vdev_rebuild_restart_impl(spa->spa_root_vdev);
@@ -1094,7 +1094,7 @@ vdev_rebuild_stop_wait(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	ASSERT(spa_namespace_held() ||
 	    spa->spa_export_thread == curthread);
 
 	if (vd == spa->spa_root_vdev) {
diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c
index 2f7a739da241..81e6ecb68ff1 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_removal.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c
@@ -51,34 +51,70 @@
 #include <sys/trace_zfs.h>
 
 /*
- * This file contains the necessary logic to remove vdevs from a
- * storage pool.  Currently, the only devices that can be removed
- * are log, cache, and spare devices; and top level vdevs from a pool
- * w/o raidz or mirrors.  (Note that members of a mirror can be removed
- * by the detach operation.)
+ * This file contains the necessary logic to remove vdevs from a storage
+ * pool. Note that members of a mirror can be removed by the detach
+ * operation. Currently, the only devices that can be removed are:
  *
- * Log vdevs are removed by evacuating them and then turning the vdev
- * into a hole vdev while holding spa config locks.
+ * 1) Traditional hot spare and cache vdevs. Note that draid distributed
+ *    spares are fixed at creation time and cannot be removed.
  *
- * Top level vdevs are removed and converted into an indirect vdev via
- * a multi-step process:
+ * 2) Log vdevs are removed by evacuating them and then turning the vdev
+ *    into a hole vdev while holding spa config locks.
  *
- *  - Disable allocations from this device (spa_vdev_remove_top).
+ * 3) Top-level singleton and mirror vdevs, including dedup and special
+ *    vdevs, are removed and converted into an indirect vdev via a
+ *    multi-step process:
  *
- *  - From a new thread (spa_vdev_remove_thread), copy data from
- *    the removing vdev to a different vdev.  The copy happens in open
- *    context (spa_vdev_copy_impl) and issues a sync task
- *    (vdev_mapping_sync) so the sync thread can update the partial
- *    indirect mappings in core and on disk.
+ *    - Disable allocations from this device (spa_vdev_remove_top).
  *
- *  - If a free happens during a removal, it is freed from the
- *    removing vdev, and if it has already been copied, from the new
- *    location as well (free_from_removing_vdev).
+ *    - From a new thread (spa_vdev_remove_thread), copy data from the
+ *      removing vdev to a different vdev. The copy happens in open context
+ *      (spa_vdev_copy_impl) and issues a sync task (vdev_mapping_sync) so
+ *      the sync thread can update the partial indirect mappings in core
+ *      and on disk.
  *
- *  - After the removal is completed, the copy thread converts the vdev
- *    into an indirect vdev (vdev_remove_complete) before instructing
- *    the sync thread to destroy the space maps and finish the removal
- *    (spa_finish_removal).
+ *    - If a free happens during a removal, it is freed from the removing
+ *      vdev, and if it has already been copied, from the new location as
+ *      well (free_from_removing_vdev).
+ *
+ *    - After the removal is completed, the copy thread converts the vdev
+ *      into an indirect vdev (vdev_remove_complete) before instructing
+ *      the sync thread to destroy the space maps and finish the removal
+ *      (spa_finish_removal).
+ *
+ *   The following constraints currently apply primary device removal:
+ *
+ *     - All vdevs must be online, healthy, and not be missing any data
+ *       according to the DTLs.
+ *
+ *     - When removing a singleton or mirror vdev, regardless of it's a
+ *       special, dedup, or primary device, it must have the same ashift
+ *       as the devices in the normal allocation class. Furthermore, all
+ *       vdevs in the normal allocation class must have the same ashift to
+ *       ensure the new allocations never includes additional padding.
+ *
+ *     - The normal allocation class cannot contain any raidz or draid
+ *       top-level vdevs since segments are copied without regard for block
+ *       boundaries. This makes it impossible to calculate the required
+ *       parity columns when using these vdev types as the destination.
+ *
+ *     - The encryption keys must be loaded so the ZIL logs can be reset
+ *       in order to prevent writing to the device being removed.
+ *
+ * N.B. ashift and raidz/draid constraints for primary top-level device
+ * removal could be slightly relaxed if it were possible to request that
+ * DVAs from a mirror or singleton in the specified allocation class be
+ * used (metaslab_alloc_dva).
+ *
+ * This flexibility would be particularly useful for raidz/draid pools which
+ * often include a mirrored special device. If a mistakenly added top-level
+ * singleton were added it could then still be removed at the cost of some
+ * special device capacity. This may be a worthwhile tradeoff depending on
+ * the pool capacity and expense (cost, complexity, time) of creating a new
+ * pool and copying all of the data to correct the configuration.
+ *
+ * Furthermore, while not currently supported it should be possible to allow
+ * vdevs of any type to be removed as long as they've never been written to.
  */
 
 typedef struct vdev_copy_arg {
@@ -273,12 +309,12 @@ spa_vdev_noalloc(spa_t *spa, uint64_t guid)
 	uint64_t txg;
 	int error = 0;
 
-	ASSERT(!MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(!spa_namespace_held());
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
@@ -306,12 +342,12 @@ spa_vdev_alloc(spa_t *spa, uint64_t guid)
 	uint64_t txg;
 	int error = 0;
 
-	ASSERT(!MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(!spa_namespace_held());
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
@@ -2049,7 +2085,7 @@ vdev_remove_make_hole_and_free(vdev_t *vd)
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	vdev_free(vd);
@@ -2077,7 +2113,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 	ASSERT(vd->vdev_islog);
 	ASSERT(vd == vd->vdev_top);
 	ASSERT0P(vd->vdev_log_mg);
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	/*
 	 * Stop allocating from this vdev.
@@ -2104,7 +2140,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 	 * spa_namespace_lock held.  Once this completes the device
 	 * should no longer have any blocks allocated on it.
 	 */
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 	if (vd->vdev_stat.vs_alloc != 0)
 		error = spa_reset_logs(spa);
 
@@ -2153,7 +2189,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 
 	sysevent_t *ev = spa_event_create(spa, vd, NULL,
 	    ESC_ZFS_VDEV_REMOVE_DEV);
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/* The top ZAP should have been destroyed by vdev_remove_empty. */
@@ -2397,7 +2433,7 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 	uint64_t txg = 0;
 	uint_t nspares, nl2cache;
 	int error = 0, error_log;
-	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
+	boolean_t locked = spa_namespace_held();
 	sysevent_t *ev = NULL;
 	const char *vd_type = NULL;
 	char *vd_path = NULL;
@@ -2407,7 +2443,7 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 	if (!locked)
 		txg = spa_vdev_enter(spa);
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c
index eee18b367909..a97f6650a81c 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_trim.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c
@@ -1045,7 +1045,7 @@ vdev_trim_stop_wait(spa_t *spa, list_t *vd_list)
 	(void) spa;
 	vdev_t *vd;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	ASSERT(spa_namespace_held() ||
 	    spa->spa_export_thread == curthread);
 
 	while ((vd = list_remove_head(vd_list)) != NULL) {
@@ -1085,7 +1085,7 @@ vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list)
 	if (vd_list == NULL) {
 		vdev_trim_stop_wait_impl(vd);
 	} else {
-		ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+		ASSERT(spa_namespace_held() ||
 		    vd->vdev_spa->spa_export_thread == curthread);
 		list_insert_tail(vd_list, vd);
 	}
@@ -1122,7 +1122,7 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
 	list_t vd_list;
 	vdev_t *vd_l2cache;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	ASSERT(spa_namespace_held() ||
 	    spa->spa_export_thread == curthread);
 
 	list_create(&vd_list, sizeof (vdev_t),
@@ -1156,7 +1156,7 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
 void
 vdev_trim_restart(vdev_t *vd)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	ASSERT(spa_namespace_held() ||
 	    vd->vdev_spa->spa_load_thread == curthread);
 	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
@@ -1582,7 +1582,7 @@ vdev_autotrim_stop_all(spa_t *spa)
 void
 vdev_autotrim_restart(spa_t *spa)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	ASSERT(spa_namespace_held() ||
 	    spa->spa_load_thread == curthread);
 	if (spa->spa_autotrim)
 		vdev_autotrim(spa);
@@ -1689,7 +1689,7 @@ vdev_trim_l2arc_thread(void *arg)
 void
 vdev_trim_l2arc(spa_t *spa)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	/*
 	 * Locate the spa's l2arc devices and kick off TRIM threads.
diff --git a/sys/contrib/openzfs/module/zfs/zap_micro.c b/sys/contrib/openzfs/module/zfs/zap_micro.c
index ea4e3117a8b9..7e9e625a193e 100644
--- a/sys/contrib/openzfs/module/zfs/zap_micro.c
+++ b/sys/contrib/openzfs/module/zfs/zap_micro.c
@@ -625,12 +625,10 @@ zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
 	ASSERT0(db->db_offset);
 	objset_t *os = dmu_buf_get_objset(db);
 	uint64_t obj = db->db_object;
-	dmu_object_info_t doi;
 
 	*zapp = NULL;
 
-	dmu_object_info_from_dnode(dn, &doi);
-	if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
+	if (DMU_OT_BYTESWAP(dn->dn_type) != DMU_BSWAP_ZAP)
 		return (SET_ERROR(EINVAL));
 
 	zap_t *zap = dmu_buf_get_user(db);
diff --git a/sys/contrib/openzfs/module/zfs/zfeature.c b/sys/contrib/openzfs/module/zfs/zfeature.c
index 0816ea134bf3..4cf9e0dbb405 100644
--- a/sys/contrib/openzfs/module/zfs/zfeature.c
+++ b/sys/contrib/openzfs/module/zfs/zfeature.c
@@ -308,6 +308,7 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
 	ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature));
 	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+	ASSERT(MUTEX_HELD(&spa->spa_feat_stats_lock));
 	VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid,
 	    sizeof (uint64_t), 1, &refcount, tx));
 
@@ -360,7 +361,9 @@ feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
 	    feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
 	    feature->fi_desc, tx));
 
+	mutex_enter(&spa->spa_feat_stats_lock);
 	feature_sync(spa, feature, initial_refcount, tx);
+	mutex_exit(&spa->spa_feat_stats_lock);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) {
 		uint64_t enabling_txg = dmu_tx_get_txg(tx);
@@ -416,6 +419,7 @@ feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
 
+	mutex_enter(&spa->spa_feat_stats_lock);
 	VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP);
 
 	switch (action) {
@@ -433,6 +437,7 @@ feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
 	}
 
 	feature_sync(spa, feature, refcount, tx);
+	mutex_exit(&spa->spa_feat_stats_lock);
 }
 
 void
diff --git a/sys/contrib/openzfs/module/zfs/zfs_crrd.c b/sys/contrib/openzfs/module/zfs/zfs_crrd.c
index f9267ed41d71..30d4c7c36897 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_crrd.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_crrd.c
@@ -162,9 +162,9 @@ dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg)
 	daydiff = time - rrd_tail(&db->dbr_days);
 	monthdiff = time - rrd_tail(&db->dbr_months);
 
-	if (monthdiff >= 0 && monthdiff >= SEC2NSEC(30 * 24 * 60 * 60))
+	if (monthdiff >= 0 && monthdiff >= 30 * 24 * 60 * 60)
 		rrd_add(&db->dbr_months, time, txg);
-	else if (daydiff >= 0 && daydiff >= SEC2NSEC(24 * 60 * 60))
+	else if (daydiff >= 0 && daydiff >= 24 * 60 * 60)
 		rrd_add(&db->dbr_days, time, txg);
 	else if (minutedif >= 0)
 		rrd_add(&db->dbr_minutes, time, txg);
@@ -208,7 +208,8 @@ dbrrd_closest(hrtime_t tv, const rrd_data_t *r1, const rrd_data_t *r2)
 	if (r2 == NULL)
 		return (r1);
 
-	return (ABS(tv - r1->rrdd_time) < ABS(tv - r2->rrdd_time) ? r1 : r2);
+	return (ABS(tv - (hrtime_t)r1->rrdd_time) <
+	    ABS(tv - (hrtime_t)r2->rrdd_time) ? r1 : r2);
 }
 
 uint64_t
diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c
index 221f24e381dc..4a0d41c24eed 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_fm.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c
@@ -223,6 +223,9 @@ vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop)
 		case VDEV_PROP_IO_T:
 			propval = vd->vdev_io_t;
 			break;
+		case VDEV_PROP_SLOW_IO_EVENTS:
+			propval = vd->vdev_slow_io_events;
+			break;
 		case VDEV_PROP_SLOW_IO_N:
 			propval = vd->vdev_slow_io_n;
 			break;
@@ -1580,10 +1583,10 @@ zfs_ereport_zvol_post(const char *subclass, const char *name,
 	nvlist_t *aux;
 	char *r;
 
-	boolean_t locked = mutex_owned(&spa_namespace_lock);
-	if (!locked) mutex_enter(&spa_namespace_lock);
+	boolean_t locked = spa_namespace_held();
+	if (!locked) spa_namespace_enter(FTAG);
 	spa_t *spa = spa_lookup(name);
-	if (!locked) mutex_exit(&spa_namespace_lock);
+	if (!locked) spa_namespace_exit(FTAG);
 
 	if (spa == NULL)
 		return;
diff --git a/sys/contrib/openzfs/module/zfs/zfs_fuid.c b/sys/contrib/openzfs/module/zfs/zfs_fuid.c
index 2af1efe82e62..aa10741ba870 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_fuid.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_fuid.c
@@ -28,8 +28,8 @@
 #include <sys/avl.h>
 #include <sys/zap.h>
 #include <sys/nvpair.h>
-#ifdef _KERNEL
 #include <sys/sid.h>
+#ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 #endif
@@ -268,7 +268,7 @@ zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 	nvlist_free(nvp);
 	zfsvfs->z_fuid_size = nvsize;
 	dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
-	    zfsvfs->z_fuid_size, packed, tx);
+	    zfsvfs->z_fuid_size, packed, tx, DMU_READ_NO_PREFETCH);
 	kmem_free(packed, zfsvfs->z_fuid_size);
 	VERIFY0(dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
index 121b966b9864..1b2392aeaa85 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@@ -212,6 +212,8 @@
 #include <sys/vdev_impl.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
+#include <sys/brt.h>
+#include <sys/ddt.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
@@ -683,6 +685,7 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 	dsl_dataset_t *ds;
 	const char *cp;
 	int error;
+	boolean_t rawok = (zc->zc_flags & 0x8);
 
 	/*
 	 * Generate the current snapshot name from the given objsetid, then
@@ -705,6 +708,10 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 
 	error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
 	    ZFS_DELEG_PERM_SEND, cr);
+	if (error != 0 && rawok == B_TRUE) {
+		error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
+		    ZFS_DELEG_PERM_SEND_RAW, cr);
+	}
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 
@@ -714,9 +721,17 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 static int
 zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	boolean_t rawok = nvlist_exists(innvl, "rawok");
+	int error;
+
 	(void) innvl;
-	return (zfs_secpolicy_write_perms(zc->zc_name,
-	    ZFS_DELEG_PERM_SEND, cr));
+	error = zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_SEND, cr);
+	if (error != 0 && rawok == B_TRUE) {
+		error = zfs_secpolicy_write_perms(zc->zc_name,
+		    ZFS_DELEG_PERM_SEND_RAW, cr);
+	}
+	return (error);
 }
 
 static int
@@ -3109,12 +3124,12 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc)
 	if (pair != NULL && strcmp(nvpair_name(pair),
 	    zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
 	    nvlist_next_nvpair(props, pair) == NULL) {
-		mutex_enter(&spa_namespace_lock);
+		spa_namespace_enter(FTAG);
 		if ((spa = spa_lookup(zc->zc_name)) != NULL) {
 			spa_configfile_set(spa, props, B_FALSE);
 			spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
 		}
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		if (spa != NULL) {
 			nvlist_free(props);
 			return (0);
@@ -3163,14 +3178,14 @@ zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl)
 		 * get (such as altroot and cachefile), so attempt to get them
 		 * anyway.
 		 */
-		mutex_enter(&spa_namespace_lock);
+		spa_namespace_enter(FTAG);
 		if ((spa = spa_lookup(pool)) != NULL) {
 			error = spa_prop_get(spa, outnvl);
 			if (error == 0 && props != NULL)
 				error = spa_prop_get_nvlist(spa, props, n_props,
 				    outnvl);
 		}
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 	} else {
 		error = spa_prop_get(spa, outnvl);
 		if (error == 0 && props != NULL)
@@ -4263,13 +4278,11 @@ zfs_ioc_pool_prefetch(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 	spa_t *spa;
 	int32_t type;
 
-	/*
-	 * Currently, only ZPOOL_PREFETCH_DDT is supported
-	 */
-	if (nvlist_lookup_int32(innvl, ZPOOL_PREFETCH_TYPE, &type) != 0 ||
-	    type != ZPOOL_PREFETCH_DDT) {
+	if (nvlist_lookup_int32(innvl, ZPOOL_PREFETCH_TYPE, &type) != 0)
+		return (EINVAL);
+
+	if (type != ZPOOL_PREFETCH_DDT && type != ZPOOL_PREFETCH_BRT)
 		return (EINVAL);
-	}
 
 	error = spa_open(poolname, &spa, FTAG);
 	if (error != 0)
@@ -4277,10 +4290,17 @@ zfs_ioc_pool_prefetch(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 
 	hrtime_t start_time = gethrtime();
 
-	ddt_prefetch_all(spa);
-
-	zfs_dbgmsg("pool '%s': loaded ddt into ARC in %llu ms", spa->spa_name,
-	    (u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
+	if (type == ZPOOL_PREFETCH_DDT) {
+		ddt_prefetch_all(spa);
+		zfs_dbgmsg("pool '%s': loaded ddt into ARC in %llu ms",
+		    spa->spa_name,
+		    (u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
+	} else {
+		brt_prefetch_all(spa);
+		zfs_dbgmsg("pool '%s': loaded brt into ARC in %llu ms",
+		    spa->spa_name,
+		    (u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
+	}
 
 	spa_close(spa, FTAG);
 
@@ -4726,7 +4746,7 @@ zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 			error = error ? error : resume_err;
 		}
 		zfs_vfs_rele(zfsvfs);
-	} else if ((zv = zvol_suspend(fsname)) != NULL) {
+	} else if (zvol_suspend(fsname, &zv) == 0) {
 		error = dsl_dataset_rollback(fsname, target, zvol_tag(zv),
 		    outnvl);
 		zvol_resume(zv);
@@ -5448,7 +5468,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, const char *origin,
 			}
 			error = error ? error : end_err;
 			zfs_vfs_rele(zfsvfs);
-		} else if ((zv = zvol_suspend(tofs)) != NULL) {
+		} else if (zvol_suspend(tofs, &zv) == 0) {
 			error = dmu_recv_end(&drc, zvol_tag(zv));
 			zvol_resume(zv);
 		} else {
@@ -6108,10 +6128,10 @@ zfs_ioc_clear(zfs_cmd_t *zc)
 	/*
 	 * On zpool clear we also fix up missing slogs
 	 */
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	spa = spa_lookup(zc->zc_name);
 	if (spa == NULL) {
-		mutex_exit(&spa_namespace_lock);
+		spa_namespace_exit(FTAG);
 		return (SET_ERROR(EIO));
 	}
 	if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
@@ -6119,7 +6139,7 @@ zfs_ioc_clear(zfs_cmd_t *zc)
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	}
 	spa->spa_last_open_failed = 0;
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 
 	if (zc->zc_cookie & ZPOOL_NO_REWIND) {
 		error = spa_open(zc->zc_name, &spa, FTAG);
@@ -7619,7 +7639,7 @@ zfs_ioctl_init(void)
 
 	zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB,
 	    zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME,
-	    POOL_CHECK_NONE, B_TRUE, B_TRUE,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub));
 
 	zfs_ioctl_register("get_props", ZFS_IOC_POOL_GET_PROPS,
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c
index 31b59c55f17b..0307df55aa21 100644
--- a/sys/contrib/openzfs/module/zfs/zil.c
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@@ -819,34 +819,37 @@ zil_lwb_vdev_compare(const void *x1, const void *x2)
  * we choose them here and later make the block allocation match.
  */
 static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
-    uint64_t txg, lwb_state_t state)
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, int min_sz, int sz,
+    boolean_t slog, uint64_t txg)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+	lwb->lwb_flags = 0;
 	lwb->lwb_zilog = zilog;
 	if (bp) {
 		lwb->lwb_blk = *bp;
-		lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2);
+		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2)
+			lwb->lwb_flags |= LWB_FLAG_SLIM;
 		sz = BP_GET_LSIZE(bp);
+		lwb->lwb_min_sz = sz;
 	} else {
 		BP_ZERO(&lwb->lwb_blk);
-		lwb->lwb_slim = (spa_version(zilog->zl_spa) >=
-		    SPA_VERSION_SLIM_ZIL);
+		if (spa_version(zilog->zl_spa) >= SPA_VERSION_SLIM_ZIL)
+			lwb->lwb_flags |= LWB_FLAG_SLIM;
+		lwb->lwb_min_sz = min_sz;
 	}
-	lwb->lwb_slog = slog;
+	if (slog)
+		lwb->lwb_flags |= LWB_FLAG_SLOG;
 	lwb->lwb_error = 0;
-	if (lwb->lwb_slim) {
-		lwb->lwb_nmax = sz;
-		lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
-	} else {
-		lwb->lwb_nmax = sz - sizeof (zil_chain_t);
-		lwb->lwb_nused = lwb->lwb_nfilled = 0;
-	}
+	/*
+	 * Buffer allocation and capacity setup will be done in
+	 * zil_lwb_write_open() when the LWB is opened for ITX assignment.
+	 */
+	lwb->lwb_nmax = lwb->lwb_nused = lwb->lwb_nfilled = 0;
 	lwb->lwb_sz = sz;
-	lwb->lwb_state = state;
-	lwb->lwb_buf = zio_buf_alloc(sz);
+	lwb->lwb_buf = NULL;
+	lwb->lwb_state = LWB_STATE_NEW;
 	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 	lwb->lwb_root_zio = NULL;
@@ -857,8 +860,6 @@ zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_lwb_list, lwb);
-	if (state != LWB_STATE_NEW)
-		zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 
 	return (lwb);
@@ -878,7 +879,7 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 	VERIFY(list_is_empty(&lwb->lwb_itxs));
 	VERIFY(list_is_empty(&lwb->lwb_waiters));
 	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
-	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
+	ASSERT(!MUTEX_HELD(&lwb->lwb_lock));
 
 	/*
 	 * Clear the zilog's field to indicate this lwb is no longer
@@ -1019,7 +1020,7 @@ zil_create(zilog_t *zilog)
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
-		    ZIL_MIN_BLKSZ, &slog);
+		    ZIL_MIN_BLKSZ, ZIL_MIN_BLKSZ, &slog, B_TRUE);
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
 	}
@@ -1028,7 +1029,7 @@ zil_create(zilog_t *zilog)
 	 * Allocate a log write block (lwb) for the first log block.
 	 */
 	if (error == 0)
-		lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
+		lwb = zil_alloc_lwb(zilog, &blk, 0, 0, slog, txg);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
@@ -1324,10 +1325,12 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
  * zil_commit() is racing with spa_sync().
  */
 static void
-zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
+zil_commit_waiter_done(zil_commit_waiter_t *zcw, int err)
 {
 	mutex_enter(&zcw->zcw_lock);
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
+	zcw->zcw_lwb = NULL;
+	zcw->zcw_error = err;
 	zcw->zcw_done = B_TRUE;
 	cv_broadcast(&zcw->zcw_cv);
 	mutex_exit(&zcw->zcw_lock);
@@ -1389,7 +1392,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
 	if (zil_nocacheflush)
 		return;
 
-	mutex_enter(&lwb->lwb_vdev_lock);
+	mutex_enter(&lwb->lwb_lock);
 	for (i = 0; i < ndvas; i++) {
 		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		if (avl_find(t, &zvsearch, &where) == NULL) {
@@ -1398,7 +1401,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
 			avl_insert(t, zv, where);
 		}
 	}
-	mutex_exit(&lwb->lwb_vdev_lock);
+	mutex_exit(&lwb->lwb_lock);
 }
 
 static void
@@ -1415,12 +1418,12 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
 
 	/*
 	 * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
-	 * not need the protection of lwb_vdev_lock (it will only be modified
+	 * not need the protection of lwb_lock (it will only be modified
 	 * while holding zilog->zl_lock) as its writes and those of its
 	 * children have all completed.  The younger 'nlwb' may be waiting on
 	 * future writes to additional vdevs.
 	 */
-	mutex_enter(&nlwb->lwb_vdev_lock);
+	mutex_enter(&nlwb->lwb_lock);
 	/*
 	 * Tear down the 'lwb' vdev tree, ensuring that entries which do not
 	 * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
@@ -1434,7 +1437,7 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
 			kmem_free(zv, sizeof (*zv));
 		}
 	}
-	mutex_exit(&nlwb->lwb_vdev_lock);
+	mutex_exit(&nlwb->lwb_lock);
 }
 
 void
@@ -1491,10 +1494,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 		zil_itx_destroy(itx, 0);
 
 	while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
-		mutex_enter(&zcw->zcw_lock);
-
-		ASSERT3P(zcw->zcw_lwb, ==, lwb);
-		zcw->zcw_lwb = NULL;
 		/*
 		 * We expect any ZIO errors from child ZIOs to have been
 		 * propagated "up" to this specific LWB's root ZIO, in
@@ -1509,14 +1508,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 		 * errors not being handled correctly here. See the
 		 * comment above the call to "zio_flush" for details.
 		 */
-
-		zcw->zcw_zio_error = zio->io_error;
-
-		ASSERT3B(zcw->zcw_done, ==, B_FALSE);
-		zcw->zcw_done = B_TRUE;
-		cv_broadcast(&zcw->zcw_cv);
-
-		mutex_exit(&zcw->zcw_lock);
+		zil_commit_waiter_done(zcw, zio->io_error);
 	}
 
 	uint64_t txg = lwb->lwb_issued_txg;
@@ -1588,7 +1580,7 @@ zil_lwb_write_done(zio_t *zio)
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
-	lwb_t *nlwb;
+	lwb_t *nlwb = NULL;
 
 	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
 
@@ -1608,9 +1600,11 @@ zil_lwb_write_done(zio_t *zio)
 	 * its write ZIO a parent this ZIO.  In such case we can not defer
 	 * our flushes or below may be a race between the done callbacks.
 	 */
-	nlwb = list_next(&zilog->zl_lwb_list, lwb);
-	if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
-		nlwb = NULL;
+	if (!(lwb->lwb_flags & LWB_FLAG_CRASHED)) {
+		nlwb = list_next(&zilog->zl_lwb_list, lwb);
+		if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
+			nlwb = NULL;
+	}
 	mutex_exit(&zilog->zl_lock);
 
 	if (avl_numnodes(t) == 0)
@@ -1624,12 +1618,17 @@ zil_lwb_write_done(zio_t *zio)
 	 * written out.
 	 *
 	 * Additionally, we don't perform any further error handling at
-	 * this point (e.g. setting "zcw_zio_error" appropriately), as
-	 * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus,
-	 * we expect any error seen here, to have been propagated to
-	 * that function).
+	 * this point (e.g. setting "zcw_error" appropriately), as we
+	 * expect that to occur in "zil_lwb_flush_vdevs_done" (thus, we
+	 * expect any error seen here, to have been propagated to that
+	 * function).
+	 *
+	 * Note that we treat a "crashed" LWB as though it was in error,
+	 * even if it did appear to succeed, because we've already
+	 * signaled error and cleaned up waiters and committers in
+	 * zil_crash(); we just want to clean up and get out of here.
 	 */
-	if (zio->io_error != 0) {
+	if (zio->io_error != 0 || (lwb->lwb_flags & LWB_FLAG_CRASHED)) {
 		while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
 			kmem_free(zv, sizeof (*zv));
 		return;
@@ -1742,10 +1741,26 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 		return;
 	}
 
+	mutex_enter(&lwb->lwb_lock);
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_OPENED;
 	zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
+	mutex_exit(&lwb->lwb_lock);
+
+	/*
+	 * Allocate buffer and set up LWB capacities.
+	 */
+	ASSERT0P(lwb->lwb_buf);
+	ASSERT3U(lwb->lwb_sz, >, 0);
+	lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
+	if (lwb->lwb_flags & LWB_FLAG_SLIM) {
+		lwb->lwb_nmax = lwb->lwb_sz;
+		lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
+	} else {
+		lwb->lwb_nmax = lwb->lwb_sz - sizeof (zil_chain_t);
+		lwb->lwb_nused = lwb->lwb_nfilled = 0;
+	}
 }
 
 /*
@@ -1762,6 +1777,8 @@ static uint_t
 zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
 {
 	uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t);
+	uint_t waste = zil_max_waste_space(zilog);
+	waste = MAX(waste, zilog->zl_cur_max);
 
 	if (size <= md) {
 		/*
@@ -1772,9 +1789,10 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
 	} else if (size > 8 * md) {
 		/*
 		 * Big bursts use maximum blocks.  The first block size
-		 * is hard to predict, but it does not really matter.
+		 * is hard to predict, but we need at least enough space
+		 * to make reasonable progress.
 		 */
-		*minsize = 0;
+		*minsize = waste;
 		return (md);
 	}
 
@@ -1787,57 +1805,52 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
 	uint_t s = size;
 	uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t));
 	uint_t chunk = DIV_ROUND_UP(s, n);
-	uint_t waste = zil_max_waste_space(zilog);
-	waste = MAX(waste, zilog->zl_cur_max);
 	if (chunk <= md - waste) {
 		*minsize = MAX(s - (md - waste) * (n - 1), waste);
 		return (chunk);
 	} else {
-		*minsize = 0;
+		*minsize = waste;
 		return (md);
 	}
 }
 
 /*
  * Try to predict next block size based on previous history.  Make prediction
- * sufficient for 7 of 8 previous bursts.  Don't try to save if the saving is
- * less then 50%, extra writes may cost more, but we don't want single spike
- * to badly affect our predictions.
+ * sufficient for 7 of 8 previous bursts, but don't try to save if the saving
+ * is less then 50%.  Extra writes may cost more, but we don't want single
+ * spike to badly affect our predictions.
  */
-static uint_t
-zil_lwb_predict(zilog_t *zilog)
+static void
+zil_lwb_predict(zilog_t *zilog, uint64_t *min_predict, uint64_t *max_predict)
 {
-	uint_t m, o;
+	uint_t m1 = 0, m2 = 0, o;
 
-	/* If we are in the middle of a burst, take it into account also. */
-	if (zilog->zl_cur_size > 0) {
-		o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m);
-	} else {
+	/* If we are in the middle of a burst, take it as another data point. */
+	if (zilog->zl_cur_size > 0)
+		o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m1);
+	else
 		o = UINT_MAX;
-		m = 0;
-	}
 
-	/* Find minimum optimal size.  We don't need to go below that. */
-	for (int i = 0; i < ZIL_BURSTS; i++)
-		o = MIN(o, zilog->zl_prev_opt[i]);
-
-	/* Find two biggest minimal first block sizes above the optimal. */
-	uint_t m1 = MAX(m, o), m2 = o;
+	/* Find two largest minimal first block sizes. */
 	for (int i = 0; i < ZIL_BURSTS; i++) {
-		m = zilog->zl_prev_min[i];
-		if (m >= m1) {
+		uint_t cur = zilog->zl_prev_min[i];
+		if (cur >= m1) {
 			m2 = m1;
-			m1 = m;
-		} else if (m > m2) {
-			m2 = m;
+			m1 = cur;
+		} else if (cur > m2) {
+			m2 = cur;
 		}
 	}
 
-	/*
-	 * If second minimum size gives 50% saving -- use it.  It may cost us
-	 * one additional write later, but the space saving is just too big.
-	 */
-	return ((m1 < m2 * 2) ? m1 : m2);
+	/* Minimum should guarantee progress in most cases. */
+	*min_predict = (m1 < m2 * 2) ? m1 : m2;
+
+	/* Maximum doesn't need to go below the minimum optimal size. */
+	for (int i = 0; i < ZIL_BURSTS; i++)
+		o = MIN(o, zilog->zl_prev_opt[i]);
+	m1 = MAX(m1, o);
+	m2 = MAX(m2, o);
+	*max_predict = (m1 < m2 * 2) ? m1 : m2;
 }
 
 /*
@@ -1845,12 +1858,13 @@ zil_lwb_predict(zilog_t *zilog)
  * Has to be called under zl_issuer_lock to chain more lwbs.
  */
 static lwb_t *
-zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
+zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb)
 {
-	uint64_t blksz, plan, plan2;
+	uint64_t minbs, maxbs;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+	membar_producer();
 	lwb->lwb_state = LWB_STATE_CLOSED;
 
 	/*
@@ -1875,27 +1889,34 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
 		 * Try to predict what can it be and plan for the worst case.
 		 */
 		uint_t m;
-		plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
+		maxbs = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
+		minbs = m;
 		if (zilog->zl_parallel) {
-			plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left +
-			    zil_lwb_predict(zilog), &m);
-			if (plan < plan2)
-				plan = plan2;
+			uint64_t minp, maxp;
+			zil_lwb_predict(zilog, &minp, &maxp);
+			maxp = zil_lwb_plan(zilog, zilog->zl_cur_left + maxp,
+			    &m);
+			if (maxbs < maxp)
+				maxbs = maxp;
 		}
 	} else {
 		/*
 		 * The previous burst is done and we can only predict what
 		 * will come next.
 		 */
-		plan = zil_lwb_predict(zilog);
+		zil_lwb_predict(zilog, &minbs, &maxbs);
 	}
-	blksz = plan + sizeof (zil_chain_t);
-	blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t);
-	blksz = MIN(blksz, zilog->zl_max_block_size);
-	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz,
-	    uint64_t, plan);
 
-	return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state));
+	minbs += sizeof (zil_chain_t);
+	maxbs += sizeof (zil_chain_t);
+	minbs = P2ROUNDUP_TYPED(minbs, ZIL_MIN_BLKSZ, uint64_t);
+	maxbs = P2ROUNDUP_TYPED(maxbs, ZIL_MIN_BLKSZ, uint64_t);
+	maxbs = MIN(maxbs, zilog->zl_max_block_size);
+	minbs = MIN(minbs, maxbs);
+	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, minbs,
+	    uint64_t, maxbs);
+
+	return (zil_alloc_lwb(zilog, NULL, minbs, maxbs, 0, 0));
 }
 
 /*
@@ -1944,14 +1965,16 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 	mutex_exit(&zilog->zl_lock);
 
 next_lwb:
-	if (lwb->lwb_slim)
+	if (lwb->lwb_flags & LWB_FLAG_SLIM)
 		zilc = (zil_chain_t *)lwb->lwb_buf;
 	else
 		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
-	int wsz = lwb->lwb_sz;
+	uint64_t alloc_size = BP_GET_LSIZE(&lwb->lwb_blk);
+	int wsz = alloc_size;
 	if (lwb->lwb_error == 0) {
 		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
-		if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk)
+		if (!(lwb->lwb_flags & LWB_FLAG_SLOG) ||
+		    zilog->zl_cur_size <= zil_slog_bulk)
 			prio = ZIO_PRIORITY_SYNC_WRITE;
 		else
 			prio = ZIO_PRIORITY_ASYNC_WRITE;
@@ -1959,16 +1982,17 @@ next_lwb:
 		    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 		    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
 		lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
-		    &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
+		    &lwb->lwb_blk, lwb_abd, alloc_size, zil_lwb_write_done,
 		    lwb, prio, ZIO_FLAG_CANFAIL, &zb);
 		zil_lwb_add_block(lwb, &lwb->lwb_blk);
 
-		if (lwb->lwb_slim) {
+		if (lwb->lwb_flags & LWB_FLAG_SLIM) {
 			/* For Slim ZIL only write what is used. */
 			wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
 			    int);
-			ASSERT3S(wsz, <=, lwb->lwb_sz);
-			zio_shrink(lwb->lwb_write_zio, wsz);
+			ASSERT3S(wsz, <=, alloc_size);
+			if (wsz < alloc_size)
+				zio_shrink(lwb->lwb_write_zio, wsz);
 			wsz = lwb->lwb_write_zio->io_size;
 		}
 		memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
@@ -2004,13 +2028,53 @@ next_lwb:
 	BP_ZERO(bp);
 	error = lwb->lwb_error;
 	if (error == 0) {
-		error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
-		    &slog);
+		/*
+		 * Allocation flexibility depends on LWB state:
+		 * if NEW: allow range allocation and larger sizes;
+		 * if OPENED: use fixed predetermined allocation size;
+		 * if CLOSED + Slim: allocate precisely for actual usage.
+		 */
+		boolean_t flexible = (nlwb->lwb_state == LWB_STATE_NEW);
+		if (flexible) {
+			/* We need to prevent opening till we update lwb_sz. */
+			mutex_enter(&nlwb->lwb_lock);
+			flexible = (nlwb->lwb_state == LWB_STATE_NEW);
+			if (!flexible)
+				mutex_exit(&nlwb->lwb_lock); /* We lost. */
+		}
+		boolean_t closed_slim = (nlwb->lwb_state == LWB_STATE_CLOSED &&
+		    (lwb->lwb_flags & LWB_FLAG_SLIM));
+
+		uint64_t min_size, max_size;
+		if (closed_slim) {
+			/* This transition is racy, but only one way. */
+			membar_consumer();
+			min_size = max_size = P2ROUNDUP_TYPED(nlwb->lwb_nused,
+			    ZIL_MIN_BLKSZ, uint64_t);
+		} else if (flexible) {
+			min_size = nlwb->lwb_min_sz;
+			max_size = nlwb->lwb_sz;
+		} else {
+			min_size = max_size = nlwb->lwb_sz;
+		}
+
+		error = zio_alloc_zil(spa, zilog->zl_os, txg, bp,
+		    min_size, max_size, &slog, flexible);
+		if (error == 0) {
+			if (closed_slim)
+				ASSERT3U(BP_GET_LSIZE(bp), ==, max_size);
+			else if (flexible)
+				nlwb->lwb_sz = BP_GET_LSIZE(bp);
+			else
+				ASSERT3U(BP_GET_LSIZE(bp), ==, nlwb->lwb_sz);
+		}
+		if (flexible)
+			mutex_exit(&nlwb->lwb_lock);
 	}
 	if (error == 0) {
 		ASSERT3U(BP_GET_BIRTH(bp), ==, txg);
-		BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
-		    ZIO_CHECKSUM_ZILOG);
+		BP_SET_CHECKSUM(bp, (nlwb->lwb_flags & LWB_FLAG_SLIM) ?
+		    ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
 		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 	}
@@ -2039,14 +2103,15 @@ next_lwb:
 	if (nlwb) {
 		nlwb->lwb_blk = *bp;
 		nlwb->lwb_error = error;
-		nlwb->lwb_slog = slog;
+		if (slog)
+			nlwb->lwb_flags |= LWB_FLAG_SLOG;
 		nlwb->lwb_alloc_txg = txg;
 		if (nlwb->lwb_state != LWB_STATE_READY)
 			nlwb = NULL;
 	}
 	mutex_exit(&zilog->zl_lock);
 
-	if (lwb->lwb_slog) {
+	if (lwb->lwb_flags & LWB_FLAG_SLOG) {
 		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
 		    lwb->lwb_nused);
@@ -2220,7 +2285,6 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3P(lwb, !=, NULL);
-	ASSERT3P(lwb->lwb_buf, !=, NULL);
 
 	zil_lwb_write_open(zilog, lwb);
 
@@ -2262,9 +2326,10 @@ cont:
 	    (dlen % max_log_data == 0 ||
 	    lwb_sp < reclen + dlen % max_log_data))) {
 		list_insert_tail(ilwbs, lwb);
-		lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
+		lwb = zil_lwb_write_close(zilog, lwb);
 		if (lwb == NULL)
 			return (NULL);
+		zil_lwb_write_open(zilog, lwb);
 		lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
 	}
 
@@ -2554,7 +2619,7 @@ zil_itxg_clean(void *arg)
 		 * called) we will hit this case.
 		 */
 		if (itx->itx_lr.lrc_txtype == TX_COMMIT)
-			zil_commit_waiter_skip(itx->itx_private);
+			zil_commit_waiter_done(itx->itx_private, 0);
 
 		zil_itx_destroy(itx, 0);
 	}
@@ -2742,6 +2807,7 @@ zil_crash_clean(zilog_t *zilog, uint64_t synced_txg)
 		}
 
 		/* This LWB is from the past, so we can clean it up now. */
+		ASSERT(lwb->lwb_flags & LWB_FLAG_CRASHED);
 		list_remove(&zilog->zl_lwb_crash_list, lwb);
 		if (lwb->lwb_buf != NULL)
 			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
@@ -2981,7 +3047,7 @@ zil_prune_commit_list(zilog_t *zilog)
 			 * never any itx's for it to wait on), so it's
 			 * safe to skip this waiter and mark it done.
 			 */
-			zil_commit_waiter_skip(itx->itx_private);
+			zil_commit_waiter_done(itx->itx_private, 0);
 		} else {
 			zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
 		}
@@ -3212,15 +3278,21 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		 * "next" lwb on-disk. When this happens, we must stall
 		 * the ZIL write pipeline; see the comment within
 		 * zil_commit_writer_stall() for more details.
+		 *
+		 * ESHUTDOWN has to be handled carefully here. If we get it,
+		 * then the pool suspended and zil_crash() was called, so we
+		 * need to stop trying and just get an error back to the
+		 * callers.
 		 */
 		int err = 0;
 		while ((lwb = list_remove_head(ilwbs)) != NULL) {
-			err = zil_lwb_write_issue(zilog, lwb);
-			if (err != 0)
-				break;
+			if (err == 0)
+				err = zil_lwb_write_issue(zilog, lwb);
 		}
-		if (err == 0)
+		if (err != ESHUTDOWN)
 			err = zil_commit_writer_stall(zilog);
+		if (err == ESHUTDOWN)
+			err = SET_ERROR(EIO);
 
 		/*
 		 * Additionally, we have to signal and mark the "nolwb"
@@ -3230,7 +3302,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		 */
 		zil_commit_waiter_t *zcw;
 		while ((zcw = list_remove_head(&nolwb_waiters)) != NULL)
-			zil_commit_waiter_skip(zcw);
+			zil_commit_waiter_done(zcw, err);
 
 		/*
 		 * And finally, we have to destroy the itx's that
@@ -3238,7 +3310,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		 * the itx's callback if one exists for the itx.
 		 */
 		while ((itx = list_remove_head(&nolwb_itxs)) != NULL)
-			zil_itx_destroy(itx, 0);
+			zil_itx_destroy(itx, err);
 	} else {
 		ASSERT(list_is_empty(&nolwb_waiters));
 		ASSERT3P(lwb, !=, NULL);
@@ -3292,17 +3364,17 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		    (!zilog->zl_parallel || zilog->zl_suspend > 0)) {
 			zil_burst_done(zilog);
 			list_insert_tail(ilwbs, lwb);
-			lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
+			lwb = zil_lwb_write_close(zilog, lwb);
 			if (lwb == NULL) {
 				int err = 0;
 				while ((lwb =
 				    list_remove_head(ilwbs)) != NULL) {
-					err = zil_lwb_write_issue(zilog, lwb);
-					if (err != 0)
-						break;
+					if (err == 0)
+						err = zil_lwb_write_issue(
+						    zilog, lwb);
 				}
-				if (err == 0)
-					zil_commit_writer_stall(zilog);
+				if (err != ESHUTDOWN)
+					(void) zil_commit_writer_stall(zilog);
 			}
 		}
 	}
@@ -3470,7 +3542,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 * hasn't been issued.
 	 */
 	zil_burst_done(zilog);
-	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
+	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb);
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
@@ -3546,7 +3618,7 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
 		 * commit itxs. When this occurs, the commit waiters linked
 		 * off of these commit itxs will not be committed to an
 		 * lwb.  Additionally, these commit waiters will not be
-		 * marked done until zil_commit_waiter_skip() is called via
+		 * marked done until zil_commit_waiter_done() is called via
 		 * zil_itxg_clean().
 		 *
 		 * Thus, it's possible for this commit waiter (i.e. the
@@ -3624,7 +3696,7 @@ zil_alloc_commit_waiter(void)
 	list_link_init(&zcw->zcw_node);
 	zcw->zcw_lwb = NULL;
 	zcw->zcw_done = B_FALSE;
-	zcw->zcw_zio_error = 0;
+	zcw->zcw_error = 0;
 
 	return (zcw);
 }
@@ -3728,6 +3800,9 @@ zil_crash(zilog_t *zilog)
 	 */
 	for (lwb_t *lwb = list_head(&zilog->zl_lwb_crash_list); lwb != NULL;
 	    lwb = list_next(&zilog->zl_lwb_crash_list, lwb)) {
+		ASSERT(!(lwb->lwb_flags & LWB_FLAG_CRASHED));
+		lwb->lwb_flags |= LWB_FLAG_CRASHED;
+
 		itx_t *itx;
 		while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
 			zil_itx_destroy(itx, EIO);
@@ -3736,7 +3811,7 @@ zil_crash(zilog_t *zilog)
 		while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
 			mutex_enter(&zcw->zcw_lock);
 			zcw->zcw_lwb = NULL;
-			zcw->zcw_zio_error = EIO;
+			zcw->zcw_error = EIO;
 			zcw->zcw_done = B_TRUE;
 			cv_broadcast(&zcw->zcw_cv);
 			mutex_exit(&zcw->zcw_lock);
@@ -4014,7 +4089,7 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
 	zil_commit_waiter(zilog, zcw);
 
 	int err = 0;
-	if (zcw->zcw_zio_error != 0) {
+	if (zcw->zcw_error != 0) {
 		/*
 		 * If there was an error writing out the ZIL blocks that
 		 * this thread is waiting on, then we fallback to
@@ -4149,7 +4224,7 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag)
 	    offsetof(zil_commit_waiter_t, zcw_node));
 	avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
 	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
-	mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&lwb->lwb_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
 
@@ -4158,7 +4233,7 @@ zil_lwb_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	lwb_t *lwb = vbuf;
-	mutex_destroy(&lwb->lwb_vdev_lock);
+	mutex_destroy(&lwb->lwb_lock);
 	avl_destroy(&lwb->lwb_vdev_tree);
 	list_destroy(&lwb->lwb_waiters);
 	list_destroy(&lwb->lwb_itxs);
@@ -4381,7 +4456,7 @@ zil_close(zilog_t *zilog)
 	if (lwb != NULL) {
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
-		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+		ASSERT0P(lwb->lwb_buf);
 		zil_free_lwb(zilog, lwb);
 	}
 	mutex_exit(&zilog->zl_lock);
@@ -4472,16 +4547,16 @@ zil_suspend(const char *osname, void **cookiep)
 			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
 		mutex_exit(&zilog->zl_lock);
 
-		if (cookiep == NULL)
+		if (zilog->zl_restart_txg > 0) {
+			/* ZIL crashed while we were waiting. */
+			zil_resume(os);
+			error = SET_ERROR(EBUSY);
+		} else if (cookiep == NULL)
 			zil_resume(os);
 		else
 			*cookiep = os;
 
-		if (zilog->zl_restart_txg > 0)
-			/* ZIL crashed while we were waiting. */
-			return (SET_ERROR(EBUSY));
-
-		return (0);
+		return (error);
 	}
 
 	/*
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index 3f0ddb63249d..74373f759cec 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -3318,8 +3318,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 	} else if (any_failed && candidate > SPA_OLD_GANGBLOCKSIZE &&
 	    spa_feature_is_enabled(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER) &&
 	    !spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) {
-		dmu_tx_t *tx =
-		    dmu_tx_create_assigned(spa->spa_dsl_pool, txg + 1);
+		dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
+		    MAX(txg, spa_syncing_txg(spa) + 1));
 		dsl_sync_task_nowait(spa->spa_dsl_pool,
 		    zio_update_feature,
 		    (void *)SPA_FEATURE_DYNAMIC_GANG_HEADER, tx);
@@ -4434,12 +4434,15 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
  */
 int
 zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
-    uint64_t size, boolean_t *slog)
+    uint64_t min_size, uint64_t max_size, boolean_t *slog,
+    boolean_t allow_larger)
 {
 	int error;
 	zio_alloc_list_t io_alloc_list;
+	uint64_t alloc_size = 0;
 
 	ASSERT(txg > spa_syncing_txg(spa));
+	ASSERT3U(min_size, <=, max_size);
 
 	metaslab_trace_init(&io_alloc_list);
 
@@ -4448,7 +4451,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 	 * Fill in the obvious ones before calling into metaslab_alloc().
 	 */
 	BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
-	BP_SET_PSIZE(new_bp, size);
+	BP_SET_PSIZE(new_bp, max_size);
 	BP_SET_LEVEL(new_bp, 0);
 
 	/*
@@ -4463,43 +4466,51 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 	ZIOSTAT_BUMP(ziostat_total_allocations);
 
 	/* Try log class (dedicated slog devices) first */
-	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
-	    txg, NULL, flags, &io_alloc_list, allocator, NULL);
+	error = metaslab_alloc_range(spa, spa_log_class(spa), min_size,
+	    max_size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
+	    NULL, &alloc_size);
 	*slog = (error == 0);
 
 	/* Try special_embedded_log class (reserved on special vdevs) */
 	if (error != 0) {
-		error = metaslab_alloc(spa, spa_special_embedded_log_class(spa),
-		    size, new_bp, 1, txg, NULL, flags, &io_alloc_list,
-		    allocator, NULL);
+		error = metaslab_alloc_range(spa,
+		    spa_special_embedded_log_class(spa), min_size, max_size,
+		    new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
+		    NULL, &alloc_size);
 	}
 
 	/* Try special class (general special vdev allocation) */
 	if (error != 0) {
-		error = metaslab_alloc(spa, spa_special_class(spa), size,
-		    new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
-		    NULL);
+		error = metaslab_alloc_range(spa, spa_special_class(spa),
+		    min_size, max_size, new_bp, 1, txg, NULL, flags,
+		    &io_alloc_list, allocator, NULL, &alloc_size);
 	}
 
 	/* Try embedded_log class (reserved on normal vdevs) */
 	if (error != 0) {
-		error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
-		    new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
-		    NULL);
+		error = metaslab_alloc_range(spa, spa_embedded_log_class(spa),
+		    min_size, max_size, new_bp, 1, txg, NULL, flags,
+		    &io_alloc_list, allocator, NULL, &alloc_size);
 	}
 
 	/* Finally fall back to normal class */
 	if (error != 0) {
 		ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks);
-		error = metaslab_alloc(spa, spa_normal_class(spa), size,
-		    new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
-		    NULL);
+		error = metaslab_alloc_range(spa, spa_normal_class(spa),
+		    min_size, max_size, new_bp, 1, txg, NULL, flags,
+		    &io_alloc_list, allocator, NULL, &alloc_size);
 	}
 	metaslab_trace_fini(&io_alloc_list);
 
 	if (error == 0) {
-		BP_SET_LSIZE(new_bp, size);
-		BP_SET_PSIZE(new_bp, size);
+		if (!allow_larger)
+			alloc_size = MIN(alloc_size, max_size);
+		else if (max_size <= SPA_OLD_MAXBLOCKSIZE)
+			alloc_size = MIN(alloc_size, SPA_OLD_MAXBLOCKSIZE);
+		alloc_size = P2ALIGN_TYPED(alloc_size, ZIL_MIN_BLKSZ, uint64_t);
+
+		BP_SET_LSIZE(new_bp, alloc_size);
+		BP_SET_PSIZE(new_bp, alloc_size);
 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
 		BP_SET_CHECKSUM(new_bp,
 		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
@@ -4527,8 +4538,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 		}
 	} else {
 		zfs_dbgmsg("%s: zil block allocation failure: "
-		    "size %llu, error %d", spa_name(spa), (u_longlong_t)size,
-		    error);
+		    "min_size %llu, max_size %llu, error %d", spa_name(spa),
+		    (u_longlong_t)min_size, (u_longlong_t)max_size, error);
 	}
 
 	return (error);
@@ -4563,8 +4574,29 @@ zio_vdev_io_start(zio_t *zio)
 	ASSERT0(zio->io_child_error[ZIO_CHILD_VDEV]);
 
 	if (vd == NULL) {
-		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
-			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
+		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) {
+			/*
+			 * A deadlock workaround. The ddt_prune_unique_entries()
+			 * -> prune_candidates_sync() code path takes the
+			 * SCL_ZIO reader lock and may request it again here.
+			 * If there is another thread who wants the SCL_ZIO
+			 * writer lock, then scl_write_wanted will be set.
+			 * Thus, the spa_config_enter_priority() is used to
+			 * ignore pending writer requests.
+			 *
+			 * The locking should be revised to remove the need
+			 * for this workaround.  If that's not workable then
+			 * it should only be applied to the zios involved in
+			 * the pruning process.  This impacts the read/write
+			 * I/O balance while pruning.
+			 */
+			if (spa->spa_active_ddt_prune)
+				spa_config_enter_priority(spa, SCL_ZIO, zio,
+				    RW_READER);
+			else
+				spa_config_enter(spa, SCL_ZIO, zio,
+				    RW_READER);
+		}
 
 		/*
 		 * The mirror_ops handle multiple DVAs in a single BP.
@@ -5294,6 +5326,16 @@ zio_ready(zio_t *zio)
 		return (NULL);
 	}
 
+	if (zio_injection_enabled) {
+		hrtime_t target = zio_handle_ready_delay(zio);
+		if (target != 0 && zio->io_target_timestamp == 0) {
+			zio->io_stage >>= 1;
+			zio->io_target_timestamp = target;
+			zio_delay_interrupt(zio);
+			return (NULL);
+		}
+	}
+
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
 		ASSERT(BP_GET_BIRTH(bp) == zio->io_txg ||
@@ -5527,9 +5569,12 @@ zio_done(zio_t *zio)
 				zio->io_vd->vdev_stat.vs_slow_ios++;
 				mutex_exit(&zio->io_vd->vdev_stat_lock);
 
-				(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
-				    zio->io_spa, zio->io_vd, &zio->io_bookmark,
-				    zio, 0);
+				if (zio->io_vd->vdev_slow_io_events) {
+					(void) zfs_ereport_post(
+					    FM_EREPORT_ZFS_DELAY,
+					    zio->io_spa, zio->io_vd,
+					    &zio->io_bookmark, zio, 0);
+				}
 			}
 		}
 	}
diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c
index 981a1be4847c..c3adfdab54ce 100644
--- a/sys/contrib/openzfs/module/zfs/zio_inject.c
+++ b/sys/contrib/openzfs/module/zfs/zio_inject.c
@@ -827,6 +827,44 @@ zio_handle_export_delay(spa_t *spa, hrtime_t elapsed)
 	zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT);
 }
 
+/*
+ * For testing, inject a delay before ready state.
+ */
+hrtime_t
+zio_handle_ready_delay(zio_t *zio)
+{
+	inject_handler_t *handler;
+	hrtime_t now = gethrtime();
+	hrtime_t target = 0;
+
+	/*
+	 * Ignore I/O not associated with any logical data.
+	 */
+	if (zio->io_logical == NULL)
+		return (0);
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+		if (zio->io_spa != handler->zi_spa ||
+		    handler->zi_record.zi_cmd != ZINJECT_DELAY_READY)
+			continue;
+
+		/* If this handler matches, inject the delay */
+		if (zio_match_iotype(zio, handler->zi_record.zi_iotype) &&
+		    zio_match_handler(&zio->io_logical->io_bookmark,
+		    zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
+		    zio_match_dva(zio), &handler->zi_record, zio->io_error)) {
+			target = now + (hrtime_t)handler->zi_record.zi_timer;
+			break;
+		}
+	}
+
+	rw_exit(&inject_lock);
+	return (target);
+}
+
 static int
 zio_calculate_range(const char *pool, zinject_record_t *record)
 {
@@ -970,9 +1008,9 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
 			if (zio_pool_handler_exists(name, record->zi_cmd))
 				return (SET_ERROR(EEXIST));
 
-			mutex_enter(&spa_namespace_lock);
+			spa_namespace_enter(FTAG);
 			boolean_t has_spa = spa_lookup(name) != NULL;
-			mutex_exit(&spa_namespace_lock);
+			spa_namespace_exit(FTAG);
 
 			if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa)
 				return (SET_ERROR(EEXIST));
@@ -1057,7 +1095,7 @@ zio_inject_list_next(int *id, char *name, size_t buflen,
 	inject_handler_t *handler;
 	int ret;
 
-	mutex_enter(&spa_namespace_lock);
+	spa_namespace_enter(FTAG);
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
@@ -1079,7 +1117,7 @@ zio_inject_list_next(int *id, char *name, size_t buflen,
 	}
 
 	rw_exit(&inject_lock);
-	mutex_exit(&spa_namespace_lock);
+	spa_namespace_exit(FTAG);
 
 	return (ret);
 }
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
index 29f51e230a37..407758641580 100644
--- a/sys/contrib/openzfs/module/zfs/zvol.c
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -38,25 +38,36 @@
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Klara, Inc.
  */
 
 /*
  * Note on locking of zvol state structures.
  *
- * These structures are used to maintain internal state used to emulate block
- * devices on top of zvols. In particular, management of device minor number
- * operations - create, remove, rename, and set_snapdev - involves access to
- * these structures. The zvol_state_lock is primarily used to protect the
- * zvol_state_list. The zv->zv_state_lock is used to protect the contents
- * of the zvol_state_t structures, as well as to make sure that when the
- * time comes to remove the structure from the list, it is not in use, and
- * therefore, it can be taken off zvol_state_list and freed.
+ * zvol_state_t represents the connection between a single dataset
+ * (DMU_OST_ZVOL) and the device "minor" (some OS-specific representation of a
+ * "disk" or "device" or "volume", eg, a /dev/zdXX node, a GEOM object, etc).
  *
- * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
- * e.g. for the duration of receive and rollback operations. This lock can be
- * held for significant periods of time. Given that it is undesirable to hold
- * mutexes for long periods of time, the following lock ordering applies:
+ * The global zvol_state_lock is used to protect access to zvol_state_list and
+ * zvol_htable, which are the primary way to obtain a zvol_state_t from a name.
+ * It should not be used for anything not name-relateds, and you should avoid
+ * sleeping or waiting while its held. See zvol_find_by_name(), zvol_insert(),
+ * zvol_remove().
+ *
+ * The zv_state_lock is used to protect the contents of the associated
+ * zvol_state_t. Most of the zvol_state_t is dedicated to control and
+ * configuration; almost none of it is needed for data operations (that is,
+ * read, write, flush) so this lock is rarely taken during general IO. It
+ * should be released quickly; you should avoid sleeping or waiting while its
+ * held.
+ *
+ * zv_suspend_lock is used to suspend IO/data operations to a zvol. The read
+ * half should held for the duration of an IO operation. The write half should
+ * be taken when something to wait for IO to complete and the block further IO,
+ * eg for the duration of receive and rollback operations. This lock can be
+ * held for long periods of time.
+ *
+ * Thus, the following lock ordering appies.
  * - take zvol_state_lock if necessary, to protect zvol_state_list
  * - take zv_suspend_lock if necessary, by the code path in question
  * - take zv_state_lock to protect zvol_state_t
@@ -67,9 +78,8 @@
  * these operations are serialized per pool. Consequently, we can be certain
  * that for a given zvol, there is only one operation at a time in progress.
  * That is why one can be sure that first, zvol_state_t for a given zvol is
- * allocated and placed on zvol_state_list, and then other minor operations
- * for this zvol are going to proceed in the order of issue.
- *
+ * allocated and placed on zvol_state_list, and then other minor operations for
+ * this zvol are going to proceed in the order of issue.
  */
 
 #include <sys/dataset_kstats.h>
@@ -400,7 +410,7 @@ zvol_set_volthreading(const char *name, boolean_t value)
 {
 	zvol_state_t *zv = zvol_find_by_name(name, RW_NONE);
 	if (zv == NULL)
-		return (SET_ERROR(ENOENT));
+		return (-1);
 	zv->zv_threading = value;
 	mutex_exit(&zv->zv_state_lock);
 	return (0);
@@ -537,7 +547,8 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 	if (error) {
 		dmu_tx_abort(tx);
 	} else {
-		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
+		dmu_write(os, ZVOL_OBJ, offset, length, data, tx,
+		    DMU_READ_PREFETCH);
 		(void) zil_replaying(zv->zv_zilog, tx);
 		dmu_tx_commit(tx);
 	}
@@ -1135,20 +1146,34 @@ zvol_tag(zvol_state_t *zv)
 /*
  * Suspend the zvol for recv and rollback.
  */
-zvol_state_t *
-zvol_suspend(const char *name)
+int
+zvol_suspend(const char *name, zvol_state_t **zvp)
 {
 	zvol_state_t *zv;
 
 	zv = zvol_find_by_name(name, RW_WRITER);
 
 	if (zv == NULL)
-		return (NULL);
+		return (SET_ERROR(ENOENT));
 
 	/* block all I/O, release in zvol_resume. */
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 
+	/*
+	 * If it's being removed, unlock and return error. It doesn't make any
+	 * sense to try to suspend a zvol being removed, but being here also
+	 * means that zvol_remove_minors_impl() is about to call zvol_remove()
+	 * and then destroy the zvol_state_t, so returning a pointer to it for
+	 * the caller to mess with would be a disaster anyway.
+	 */
+	if (zv->zv_flags & ZVOL_REMOVING) {
+		mutex_exit(&zv->zv_state_lock);
+		rw_exit(&zv->zv_suspend_lock);
+		/* NB: Returning EIO here to match zfsvfs_teardown() */
+		return (SET_ERROR(EIO));
+	}
+
 	atomic_inc(&zv->zv_suspend_ref);
 
 	if (zv->zv_open_count > 0)
@@ -1161,7 +1186,8 @@ zvol_suspend(const char *name)
 	mutex_exit(&zv->zv_state_lock);
 
 	/* zv_suspend_lock is released in zvol_resume() */
-	return (zv);
+	*zvp = zv;
+	return (0);
 }
 
 int
@@ -1207,7 +1233,7 @@ zvol_first_open(zvol_state_t *zv, boolean_t readonly)
 
 	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
-	ASSERT(mutex_owned(&spa_namespace_lock));
+	ASSERT(spa_namespace_held());
 
 	boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
 	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
@@ -1277,7 +1303,7 @@ zvol_create_snap_minor_cb(const char *dsname, void *arg)
 	list_t *minors_list = j->list;
 	const char *name = j->name;
 
-	ASSERT0(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT0(spa_namespace_held());
 
 	/* skip the designated dataset */
 	if (name && strcmp(dsname, name) == 0)
@@ -1377,7 +1403,7 @@ zvol_create_minors_cb(const char *dsname, void *arg)
 	int error;
 	list_t *minors_list = arg;
 
-	ASSERT0(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT0(spa_namespace_held());
 
 	error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
 	if (error)
@@ -1570,184 +1596,156 @@ zvol_create_minors_impl(zvol_task_t *task)
 }
 
 /*
- * Remove minors for specified dataset including children and snapshots.
+ * Remove minors for specified dataset and, optionally, its children and
+ * snapshots.
  */
-
-/*
- * Remove the minor for a given zvol. This will do it all:
- *  - flag the zvol for removal, so new requests are rejected
- *  - wait until outstanding requests are completed
- *  - remove it from lists
- *  - free it
- * It's also usable as a taskq task, and smells nice too.
- */
-static void
-zvol_remove_minor_task(void *arg)
-{
-	zvol_state_t *zv = (zvol_state_t *)arg;
-
-	ASSERT(!RW_LOCK_HELD(&zvol_state_lock));
-	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
-
-	mutex_enter(&zv->zv_state_lock);
-	while (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) {
-		zv->zv_flags |= ZVOL_REMOVING;
-		cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
-	}
-	mutex_exit(&zv->zv_state_lock);
-
-	rw_enter(&zvol_state_lock, RW_WRITER);
-	mutex_enter(&zv->zv_state_lock);
-
-	zvol_remove(zv);
-	zvol_os_clear_private(zv);
-
-	mutex_exit(&zv->zv_state_lock);
-	rw_exit(&zvol_state_lock);
-
-	zvol_os_free(zv);
-}
-
-static void
-zvol_free_task(void *arg)
-{
-	zvol_os_free(arg);
-}
-
 static void
 zvol_remove_minors_impl(zvol_task_t *task)
 {
 	zvol_state_t *zv, *zv_next;
 	const char *name = task ? task->zt_name1 : NULL;
 	int namelen = ((name) ? strlen(name) : 0);
-	taskqid_t t;
-	list_t delay_list, free_list;
+	boolean_t children = task ? !!task->zt_value : B_TRUE;
 
 	if (zvol_inhibit_dev)
 		return;
 
-	list_create(&delay_list, sizeof (zvol_state_t),
-	    offsetof(zvol_state_t, zv_next));
-	list_create(&free_list, sizeof (zvol_state_t),
-	    offsetof(zvol_state_t, zv_next));
+	/*
+	 * We collect up zvols that we want to remove on a separate list, so
+	 * that we don't have to hold zvol_state_lock for the whole time.
+	 *
+	 * We can't remove them from the global lists until we're completely
+	 * done with them, because that would make them appear to ZFS-side ops
+	 * that they don't exist, and the name might be reused, which can't be
+	 * good.
+	 */
+	list_t remove_list;
+	list_create(&remove_list, sizeof (zvol_state_t),
+	    offsetof(zvol_state_t, zv_remove_node));
 
-	rw_enter(&zvol_state_lock, RW_WRITER);
+	rw_enter(&zvol_state_lock, RW_READER);
 
 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 		zv_next = list_next(&zvol_state_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
+		if (zv->zv_flags & ZVOL_REMOVING) {
+			/* Another thread is handling shutdown, skip it. */
+			mutex_exit(&zv->zv_state_lock);
+			continue;
+		}
+
+		/*
+		 * This zvol should be removed if:
+		 * - no name was offered (ie removing all at shutdown); or
+		 * - name matches exactly; or
+		 * - we were asked to remove children, and
+		 *   - the start of the name matches, and
+		 *   - there is a '/' immediately after the matched name; or
+		 *   - there is a '@' immediately after the matched name
+		 */
 		if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
-		    (strncmp(zv->zv_name, name, namelen) == 0 &&
+		    (children && strncmp(zv->zv_name, name, namelen) == 0 &&
 		    (zv->zv_name[namelen] == '/' ||
 		    zv->zv_name[namelen] == '@'))) {
-			/*
-			 * By holding zv_state_lock here, we guarantee that no
-			 * one is currently using this zv
-			 */
 
 			/*
-			 * If in use, try to throw everyone off and try again
-			 * later.
+			 * Matched, so mark it removal. We want to take the
+			 * write half of the suspend lock to make sure that
+			 * the zvol is not suspended, and give any data ops
+			 * chance to finish.
 			 */
-			if (zv->zv_open_count > 0 ||
-			    atomic_read(&zv->zv_suspend_ref)) {
-				zv->zv_flags |= ZVOL_REMOVING;
-				t = taskq_dispatch(
-				    zv->zv_objset->os_spa->spa_zvol_taskq,
-				    zvol_remove_minor_task, zv, TQ_SLEEP);
-				if (t == TASKQID_INVALID) {
-					/*
-					 * Couldn't create the task, so we'll
-					 * do it in place once the loop is
-					 * finished.
-					 */
-					list_insert_head(&delay_list, zv);
-				}
+			mutex_exit(&zv->zv_state_lock);
+			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+			mutex_enter(&zv->zv_state_lock);
+
+			if (zv->zv_flags & ZVOL_REMOVING) {
+				/* Another thread has taken it, let them. */
 				mutex_exit(&zv->zv_state_lock);
+				rw_exit(&zv->zv_suspend_lock);
 				continue;
 			}
 
-			zvol_remove(zv);
-
 			/*
-			 * Cleared while holding zvol_state_lock as a writer
-			 * which will prevent zvol_open() from opening it.
+			 * Mark it and unlock. New entries will see the flag
+			 * and return ENXIO.
 			 */
-			zvol_os_clear_private(zv);
-
-			/* Drop zv_state_lock before zvol_free() */
+			zv->zv_flags |= ZVOL_REMOVING;
 			mutex_exit(&zv->zv_state_lock);
+			rw_exit(&zv->zv_suspend_lock);
 
-			/* Try parallel zv_free, if failed do it in place */
-			t = taskq_dispatch(system_taskq, zvol_free_task, zv,
-			    TQ_SLEEP);
-			if (t == TASKQID_INVALID)
-				list_insert_head(&free_list, zv);
-		} else {
+			/* Put it on the list for the next stage. */
+			list_insert_head(&remove_list, zv);
+		} else
 			mutex_exit(&zv->zv_state_lock);
-		}
 	}
-	rw_exit(&zvol_state_lock);
-
-	/* Wait for zvols that we couldn't create a remove task for */
-	while ((zv = list_remove_head(&delay_list)) != NULL)
-		zvol_remove_minor_task(zv);
 
-	/* Free any that we couldn't free in parallel earlier */
-	while ((zv = list_remove_head(&free_list)) != NULL)
-		zvol_os_free(zv);
-}
-
-/* Remove minor for this specific volume only */
-static int
-zvol_remove_minor_impl(const char *name)
-{
-	zvol_state_t *zv = NULL, *zv_next;
+	rw_exit(&zvol_state_lock);
 
-	if (zvol_inhibit_dev)
-		return (0);
+	/* Didn't match any, nothing to do! */
+	if (list_is_empty(&remove_list)) {
+		if (task)
+			task->zt_error = SET_ERROR(ENOENT);
+		return;
+	}
 
-	rw_enter(&zvol_state_lock, RW_WRITER);
-
-	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
-		zv_next = list_next(&zvol_state_list, zv);
+	/* Actually shut them all down. */
+	for (zv = list_head(&remove_list); zv != NULL; zv = zv_next) {
+		zv_next = list_next(&remove_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
-		if (strcmp(zv->zv_name, name) == 0)
-			/* Found, leave the the loop with zv_lock held */
-			break;
-		mutex_exit(&zv->zv_state_lock);
-	}
 
-	if (zv == NULL) {
-		rw_exit(&zvol_state_lock);
-		return (SET_ERROR(ENOENT));
-	}
-
-	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+		/*
+		 * Still open or suspended, just wait. This can happen if, for
+		 * example, we managed to acquire zv_state_lock in the moments
+		 * where zvol_open() or zvol_release() are trading locks to
+		 * call zvol_first_open() or zvol_last_close().
+		 */
+		while (zv->zv_open_count > 0 ||
+		    atomic_read(&zv->zv_suspend_ref))
+			cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
 
-	if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) {
 		/*
-		 * In use, so try to throw everyone off, then wait
-		 * until finished.
+		 * No users, shut down the OS side. This may not remove the
+		 * minor from view immediately, depending on the kernel
+		 * specifics, but it will ensure that it is unusable and that
+		 * this zvol_state_t can never again be reached from an OS-side
+		 * operation.
 		 */
-		zv->zv_flags |= ZVOL_REMOVING;
+		zvol_os_remove_minor(zv);
 		mutex_exit(&zv->zv_state_lock);
+
+		/* Remove it from the name lookup lists */
+		rw_enter(&zvol_state_lock, RW_WRITER);
+		zvol_remove(zv);
 		rw_exit(&zvol_state_lock);
-		zvol_remove_minor_task(zv);
-		return (0);
 	}
 
-	zvol_remove(zv);
-	zvol_os_clear_private(zv);
+	/*
+	 * Our own references on remove_list is the last one, free them and
+	 * we're done.
+	 */
+	while ((zv = list_remove_head(&remove_list)) != NULL)
+		zvol_os_free(zv);
 
-	mutex_exit(&zv->zv_state_lock);
-	rw_exit(&zvol_state_lock);
+	list_destroy(&remove_list);
+}
 
-	zvol_os_free(zv);
+/* Remove minor for this specific volume only */
+static int
+zvol_remove_minor_impl(const char *name)
+{
+	if (zvol_inhibit_dev)
+		return (0);
 
-	return (0);
+	zvol_task_t task;
+	memset(&task, 0, sizeof (zvol_task_t));
+	strlcpy(task.zt_name1, name, sizeof (task.zt_name1));
+	task.zt_value = B_FALSE;
+
+	zvol_remove_minors_impl(&task);
+
+	return (task.zt_error);
 }
 
 /*
@@ -2067,6 +2065,7 @@ zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
 	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
 	task->zt_op = ZVOL_ASYNC_REMOVE_MINORS;
 	strlcpy(task->zt_name1, name, sizeof (task->zt_name1));
+	task->zt_value = B_TRUE;
 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 	if ((async == B_FALSE) && (id != TASKQID_INVALID))
 		taskq_wait_id(spa->spa_zvol_taskq, id);
@@ -2188,14 +2187,6 @@ zvol_fini_impl(void)
 
 	zvol_remove_minors_impl(NULL);
 
-	/*
-	 * The call to "zvol_remove_minors_impl" may dispatch entries to
-	 * the system_taskq, but it doesn't wait for those entries to
-	 * complete before it returns. Thus, we must wait for all of the
-	 * removals to finish, before we can continue.
-	 */
-	taskq_wait_outstanding(system_taskq, 0);
-
 	kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
 	list_destroy(&zvol_state_list);
 	rw_destroy(&zvol_state_lock);
diff --git a/sys/contrib/openzfs/module/zstd/include/aarch64_compat.h b/sys/contrib/openzfs/module/zstd/include/aarch64_compat.h
deleted file mode 100644
index 9500a832b81c..000000000000
--- a/sys/contrib/openzfs/module/zstd/include/aarch64_compat.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// SPDX-License-Identifier: BSD-3-Clause
-/*
- * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Copyright (c) 2018-2020, Sebastian Gottschall
- */
-
-#ifdef _KERNEL
-#undef __aarch64__
-#endif
diff --git a/sys/contrib/openzfs/module/zstd/lib/common/compiler.h b/sys/contrib/openzfs/module/zstd/lib/common/compiler.h
index d0f588e2ec3c..c8d65a201212 100644
--- a/sys/contrib/openzfs/module/zstd/lib/common/compiler.h
+++ b/sys/contrib/openzfs/module/zstd/lib/common/compiler.h
@@ -115,9 +115,6 @@
 #    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
 #    define PREFETCH_L1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
 #    define PREFETCH_L2(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
-#    elif defined(__aarch64__)
-#     define PREFETCH_L1(ptr)  __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr)))
-#     define PREFETCH_L2(ptr)  __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr)))
 #  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
 #    define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
 #    define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
diff --git a/sys/contrib/openzfs/module/zstd/lib/common/zstd_internal.h b/sys/contrib/openzfs/module/zstd/lib/common/zstd_internal.h
index 6b1fc44cf9f6..9650af77bcea 100644
--- a/sys/contrib/openzfs/module/zstd/lib/common/zstd_internal.h
+++ b/sys/contrib/openzfs/module/zstd/lib/common/zstd_internal.h
@@ -12,6 +12,15 @@
 #ifndef ZSTD_CCOMMON_H_MODULE
 #define ZSTD_CCOMMON_H_MODULE
 
+/*
+ * Disable the aarch64 NEON SIMD intrinsics for kernel builds.  Safely
+ * using them in the kernel context requires saving/restoring the FPU
+ * registers which is not currently done.
+ */
+#ifdef _KERNEL
+#define ZSTD_NO_INTRINSICS
+#endif
+
 /* this module contains definitions which must be identical
  * across compression, decompression and dictBuilder.
  * It also contains a few functions useful to at least 2 of them
diff --git a/sys/contrib/openzfs/module/zstd/zfs_zstd.c b/sys/contrib/openzfs/module/zstd/zfs_zstd.c
index 391216d6e263..c403c001086a 100644
--- a/sys/contrib/openzfs/module/zstd/zfs_zstd.c
+++ b/sys/contrib/openzfs/module/zstd/zfs_zstd.c
@@ -441,64 +441,6 @@ zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
 }
 
 #ifndef IN_LIBSA
-static size_t
-zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
-    int level)
-{
-	int16_t zstd_level;
-	if (zstd_enum_to_level(level, &zstd_level)) {
-		ZSTDSTAT_BUMP(zstd_stat_com_inval);
-		return (s_len);
-	}
-	/*
-	 * A zstd early abort heuristic.
-	 *
-	 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
-	 *   128k), don't try any of this, just go.
-	 *   (because experimentally that was a reasonable cutoff for a perf win
-	 *   with tiny ratio change)
-	 * - First, we try LZ4 compression, and if it doesn't early abort, we
-	 *   jump directly to whatever compression level we intended to try.
-	 * - Second, we try zstd-1 - if that errors out (usually, but not
-	 *   exclusively, if it would overflow), we give up early.
-	 *
-	 *   If it works, instead we go on and compress anyway.
-	 *
-	 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
-	 * compressible data, it was losing up to 8.5% of the compressed
-	 * savings versus no early abort, and all the zstd-fast levels are
-	 * worse indications on their own than LZ4, and don't improve the LZ4
-	 * pass noticably if stacked like this.
-	 */
-	size_t actual_abort_size = zstd_abort_size;
-	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
-	    s_len >= actual_abort_size) {
-		int pass_len = 1;
-		pass_len = zfs_lz4_compress(s_start, d_start, s_len, d_len, 0);
-		if (pass_len < d_len) {
-			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
-			goto keep_trying;
-		}
-		ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
-
-		pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
-		    ZIO_ZSTD_LEVEL_1);
-		if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
-			ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
-			return (s_len);
-		}
-		ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
-	} else {
-		ZSTDSTAT_BUMP(zstd_stat_passignored);
-		if (s_len < actual_abort_size) {
-			ZSTDSTAT_BUMP(zstd_stat_passignored_size);
-		}
-	}
-keep_trying:
-	return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
-
-}
-
 /* Compress block using zstd */
 static size_t
 zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,
@@ -876,9 +818,9 @@ static void __init
 zstd_mempool_init(void)
 {
 	zstd_mempool_cctx =
-	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
+	    vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
 	zstd_mempool_dctx =
-	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
+	    vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
 
 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
 		mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
@@ -924,8 +866,8 @@ zstd_mempool_deinit(void)
 		release_pool(&zstd_mempool_dctx[i]);
 	}
 
-	kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
-	kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
+	vmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
+	vmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
 	zstd_mempool_dctx = NULL;
 	zstd_mempool_cctx = NULL;
 }