132 files changed, 25472 insertions, 11066 deletions
diff --git a/sys/contrib/openzfs/module/zfs/Makefile.in b/sys/contrib/openzfs/module/zfs/Makefile.in
deleted file mode 100644
index 653ea0da9bcc..000000000000
--- a/sys/contrib/openzfs/module/zfs/Makefile.in
+++ /dev/null
@@ -1,157 +0,0 @@
-ifneq ($(KBUILD_EXTMOD),)
-src = @abs_srcdir@
-obj = @abs_builddir@
-mfdir = $(obj)
-else
-mfdir = $(srctree)/$(src)
-endif
-
-MODULE := zfs
-
-obj-$(CONFIG_ZFS) := $(MODULE).o
-
-# Suppress unused-value warnings in sparc64 architecture headers
-ccflags-$(CONFIG_SPARC64) += -Wno-unused-value
-
-$(MODULE)-objs += abd.o
-$(MODULE)-objs += aggsum.o
-$(MODULE)-objs += arc.o
-$(MODULE)-objs += blkptr.o
-$(MODULE)-objs += bplist.o
-$(MODULE)-objs += bpobj.o
-$(MODULE)-objs += bptree.o
-$(MODULE)-objs += btree.o
-$(MODULE)-objs += bqueue.o
-$(MODULE)-objs += dataset_kstats.o
-$(MODULE)-objs += dbuf.o
-$(MODULE)-objs += dbuf_stats.o
-$(MODULE)-objs += ddt.o
-$(MODULE)-objs += ddt_zap.o
-$(MODULE)-objs += dmu.o
-$(MODULE)-objs += dmu_diff.o
-$(MODULE)-objs += dmu_object.o
-$(MODULE)-objs += dmu_objset.o
-$(MODULE)-objs += dmu_recv.o
-$(MODULE)-objs += dmu_redact.o
-$(MODULE)-objs += dmu_send.o
-$(MODULE)-objs += dmu_traverse.o
-$(MODULE)-objs += dmu_tx.o
-$(MODULE)-objs += dmu_zfetch.o
-$(MODULE)-objs += dnode.o
-$(MODULE)-objs += dnode_sync.o
-$(MODULE)-objs += dsl_bookmark.o
-$(MODULE)-objs += dsl_crypt.o
-$(MODULE)-objs += dsl_dataset.o
-$(MODULE)-objs += dsl_deadlist.o
-$(MODULE)-objs += dsl_deleg.o
-$(MODULE)-objs += dsl_destroy.o
-$(MODULE)-objs += dsl_dir.o
-$(MODULE)-objs += dsl_pool.o
-$(MODULE)-objs += dsl_prop.o
-$(MODULE)-objs += dsl_scan.o
-$(MODULE)-objs += dsl_synctask.o
-$(MODULE)-objs += dsl_userhold.o
-$(MODULE)-objs += edonr_zfs.o
-$(MODULE)-objs += fm.o
-$(MODULE)-objs += gzip.o
-$(MODULE)-objs += hkdf.o
-$(MODULE)-objs += lz4.o
-$(MODULE)-objs += lzjb.o
-$(MODULE)-objs += metaslab.o
-$(MODULE)-objs += mmp.o
-$(MODULE)-objs += multilist.o
-$(MODULE)-objs += objlist.o
-$(MODULE)-objs += pathname.o
-$(MODULE)-objs += range_tree.o
-$(MODULE)-objs += refcount.o
-$(MODULE)-objs += rrwlock.o
-$(MODULE)-objs += sa.o
-$(MODULE)-objs += sha256.o
-$(MODULE)-objs += skein_zfs.o
-$(MODULE)-objs += spa.o
-$(MODULE)-objs += spa_boot.o
-$(MODULE)-objs += spa_checkpoint.o
-$(MODULE)-objs += spa_config.o
-$(MODULE)-objs += spa_errlog.o
-$(MODULE)-objs += spa_history.o
-$(MODULE)-objs += spa_log_spacemap.o
-$(MODULE)-objs += spa_misc.o
-$(MODULE)-objs += spa_stats.o
-$(MODULE)-objs += space_map.o
-$(MODULE)-objs += space_reftree.o
-$(MODULE)-objs += txg.o
-$(MODULE)-objs += uberblock.o
-$(MODULE)-objs += unique.o
-$(MODULE)-objs += vdev.o
-$(MODULE)-objs += vdev_cache.o
-$(MODULE)-objs += vdev_draid.o
-$(MODULE)-objs += vdev_draid_rand.o
-$(MODULE)-objs += vdev_indirect.o
-$(MODULE)-objs += vdev_indirect_births.o
-$(MODULE)-objs += vdev_indirect_mapping.o
-$(MODULE)-objs += vdev_initialize.o
-$(MODULE)-objs += vdev_label.o
-$(MODULE)-objs += vdev_mirror.o
-$(MODULE)-objs += vdev_missing.o
-$(MODULE)-objs += vdev_queue.o
-$(MODULE)-objs += vdev_raidz.o
-$(MODULE)-objs += vdev_raidz_math.o
-$(MODULE)-objs += vdev_raidz_math_scalar.o
-$(MODULE)-objs += vdev_rebuild.o
-$(MODULE)-objs += vdev_removal.o
-$(MODULE)-objs += vdev_root.o
-$(MODULE)-objs += vdev_trim.o
-$(MODULE)-objs += zap.o
-$(MODULE)-objs += zap_leaf.o
-$(MODULE)-objs += zap_micro.o
-$(MODULE)-objs += zcp.o
-$(MODULE)-objs += zcp_get.o
-$(MODULE)-objs += zcp_global.o
-$(MODULE)-objs += zcp_iter.o
-$(MODULE)-objs += zcp_set.o
-$(MODULE)-objs += zcp_synctask.o
-$(MODULE)-objs += zfeature.o
-$(MODULE)-objs += zfs_byteswap.o
-$(MODULE)-objs += zfs_fm.o
-$(MODULE)-objs += zfs_fuid.o
-$(MODULE)-objs += zfs_ioctl.o
-$(MODULE)-objs += zfs_log.o
-$(MODULE)-objs += zfs_onexit.o
-$(MODULE)-objs += zfs_quota.o
-$(MODULE)-objs += zfs_ratelimit.o
-$(MODULE)-objs += zfs_replay.o
-$(MODULE)-objs += zfs_rlock.o
-$(MODULE)-objs += zfs_sa.o
-$(MODULE)-objs += zfs_vnops.o
-$(MODULE)-objs += zil.o
-$(MODULE)-objs += zio.o
-$(MODULE)-objs += zio_checksum.o
-$(MODULE)-objs += zio_compress.o
-$(MODULE)-objs += zio_inject.o
-$(MODULE)-objs += zle.o
-$(MODULE)-objs += zrlock.o
-$(MODULE)-objs += zthr.o
-$(MODULE)-objs += zvol.o
-
-# Suppress incorrect warnings from versions of objtool which are not
-# aware of x86 EVEX prefix instructions used for AVX512.
-OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y
-OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y
-
-$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_sse2.o
-$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_ssse3.o
-$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx2.o
-$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512f.o
-$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512bw.o
-
-$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neon.o
-$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neonx2.o
-
-$(MODULE)-$(CONFIG_PPC) += vdev_raidz_math_powerpc_altivec.o
-$(MODULE)-$(CONFIG_PPC64) += vdev_raidz_math_powerpc_altivec.o
-
-ifeq ($(CONFIG_ALTIVEC),y)
-$(obj)/vdev_raidz_math_powerpc_altivec.o: c_flags += -maltivec
-endif
-
-include $(mfdir)/../os/linux/zfs/Makefile
diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c
index bf39cd613330..2c0cda25dbc6 100644
--- a/sys/contrib/openzfs/module/zfs/abd.c
+++ b/sys/contrib/openzfs/module/zfs/abd.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -109,7 +109,6 @@ void
 abd_verify(abd_t *abd)
 {
 #ifdef ZFS_DEBUG
-	ASSERT3U(abd->abd_size, >, 0);
 	ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
 	    ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
@@ -118,6 +117,7 @@ abd_verify(abd_t *abd)
 	IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
 	IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
 	if (abd_is_linear(abd)) {
+		ASSERT3U(abd->abd_size, >, 0);
 		ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL);
 	} else if (abd_is_gang(abd)) {
 		uint_t child_sizes = 0;
@@ -130,6 +130,7 @@ abd_verify(abd_t *abd)
 		}
 		ASSERT3U(abd->abd_size, ==, child_sizes);
 	} else {
+		ASSERT3U(abd->abd_size, >, 0);
 		abd_verify_scatter(abd);
 	}
 #endif
@@ -369,7 +370,20 @@ abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
 		 * will retain all the free_on_free settings after being
 		 * added to the parents list.
 		 */
+#ifdef ZFS_DEBUG
+		/*
+		 * If cabd had abd_parent, we have to drop it here.  We can't
+		 * transfer it to pabd, nor we can clear abd_size leaving it.
+		 */
+		if (cabd->abd_parent != NULL) {
+			(void) zfs_refcount_remove_many(
+			    &cabd->abd_parent->abd_children,
+			    cabd->abd_size, cabd);
+			cabd->abd_parent = NULL;
+		}
+#endif
 		pabd->abd_size += cabd->abd_size;
+		cabd->abd_size = 0;
 		list_move_tail(&ABD_GANG(pabd).abd_gang_chain,
 		    &ABD_GANG(cabd).abd_gang_chain);
 		ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain));
@@ -407,7 +421,6 @@ abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
 	 */
 	if (abd_is_gang(cabd)) {
 		ASSERT(!list_link_active(&cabd->abd_gang_link));
-		ASSERT(!list_is_empty(&ABD_GANG(cabd).abd_gang_chain));
 		return (abd_gang_add_gang(pabd, cabd, free_on_free));
 	}
 	ASSERT(!abd_is_gang(cabd));
@@ -667,15 +680,15 @@ abd_return_buf(abd_t *abd, void *buf, size_t n)
 {
 	abd_verify(abd);
 	ASSERT3U(abd->abd_size, >=, n);
+#ifdef ZFS_DEBUG
+	(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
+#endif
 	if (abd_is_linear(abd)) {
 		ASSERT3P(buf, ==, abd_to_buf(abd));
 	} else {
 		ASSERT0(abd_cmp_buf(abd, buf, n));
 		zio_buf_free(buf, n);
 	}
-#ifdef ZFS_DEBUG
-	(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
-#endif
 }
 
 void
@@ -789,13 +802,10 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
 	abd_verify(abd);
 	ASSERT3U(off + size, <=, abd->abd_size);
 
-	boolean_t gang = abd_is_gang(abd);
 	abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
 
 	while (size > 0) {
-		/* If we are at the end of the gang ABD we are done */
-		if (gang && !c_abd)
-			break;
+		IMPLY(abd_is_gang(abd), c_abd != NULL);
 
 		abd_iter_map(&aiter);
 
@@ -816,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
 	return (ret);
 }
 
+#if defined(__linux__) && defined(_KERNEL)
+int
+abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
+    abd_iter_page_func_t *func, void *private)
+{
+	struct abd_iter aiter;
+	int ret = 0;
+
+	if (size == 0)
+		return (0);
+
+	abd_verify(abd);
+	ASSERT3U(off + size, <=, abd->abd_size);
+
+	abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
+
+	while (size > 0) {
+		IMPLY(abd_is_gang(abd), c_abd != NULL);
+
+		abd_iter_page(&aiter);
+
+		size_t len = MIN(aiter.iter_page_dsize, size);
+		ASSERT3U(len, >, 0);
+
+		ret = func(aiter.iter_page, aiter.iter_page_doff,
+		    len, private);
+
+		aiter.iter_page = NULL;
+		aiter.iter_page_doff = 0;
+		aiter.iter_page_dsize = 0;
+
+		if (ret != 0)
+			break;
+
+		size -= len;
+		c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
+	}
+
+	return (ret);
+}
+#endif
+
 struct buf_arg {
 	void *arg_buf;
 };
@@ -889,10 +941,10 @@ abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
 	    &ba_ptr);
 }
 
-/*ARGSUSED*/
 static int
 abd_zero_off_cb(void *buf, size_t size, void *private)
 {
+	(void) private;
 	(void) memset(buf, 0, size);
 	return (0);
 }
@@ -917,7 +969,6 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
 {
 	int ret = 0;
 	struct abd_iter daiter, saiter;
-	boolean_t dabd_is_gang_abd, sabd_is_gang_abd;
 	abd_t *c_dabd, *c_sabd;
 
 	if (size == 0)
@@ -929,16 +980,12 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
 	ASSERT3U(doff + size, <=, dabd->abd_size);
 	ASSERT3U(soff + size, <=, sabd->abd_size);
 
-	dabd_is_gang_abd = abd_is_gang(dabd);
-	sabd_is_gang_abd = abd_is_gang(sabd);
 	c_dabd = abd_init_abd_iter(dabd, &daiter, doff);
 	c_sabd = abd_init_abd_iter(sabd, &saiter, soff);
 
 	while (size > 0) {
-		/* if we are at the end of the gang ABD we are done */
-		if ((dabd_is_gang_abd && !c_dabd) ||
-		    (sabd_is_gang_abd && !c_sabd))
-			break;
+		IMPLY(abd_is_gang(dabd), c_dabd != NULL);
+		IMPLY(abd_is_gang(sabd), c_sabd != NULL);
 
 		abd_iter_map(&daiter);
 		abd_iter_map(&saiter);
@@ -967,10 +1014,10 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
 	return (ret);
 }
 
-/*ARGSUSED*/
 static int
 abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
 {
+	(void) private;
 	(void) memcpy(dbuf, sbuf, size);
 	return (0);
 }
@@ -985,10 +1032,10 @@ abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
 	    abd_copy_off_cb, NULL);
 }
 
-/*ARGSUSED*/
 static int
 abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
 {
+	(void) private;
 	return (memcmp(bufa, bufb, size));
 }
 
@@ -1012,87 +1059,63 @@ abd_cmp(abd_t *dabd, abd_t *sabd)
  *                 is the same when taking linear and when taking scatter
  */
 void
-abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
-    ssize_t csize, ssize_t dsize, const unsigned parity,
+abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off,
+    size_t csize, size_t dsize, const unsigned parity,
     void (*func_raidz_gen)(void **, const void *, size_t, size_t))
 {
 	int i;
-	ssize_t len, dlen;
+	size_t len, dlen;
 	struct abd_iter caiters[3];
-	struct abd_iter daiter = {0};
-	void *caddrs[3];
+	struct abd_iter daiter;
+	void *caddrs[3], *daddr;
 	unsigned long flags __maybe_unused = 0;
 	abd_t *c_cabds[3];
 	abd_t *c_dabd = NULL;
-	boolean_t cabds_is_gang_abd[3];
-	boolean_t dabd_is_gang_abd = B_FALSE;
 
 	ASSERT3U(parity, <=, 3);
-
 	for (i = 0; i < parity; i++) {
-		cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
-		c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0);
+		abd_verify(cabds[i]);
+		ASSERT3U(off + csize, <=, cabds[i]->abd_size);
+		c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], off);
 	}
 
-	if (dabd) {
-		dabd_is_gang_abd = abd_is_gang(dabd);
-		c_dabd = abd_init_abd_iter(dabd, &daiter, 0);
+	if (dsize > 0) {
+		ASSERT(dabd);
+		abd_verify(dabd);
+		ASSERT3U(off + dsize, <=, dabd->abd_size);
+		c_dabd = abd_init_abd_iter(dabd, &daiter, off);
 	}
 
-	ASSERT3S(dsize, >=, 0);
-
 	abd_enter_critical(flags);
 	while (csize > 0) {
-		/* if we are at the end of the gang ABD we are done */
-		if (dabd_is_gang_abd && !c_dabd)
-			break;
-
+		len = csize;
 		for (i = 0; i < parity; i++) {
-			/*
-			 * If we are at the end of the gang ABD we are
-			 * done.
-			 */
-			if (cabds_is_gang_abd[i] && !c_cabds[i])
-				break;
+			IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);
 			abd_iter_map(&caiters[i]);
 			caddrs[i] = caiters[i].iter_mapaddr;
+			len = MIN(caiters[i].iter_mapsize, len);
 		}
 
-		len = csize;
-
-		if (dabd && dsize > 0)
+		if (dsize > 0) {
+			IMPLY(abd_is_gang(dabd), c_dabd != NULL);
 			abd_iter_map(&daiter);
-
-		switch (parity) {
-			case 3:
-				len = MIN(caiters[2].iter_mapsize, len);
-				fallthrough;
-			case 2:
-				len = MIN(caiters[1].iter_mapsize, len);
-				fallthrough;
-			case 1:
-				len = MIN(caiters[0].iter_mapsize, len);
-		}
-
-		/* must be progressive */
-		ASSERT3S(len, >, 0);
-
-		if (dabd && dsize > 0) {
-			/* this needs precise iter.length */
+			daddr = daiter.iter_mapaddr;
 			len = MIN(daiter.iter_mapsize, len);
 			dlen = len;
-		} else
+		} else {
+			daddr = NULL;
 			dlen = 0;
+		}
 
 		/* must be progressive */
-		ASSERT3S(len, >, 0);
+		ASSERT3U(len, >, 0);
 		/*
 		 * The iterated function likely will not do well if each
 		 * segment except the last one is not multiple of 512 (raidz).
 		 */
 		ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
 
-		func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen);
+		func_raidz_gen(caddrs, daddr, len, dlen);
 
 		for (i = parity-1; i >= 0; i--) {
 			abd_iter_unmap(&caiters[i]);
@@ -1101,7 +1124,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
 			    &caiters[i], len);
 		}
 
-		if (dabd && dsize > 0) {
+		if (dsize > 0) {
 			abd_iter_unmap(&daiter);
 			c_dabd =
 			    abd_advance_abd_iter(dabd, c_dabd, &daiter,
@@ -1110,9 +1133,6 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
 		}
 
 		csize -= len;
-
-		ASSERT3S(dsize, >=, 0);
-		ASSERT3S(csize, >=, 0);
 	}
 	abd_exit_critical(flags);
 }
@@ -1129,27 +1149,27 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
  */
 void
 abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
-    ssize_t tsize, const unsigned parity,
+    size_t tsize, const unsigned parity,
     void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
     const unsigned *mul),
     const unsigned *mul)
 {
 	int i;
-	ssize_t len;
+	size_t len;
 	struct abd_iter citers[3];
 	struct abd_iter xiters[3];
 	void *caddrs[3], *xaddrs[3];
 	unsigned long flags __maybe_unused = 0;
-	boolean_t cabds_is_gang_abd[3];
-	boolean_t tabds_is_gang_abd[3];
 	abd_t *c_cabds[3];
 	abd_t *c_tabds[3];
 
 	ASSERT3U(parity, <=, 3);
 
 	for (i = 0; i < parity; i++) {
-		cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
-		tabds_is_gang_abd[i] = abd_is_gang(tabds[i]);
+		abd_verify(cabds[i]);
+		abd_verify(tabds[i]);
+		ASSERT3U(tsize, <=, cabds[i]->abd_size);
+		ASSERT3U(tsize, <=, tabds[i]->abd_size);
 		c_cabds[i] =
 		    abd_init_abd_iter(cabds[i], &citers[i], 0);
 		c_tabds[i] =
@@ -1158,36 +1178,18 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
 
 	abd_enter_critical(flags);
 	while (tsize > 0) {
-
+		len = tsize;
 		for (i = 0; i < parity; i++) {
-			/*
-			 * If we are at the end of the gang ABD we
-			 * are done.
-			 */
-			if (cabds_is_gang_abd[i] && !c_cabds[i])
-				break;
-			if (tabds_is_gang_abd[i] && !c_tabds[i])
-				break;
+			IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);
+			IMPLY(abd_is_gang(tabds[i]), c_tabds[i] != NULL);
 			abd_iter_map(&citers[i]);
 			abd_iter_map(&xiters[i]);
 			caddrs[i] = citers[i].iter_mapaddr;
 			xaddrs[i] = xiters[i].iter_mapaddr;
+			len = MIN(citers[i].iter_mapsize, len);
+			len = MIN(xiters[i].iter_mapsize, len);
 		}
 
-		len = tsize;
-		switch (parity) {
-			case 3:
-				len = MIN(xiters[2].iter_mapsize, len);
-				len = MIN(citers[2].iter_mapsize, len);
-				fallthrough;
-			case 2:
-				len = MIN(xiters[1].iter_mapsize, len);
-				len = MIN(citers[1].iter_mapsize, len);
-				fallthrough;
-			case 1:
-				len = MIN(xiters[0].iter_mapsize, len);
-				len = MIN(citers[0].iter_mapsize, len);
-		}
 		/* must be progressive */
 		ASSERT3S(len, >, 0);
 		/*
diff --git a/sys/contrib/openzfs/module/zfs/aggsum.c b/sys/contrib/openzfs/module/zfs/aggsum.c
index c4ea4f86fc5f..488c6ef3b6fc 100644
--- a/sys/contrib/openzfs/module/zfs/aggsum.c
+++ b/sys/contrib/openzfs/module/zfs/aggsum.c
@@ -87,7 +87,7 @@ static uint_t aggsum_borrow_shift = 4;
 void
 aggsum_init(aggsum_t *as, uint64_t value)
 {
-	bzero(as, sizeof (*as));
+	memset(as, 0, sizeof (*as));
 	as->as_lower_bound = as->as_upper_bound = value;
 	mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
 	/*
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
index 79e2d4381830..30d30b98a6c6 100644
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -108,12 +108,11 @@
  * the active state mutex must be held before the ghost state mutex.
  *
  * It as also possible to register a callback which is run when the
- * arc_meta_limit is reached and no buffers can be safely evicted.  In
+ * metadata limit is reached and no buffers can be safely evicted.  In
  * this case the arc user should drop a reference on some arc buffers so
- * they can be reclaimed and the arc_meta_limit honored.  For example,
- * when using the ZPL each dentry holds a references on a znode.  These
- * dentries must be pruned before the arc buffer holding the znode can
- * be safely evicted.
+ * they can be reclaimed.  For example, when using the ZPL each dentry
+ * holds a references on a znode.  These dentries must be pruned before
+ * the arc buffer holding the znode can be safely evicted.
  *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
@@ -250,7 +249,7 @@
  * since the physical block is about to be rewritten. The new data contents
  * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
  * it may compress the data before writing it to disk. The ARC will be called
- * with the transformed data and will bcopy the transformed on-disk block into
+ * with the transformed data and will memcpy the transformed on-disk block into
  * a newly allocated b_pabd. Writes are always done into buffers which have
  * either been loaned (and hence are new and don't have other readers) or
  * buffers which have been released (and hence have their own hdr, if there
@@ -328,9 +327,12 @@ static zthr_t *arc_reap_zthr;
  * arc_evict(), which improves arc_is_overflowing().
  */
 static zthr_t *arc_evict_zthr;
+static arc_buf_hdr_t **arc_state_evict_markers;
+static int arc_state_evict_marker_count;
 
 static kmutex_t arc_evict_lock;
 static boolean_t arc_evict_needed = B_FALSE;
+static clock_t arc_last_uncached_flush;
 
 /*
  * Count of bytes evicted since boot.
@@ -352,7 +354,7 @@ static list_t arc_evict_waiters;
  * can still happen, even during the potentially long time that arc_size is
  * more than arc_c.
  */
-int zfs_arc_eviction_pct = 200;
+static uint_t zfs_arc_eviction_pct = 200;
 
 /*
  * The number of headers to evict in arc_evict_state_impl() before
@@ -361,24 +363,21 @@ int zfs_arc_eviction_pct = 200;
  * oldest header in the arc state), but comes with higher overhead
  * (i.e. more invocations of arc_evict_state_impl()).
  */
-int zfs_arc_evict_batch_limit = 10;
+static uint_t zfs_arc_evict_batch_limit = 10;
 
 /* number of seconds before growing cache again */
-int arc_grow_retry = 5;
+uint_t arc_grow_retry = 5;
 
 /*
  * Minimum time between calls to arc_kmem_reap_soon().
  */
-int arc_kmem_cache_reap_retry_ms = 1000;
+static const int arc_kmem_cache_reap_retry_ms = 1000;
 
 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
-int zfs_arc_overflow_shift = 8;
-
-/* shift of arc_c for calculating both min and max arc_p */
-int arc_p_min_shift = 4;
+static int zfs_arc_overflow_shift = 8;
 
 /* log2(fraction of arc to reclaim) */
-int arc_shrink_shift = 7;
+uint_t arc_shrink_shift = 7;
 
 /* percent of pagecache to reclaim arc to */
 #ifdef _KERNEL
@@ -394,20 +393,20 @@ uint_t zfs_arc_pc_percent = 0;
  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
  * we will still not allow it to grow.
  */
-int			arc_no_grow_shift = 5;
+uint_t		arc_no_grow_shift = 5;
 
 
 /*
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
-static int		arc_min_prefetch_ms;
-static int		arc_min_prescient_prefetch_ms;
+static uint_t		arc_min_prefetch_ms;
+static uint_t		arc_min_prescient_prefetch_ms;
 
 /*
  * If this percent of memory is free, don't throttle.
  */
-int arc_lotsfree_percent = 10;
+uint_t arc_lotsfree_percent = 10;
 
 /*
  * The arc has filled available memory and has now warmed up.
@@ -417,23 +416,23 @@ boolean_t arc_warm;
 /*
  * These tunables are for performance analysis.
  */
-unsigned long zfs_arc_max = 0;
-unsigned long zfs_arc_min = 0;
-unsigned long zfs_arc_meta_limit = 0;
-unsigned long zfs_arc_meta_min = 0;
-unsigned long zfs_arc_dnode_limit = 0;
-unsigned long zfs_arc_dnode_reduce_percent = 10;
-int zfs_arc_grow_retry = 0;
-int zfs_arc_shrink_shift = 0;
-int zfs_arc_p_min_shift = 0;
-int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
+uint64_t zfs_arc_max = 0;
+uint64_t zfs_arc_min = 0;
+static uint64_t zfs_arc_dnode_limit = 0;
+static uint_t zfs_arc_dnode_reduce_percent = 10;
+static uint_t zfs_arc_grow_retry = 0;
+static uint_t zfs_arc_shrink_shift = 0;
+uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 
 /*
- * ARC dirty data constraints for arc_tempreserve_space() throttle.
+ * ARC dirty data constraints for arc_tempreserve_space() throttle:
+ * * total dirty data limit
+ * * anon block dirty limit
+ * * each pool's anon allowance
  */
-unsigned long zfs_arc_dirty_limit_percent = 50;	/* total dirty data limit */
-unsigned long zfs_arc_anon_limit_percent = 25;	/* anon block dirty limit */
-unsigned long zfs_arc_pool_dirty_percent = 20;	/* each pool's anon allowance */
+static const unsigned long zfs_arc_dirty_limit_percent = 50;
+static const unsigned long zfs_arc_anon_limit_percent = 25;
+static const unsigned long zfs_arc_pool_dirty_percent = 20;
 
 /*
  * Enable or disable compressed arc buffers.
@@ -441,51 +440,60 @@ unsigned long zfs_arc_pool_dirty_percent = 20;	/* each pool's anon allowance */
 int zfs_compressed_arc_enabled = B_TRUE;
 
 /*
- * ARC will evict meta buffers that exceed arc_meta_limit. This
- * tunable make arc_meta_limit adjustable for different workloads.
+ * Balance between metadata and data on ghost hits.  Values above 100
+ * increase metadata caching by proportionally reducing effect of ghost
+ * data hits on target data/metadata rate.
  */
-unsigned long zfs_arc_meta_limit_percent = 75;
+static uint_t zfs_arc_meta_balance = 500;
 
 /*
  * Percentage that can be consumed by dnodes of ARC meta buffers.
  */
-unsigned long zfs_arc_dnode_limit_percent = 10;
+static uint_t zfs_arc_dnode_limit_percent = 10;
+
+/*
+ * These tunables are Linux-specific
+ */
+static uint64_t zfs_arc_sys_free = 0;
+static uint_t zfs_arc_min_prefetch_ms = 0;
+static uint_t zfs_arc_min_prescient_prefetch_ms = 0;
+static uint_t zfs_arc_lotsfree_percent = 10;
 
 /*
- * These tunables are Linux specific
+ * Number of arc_prune threads
  */
-unsigned long zfs_arc_sys_free = 0;
-int zfs_arc_min_prefetch_ms = 0;
-int zfs_arc_min_prescient_prefetch_ms = 0;
-int zfs_arc_p_dampener_disable = 1;
-int zfs_arc_meta_prune = 10000;
-int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
-int zfs_arc_meta_adjust_restarts = 4096;
-int zfs_arc_lotsfree_percent = 10;
+static int zfs_arc_prune_task_threads = 1;
 
-/* The 6 states: */
+/* The 7 states: */
 arc_state_t ARC_anon;
 arc_state_t ARC_mru;
 arc_state_t ARC_mru_ghost;
 arc_state_t ARC_mfu;
 arc_state_t ARC_mfu_ghost;
 arc_state_t ARC_l2c_only;
+arc_state_t ARC_uncached;
 
 arc_stats_t arc_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
+	{ "iohits",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
+	{ "demand_data_iohits",		KSTAT_DATA_UINT64 },
 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
+	{ "demand_metadata_iohits",	KSTAT_DATA_UINT64 },
 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
+	{ "prefetch_data_iohits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
+	{ "prefetch_metadata_iohits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "mru_hits",			KSTAT_DATA_UINT64 },
 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
+	{ "uncached_hits",		KSTAT_DATA_UINT64 },
 	{ "deleted",			KSTAT_DATA_UINT64 },
 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
 	{ "access_skip",		KSTAT_DATA_UINT64 },
@@ -502,7 +510,9 @@ arc_stats_t arc_stats = {
 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
 	{ "hash_chains",		KSTAT_DATA_UINT64 },
 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
-	{ "p",				KSTAT_DATA_UINT64 },
+	{ "meta",			KSTAT_DATA_UINT64 },
+	{ "pd",				KSTAT_DATA_UINT64 },
+	{ "pm",				KSTAT_DATA_UINT64 },
 	{ "c",				KSTAT_DATA_UINT64 },
 	{ "c_min",			KSTAT_DATA_UINT64 },
 	{ "c_max",			KSTAT_DATA_UINT64 },
@@ -520,20 +530,35 @@ arc_stats_t arc_stats = {
 	{ "other_size",			KSTAT_DATA_UINT64 },
 #endif
 	{ "anon_size",			KSTAT_DATA_UINT64 },
+	{ "anon_data",			KSTAT_DATA_UINT64 },
+	{ "anon_metadata",		KSTAT_DATA_UINT64 },
 	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mru_size",			KSTAT_DATA_UINT64 },
+	{ "mru_data",			KSTAT_DATA_UINT64 },
+	{ "mru_metadata",		KSTAT_DATA_UINT64 },
 	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
 	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
+	{ "mru_ghost_data",		KSTAT_DATA_UINT64 },
+	{ "mru_ghost_metadata",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "mfu_size",			KSTAT_DATA_UINT64 },
+	{ "mfu_data",			KSTAT_DATA_UINT64 },
+	{ "mfu_metadata",		KSTAT_DATA_UINT64 },
 	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
 	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
+	{ "mfu_ghost_data",		KSTAT_DATA_UINT64 },
+	{ "mfu_ghost_metadata",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
+	{ "uncached_size",		KSTAT_DATA_UINT64 },
+	{ "uncached_data",		KSTAT_DATA_UINT64 },
+	{ "uncached_metadata",		KSTAT_DATA_UINT64 },
+	{ "uncached_evictable_data",	KSTAT_DATA_UINT64 },
+	{ "uncached_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "l2_hits",			KSTAT_DATA_UINT64 },
 	{ "l2_misses",			KSTAT_DATA_UINT64 },
 	{ "l2_prefetch_asize",		KSTAT_DATA_UINT64 },
@@ -586,13 +611,14 @@ arc_stats_t arc_stats = {
 	{ "arc_loaned_bytes",		KSTAT_DATA_UINT64 },
 	{ "arc_prune",			KSTAT_DATA_UINT64 },
 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
-	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
 	{ "arc_dnode_limit",		KSTAT_DATA_UINT64 },
-	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
-	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
 	{ "async_upgrade_sync",		KSTAT_DATA_UINT64 },
+	{ "predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
+	{ "demand_iohit_predictive_prefetch", KSTAT_DATA_UINT64 },
+	{ "prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
+	{ "demand_iohit_prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "arc_need_free",		KSTAT_DATA_UINT64 },
 	{ "arc_sys_free",		KSTAT_DATA_UINT64 },
 	{ "arc_raw_size",		KSTAT_DATA_UINT64 },
@@ -646,7 +672,7 @@ arc_sums_t arc_sums;
 		ARCSTAT(stat) = x; \
 	} while (0)
 
-kstat_t			*arc_ksp;
+static kstat_t			*arc_ksp;
 
 /*
  * There are several ARC variables that are critical to export as kstats --
@@ -658,10 +684,7 @@ kstat_t			*arc_ksp;
  */
 #define	arc_tempreserve	ARCSTAT(arcstat_tempreserve)
 #define	arc_loaned_bytes	ARCSTAT(arcstat_loaned_bytes)
-#define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
-/* max size for dnodes */
-#define	arc_dnode_size_limit	ARCSTAT(arcstat_dnode_limit)
-#define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
+#define	arc_dnode_limit	ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
 #define	arc_need_free	ARCSTAT(arcstat_need_free) /* waiting to be evicted */
 
 hrtime_t arc_growtime;
@@ -683,6 +706,7 @@ taskq_t *arc_prune_taskq;
 	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 
 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
+#define	HDR_UNCACHED(hdr)	((hdr)->b_flags & ARC_FLAG_UNCACHED)
 #define	HDR_L2_READING(hdr)	\
 	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
 	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
@@ -724,8 +748,7 @@ taskq_t *arc_prune_taskq;
  * Other sizes
  */
 
-#define	HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
-#define	HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr))
+#define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 #define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 
 /*
@@ -753,8 +776,8 @@ uint64_t zfs_crc64_table[256];
  * Level 2 ARC
  */
 
-#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
-#define	L2ARC_HEADROOM		2			/* num of writes */
+#define	L2ARC_WRITE_SIZE	(32 * 1024 * 1024)	/* initial write max */
+#define	L2ARC_HEADROOM		8			/* num of writes */
 
 /*
  * If we discover during ARC scan any buffers to be compressed, we boost
@@ -771,16 +794,16 @@ uint64_t zfs_crc64_table[256];
 #define	L2ARC_FEED_TYPES	4
 
 /* L2ARC Performance Tunables */
-unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
-unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
-unsigned long l2arc_headroom = L2ARC_HEADROOM;		/* # of dev writes */
-unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
-unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
-unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
+uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
+uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
+uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* # of dev writes */
+uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
+uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
+uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
 int l2arc_noprefetch = B_TRUE;			/* don't cache prefetch bufs */
 int l2arc_feed_again = B_TRUE;			/* turbo warmup */
 int l2arc_norw = B_FALSE;			/* no reads during writes */
-int l2arc_meta_percent = 33;			/* limit on headers size */
+static uint_t l2arc_meta_percent = 33;	/* limit on headers size */
 
 /*
  * L2ARC Internals
@@ -833,21 +856,24 @@ static kcondvar_t l2arc_rebuild_thr_cv;
 
 enum arc_hdr_alloc_flags {
 	ARC_HDR_ALLOC_RDATA = 0x1,
-	ARC_HDR_DO_ADAPT = 0x2,
 	ARC_HDR_USE_RESERVE = 0x4,
+	ARC_HDR_ALLOC_LINEAR = 0x8,
 };
 
 
-static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, int);
-static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
-static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, int);
-static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
-static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
-static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
+static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, const void *, int);
+static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, const void *);
+static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, const void *, int);
+static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, const void *);
+static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, const void *);
+static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size,
+    const void *tag);
 static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t);
 static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int);
-static void arc_access(arc_buf_hdr_t *, kmutex_t *);
+static void arc_hdr_destroy(arc_buf_hdr_t *);
+static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t);
 static void arc_buf_watch(arc_buf_t *);
+static void arc_change_state(arc_state_t *, arc_buf_hdr_t *);
 
 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
@@ -860,6 +886,8 @@ static void l2arc_do_free_on_write(void);
 static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
     boolean_t state_only);
 
+static void arc_prune_async(uint64_t adjust);
+
 #define	l2arc_hdr_arcstats_increment(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
 #define	l2arc_hdr_arcstats_decrement(hdr) \
@@ -881,7 +909,7 @@ int l2arc_exclude_special = 0;
  * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
  * 		metadata and data are cached from ARC into L2ARC.
  */
-int l2arc_mfuonly = 0;
+static int l2arc_mfuonly = 0;
 
 /*
  * L2ARC TRIM
@@ -898,7 +926,7 @@ int l2arc_mfuonly = 0;
  * 		will vary depending of how well the specific device handles
  * 		these commands.
  */
-unsigned long l2arc_trim_ahead = 0;
+static uint64_t l2arc_trim_ahead = 0;
 
 /*
  * Performance tuning of L2ARC persistence:
@@ -913,12 +941,12 @@ unsigned long l2arc_trim_ahead = 0;
  * 		data. In this case do not write log blocks in L2ARC in order
  * 		not to waste space.
  */
-int l2arc_rebuild_enabled = B_TRUE;
-unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
+static int l2arc_rebuild_enabled = B_TRUE;
+static uint64_t l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
 
 /* L2ARC persistence rebuild control routines. */
 void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
-static void l2arc_dev_rebuild_thread(void *arg);
+static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg);
 static int l2arc_rebuild(l2arc_dev_t *dev);
 
 /* L2ARC persistence read I/O routines. */
@@ -938,7 +966,7 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
     l2arc_dev_t *dev);
 
 /* L2ARC persistence write I/O routines. */
-static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
+static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
     l2arc_write_callback_t *cb);
 
 /* L2ARC persistence auxiliary routines. */
@@ -986,7 +1014,7 @@ static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
-	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
+	uint64_t birth = BP_GET_BIRTH(bp);
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *hdr;
@@ -1086,15 +1114,12 @@ buf_hash_remove(arc_buf_hdr_t *hdr)
  */
 
 static kmem_cache_t *hdr_full_cache;
-static kmem_cache_t *hdr_full_crypt_cache;
 static kmem_cache_t *hdr_l2only_cache;
 static kmem_cache_t *buf_cache;
 
 static void
 buf_fini(void)
 {
-	int i;
-
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
@@ -1106,10 +1131,9 @@ buf_fini(void)
 	kmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #endif
-	for (i = 0; i < BUF_LOCKS; i++)
+	for (int i = 0; i < BUF_LOCKS; i++)
 		mutex_destroy(BUF_HASH_LOCK(i));
 	kmem_cache_destroy(hdr_full_cache);
-	kmem_cache_destroy(hdr_full_crypt_cache);
 	kmem_cache_destroy(hdr_l2only_cache);
 	kmem_cache_destroy(buf_cache);
 }
@@ -1118,58 +1142,44 @@ buf_fini(void)
  * Constructor callback - called when the cache is empty
  * and a new buf is requested.
  */
-/* ARGSUSED */
 static int
 hdr_full_cons(void *vbuf, void *unused, int kmflag)
 {
+	(void) unused, (void) kmflag;
 	arc_buf_hdr_t *hdr = vbuf;
 
-	bzero(hdr, HDR_FULL_SIZE);
+	memset(hdr, 0, HDR_FULL_SIZE);
 	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
-	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
 	zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
+#ifdef ZFS_DEBUG
 	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
-	list_link_init(&hdr->b_l1hdr.b_arc_node);
-	list_link_init(&hdr->b_l2hdr.b_l2node);
+#endif
 	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
+	list_link_init(&hdr->b_l2hdr.b_l2node);
 	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 
 	return (0);
 }
 
-/* ARGSUSED */
-static int
-hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag)
-{
-	arc_buf_hdr_t *hdr = vbuf;
-
-	hdr_full_cons(vbuf, unused, kmflag);
-	bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr));
-	arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
-
-	return (0);
-}
-
-/* ARGSUSED */
 static int
 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
 {
+	(void) unused, (void) kmflag;
 	arc_buf_hdr_t *hdr = vbuf;
 
-	bzero(hdr, HDR_L2ONLY_SIZE);
+	memset(hdr, 0, HDR_L2ONLY_SIZE);
 	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 
 	return (0);
 }
 
-/* ARGSUSED */
 static int
 buf_cons(void *vbuf, void *unused, int kmflag)
 {
+	(void) unused, (void) kmflag;
 	arc_buf_t *buf = vbuf;
 
-	bzero(buf, sizeof (arc_buf_t));
-	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+	memset(buf, 0, sizeof (arc_buf_t));
 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
 	return (0);
@@ -1179,47 +1189,37 @@ buf_cons(void *vbuf, void *unused, int kmflag)
  * Destructor callback - called when a cached buf is
  * no longer required.
  */
-/* ARGSUSED */
 static void
 hdr_full_dest(void *vbuf, void *unused)
 {
+	(void) unused;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	ASSERT(HDR_EMPTY(hdr));
-	cv_destroy(&hdr->b_l1hdr.b_cv);
 	zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
+#ifdef ZFS_DEBUG
 	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+#endif
 	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 }
 
-/* ARGSUSED */
-static void
-hdr_full_crypt_dest(void *vbuf, void *unused)
-{
-	arc_buf_hdr_t *hdr = vbuf;
-
-	hdr_full_dest(vbuf, unused);
-	arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
-}
-
-/* ARGSUSED */
 static void
 hdr_l2only_dest(void *vbuf, void *unused)
 {
-	arc_buf_hdr_t *hdr __maybe_unused = vbuf;
+	(void) unused;
+	arc_buf_hdr_t *hdr = vbuf;
 
 	ASSERT(HDR_EMPTY(hdr));
 	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 }
 
-/* ARGSUSED */
 static void
 buf_dest(void *vbuf, void *unused)
 {
-	arc_buf_t *buf = vbuf;
+	(void) unused;
+	(void) vbuf;
 
-	mutex_destroy(&buf->b_evict_lock);
 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
@@ -1259,9 +1259,6 @@ retry:
 
 	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
 	    0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0);
-	hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
-	    HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
-	    NULL, NULL, NULL, 0);
 	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
 	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
 	    NULL, NULL, 0);
@@ -1324,9 +1321,9 @@ arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
 
 	ASSERT(HDR_PROTECTED(hdr));
 
-	bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
-	bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
-	bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
+	memcpy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
+	memcpy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
+	memcpy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
 	*byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
 	    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
 }
@@ -1369,7 +1366,7 @@ arc_buf_is_shared(arc_buf_t *buf)
 	    abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
 	    buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
 	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
-	IMPLY(shared, ARC_BUF_SHARED(buf));
+	EQUIV(shared, ARC_BUF_SHARED(buf));
 	IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
 
 	/*
@@ -1387,6 +1384,7 @@ arc_buf_is_shared(arc_buf_t *buf)
 static inline void
 arc_cksum_free(arc_buf_hdr_t *hdr)
 {
+#ifdef ZFS_DEBUG
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
@@ -1395,6 +1393,7 @@ arc_cksum_free(arc_buf_hdr_t *hdr)
 		hdr->b_l1hdr.b_freeze_cksum = NULL;
 	}
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+#endif
 }
 
 /*
@@ -1423,6 +1422,7 @@ arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
 static void
 arc_cksum_verify(arc_buf_t *buf)
 {
+#ifdef ZFS_DEBUG
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	zio_cksum_t zc;
 
@@ -1445,6 +1445,7 @@ arc_cksum_verify(arc_buf_t *buf)
 	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
 		panic("buffer modified while frozen!");
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+#endif
 }
 
 /*
@@ -1485,14 +1486,13 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
 static void
 arc_cksum_compute(arc_buf_t *buf)
 {
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
+#ifdef ZFS_DEBUG
+	arc_buf_hdr_t *hdr = buf->b_hdr;
 	ASSERT(HDR_HAS_L1HDR(hdr));
-
-	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 	if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
@@ -1505,6 +1505,7 @@ arc_cksum_compute(arc_buf_t *buf)
 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
 	    hdr->b_l1hdr.b_freeze_cksum);
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+#endif
 	arc_buf_watch(buf);
 }
 
@@ -1512,11 +1513,11 @@ arc_cksum_compute(arc_buf_t *buf)
 void
 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
 {
+	(void) sig, (void) unused;
 	panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr);
 }
 #endif
 
-/* ARGSUSED */
 static void
 arc_buf_unwatch(arc_buf_t *buf)
 {
@@ -1525,10 +1526,11 @@ arc_buf_unwatch(arc_buf_t *buf)
 		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
 		    PROT_READ | PROT_WRITE));
 	}
+#else
+	(void) buf;
 #endif
 }
 
-/* ARGSUSED */
 static void
 arc_buf_watch(arc_buf_t *buf)
 {
@@ -1536,6 +1538,8 @@ arc_buf_watch(arc_buf_t *buf)
 	if (arc_watch)
 		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
 		    PROT_READ));
+#else
+	(void) buf;
 #endif
 }
 
@@ -1681,18 +1685,20 @@ arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
 		}
 
 		if (!ARC_BUF_COMPRESSED(from)) {
-			bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
+			memcpy(buf->b_data, from->b_data, arc_buf_size(buf));
 			copied = B_TRUE;
 			break;
 		}
 	}
 
+#ifdef ZFS_DEBUG
 	/*
 	 * There were no decompressed bufs, so there should not be a
 	 * checksum on the hdr either.
 	 */
 	if (zfs_flags & ZFS_DEBUG_MODIFY)
 		EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
+#endif
 
 	return (copied);
 }
@@ -1778,12 +1784,13 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
-		tmpbuf = zio_buf_alloc(lsize);
-		abd = abd_get_from_buf(tmpbuf, lsize);
-		abd_take_ownership_of_buf(abd, B_TRUE);
+
 		csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
-		    hdr->b_l1hdr.b_pabd, tmpbuf, lsize, hdr->b_complevel);
+		    hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel);
+		ASSERT3P(tmpbuf, !=, NULL);
 		ASSERT3U(csize, <=, psize);
+		abd = abd_get_from_buf(tmpbuf, lsize);
+		abd_take_ownership_of_buf(abd, B_TRUE);
 		abd_zero_off(abd, csize, psize - csize);
 	}
 
@@ -1836,7 +1843,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_ENCRYPTED(hdr));
 
-	arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
+	arc_hdr_alloc_abd(hdr, 0);
 
 	ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
 	    B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
@@ -1863,8 +1870,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 		 * and then loan a buffer from it, rather than allocating a
 		 * linear buffer and wrapping it in an abd later.
 		 */
-		cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
-		    ARC_HDR_DO_ADAPT);
+		cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0);
 		tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
@@ -1947,20 +1953,19 @@ error:
  * arc_buf_fill().
  */
 static void
-arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock)
+arc_buf_untransform_in_place(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(HDR_ENCRYPTED(hdr));
 	ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
-	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+	ASSERT3PF(hdr->b_l1hdr.b_pabd, !=, NULL, "hdr %px buf %px", hdr, buf);
 
 	zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
 	    arc_buf_size(buf));
 	buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 	buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
-	hdr->b_crypt_hdr.b_ebufcnt -= 1;
 }
 
 /*
@@ -1995,7 +2000,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 	IMPLY(encrypted, HDR_ENCRYPTED(hdr));
 	IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
 	IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
-	IMPLY(encrypted, !ARC_BUF_SHARED(buf));
+	IMPLY(encrypted, !arc_buf_is_shared(buf));
 
 	/*
 	 * If the caller wanted encrypted data we just need to copy it from
@@ -2051,7 +2056,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 
 			if (hash_lock != NULL)
 				mutex_enter(hash_lock);
-			arc_buf_untransform_in_place(buf, hash_lock);
+			arc_buf_untransform_in_place(buf);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 
@@ -2063,21 +2068,23 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 	}
 
 	if (hdr_compressed == compressed) {
-		if (!arc_buf_is_shared(buf)) {
+		if (ARC_BUF_SHARED(buf)) {
+			ASSERT(arc_buf_is_shared(buf));
+		} else {
 			abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
 			    arc_buf_size(buf));
 		}
 	} else {
 		ASSERT(hdr_compressed);
 		ASSERT(!compressed);
-		ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
 
 		/*
 		 * If the buf is sharing its data with the hdr, unlink it and
 		 * allocate a new data buffer for the buf.
 		 */
-		if (arc_buf_is_shared(buf)) {
-			ASSERT(ARC_BUF_COMPRESSED(buf));
+		if (ARC_BUF_SHARED(buf)) {
+			ASSERTF(ARC_BUF_COMPRESSED(buf),
+			"buf %p was uncompressed", buf);
 
 			/* We need to give the buf its own b_data */
 			buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
@@ -2088,6 +2095,8 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 			/* Previously overhead was 0; just add new overhead */
 			ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
 		} else if (ARC_BUF_COMPRESSED(buf)) {
+			ASSERT(!arc_buf_is_shared(buf));
+
 			/* We need to reallocate the buf's b_data */
 			arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
 			    buf);
@@ -2175,7 +2184,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 		 * (and generate an ereport) before leaving the ARC.
 		 */
 		ret = SET_ERROR(EIO);
-		spa_log_error(spa, zb);
+		spa_log_error(spa, zb, buf->b_hdr->b_birth);
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 		    spa, NULL, zb, NULL, 0);
 	}
@@ -2196,7 +2205,6 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (GHOST_STATE(state)) {
-		ASSERT0(hdr->b_l1hdr.b_bufcnt);
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
@@ -2216,7 +2224,7 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
-		if (arc_buf_is_shared(buf))
+		if (ARC_BUF_SHARED(buf))
 			continue;
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
@@ -2236,7 +2244,6 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (GHOST_STATE(state)) {
-		ASSERT0(hdr->b_l1hdr.b_bufcnt);
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
@@ -2256,7 +2263,7 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
-		if (arc_buf_is_shared(buf))
+		if (ARC_BUF_SHARED(buf))
 			continue;
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
@@ -2270,33 +2277,22 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
  * it is not evictable.
  */
 static void
-add_reference(arc_buf_hdr_t *hdr, void *tag)
+add_reference(arc_buf_hdr_t *hdr, const void *tag)
 {
-	arc_state_t *state;
+	arc_state_t *state = hdr->b_l1hdr.b_state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
-		ASSERT(hdr->b_l1hdr.b_state == arc_anon);
+		ASSERT(state == arc_anon);
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 	}
 
-	state = hdr->b_l1hdr.b_state;
-
 	if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
-	    (state != arc_anon)) {
+	    state != arc_anon && state != arc_l2c_only) {
 		/* We don't use the L2-only state list. */
-		if (state != arc_l2c_only) {
-			multilist_remove(&state->arcs_list[arc_buf_type(hdr)],
-			    hdr);
-			arc_evictable_space_decrement(hdr, state);
-		}
-		/* remove the prefetch flag if we get a reference */
-		if (HDR_HAS_L2HDR(hdr))
-			l2arc_hdr_arcstats_decrement_state(hdr);
-		arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
-		if (HDR_HAS_L2HDR(hdr))
-			l2arc_hdr_arcstats_increment_state(hdr);
+		multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr);
+		arc_evictable_space_decrement(hdr, state);
 	}
 }
 
@@ -2306,26 +2302,30 @@ add_reference(arc_buf_hdr_t *hdr, void *tag)
  * list making it eligible for eviction.
  */
 static int
-remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
+remove_reference(arc_buf_hdr_t *hdr, const void *tag)
 {
 	int cnt;
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
-	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
-	ASSERT(!GHOST_STATE(state));
+	ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr)));
+	ASSERT(!GHOST_STATE(state));	/* arc_l2c_only counts as a ghost. */
 
-	/*
-	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
-	 * check to prevent usage of the arc_l2c_only list.
-	 */
-	if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
-	    (state != arc_anon)) {
-		multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
-		ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
-		arc_evictable_space_increment(hdr, state);
+	if ((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) != 0)
+		return (cnt);
+
+	if (state == arc_anon) {
+		arc_hdr_destroy(hdr);
+		return (0);
+	}
+	if (state == arc_uncached && !HDR_PREFETCH(hdr)) {
+		arc_change_state(arc_anon, hdr);
+		arc_hdr_destroy(hdr);
+		return (0);
 	}
-	return (cnt);
+	multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
+	arc_evictable_space_increment(hdr, state);
+	return (0);
 }
 
 /*
@@ -2338,6 +2338,7 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
 void
 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
 {
+	(void) state_index;
 	arc_buf_hdr_t *hdr = ab->b_hdr;
 	l1arc_buf_hdr_t *l1hdr = NULL;
 	l2arc_buf_hdr_t *l2hdr = NULL;
@@ -2358,7 +2359,9 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
 		l2hdr = &hdr->b_l2hdr;
 
 	if (l1hdr) {
-		abi->abi_bufcnt = l1hdr->b_bufcnt;
+		abi->abi_bufcnt = 0;
+		for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next)
+			abi->abi_bufcnt++;
 		abi->abi_access = l1hdr->b_arc_access;
 		abi->abi_mru_hits = l1hdr->b_mru_hits;
 		abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
@@ -2382,14 +2385,12 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
  * for the buffer must be held by the caller.
  */
 static void
-arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
-    kmutex_t *hash_lock)
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 {
 	arc_state_t *old_state;
 	int64_t refcnt;
-	uint32_t bufcnt;
 	boolean_t update_old, update_new;
-	arc_buf_contents_t buftype = arc_buf_type(hdr);
+	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	/*
 	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
@@ -2401,21 +2402,26 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 	if (HDR_HAS_L1HDR(hdr)) {
 		old_state = hdr->b_l1hdr.b_state;
 		refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
-		bufcnt = hdr->b_l1hdr.b_bufcnt;
-		update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL ||
-		    HDR_HAS_RABD(hdr));
+		update_old = (hdr->b_l1hdr.b_buf != NULL ||
+		    hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
+
+		IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL);
+		IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL);
+		IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL ||
+		    ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
 	} else {
 		old_state = arc_l2c_only;
 		refcnt = 0;
-		bufcnt = 0;
 		update_old = B_FALSE;
 	}
 	update_new = update_old;
+	if (GHOST_STATE(old_state))
+		update_old = B_TRUE;
+	if (GHOST_STATE(new_state))
+		update_new = B_TRUE;
 
-	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT3P(new_state, !=, old_state);
-	ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
-	ASSERT(old_state != arc_anon || bufcnt <= 1);
 
 	/*
 	 * If this buffer is evictable, transfer it from the
@@ -2424,14 +2430,12 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 	if (refcnt == 0) {
 		if (old_state != arc_anon && old_state != arc_l2c_only) {
 			ASSERT(HDR_HAS_L1HDR(hdr));
-			multilist_remove(&old_state->arcs_list[buftype], hdr);
-
-			if (GHOST_STATE(old_state)) {
-				ASSERT0(bufcnt);
-				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
-				update_old = B_TRUE;
+			/* remove_reference() saves on insert. */
+			if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+				multilist_remove(&old_state->arcs_list[type],
+				    hdr);
+				arc_evictable_space_decrement(hdr, old_state);
 			}
-			arc_evictable_space_decrement(hdr, old_state);
 		}
 		if (new_state != arc_anon && new_state != arc_l2c_only) {
 			/*
@@ -2441,13 +2445,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 			 * beforehand.
 			 */
 			ASSERT(HDR_HAS_L1HDR(hdr));
-			multilist_insert(&new_state->arcs_list[buftype], hdr);
-
-			if (GHOST_STATE(new_state)) {
-				ASSERT0(bufcnt);
-				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
-				update_new = B_TRUE;
-			}
+			multilist_insert(&new_state->arcs_list[type], hdr);
 			arc_evictable_space_increment(hdr, new_state);
 		}
 	}
@@ -2461,21 +2459,19 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 	if (update_new && new_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(new_state)) {
-			ASSERT0(bufcnt);
 
 			/*
 			 * When moving a header to a ghost state, we first
-			 * remove all arc buffers. Thus, we'll have a
-			 * bufcnt of zero, and no arc buffer to use for
-			 * the reference. As a result, we use the arc
-			 * header pointer for the reference.
+			 * remove all arc buffers. Thus, we'll have no arc
+			 * buffer to use for the reference. As a result, we
+			 * use the arc header pointer for the reference.
 			 */
-			(void) zfs_refcount_add_many(&new_state->arcs_size,
+			(void) zfs_refcount_add_many(
+			    &new_state->arcs_size[type],
 			    HDR_GET_LSIZE(hdr), hdr);
 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 		} else {
-			uint32_t buffers = 0;
 
 			/*
 			 * Each individual buffer holds a unique reference,
@@ -2484,8 +2480,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
-				ASSERT3U(bufcnt, !=, 0);
-				buffers++;
 
 				/*
 				 * When the arc_buf_t is sharing the data
@@ -2494,24 +2488,23 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
-				if (arc_buf_is_shared(buf))
+				if (ARC_BUF_SHARED(buf))
 					continue;
 
 				(void) zfs_refcount_add_many(
-				    &new_state->arcs_size,
+				    &new_state->arcs_size[type],
 				    arc_buf_size(buf), buf);
 			}
-			ASSERT3U(bufcnt, ==, buffers);
 
 			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) zfs_refcount_add_many(
-				    &new_state->arcs_size,
+				    &new_state->arcs_size[type],
 				    arc_hdr_size(hdr), hdr);
 			}
 
 			if (HDR_HAS_RABD(hdr)) {
 				(void) zfs_refcount_add_many(
-				    &new_state->arcs_size,
+				    &new_state->arcs_size[type],
 				    HDR_GET_PSIZE(hdr), hdr);
 			}
 		}
@@ -2520,7 +2513,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 	if (update_old && old_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(old_state)) {
-			ASSERT0(bufcnt);
 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 
@@ -2532,10 +2524,10 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 			 * header on the ghost state.
 			 */
 
-			(void) zfs_refcount_remove_many(&old_state->arcs_size,
+			(void) zfs_refcount_remove_many(
+			    &old_state->arcs_size[type],
 			    HDR_GET_LSIZE(hdr), hdr);
 		} else {
-			uint32_t buffers = 0;
 
 			/*
 			 * Each individual buffer holds a unique reference,
@@ -2544,8 +2536,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
-				ASSERT3U(bufcnt, !=, 0);
-				buffers++;
 
 				/*
 				 * When the arc_buf_t is sharing the data
@@ -2554,27 +2544,26 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
-				if (arc_buf_is_shared(buf))
+				if (ARC_BUF_SHARED(buf))
 					continue;
 
 				(void) zfs_refcount_remove_many(
-				    &old_state->arcs_size, arc_buf_size(buf),
-				    buf);
+				    &old_state->arcs_size[type],
+				    arc_buf_size(buf), buf);
 			}
-			ASSERT3U(bufcnt, ==, buffers);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));
 
 			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) zfs_refcount_remove_many(
-				    &old_state->arcs_size, arc_hdr_size(hdr),
-				    hdr);
+				    &old_state->arcs_size[type],
+				    arc_hdr_size(hdr), hdr);
 			}
 
 			if (HDR_HAS_RABD(hdr)) {
 				(void) zfs_refcount_remove_many(
-				    &old_state->arcs_size, HDR_GET_PSIZE(hdr),
-				    hdr);
+				    &old_state->arcs_size[type],
+				    HDR_GET_PSIZE(hdr), hdr);
 			}
 		}
 	}
@@ -2608,7 +2597,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
 		ARCSTAT_INCR(arcstat_bonus_size, space);
 		break;
 	case ARC_SPACE_DNODE:
-		aggsum_add(&arc_sums.arcstat_dnode_size, space);
+		ARCSTAT_INCR(arcstat_dnode_size, space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, space);
@@ -2631,7 +2620,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
 	}
 
 	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
-		aggsum_add(&arc_sums.arcstat_meta_used, space);
+		ARCSTAT_INCR(arcstat_meta_used, space);
 
 	aggsum_add(&arc_sums.arcstat_size, space);
 }
@@ -2654,7 +2643,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
 		ARCSTAT_INCR(arcstat_bonus_size, -space);
 		break;
 	case ARC_SPACE_DNODE:
-		aggsum_add(&arc_sums.arcstat_dnode_size, -space);
+		ARCSTAT_INCR(arcstat_dnode_size, -space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, -space);
@@ -2670,13 +2659,8 @@ arc_space_return(uint64_t space, arc_space_type_t type)
 		break;
 	}
 
-	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) {
-		ASSERT(aggsum_compare(&arc_sums.arcstat_meta_used,
-		    space) >= 0);
-		ARCSTAT_MAX(arcstat_meta_max,
-		    aggsum_upper_bound(&arc_sums.arcstat_meta_used));
-		aggsum_add(&arc_sums.arcstat_meta_used, -space);
-	}
+	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
+		ARCSTAT_INCR(arcstat_meta_used, -space);
 
 	ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0);
 	aggsum_add(&arc_sums.arcstat_size, -space);
@@ -2729,8 +2713,8 @@ arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
  */
 static int
 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
-    void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth,
-    boolean_t fill, arc_buf_t **ret)
+    const void *tag, boolean_t encrypted, boolean_t compressed,
+    boolean_t noauth, boolean_t fill, arc_buf_t **ret)
 {
 	arc_buf_t *buf;
 	arc_fill_flags_t flags = ARC_FILL_LOCKED;
@@ -2814,9 +2798,6 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
 	VERIFY3P(buf->b_data, !=, NULL);
 
 	hdr->b_l1hdr.b_buf = buf;
-	hdr->b_l1hdr.b_bufcnt += 1;
-	if (encrypted)
-		hdr->b_crypt_hdr.b_ebufcnt += 1;
 
 	/*
 	 * If the user wants the data from the hdr, we need to either copy or
@@ -2830,7 +2811,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
 	return (0);
 }
 
-static char *arc_onloan_tag = "onloan";
+static const char *arc_onloan_tag = "onloan";
 
 static inline void
 arc_loaned_bytes_update(int64_t delta)
@@ -2889,7 +2870,7 @@ arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
  * Return a loaned arc buffer to the arc.
  */
 void
-arc_return_buf(arc_buf_t *buf, void *tag)
+arc_return_buf(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
@@ -2903,7 +2884,7 @@ arc_return_buf(arc_buf_t *buf, void *tag)
 
 /* Detach an arc_buf from a dbuf (tag) */
 void
-arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+arc_loan_inuse_buf(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
@@ -2943,7 +2924,7 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    size, hdr);
 	}
-	(void) zfs_refcount_remove_many(&state->arcs_size, size, hdr);
+	(void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_return(size, ARC_SPACE_META);
 	} else {
@@ -2976,7 +2957,8 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 	 * refcount ownership to the hdr since it always owns
 	 * the refcount whenever an arc_buf_t is shared.
 	 */
-	zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
+	zfs_refcount_transfer_ownership_many(
+	    &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
 	    arc_hdr_size(hdr), buf, hdr);
 	hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
 	abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
@@ -3005,7 +2987,8 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 	 * We are no longer sharing this buffer so we need
 	 * to transfer its ownership to the rightful owner.
 	 */
-	zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
+	zfs_refcount_transfer_ownership_many(
+	    &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
 	    arc_hdr_size(hdr), hdr, buf);
 	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 	abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
@@ -3056,8 +3039,6 @@ arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 	}
 	buf->b_next = NULL;
 	ASSERT3P(lastbuf, !=, buf);
-	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
-	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
 	IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
 
 	return (lastbuf);
@@ -3087,31 +3068,30 @@ arc_buf_destroy_impl(arc_buf_t *buf)
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
-		if (arc_buf_is_shared(buf)) {
+		if (ARC_BUF_SHARED(buf)) {
 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 		} else {
+			ASSERT(!arc_buf_is_shared(buf));
 			uint64_t size = arc_buf_size(buf);
 			arc_free_data_buf(hdr, buf->b_data, size, buf);
 			ARCSTAT_INCR(arcstat_overhead_size, -size);
 		}
 		buf->b_data = NULL;
 
-		ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
-		hdr->b_l1hdr.b_bufcnt -= 1;
-
-		if (ARC_BUF_ENCRYPTED(buf)) {
-			hdr->b_crypt_hdr.b_ebufcnt -= 1;
-
-			/*
-			 * If we have no more encrypted buffers and we've
-			 * already gotten a copy of the decrypted data we can
-			 * free b_rabd to save some space.
-			 */
-			if (hdr->b_crypt_hdr.b_ebufcnt == 0 &&
-			    HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL &&
-			    !HDR_IO_IN_PROGRESS(hdr)) {
-				arc_hdr_free_abd(hdr, B_TRUE);
+		/*
+		 * If we have no more encrypted buffers and we've already
+		 * gotten a copy of the decrypted data we can free b_rabd
+		 * to save some space.
+		 */
+		if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) &&
+		    hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) {
+			arc_buf_t *b;
+			for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) {
+				if (b != buf && ARC_BUF_ENCRYPTED(b))
+					break;
 			}
+			if (b == NULL)
+				arc_hdr_free_abd(hdr, B_TRUE);
 		}
 	}
 
@@ -3132,9 +3112,9 @@ arc_buf_destroy_impl(arc_buf_t *buf)
 		 */
 		if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
 			/* Only one buf can be shared at once */
-			VERIFY(!arc_buf_is_shared(lastbuf));
+			ASSERT(!arc_buf_is_shared(lastbuf));
 			/* hdr is uncompressed so can't have compressed buf */
-			VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
+			ASSERT(!ARC_BUF_COMPRESSED(lastbuf));
 
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			arc_hdr_free_abd(hdr, B_FALSE);
@@ -3272,14 +3252,12 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
 	arc_buf_hdr_t *hdr;
 
 	VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
-	if (protected) {
-		hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE);
-	} else {
-		hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
-	}
+	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
 
 	ASSERT(HDR_EMPTY(hdr));
+#ifdef ZFS_DEBUG
 	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+#endif
 	HDR_SET_PSIZE(hdr, psize);
 	HDR_SET_LSIZE(hdr, lsize);
 	hdr->b_spa = spa;
@@ -3297,7 +3275,6 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
 	hdr->b_l1hdr.b_mru_ghost_hits = 0;
 	hdr->b_l1hdr.b_mfu_hits = 0;
 	hdr->b_l1hdr.b_mfu_ghost_hits = 0;
-	hdr->b_l1hdr.b_bufcnt = 0;
 	hdr->b_l1hdr.b_buf = NULL;
 
 	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
@@ -3323,24 +3300,14 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
 	    (old == hdr_l2only_cache && new == hdr_full_cache));
 
-	/*
-	 * if the caller wanted a new full header and the header is to be
-	 * encrypted we will actually allocate the header from the full crypt
-	 * cache instead. The same applies to freeing from the old cache.
-	 */
-	if (HDR_PROTECTED(hdr) && new == hdr_full_cache)
-		new = hdr_full_crypt_cache;
-	if (HDR_PROTECTED(hdr) && old == hdr_full_cache)
-		old = hdr_full_crypt_cache;
-
 	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	buf_hash_remove(hdr);
 
-	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
+	memcpy(nhdr, hdr, HDR_L2ONLY_SIZE);
 
-	if (new == hdr_full_cache || new == hdr_full_crypt_cache) {
+	if (new == hdr_full_cache) {
 		arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 		/*
 		 * arc_access and arc_change_state need to be aware that a
@@ -3354,8 +3321,9 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 		ASSERT(!HDR_HAS_RABD(hdr));
 	} else {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
-		ASSERT0(hdr->b_l1hdr.b_bufcnt);
+#ifdef ZFS_DEBUG
 		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+#endif
 
 		/*
 		 * If we've reached here, We must have been called from
@@ -3419,125 +3387,6 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 }
 
 /*
- * This function allows an L1 header to be reallocated as a crypt
- * header and vice versa. If we are going to a crypt header, the
- * new fields will be zeroed out.
- */
-static arc_buf_hdr_t *
-arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
-{
-	arc_buf_hdr_t *nhdr;
-	arc_buf_t *buf;
-	kmem_cache_t *ncache, *ocache;
-
-	/*
-	 * This function requires that hdr is in the arc_anon state.
-	 * Therefore it won't have any L2ARC data for us to worry
-	 * about copying.
-	 */
-	ASSERT(HDR_HAS_L1HDR(hdr));
-	ASSERT(!HDR_HAS_L2HDR(hdr));
-	ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
-	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
-	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
-	ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node));
-	ASSERT3P(hdr->b_hash_next, ==, NULL);
-
-	if (need_crypt) {
-		ncache = hdr_full_crypt_cache;
-		ocache = hdr_full_cache;
-	} else {
-		ncache = hdr_full_cache;
-		ocache = hdr_full_crypt_cache;
-	}
-
-	nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE);
-
-	/*
-	 * Copy all members that aren't locks or condvars to the new header.
-	 * No lists are pointing to us (as we asserted above), so we don't
-	 * need to worry about the list nodes.
-	 */
-	nhdr->b_dva = hdr->b_dva;
-	nhdr->b_birth = hdr->b_birth;
-	nhdr->b_type = hdr->b_type;
-	nhdr->b_flags = hdr->b_flags;
-	nhdr->b_psize = hdr->b_psize;
-	nhdr->b_lsize = hdr->b_lsize;
-	nhdr->b_spa = hdr->b_spa;
-	nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum;
-	nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt;
-	nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap;
-	nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state;
-	nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access;
-	nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits;
-	nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
-	nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
-	nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
-	nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb;
-	nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd;
-
-	/*
-	 * This zfs_refcount_add() exists only to ensure that the individual
-	 * arc buffers always point to a header that is referenced, avoiding
-	 * a small race condition that could trigger ASSERTs.
-	 */
-	(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG);
-	nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf;
-	for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) {
-		mutex_enter(&buf->b_evict_lock);
-		buf->b_hdr = nhdr;
-		mutex_exit(&buf->b_evict_lock);
-	}
-
-	zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt);
-	(void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG);
-	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
-
-	if (need_crypt) {
-		arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED);
-	} else {
-		arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED);
-	}
-
-	/* unset all members of the original hdr */
-	bzero(&hdr->b_dva, sizeof (dva_t));
-	hdr->b_birth = 0;
-	hdr->b_type = ARC_BUFC_INVALID;
-	hdr->b_flags = 0;
-	hdr->b_psize = 0;
-	hdr->b_lsize = 0;
-	hdr->b_spa = 0;
-	hdr->b_l1hdr.b_freeze_cksum = NULL;
-	hdr->b_l1hdr.b_buf = NULL;
-	hdr->b_l1hdr.b_bufcnt = 0;
-	hdr->b_l1hdr.b_byteswap = 0;
-	hdr->b_l1hdr.b_state = NULL;
-	hdr->b_l1hdr.b_arc_access = 0;
-	hdr->b_l1hdr.b_mru_hits = 0;
-	hdr->b_l1hdr.b_mru_ghost_hits = 0;
-	hdr->b_l1hdr.b_mfu_hits = 0;
-	hdr->b_l1hdr.b_mfu_ghost_hits = 0;
-	hdr->b_l1hdr.b_acb = NULL;
-	hdr->b_l1hdr.b_pabd = NULL;
-
-	if (ocache == hdr_full_crypt_cache) {
-		ASSERT(!HDR_HAS_RABD(hdr));
-		hdr->b_crypt_hdr.b_ot = DMU_OT_NONE;
-		hdr->b_crypt_hdr.b_ebufcnt = 0;
-		hdr->b_crypt_hdr.b_dsobj = 0;
-		bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
-		bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
-		bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
-	}
-
-	buf_discard_identity(hdr);
-	kmem_cache_free(ocache, hdr);
-
-	return (nhdr);
-}
-
-/*
  * This function is used by the send / receive code to convert a newly
  * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
  * is also used to allow the root objset block to be updated without altering
@@ -3556,8 +3405,7 @@ arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 
 	buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
-	if (!HDR_PROTECTED(hdr))
-		hdr = arc_hdr_realloc_crypt(hdr, B_TRUE);
+	arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 	hdr->b_crypt_hdr.b_dsobj = dsobj;
 	hdr->b_crypt_hdr.b_ot = ot;
 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
@@ -3566,11 +3414,11 @@ arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
 		arc_cksum_free(hdr);
 
 	if (salt != NULL)
-		bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
+		memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
 	if (iv != NULL)
-		bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
+		memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
 	if (mac != NULL)
-		bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
+		memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
 }
 
 /*
@@ -3578,7 +3426,8 @@ arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
  * The buf is returned thawed since we expect the consumer to modify it.
  */
 arc_buf_t *
-arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
+arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type,
+    int32_t size)
 {
 	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
 	    B_FALSE, ZIO_COMPRESS_OFF, 0, type);
@@ -3596,8 +3445,8 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
  * for bufs containing metadata.
  */
 arc_buf_t *
-arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
-    enum zio_compress compression_type, uint8_t complevel)
+arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize,
+    uint64_t lsize, enum zio_compress compression_type, uint8_t complevel)
 {
 	ASSERT3U(lsize, >, 0);
 	ASSERT3U(lsize, >=, psize);
@@ -3611,7 +3460,6 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
 	    B_TRUE, B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
-	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 
 	/*
 	 * To ensure that the hdr has the correct data in it if we call
@@ -3624,9 +3472,9 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
 }
 
 arc_buf_t *
-arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
-    const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
-    dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
+arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj,
+    boolean_t byteorder, const uint8_t *salt, const uint8_t *iv,
+    const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_hdr_t *hdr;
@@ -3646,9 +3494,9 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
 	hdr->b_crypt_hdr.b_ot = ot;
 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
 	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
-	bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
-	bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
-	bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
+	memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
+	memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
+	memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
 
 	/*
 	 * This buffer will be considered encrypted even if the ot is not an
@@ -3659,7 +3507,6 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
 	    B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
-	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 
 	return (buf);
 }
@@ -3759,8 +3606,6 @@ static void
 arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	if (HDR_HAS_L1HDR(hdr)) {
-		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
-		    hdr->b_l1hdr.b_bufcnt > 0);
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	}
@@ -3821,27 +3666,25 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 	if (HDR_HAS_L1HDR(hdr)) {
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
-
-		if (!HDR_PROTECTED(hdr)) {
-			kmem_cache_free(hdr_full_cache, hdr);
-		} else {
-			kmem_cache_free(hdr_full_crypt_cache, hdr);
-		}
+#ifdef ZFS_DEBUG
+		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+#endif
+		kmem_cache_free(hdr_full_cache, hdr);
 	} else {
 		kmem_cache_free(hdr_l2only_cache, hdr);
 	}
 }
 
 void
-arc_buf_destroy(arc_buf_t *buf, void* tag)
+arc_buf_destroy(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	if (hdr->b_l1hdr.b_state == arc_anon) {
-		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
+		ASSERT(ARC_BUF_LAST(buf));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-		VERIFY0(remove_reference(hdr, NULL, tag));
-		arc_hdr_destroy(hdr);
+		VERIFY0(remove_reference(hdr, tag));
 		return;
 	}
 
@@ -3849,13 +3692,13 @@ arc_buf_destroy(arc_buf_t *buf, void* tag)
 	mutex_enter(hash_lock);
 
 	ASSERT3P(hdr, ==, buf->b_hdr);
-	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
 	ASSERT3P(buf->b_data, !=, NULL);
 
-	(void) remove_reference(hdr, hash_lock, tag);
 	arc_buf_destroy_impl(buf);
+	(void) remove_reference(hdr, tag);
 	mutex_exit(hash_lock);
 }
 
@@ -3870,6 +3713,7 @@ arc_buf_destroy(arc_buf_t *buf, void* tag)
  *    - arc_mru_ghost -> deleted
  *    - arc_mfu_ghost -> arc_l2c_only
  *    - arc_mfu_ghost -> deleted
+ *    - arc_uncached -> deleted
  *
  * Return total size of evicted data buffers for eviction progress tracking.
  * When evicting from ghost states return logical buffer size to make eviction
@@ -3881,21 +3725,22 @@ arc_buf_destroy(arc_buf_t *buf, void* tag)
  * only the evicted headers size.
  */
 static int64_t
-arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted)
+arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted)
 {
 	arc_state_t *evicted_state, *state;
 	int64_t bytes_evicted = 0;
-	int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
+	uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
 	    arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
 
-	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
 
 	*real_evicted = 0;
 	state = hdr->b_l1hdr.b_state;
 	if (GHOST_STATE(state)) {
-		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 
 		/*
 		 * l2arc_write_buffers() relies on a header's L1 portion
@@ -3921,49 +3766,34 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted)
 			 * This buffer is cached on the 2nd Level ARC;
 			 * don't destroy the header.
 			 */
-			arc_change_state(arc_l2c_only, hdr, hash_lock);
+			arc_change_state(arc_l2c_only, hdr);
 			/*
 			 * dropping from L1+L2 cached to L2-only,
 			 * realloc to remove the L1 header.
 			 */
-			hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+			(void) arc_hdr_realloc(hdr, hdr_full_cache,
 			    hdr_l2only_cache);
 			*real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE;
 		} else {
-			arc_change_state(arc_anon, hdr, hash_lock);
+			arc_change_state(arc_anon, hdr);
 			arc_hdr_destroy(hdr);
 			*real_evicted += HDR_FULL_SIZE;
 		}
 		return (bytes_evicted);
 	}
 
-	ASSERT(state == arc_mru || state == arc_mfu);
-	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+	ASSERT(state == arc_mru || state == arc_mfu || state == arc_uncached);
+	evicted_state = (state == arc_uncached) ? arc_anon :
+	    ((state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost);
 
 	/* prefetch buffers have a minimum lifespan */
-	if (HDR_IO_IN_PROGRESS(hdr) ||
-	    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+	if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
 	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
-	    MSEC_TO_TICK(min_lifetime))) {
+	    MSEC_TO_TICK(min_lifetime)) {
 		ARCSTAT_BUMP(arcstat_evict_skip);
 		return (bytes_evicted);
 	}
 
-	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
-	while (hdr->b_l1hdr.b_buf) {
-		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
-		if (!mutex_tryenter(&buf->b_evict_lock)) {
-			ARCSTAT_BUMP(arcstat_mutex_miss);
-			break;
-		}
-		if (buf->b_data != NULL) {
-			bytes_evicted += HDR_GET_LSIZE(hdr);
-			*real_evicted += HDR_GET_LSIZE(hdr);
-		}
-		mutex_exit(&buf->b_evict_lock);
-		arc_buf_destroy_impl(buf);
-	}
-
 	if (HDR_HAS_L2HDR(hdr)) {
 		ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
 	} else {
@@ -3991,28 +3821,27 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted)
 		}
 	}
 
-	if (hdr->b_l1hdr.b_bufcnt == 0) {
-		arc_cksum_free(hdr);
-
-		bytes_evicted += arc_hdr_size(hdr);
-		*real_evicted += arc_hdr_size(hdr);
+	bytes_evicted += arc_hdr_size(hdr);
+	*real_evicted += arc_hdr_size(hdr);
 
-		/*
-		 * If this hdr is being evicted and has a compressed
-		 * buffer then we discard it here before we change states.
-		 * This ensures that the accounting is updated correctly
-		 * in arc_free_data_impl().
-		 */
-		if (hdr->b_l1hdr.b_pabd != NULL)
-			arc_hdr_free_abd(hdr, B_FALSE);
+	/*
+	 * If this hdr is being evicted and has a compressed buffer then we
+	 * discard it here before we change states.  This ensures that the
+	 * accounting is updated correctly in arc_free_data_impl().
+	 */
+	if (hdr->b_l1hdr.b_pabd != NULL)
+		arc_hdr_free_abd(hdr, B_FALSE);
 
-		if (HDR_HAS_RABD(hdr))
-			arc_hdr_free_abd(hdr, B_TRUE);
+	if (HDR_HAS_RABD(hdr))
+		arc_hdr_free_abd(hdr, B_TRUE);
 
-		arc_change_state(evicted_state, hdr, hash_lock);
+	arc_change_state(evicted_state, hdr);
+	DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
+	if (evicted_state == arc_anon) {
+		arc_hdr_destroy(hdr);
+		*real_evicted += HDR_FULL_SIZE;
+	} else {
 		ASSERT(HDR_IN_HASH_TABLE(hdr));
-		arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
-		DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
 	}
 
 	return (bytes_evicted);
@@ -4040,15 +3869,15 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 	uint64_t bytes_evicted = 0, real_evicted = 0;
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
-	int evict_count = zfs_arc_evict_batch_limit;
+	uint_t evict_count = zfs_arc_evict_batch_limit;
 
 	ASSERT3P(marker, !=, NULL);
 
-	mls = multilist_sublist_lock(ml, idx);
+	mls = multilist_sublist_lock_idx(ml, idx);
 
 	for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
 	    hdr = multilist_sublist_prev(mls, marker)) {
-		if ((evict_count <= 0) || (bytes_evicted >= bytes))
+		if ((evict_count == 0) || (bytes_evicted >= bytes))
 			break;
 
 		/*
@@ -4097,8 +3926,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 
 		if (mutex_tryenter(hash_lock)) {
 			uint64_t revicted;
-			uint64_t evicted = arc_evict_hdr(hdr, hash_lock,
-			    &revicted);
+			uint64_t evicted = arc_evict_hdr(hdr, &revicted);
 			mutex_exit(hash_lock);
 
 			bytes_evicted += evicted;
@@ -4152,11 +3980,54 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 	 * this CPU are able to make progress, make a voluntary preemption
 	 * call here.
 	 */
-	cond_resched();
+	kpreempt(KPREEMPT_SYNC);
 
 	return (bytes_evicted);
 }
 
+static arc_buf_hdr_t *
+arc_state_alloc_marker(void)
+{
+	arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+	/*
+	 * A b_spa of 0 is used to indicate that this header is
+	 * a marker. This fact is used in arc_evict_state_impl().
+	 */
+	marker->b_spa = 0;
+
+	return (marker);
+}
+
+static void
+arc_state_free_marker(arc_buf_hdr_t *marker)
+{
+	kmem_cache_free(hdr_full_cache, marker);
+}
+
+/*
+ * Allocate an array of buffer headers used as placeholders during arc state
+ * eviction.
+ */
+static arc_buf_hdr_t **
+arc_state_alloc_markers(int count)
+{
+	arc_buf_hdr_t **markers;
+
+	markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
+	for (int i = 0; i < count; i++)
+		markers[i] = arc_state_alloc_marker();
+	return (markers);
+}
+
+static void
+arc_state_free_markers(arc_buf_hdr_t **markers, int count)
+{
+	for (int i = 0; i < count; i++)
+		arc_state_free_marker(markers[i]);
+	kmem_free(markers, sizeof (*markers) * count);
+}
+
 /*
  * Evict buffers from the given arc state, until we've removed the
  * specified number of bytes. Move the removed buffers to the
@@ -4171,8 +4042,8 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
  * the given arc state; which is used by arc_flush().
  */
 static uint64_t
-arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes,
-    arc_buf_contents_t type)
+arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
+    uint64_t bytes)
 {
 	uint64_t total_evicted = 0;
 	multilist_t *ml = &state->arcs_list[type];
@@ -4188,20 +4059,16 @@ arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes,
 	 * pick up where we left off for each individual sublist, rather
 	 * than starting from the tail each time.
 	 */
-	markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
+	if (zthr_iscurthread(arc_evict_zthr)) {
+		markers = arc_state_evict_markers;
+		ASSERT3S(num_sublists, <=, arc_state_evict_marker_count);
+	} else {
+		markers = arc_state_alloc_markers(num_sublists);
+	}
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls;
 
-		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
-
-		/*
-		 * A b_spa of 0 is used to indicate that this header is
-		 * a marker. This fact is used in arc_evict_type() and
-		 * arc_evict_state_impl().
-		 */
-		markers[i]->b_spa = 0;
-
-		mls = multilist_sublist_lock(ml, i);
+		mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_insert_tail(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
@@ -4215,19 +4082,6 @@ arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes,
 		uint64_t scan_evicted = 0;
 
 		/*
-		 * Try to reduce pinned dnodes with a floor of arc_dnode_limit.
-		 * Request that 10% of the LRUs be scanned by the superblock
-		 * shrinker.
-		 */
-		if (type == ARC_BUFC_DATA && aggsum_compare(
-		    &arc_sums.arcstat_dnode_size, arc_dnode_size_limit) > 0) {
-			arc_prune_async((aggsum_upper_bound(
-			    &arc_sums.arcstat_dnode_size) -
-			    arc_dnode_size_limit) / sizeof (dnode_t) /
-			    zfs_arc_dnode_reduce_percent);
-		}
-
-		/*
 		 * Start eviction using a randomly selected sublist,
 		 * this is to try and evenly balance eviction across all
 		 * sublists. Always starting at the same sublist
@@ -4279,13 +4133,12 @@ arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes,
 	}
 
 	for (int i = 0; i < num_sublists; i++) {
-		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_remove(mls, markers[i]);
 		multilist_sublist_unlock(mls);
-
-		kmem_cache_free(hdr_full_cache, markers[i]);
 	}
-	kmem_free(markers, sizeof (*markers) * num_sublists);
+	if (markers != arc_state_evict_markers)
+		arc_state_free_markers(markers, num_sublists);
 
 	return (total_evicted);
 }
@@ -4312,7 +4165,7 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
 	uint64_t evicted = 0;
 
 	while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
-		evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
+		evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL);
 
 		if (!retry)
 			break;
@@ -4322,252 +4175,64 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
 }
 
 /*
- * Evict the specified number of bytes from the state specified,
- * restricting eviction to the spa and type given. This function
- * prevents us from trying to evict more from a state's list than
- * is "evictable", and to skip evicting altogether when passed a
+ * Evict the specified number of bytes from the state specified. This
+ * function prevents us from trying to evict more from a state's list
+ * than is "evictable", and to skip evicting altogether when passed a
  * negative value for "bytes". In contrast, arc_evict_state() will
  * evict everything it can, when passed a negative value for "bytes".
  */
 static uint64_t
-arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
-    arc_buf_contents_t type)
+arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes)
 {
 	uint64_t delta;
 
 	if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
 		delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
 		    bytes);
-		return (arc_evict_state(state, spa, delta, type));
+		return (arc_evict_state(state, type, 0, delta));
 	}
 
 	return (0);
 }
 
 /*
- * The goal of this function is to evict enough meta data buffers from the
- * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
- * more complicated than it appears because it is common for data buffers
- * to have holds on meta data buffers.  In addition, dnode meta data buffers
- * will be held by the dnodes in the block preventing them from being freed.
- * This means we can't simply traverse the ARC and expect to always find
- * enough unheld meta data buffer to release.
- *
- * Therefore, this function has been updated to make alternating passes
- * over the ARC releasing data buffers and then newly unheld meta data
- * buffers.  This ensures forward progress is maintained and meta_used
- * will decrease.  Normally this is sufficient, but if required the ARC
- * will call the registered prune callbacks causing dentry and inodes to
- * be dropped from the VFS cache.  This will make dnode meta data buffers
- * available for reclaim.
+ * Adjust specified fraction, taking into account initial ghost state(s) size,
+ * ghost hit bytes towards increasing the fraction, ghost hit bytes towards
+ * decreasing it, plus a balance factor, controlling the decrease rate, used
+ * to balance metadata vs data.
  */
 static uint64_t
-arc_evict_meta_balanced(uint64_t meta_used)
+arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down,
+    uint_t balance)
 {
-	int64_t delta, prune = 0, adjustmnt;
-	uint64_t total_evicted = 0;
-	arc_buf_contents_t type = ARC_BUFC_DATA;
-	int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
+	if (total < 8 || up + down == 0)
+		return (frac);
 
-restart:
 	/*
-	 * This slightly differs than the way we evict from the mru in
-	 * arc_evict because we don't have a "target" value (i.e. no
-	 * "meta" arc_p). As a result, I think we can completely
-	 * cannibalize the metadata in the MRU before we evict the
-	 * metadata from the MFU. I think we probably need to implement a
-	 * "metadata arc_p" value to do this properly.
+	 * We should not have more ghost hits than ghost size, but they
+	 * may get close.  Restrict maximum adjustment in that case.
 	 */
-	adjustmnt = meta_used - arc_meta_limit;
-
-	if (adjustmnt > 0 &&
-	    zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) {
-		delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]),
-		    adjustmnt);
-		total_evicted += arc_evict_impl(arc_mru, 0, delta, type);
-		adjustmnt -= delta;
+	if (up + down >= total / 4) {
+		uint64_t scale = (up + down) / (total / 8);
+		up /= scale;
+		down /= scale;
 	}
 
-	/*
-	 * We can't afford to recalculate adjustmnt here. If we do,
-	 * new metadata buffers can sneak into the MRU or ANON lists,
-	 * thus penalize the MFU metadata. Although the fudge factor is
-	 * small, it has been empirically shown to be significant for
-	 * certain workloads (e.g. creating many empty directories). As
-	 * such, we use the original calculation for adjustmnt, and
-	 * simply decrement the amount of data evicted from the MRU.
-	 */
+	/* Get maximal dynamic range by choosing optimal shifts. */
+	int s = highbit64(total);
+	s = MIN(64 - s, 32);
 
-	if (adjustmnt > 0 &&
-	    zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) {
-		delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]),
-		    adjustmnt);
-		total_evicted += arc_evict_impl(arc_mfu, 0, delta, type);
-	}
+	uint64_t ofrac = (1ULL << 32) - frac;
 
-	adjustmnt = meta_used - arc_meta_limit;
+	if (frac >= 4 * ofrac)
+		up /= frac / (2 * ofrac + 1);
+	up = (up << s) / (total >> (32 - s));
+	if (ofrac >= 4 * frac)
+		down /= ofrac / (2 * frac + 1);
+	down = (down << s) / (total >> (32 - s));
+	down = down * 100 / balance;
 
-	if (adjustmnt > 0 &&
-	    zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
-		delta = MIN(adjustmnt,
-		    zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]));
-		total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type);
-		adjustmnt -= delta;
-	}
-
-	if (adjustmnt > 0 &&
-	    zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) {
-		delta = MIN(adjustmnt,
-		    zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]));
-		total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type);
-	}
-
-	/*
-	 * If after attempting to make the requested adjustment to the ARC
-	 * the meta limit is still being exceeded then request that the
-	 * higher layers drop some cached objects which have holds on ARC
-	 * meta buffers.  Requests to the upper layers will be made with
-	 * increasingly large scan sizes until the ARC is below the limit.
-	 */
-	if (meta_used > arc_meta_limit) {
-		if (type == ARC_BUFC_DATA) {
-			type = ARC_BUFC_METADATA;
-		} else {
-			type = ARC_BUFC_DATA;
-
-			if (zfs_arc_meta_prune) {
-				prune += zfs_arc_meta_prune;
-				arc_prune_async(prune);
-			}
-		}
-
-		if (restarts > 0) {
-			restarts--;
-			goto restart;
-		}
-	}
-	return (total_evicted);
-}
-
-/*
- * Evict metadata buffers from the cache, such that arcstat_meta_used is
- * capped by the arc_meta_limit tunable.
- */
-static uint64_t
-arc_evict_meta_only(uint64_t meta_used)
-{
-	uint64_t total_evicted = 0;
-	int64_t target;
-
-	/*
-	 * If we're over the meta limit, we want to evict enough
-	 * metadata to get back under the meta limit. We don't want to
-	 * evict so much that we drop the MRU below arc_p, though. If
-	 * we're over the meta limit more than we're over arc_p, we
-	 * evict some from the MRU here, and some from the MFU below.
-	 */
-	target = MIN((int64_t)(meta_used - arc_meta_limit),
-	    (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
-	    zfs_refcount_count(&arc_mru->arcs_size) - arc_p));
-
-	total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
-
-	/*
-	 * Similar to the above, we want to evict enough bytes to get us
-	 * below the meta limit, but not so much as to drop us below the
-	 * space allotted to the MFU (which is defined as arc_c - arc_p).
-	 */
-	target = MIN((int64_t)(meta_used - arc_meta_limit),
-	    (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) -
-	    (arc_c - arc_p)));
-
-	total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
-
-	return (total_evicted);
-}
-
-static uint64_t
-arc_evict_meta(uint64_t meta_used)
-{
-	if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
-		return (arc_evict_meta_only(meta_used));
-	else
-		return (arc_evict_meta_balanced(meta_used));
-}
-
-/*
- * Return the type of the oldest buffer in the given arc state
- *
- * This function will select a random sublist of type ARC_BUFC_DATA and
- * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
- * is compared, and the type which contains the "older" buffer will be
- * returned.
- */
-static arc_buf_contents_t
-arc_evict_type(arc_state_t *state)
-{
-	multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
-	multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
-	int data_idx = multilist_get_random_index(data_ml);
-	int meta_idx = multilist_get_random_index(meta_ml);
-	multilist_sublist_t *data_mls;
-	multilist_sublist_t *meta_mls;
-	arc_buf_contents_t type;
-	arc_buf_hdr_t *data_hdr;
-	arc_buf_hdr_t *meta_hdr;
-
-	/*
-	 * We keep the sublist lock until we're finished, to prevent
-	 * the headers from being destroyed via arc_evict_state().
-	 */
-	data_mls = multilist_sublist_lock(data_ml, data_idx);
-	meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
-
-	/*
-	 * These two loops are to ensure we skip any markers that
-	 * might be at the tail of the lists due to arc_evict_state().
-	 */
-
-	for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
-	    data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
-		if (data_hdr->b_spa != 0)
-			break;
-	}
-
-	for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
-	    meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
-		if (meta_hdr->b_spa != 0)
-			break;
-	}
-
-	if (data_hdr == NULL && meta_hdr == NULL) {
-		type = ARC_BUFC_DATA;
-	} else if (data_hdr == NULL) {
-		ASSERT3P(meta_hdr, !=, NULL);
-		type = ARC_BUFC_METADATA;
-	} else if (meta_hdr == NULL) {
-		ASSERT3P(data_hdr, !=, NULL);
-		type = ARC_BUFC_DATA;
-	} else {
-		ASSERT3P(data_hdr, !=, NULL);
-		ASSERT3P(meta_hdr, !=, NULL);
-
-		/* The headers can't be on the sublist without an L1 header */
-		ASSERT(HDR_HAS_L1HDR(data_hdr));
-		ASSERT(HDR_HAS_L1HDR(meta_hdr));
-
-		if (data_hdr->b_l1hdr.b_arc_access <
-		    meta_hdr->b_l1hdr.b_arc_access) {
-			type = ARC_BUFC_DATA;
-		} else {
-			type = ARC_BUFC_METADATA;
-		}
-	}
-
-	multilist_sublist_unlock(meta_mls);
-	multilist_sublist_unlock(data_mls);
-
-	return (type);
+	return (frac + up - down);
 }
 
 /*
@@ -4576,150 +4241,128 @@ arc_evict_type(arc_state_t *state)
 static uint64_t
 arc_evict(void)
 {
-	uint64_t total_evicted = 0;
-	uint64_t bytes;
-	int64_t target;
-	uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
-	uint64_t ameta = aggsum_value(&arc_sums.arcstat_meta_used);
-
-	/*
-	 * If we're over arc_meta_limit, we want to correct that before
-	 * potentially evicting data buffers below.
-	 */
-	total_evicted += arc_evict_meta(ameta);
-
-	/*
-	 * Adjust MRU size
-	 *
-	 * If we're over the target cache size, we want to evict enough
-	 * from the list to get back to our target size. We don't want
-	 * to evict too much from the MRU, such that it drops below
-	 * arc_p. So, if we're over our target cache size more than
-	 * the MRU is over arc_p, we'll evict enough to get back to
-	 * arc_p here, and then evict more from the MFU below.
-	 */
-	target = MIN((int64_t)(asize - arc_c),
-	    (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
-	    zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
-
-	/*
-	 * If we're below arc_meta_min, always prefer to evict data.
-	 * Otherwise, try to satisfy the requested number of bytes to
-	 * evict from the type which contains older buffers; in an
-	 * effort to keep newer buffers in the cache regardless of their
-	 * type. If we cannot satisfy the number of bytes from this
-	 * type, spill over into the next type.
-	 */
-	if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA &&
-	    ameta > arc_meta_min) {
-		bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
-		total_evicted += bytes;
-
-		/*
-		 * If we couldn't evict our target number of bytes from
-		 * metadata, we try to get the rest from data.
-		 */
-		target -= bytes;
-
-		total_evicted +=
-		    arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA);
-	} else {
-		bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA);
-		total_evicted += bytes;
-
-		/*
-		 * If we couldn't evict our target number of bytes from
-		 * data, we try to get the rest from metadata.
-		 */
-		target -= bytes;
-
-		total_evicted +=
-		    arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
-	}
+	uint64_t asize, bytes, total_evicted = 0;
+	int64_t e, mrud, mrum, mfud, mfum, w;
+	static uint64_t ogrd, ogrm, ogfd, ogfm;
+	static uint64_t gsrd, gsrm, gsfd, gsfm;
+	uint64_t ngrd, ngrm, ngfd, ngfm;
+
+	/* Get current size of ARC states we can evict from. */
+	mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) +
+	    zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]);
+	mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) +
+	    zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
+	mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
+	mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
+	uint64_t d = mrud + mfud;
+	uint64_t m = mrum + mfum;
+	uint64_t t = d + m;
+
+	/* Get ARC ghost hits since last eviction. */
+	ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
+	uint64_t grd = ngrd - ogrd;
+	ogrd = ngrd;
+	ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
+	uint64_t grm = ngrm - ogrm;
+	ogrm = ngrm;
+	ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
+	uint64_t gfd = ngfd - ogfd;
+	ogfd = ngfd;
+	ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
+	uint64_t gfm = ngfm - ogfm;
+	ogfm = ngfm;
+
+	/* Adjust ARC states balance based on ghost hits. */
+	arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm,
+	    grm + gfm, grd + gfd, zfs_arc_meta_balance);
+	arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100);
+	arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100);
 
-	/*
-	 * Re-sum ARC stats after the first round of evictions.
-	 */
 	asize = aggsum_value(&arc_sums.arcstat_size);
-	ameta = aggsum_value(&arc_sums.arcstat_meta_used);
-
-
-	/*
-	 * Adjust MFU size
-	 *
-	 * Now that we've tried to evict enough from the MRU to get its
-	 * size back to arc_p, if we're still above the target cache
-	 * size, we evict the rest from the MFU.
-	 */
-	target = asize - arc_c;
-
-	if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA &&
-	    ameta > arc_meta_min) {
-		bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
-		total_evicted += bytes;
-
-		/*
-		 * If we couldn't evict our target number of bytes from
-		 * metadata, we try to get the rest from data.
-		 */
-		target -= bytes;
-
-		total_evicted +=
-		    arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
-	} else {
-		bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
-		total_evicted += bytes;
-
-		/*
-		 * If we couldn't evict our target number of bytes from
-		 * data, we try to get the rest from data.
-		 */
-		target -= bytes;
-
-		total_evicted +=
-		    arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
-	}
-
-	/*
-	 * Adjust ghost lists
-	 *
-	 * In addition to the above, the ARC also defines target values
-	 * for the ghost lists. The sum of the mru list and mru ghost
-	 * list should never exceed the target size of the cache, and
-	 * the sum of the mru list, mfu list, mru ghost list, and mfu
-	 * ghost list should never exceed twice the target size of the
-	 * cache. The following logic enforces these limits on the ghost
-	 * caches, and evicts from them as needed.
-	 */
-	target = zfs_refcount_count(&arc_mru->arcs_size) +
-	    zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
-
-	bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
+	int64_t wt = t - (asize - arc_c);
+
+	/*
+	 * Try to reduce pinned dnodes if more than 3/4 of wanted metadata
+	 * target is not evictable or if they go over arc_dnode_limit.
+	 */
+	int64_t prune = 0;
+	int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size);
+	w = wt * (int64_t)(arc_meta >> 16) >> 16;
+	if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) +
+	    zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) -
+	    zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) -
+	    zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]) >
+	    w * 3 / 4) {
+		prune = dn / sizeof (dnode_t) *
+		    zfs_arc_dnode_reduce_percent / 100;
+	} else if (dn > arc_dnode_limit) {
+		prune = (dn - arc_dnode_limit) / sizeof (dnode_t) *
+		    zfs_arc_dnode_reduce_percent / 100;
+	}
+	if (prune > 0)
+		arc_prune_async(prune);
+
+	/* Evict MRU metadata. */
+	w = wt * (int64_t)(arc_meta * arc_pm >> 48) >> 16;
+	e = MIN((int64_t)(asize - arc_c), (int64_t)(mrum - w));
+	bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e);
 	total_evicted += bytes;
+	mrum -= bytes;
+	asize -= bytes;
 
-	target -= bytes;
+	/* Evict MFU metadata. */
+	w = wt * (int64_t)(arc_meta >> 16) >> 16;
+	e = MIN((int64_t)(asize - arc_c), (int64_t)(m - w));
+	bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e);
+	total_evicted += bytes;
+	mfum -= bytes;
+	asize -= bytes;
+
+	/* Evict MRU data. */
+	wt -= m - total_evicted;
+	w = wt * (int64_t)(arc_pd >> 16) >> 16;
+	e = MIN((int64_t)(asize - arc_c), (int64_t)(mrud - w));
+	bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e);
+	total_evicted += bytes;
+	mrud -= bytes;
+	asize -= bytes;
 
-	total_evicted +=
-	    arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
+	/* Evict MFU data. */
+	e = asize - arc_c;
+	bytes = arc_evict_impl(arc_mfu, ARC_BUFC_DATA, e);
+	mfud -= bytes;
+	total_evicted += bytes;
 
 	/*
-	 * We assume the sum of the mru list and mfu list is less than
-	 * or equal to arc_c (we enforced this above), which means we
-	 * can use the simpler of the two equations below:
+	 * Evict ghost lists
 	 *
-	 *	mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
-	 *		    mru ghost + mfu ghost <= arc_c
-	 */
-	target = zfs_refcount_count(&arc_mru_ghost->arcs_size) +
-	    zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
-
-	bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
-	total_evicted += bytes;
-
-	target -= bytes;
-
-	total_evicted +=
-	    arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
+	 * Size of each state's ghost list represents how much that state
+	 * may grow by shrinking the other states.  Would it need to shrink
+	 * other states to zero (that is unlikely), its ghost size would be
+	 * equal to sum of other three state sizes.  But excessive ghost
+	 * size may result in false ghost hits (too far back), that may
+	 * never result in real cache hits if several states are competing.
+	 * So choose some arbitraty point of 1/2 of other state sizes.
+	 */
+	gsrd = (mrum + mfud + mfum) / 2;
+	e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) -
+	    gsrd;
+	(void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_DATA, e);
+
+	gsrm = (mrud + mfud + mfum) / 2;
+	e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) -
+	    gsrm;
+	(void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_METADATA, e);
+
+	gsfd = (mrud + mrum + mfum) / 2;
+	e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) -
+	    gsfd;
+	(void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_DATA, e);
+
+	gsfm = (mrud + mrum + mfud) / 2;
+	e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) -
+	    gsfm;
+	(void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_METADATA, e);
 
 	return (total_evicted);
 }
@@ -4734,7 +4377,7 @@ arc_flush(spa_t *spa, boolean_t retry)
 	 * no good way to determine if all of a spa's buffers have been
 	 * evicted from an arc state.
 	 */
-	ASSERT(!retry || spa == 0);
+	ASSERT(!retry || spa == NULL);
 
 	if (spa != NULL)
 		guid = spa_load_guid(spa);
@@ -4750,12 +4393,18 @@ arc_flush(spa_t *spa, boolean_t retry)
 
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
+
+	(void) arc_flush_state(arc_uncached, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry);
 }
 
 void
 arc_reduce_target_size(int64_t to_free)
 {
-	uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
+	uint64_t c = arc_c;
+
+	if (c <= arc_c_min)
+		return;
 
 	/*
 	 * All callers want the ARC to actually evict (at least) this much
@@ -4765,26 +4414,16 @@ arc_reduce_target_size(int64_t to_free)
 	 * immediately have arc_c < arc_size and therefore the arc_evict_zthr
 	 * will evict.
 	 */
-	uint64_t c = MIN(arc_c, asize);
-
-	if (c > to_free && c - to_free > arc_c_min) {
-		arc_c = c - to_free;
-		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
-		if (arc_p > arc_c)
-			arc_p = (arc_c >> 1);
-		ASSERT(arc_c >= arc_c_min);
-		ASSERT((int64_t)arc_p >= 0);
-	} else {
-		arc_c = arc_c_min;
-	}
+	uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
+	if (asize < c)
+		to_free += c - asize;
+	arc_c = MAX((int64_t)c - to_free, (int64_t)arc_c_min);
 
-	if (asize > arc_c) {
-		/* See comment in arc_evict_cb_check() on why lock+flag */
-		mutex_enter(&arc_evict_lock);
-		arc_evict_needed = B_TRUE;
-		mutex_exit(&arc_evict_lock);
-		zthr_wakeup(arc_evict_zthr);
-	}
+	/* See comment in arc_evict_cb_check() on why lock+flag */
+	mutex_enter(&arc_evict_lock);
+	arc_evict_needed = B_TRUE;
+	mutex_exit(&arc_evict_lock);
+	zthr_wakeup(arc_evict_zthr);
 }
 
 /*
@@ -4804,18 +4443,8 @@ arc_kmem_reap_soon(void)
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
 	kmem_cache_t		*prev_data_cache = NULL;
-	extern kmem_cache_t	*zio_buf_cache[];
-	extern kmem_cache_t	*zio_data_buf_cache[];
 
 #ifdef _KERNEL
-	if ((aggsum_compare(&arc_sums.arcstat_meta_used,
-	    arc_meta_limit) >= 0) && zfs_arc_meta_prune) {
-		/*
-		 * We are exceeding our meta-data cache limit.
-		 * Prune some entries to release holds on meta-data.
-		 */
-		arc_prune_async(zfs_arc_meta_prune);
-	}
 #if defined(_ILP32)
 	/*
 	 * Reclaim unused memory from all kmem caches.
@@ -4846,10 +4475,11 @@ arc_kmem_reap_soon(void)
 	abd_cache_reap_now();
 }
 
-/* ARGSUSED */
 static boolean_t
 arc_evict_cb_check(void *arg, zthr_t *zthr)
 {
+	(void) arg, (void) zthr;
+
 #ifdef ZFS_DEBUG
 	/*
 	 * This is necessary in order to keep the kstat information
@@ -4882,22 +4512,38 @@ arc_evict_cb_check(void *arg, zthr_t *zthr)
 	 * which is held before this function is called, and is held by
 	 * arc_wait_for_eviction() when it calls zthr_wakeup().
 	 */
-	return (arc_evict_needed);
+	if (arc_evict_needed)
+		return (B_TRUE);
+
+	/*
+	 * If we have buffers in uncached state, evict them periodically.
+	 */
+	return ((zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_DATA]) +
+	    zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]) &&
+	    ddi_get_lbolt() - arc_last_uncached_flush >
+	    MSEC_TO_TICK(arc_min_prefetch_ms / 2)));
 }
 
 /*
  * Keep arc_size under arc_c by running arc_evict which evicts data
  * from the ARC.
  */
-/* ARGSUSED */
 static void
 arc_evict_cb(void *arg, zthr_t *zthr)
 {
+	(void) arg;
+
 	uint64_t evicted = 0;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
-	/* Evict from cache */
-	evicted = arc_evict();
+	/* Always try to evict from uncached state. */
+	arc_last_uncached_flush = ddi_get_lbolt();
+	evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_DATA, B_FALSE);
+	evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_METADATA, B_FALSE);
+
+	/* Evict from other states only if told to. */
+	if (arc_evict_needed)
+		evicted += arc_evict();
 
 	/*
 	 * If evicted is zero, we couldn't evict anything
@@ -4909,9 +4555,13 @@ arc_evict_cb(void *arg, zthr_t *zthr)
 	 * infinite loop.  Additionally, zthr_iscancelled() is
 	 * checked here so that if the arc is shutting down, the
 	 * broadcast will wake any remaining arc evict waiters.
+	 *
+	 * Note we cancel using zthr instead of arc_evict_zthr
+	 * because the latter may not yet be initializd when the
+	 * callback is first invoked.
 	 */
 	mutex_enter(&arc_evict_lock);
-	arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) &&
+	arc_evict_needed = !zthr_iscancelled(zthr) &&
 	    evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0;
 	if (!arc_evict_needed) {
 		/*
@@ -4929,10 +4579,11 @@ arc_evict_cb(void *arg, zthr_t *zthr)
 	spl_fstrans_unmark(cookie);
 }
 
-/* ARGSUSED */
 static boolean_t
 arc_reap_cb_check(void *arg, zthr_t *zthr)
 {
+	(void) arg, (void) zthr;
+
 	int64_t free_memory = arc_available_memory();
 	static int reap_cb_check_counter = 0;
 
@@ -4976,10 +4627,11 @@ arc_reap_cb_check(void *arg, zthr_t *zthr)
  * target size of the cache (arc_c), causing the arc_evict_cb()
  * to free more buffers.
  */
-/* ARGSUSED */
 static void
 arc_reap_cb(void *arg, zthr_t *zthr)
 {
+	(void) arg, (void) zthr;
+
 	int64_t free_memory;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
@@ -5009,10 +4661,11 @@ arc_reap_cb(void *arg, zthr_t *zthr)
 	 */
 	free_memory = arc_available_memory();
 
-	int64_t to_free =
-	    (arc_c >> arc_shrink_shift) - free_memory;
-	if (to_free > 0) {
-		arc_reduce_target_size(to_free);
+	int64_t can_free = arc_c - arc_c_min;
+	if (can_free > 0) {
+		int64_t to_free = (can_free >> arc_shrink_shift) - free_memory;
+		if (to_free > 0)
+			arc_reduce_target_size(to_free);
 	}
 	spl_fstrans_unmark(cookie);
 }
@@ -5072,40 +4725,8 @@ arc_reap_cb(void *arg, zthr_t *zthr)
  * when we are adding new content to the cache.
  */
 static void
-arc_adapt(int bytes, arc_state_t *state)
+arc_adapt(uint64_t bytes)
 {
-	int mult;
-	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
-	int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size);
-	int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size);
-
-	ASSERT(bytes > 0);
-	/*
-	 * Adapt the target size of the MRU list:
-	 *	- if we just hit in the MRU ghost list, then increase
-	 *	  the target size of the MRU list.
-	 *	- if we just hit in the MFU ghost list, then increase
-	 *	  the target size of the MFU list by decreasing the
-	 *	  target size of the MRU list.
-	 */
-	if (state == arc_mru_ghost) {
-		mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
-		if (!zfs_arc_p_dampener_disable)
-			mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
-
-		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
-	} else if (state == arc_mfu_ghost) {
-		uint64_t delta;
-
-		mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
-		if (!zfs_arc_p_dampener_disable)
-			mult = MIN(mult, 10);
-
-		delta = MIN(bytes * mult, arc_p);
-		arc_p = MAX(arc_p_min, arc_p - delta);
-	}
-	ASSERT((int64_t)arc_p >= 0);
-
 	/*
 	 * Wake reap thread if we do not have any available memory
 	 */
@@ -5124,18 +4745,12 @@ arc_adapt(int bytes, arc_state_t *state)
 	 * If we're within (2 * maxblocksize) bytes of the target
 	 * cache size, increment the target cache size
 	 */
-	ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
-	if (aggsum_upper_bound(&arc_sums.arcstat_size) >=
-	    arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
-		atomic_add_64(&arc_c, (int64_t)bytes);
-		if (arc_c > arc_c_max)
+	if (aggsum_upper_bound(&arc_sums.arcstat_size) +
+	    2 * SPA_MAXBLOCKSIZE >= arc_c) {
+		uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE);
+		if (atomic_add_64_nv(&arc_c, dc) > arc_c_max)
 			arc_c = arc_c_max;
-		else if (state == arc_anon)
-			atomic_add_64(&arc_p, (int64_t)bytes);
-		if (arc_p > arc_c)
-			arc_p = arc_c;
 	}
-	ASSERT((int64_t)arc_p >= 0);
 }
 
 /*
@@ -5167,26 +4782,24 @@ arc_is_overflowing(boolean_t use_reserve)
 }
 
 static abd_t *
-arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
+arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
     int alloc_flags)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_get_data_impl(hdr, size, tag, alloc_flags);
-	if (type == ARC_BUFC_METADATA) {
-		return (abd_alloc(size, B_TRUE));
-	} else {
-		ASSERT(type == ARC_BUFC_DATA);
-		return (abd_alloc(size, B_FALSE));
-	}
+	if (alloc_flags & ARC_HDR_ALLOC_LINEAR)
+		return (abd_alloc_linear(size, type == ARC_BUFC_METADATA));
+	else
+		return (abd_alloc(size, type == ARC_BUFC_METADATA));
 }
 
 static void *
-arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
-	arc_get_data_impl(hdr, size, tag, ARC_HDR_DO_ADAPT);
+	arc_get_data_impl(hdr, size, tag, 0);
 	if (type == ARC_BUFC_METADATA) {
 		return (zio_buf_alloc(size));
 	} else {
@@ -5281,14 +4894,10 @@ arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve)
  * limit, we'll only signal the reclaim thread and continue on.
  */
 static void
-arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
+arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
     int alloc_flags)
 {
-	arc_state_t *state = hdr->b_l1hdr.b_state;
-	arc_buf_contents_t type = arc_buf_type(hdr);
-
-	if (alloc_flags & ARC_HDR_DO_ADAPT)
-		arc_adapt(size, state);
+	arc_adapt(size);
 
 	/*
 	 * If arc_size is currently overflowing, we must be adding data
@@ -5306,7 +4915,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
 	arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100,
 	    alloc_flags & ARC_HDR_USE_RESERVE);
 
-	VERIFY3U(hdr->b_type, ==, type);
+	arc_buf_contents_t type = arc_buf_type(hdr);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_consume(size, ARC_SPACE_META);
 	} else {
@@ -5317,9 +4926,11 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
 	 * Update the state size.  Note that ghost states have a
 	 * "ghost size" and so don't need to be updated.
 	 */
+	arc_state_t *state = hdr->b_l1hdr.b_state;
 	if (!GHOST_STATE(state)) {
 
-		(void) zfs_refcount_add_many(&state->arcs_size, size, tag);
+		(void) zfs_refcount_add_many(&state->arcs_size[type], size,
+		    tag);
 
 		/*
 		 * If this is reached via arc_read, the link is
@@ -5335,28 +4946,19 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
 			(void) zfs_refcount_add_many(&state->arcs_esize[type],
 			    size, tag);
 		}
-
-		/*
-		 * If we are growing the cache, and we are adding anonymous
-		 * data, and we have outgrown arc_p, update arc_p
-		 */
-		if (aggsum_upper_bound(&arc_sums.arcstat_size) < arc_c &&
-		    hdr->b_l1hdr.b_state == arc_anon &&
-		    (zfs_refcount_count(&arc_anon->arcs_size) +
-		    zfs_refcount_count(&arc_mru->arcs_size) > arc_p))
-			arc_p = MIN(arc_c, arc_p + size);
 	}
 }
 
 static void
-arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
+arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size,
+    const void *tag)
 {
 	arc_free_data_impl(hdr, size, tag);
 	abd_free(abd);
 }
 
 static void
-arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
+arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, const void *tag)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
@@ -5373,7 +4975,7 @@ arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
  * Free the arc data buffer.
  */
 static void
-arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
@@ -5386,7 +4988,7 @@ arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    size, tag);
 	}
-	(void) zfs_refcount_remove_many(&state->arcs_size, size, tag);
+	(void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag);
 
 	VERIFY3U(hdr->b_type, ==, type);
 	if (type == ARC_BUFC_METADATA) {
@@ -5399,150 +5001,155 @@ arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 
 /*
  * This routine is called whenever a buffer is accessed.
- * NOTE: the hash lock is dropped in this function.
  */
 static void
-arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
+arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit)
 {
-	clock_t now;
-
-	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
+	/*
+	 * Update buffer prefetch status.
+	 */
+	boolean_t was_prefetch = HDR_PREFETCH(hdr);
+	boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH;
+	if (was_prefetch != now_prefetch) {
+		if (was_prefetch) {
+			ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit,
+			    HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive,
+			    prefetch);
+		}
+		if (HDR_HAS_L2HDR(hdr))
+			l2arc_hdr_arcstats_decrement_state(hdr);
+		if (was_prefetch) {
+			arc_hdr_clear_flags(hdr,
+			    ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH);
+		} else {
+			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+		}
+		if (HDR_HAS_L2HDR(hdr))
+			l2arc_hdr_arcstats_increment_state(hdr);
+	}
+	if (now_prefetch) {
+		if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
+			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
+			ARCSTAT_BUMP(arcstat_prescient_prefetch);
+		} else {
+			ARCSTAT_BUMP(arcstat_predictive_prefetch);
+		}
+	}
+	if (arc_flags & ARC_FLAG_L2CACHE)
+		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+
+	clock_t now = ddi_get_lbolt();
 	if (hdr->b_l1hdr.b_state == arc_anon) {
+		arc_state_t	*new_state;
 		/*
-		 * This buffer is not in the cache, and does not
-		 * appear in our "ghost" list.  Add the new buffer
-		 * to the MRU state.
+		 * This buffer is not in the cache, and does not appear in
+		 * our "ghost" lists.  Add it to the MRU or uncached state.
 		 */
-
 		ASSERT0(hdr->b_l1hdr.b_arc_access);
-		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
-		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
-		arc_change_state(arc_mru, hdr, hash_lock);
-
+		hdr->b_l1hdr.b_arc_access = now;
+		if (HDR_UNCACHED(hdr)) {
+			new_state = arc_uncached;
+			DTRACE_PROBE1(new_state__uncached, arc_buf_hdr_t *,
+			    hdr);
+		} else {
+			new_state = arc_mru;
+			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
+		}
+		arc_change_state(new_state, hdr);
 	} else if (hdr->b_l1hdr.b_state == arc_mru) {
-		now = ddi_get_lbolt();
+		/*
+		 * This buffer has been accessed once recently and either
+		 * its read is still in progress or it is in the cache.
+		 */
+		if (HDR_IO_IN_PROGRESS(hdr)) {
+			hdr->b_l1hdr.b_arc_access = now;
+			return;
+		}
+		hdr->b_l1hdr.b_mru_hits++;
+		ARCSTAT_BUMP(arcstat_mru_hits);
 
 		/*
-		 * If this buffer is here because of a prefetch, then either:
-		 * - clear the flag if this is a "referencing" read
-		 *   (any subsequent access will bump this into the MFU state).
-		 * or
-		 * - move the buffer to the head of the list if this is
-		 *   another prefetch (to make it less likely to be evicted).
+		 * If the previous access was a prefetch, then it already
+		 * handled possible promotion, so nothing more to do for now.
 		 */
-		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
-			if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
-				/* link protected by hash lock */
-				ASSERT(multilist_link_active(
-				    &hdr->b_l1hdr.b_arc_node));
-			} else {
-				if (HDR_HAS_L2HDR(hdr))
-					l2arc_hdr_arcstats_decrement_state(hdr);
-				arc_hdr_clear_flags(hdr,
-				    ARC_FLAG_PREFETCH |
-				    ARC_FLAG_PRESCIENT_PREFETCH);
-				hdr->b_l1hdr.b_mru_hits++;
-				ARCSTAT_BUMP(arcstat_mru_hits);
-				if (HDR_HAS_L2HDR(hdr))
-					l2arc_hdr_arcstats_increment_state(hdr);
-			}
+		if (was_prefetch) {
 			hdr->b_l1hdr.b_arc_access = now;
 			return;
 		}
 
 		/*
-		 * This buffer has been "accessed" only once so far,
-		 * but it is still in the cache. Move it to the MFU
-		 * state.
+		 * If more than ARC_MINTIME have passed from the previous
+		 * hit, promote the buffer to the MFU state.
 		 */
 		if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
 		    ARC_MINTIME)) {
-			/*
-			 * More than 125ms have passed since we
-			 * instantiated this buffer.  Move it to the
-			 * most frequently used state.
-			 */
 			hdr->b_l1hdr.b_arc_access = now;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
-			arc_change_state(arc_mfu, hdr, hash_lock);
+			arc_change_state(arc_mfu, hdr);
 		}
-		hdr->b_l1hdr.b_mru_hits++;
-		ARCSTAT_BUMP(arcstat_mru_hits);
 	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
 		arc_state_t	*new_state;
 		/*
-		 * This buffer has been "accessed" recently, but
-		 * was evicted from the cache.  Move it to the
-		 * MFU state.
+		 * This buffer has been accessed once recently, but was
+		 * evicted from the cache.  Would we have bigger MRU, it
+		 * would be an MRU hit, so handle it the same way, except
+		 * we don't need to check the previous access time.
 		 */
-		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
+		hdr->b_l1hdr.b_mru_ghost_hits++;
+		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
+		hdr->b_l1hdr.b_arc_access = now;
+		wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)],
+		    arc_hdr_size(hdr));
+		if (was_prefetch) {
 			new_state = arc_mru;
-			if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
-				if (HDR_HAS_L2HDR(hdr))
-					l2arc_hdr_arcstats_decrement_state(hdr);
-				arc_hdr_clear_flags(hdr,
-				    ARC_FLAG_PREFETCH |
-				    ARC_FLAG_PRESCIENT_PREFETCH);
-				if (HDR_HAS_L2HDR(hdr))
-					l2arc_hdr_arcstats_increment_state(hdr);
-			}
 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		} else {
 			new_state = arc_mfu;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		}
-
-		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
-		arc_change_state(new_state, hdr, hash_lock);
-
-		hdr->b_l1hdr.b_mru_ghost_hits++;
-		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
+		arc_change_state(new_state, hdr);
 	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
 		/*
-		 * This buffer has been accessed more than once and is
-		 * still in the cache.  Keep it in the MFU state.
-		 *
-		 * NOTE: an add_reference() that occurred when we did
-		 * the arc_read() will have kicked this off the list.
-		 * If it was a prefetch, we will explicitly move it to
-		 * the head of the list now.
+		 * This buffer has been accessed more than once and either
+		 * still in the cache or being restored from one of ghosts.
 		 */
-
-		hdr->b_l1hdr.b_mfu_hits++;
-		ARCSTAT_BUMP(arcstat_mfu_hits);
-		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+		if (!HDR_IO_IN_PROGRESS(hdr)) {
+			hdr->b_l1hdr.b_mfu_hits++;
+			ARCSTAT_BUMP(arcstat_mfu_hits);
+		}
+		hdr->b_l1hdr.b_arc_access = now;
 	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
-		arc_state_t	*new_state = arc_mfu;
 		/*
-		 * This buffer has been accessed more than once but has
-		 * been evicted from the cache.  Move it back to the
-		 * MFU state.
+		 * This buffer has been accessed more than once recently, but
+		 * has been evicted from the cache.  Would we have bigger MFU
+		 * it would stay in cache, so move it back to MFU state.
 		 */
-
-		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
-			/*
-			 * This is a prefetch access...
-			 * move this block back to the MRU state.
-			 */
-			new_state = arc_mru;
-		}
-
-		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
-		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
-		arc_change_state(new_state, hdr, hash_lock);
-
 		hdr->b_l1hdr.b_mfu_ghost_hits++;
 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
+		hdr->b_l1hdr.b_arc_access = now;
+		wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)],
+		    arc_hdr_size(hdr));
+		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+		arc_change_state(arc_mfu, hdr);
+	} else if (hdr->b_l1hdr.b_state == arc_uncached) {
+		/*
+		 * This buffer is uncacheable, but we got a hit.  Probably
+		 * a demand read after prefetch.  Nothing more to do here.
+		 */
+		if (!HDR_IO_IN_PROGRESS(hdr))
+			ARCSTAT_BUMP(arcstat_uncached_hits);
+		hdr->b_l1hdr.b_arc_access = now;
 	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
 		/*
-		 * This buffer is on the 2nd Level ARC.
+		 * This buffer is on the 2nd Level ARC and was not accessed
+		 * for a long time, so treat it as new and put into MRU.
 		 */
-
-		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
-		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
-		arc_change_state(arc_mfu, hdr, hash_lock);
+		hdr->b_l1hdr.b_arc_access = now;
+		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
+		arc_change_state(arc_mru, hdr);
 	} else {
 		cmn_err(CE_PANIC, "invalid arc state 0x%p",
 		    hdr->b_l1hdr.b_state);
@@ -5556,7 +5163,6 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 void
 arc_buf_access(arc_buf_t *buf)
 {
-	mutex_enter(&buf->b_evict_lock);
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
@@ -5564,54 +5170,51 @@ arc_buf_access(arc_buf_t *buf)
 	 * The header must be checked again under the hash_lock in order
 	 * to handle the case where it is concurrently being released.
 	 */
-	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
-		mutex_exit(&buf->b_evict_lock);
+	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr))
 		return;
-	}
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
 		mutex_exit(hash_lock);
-		mutex_exit(&buf->b_evict_lock);
 		ARCSTAT_BUMP(arcstat_access_skip);
 		return;
 	}
 
-	mutex_exit(&buf->b_evict_lock);
-
 	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
-	    hdr->b_l1hdr.b_state == arc_mfu);
+	    hdr->b_l1hdr.b_state == arc_mfu ||
+	    hdr->b_l1hdr.b_state == arc_uncached);
 
 	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
-	arc_access(hdr, hash_lock);
+	arc_access(hdr, 0, B_TRUE);
 	mutex_exit(hash_lock);
 
 	ARCSTAT_BUMP(arcstat_hits);
-	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr),
-	    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
+	ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch,
+	    !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
 }
 
 /* a generic arc_read_done_func_t which you can use */
-/* ARGSUSED */
 void
 arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *arg)
 {
+	(void) zio, (void) zb, (void) bp;
+
 	if (buf == NULL)
 		return;
 
-	bcopy(buf->b_data, arg, arc_buf_size(buf));
+	memcpy(arg, buf->b_data, arc_buf_size(buf));
 	arc_buf_destroy(buf, arg);
 }
 
 /* a generic arc_read_done_func_t */
-/* ARGSUSED */
 void
 arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *arg)
 {
+	(void) zb, (void) bp;
 	arc_buf_t **bufp = arg;
 
 	if (buf == NULL) {
@@ -5649,7 +5252,6 @@ arc_read_done(zio_t *zio)
 	kmutex_t	*hash_lock = NULL;
 	arc_callback_t	*callback_list;
 	arc_callback_t	*acb;
-	boolean_t	freeable = B_FALSE;
 
 	/*
 	 * The hdr was inserted into hash-table and removed from lists
@@ -5662,7 +5264,7 @@ arc_read_done(zio_t *zio)
 	if (HDR_IN_HASH_TABLE(hdr)) {
 		arc_buf_hdr_t *found;
 
-		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
+		ASSERT3U(hdr->b_birth, ==, BP_GET_BIRTH(zio->io_bp));
 		ASSERT3U(hdr->b_dva.dva_word[0], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
 		ASSERT3U(hdr->b_dva.dva_word[1], ==,
@@ -5682,17 +5284,20 @@ arc_read_done(zio_t *zio)
 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv);
 
-		if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
-			void *tmpbuf;
-
-			tmpbuf = abd_borrow_buf_copy(zio->io_abd,
-			    sizeof (zil_chain_t));
-			zio_crypt_decode_mac_zil(tmpbuf,
-			    hdr->b_crypt_hdr.b_mac);
-			abd_return_buf(zio->io_abd, tmpbuf,
-			    sizeof (zil_chain_t));
-		} else {
-			zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
+		if (zio->io_error == 0) {
+			if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
+				void *tmpbuf;
+
+				tmpbuf = abd_borrow_buf_copy(zio->io_abd,
+				    sizeof (zil_chain_t));
+				zio_crypt_decode_mac_zil(tmpbuf,
+				    hdr->b_crypt_hdr.b_mac);
+				abd_return_buf(zio->io_abd, tmpbuf,
+				    sizeof (zil_chain_t));
+			} else {
+				zio_crypt_decode_mac_bp(bp,
+				    hdr->b_crypt_hdr.b_mac);
+			}
 		}
 	}
 
@@ -5719,17 +5324,7 @@ arc_read_done(zio_t *zio)
 
 	callback_list = hdr->b_l1hdr.b_acb;
 	ASSERT3P(callback_list, !=, NULL);
-
-	if (hash_lock && zio->io_error == 0 &&
-	    hdr->b_l1hdr.b_state == arc_anon) {
-		/*
-		 * Only call arc_access on anonymous buffers.  This is because
-		 * if we've issued an I/O for an evicted buffer, we've already
-		 * called arc_access (to prevent any simultaneous readers from
-		 * getting confused).
-		 */
-		arc_access(hdr, hash_lock);
-	}
+	hdr->b_l1hdr.b_acb = NULL;
 
 	/*
 	 * If a read request has a callback (i.e. acb_done is not NULL), then we
@@ -5739,6 +5334,10 @@ arc_read_done(zio_t *zio)
 	 */
 	int callback_cnt = 0;
 	for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
+
+		/* We need the last one to call below in original order. */
+		callback_list = acb;
+
 		if (!acb->acb_done || acb->acb_nobuf)
 			continue;
 
@@ -5767,7 +5366,8 @@ arc_read_done(zio_t *zio)
 			ASSERT(BP_IS_PROTECTED(bp));
 			error = SET_ERROR(EIO);
 			if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
-				spa_log_error(zio->io_spa, &acb->acb_zb);
+				spa_log_error(zio->io_spa, &acb->acb_zb,
+				    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 				(void) zfs_ereport_post(
 				    FM_EREPORT_ZFS_AUTHENTICATION,
 				    zio->io_spa, NULL, &acb->acb_zb, zio, 0);
@@ -5802,44 +5402,21 @@ arc_read_done(zio_t *zio)
 	 */
 	ASSERT(callback_cnt < 2 || hash_lock != NULL);
 
-	hdr->b_l1hdr.b_acb = NULL;
-	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
-	if (callback_cnt == 0)
-		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
-
-	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
-	    callback_list != NULL);
-
 	if (zio->io_error == 0) {
 		arc_hdr_verify(hdr, zio->io_bp);
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 		if (hdr->b_l1hdr.b_state != arc_anon)
-			arc_change_state(arc_anon, hdr, hash_lock);
+			arc_change_state(arc_anon, hdr);
 		if (HDR_IN_HASH_TABLE(hdr))
 			buf_hash_remove(hdr);
-		freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
 	}
 
-	/*
-	 * Broadcast before we drop the hash_lock to avoid the possibility
-	 * that the hdr (and hence the cv) might be freed before we get to
-	 * the cv_broadcast().
-	 */
-	cv_broadcast(&hdr->b_l1hdr.b_cv);
+	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+	(void) remove_reference(hdr, hdr);
 
-	if (hash_lock != NULL) {
+	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
-	} else {
-		/*
-		 * This block was freed while we waited for the read to
-		 * complete.  It has been removed from the hash table and
-		 * moved to the anonymous state (so that it won't show up
-		 * in the cache).
-		 */
-		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
-		freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
-	}
 
 	/* execute each callback and free its structure */
 	while ((acb = callback_list) != NULL) {
@@ -5863,12 +5440,18 @@ arc_read_done(zio_t *zio)
 			zio_nowait(acb->acb_zio_dummy);
 		}
 
-		callback_list = acb->acb_next;
-		kmem_free(acb, sizeof (arc_callback_t));
+		callback_list = acb->acb_prev;
+		if (acb->acb_wait) {
+			mutex_enter(&acb->acb_wait_lock);
+			acb->acb_wait_error = zio->io_error;
+			acb->acb_wait = B_FALSE;
+			cv_signal(&acb->acb_wait_cv);
+			mutex_exit(&acb->acb_wait_lock);
+			/* acb will be freed by the waiting thread. */
+		} else {
+			kmem_free(acb, sizeof (arc_callback_t));
+		}
 	}
-
-	if (freeable)
-		arc_hdr_destroy(hdr);
 }
 
 /*
@@ -5905,6 +5488,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
 	boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp);
 	boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
+	arc_buf_t *buf = NULL;
 	int rc = 0;
 
 	ASSERT(!embedded_bp ||
@@ -5931,10 +5515,10 @@ top:
 	 * and treat it as a checksum error.  This allows an alternate blkptr
 	 * to be tried when one is available (e.g. ditto blocks).
 	 */
-	if (!zfs_blkptr_verify(spa, bp, zio_flags & ZIO_FLAG_CONFIG_WRITER,
-	    BLK_VERIFY_LOG)) {
+	if (!zfs_blkptr_verify(spa, bp, (zio_flags & ZIO_FLAG_CONFIG_WRITER) ?
+	    BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
 		rc = SET_ERROR(ECKSUM);
-		goto out;
+		goto done;
 	}
 
 	if (!embedded_bp) {
@@ -5954,19 +5538,17 @@ top:
 	 */
 	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
 	    (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
-		arc_buf_t *buf = NULL;
-		*arc_flags |= ARC_FLAG_CACHED;
+		boolean_t is_data = !HDR_ISTYPE_METADATA(hdr);
 
 		if (HDR_IO_IN_PROGRESS(hdr)) {
-			zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
-
 			if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
 				mutex_exit(hash_lock);
 				ARCSTAT_BUMP(arcstat_cached_only_in_progress);
 				rc = SET_ERROR(ENOENT);
-				goto out;
+				goto done;
 			}
 
+			zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
 			ASSERT3P(head_zio, !=, NULL);
 			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
 			    priority == ZIO_PRIORITY_SYNC_READ) {
@@ -5980,21 +5562,28 @@ top:
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_async_upgrade_sync);
 			}
-			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
-				arc_hdr_clear_flags(hdr,
-				    ARC_FLAG_PREDICTIVE_PREFETCH);
-			}
 
-			if (*arc_flags & ARC_FLAG_WAIT) {
-				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
-				mutex_exit(hash_lock);
-				goto top;
-			}
-			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
-
-			if (done) {
-				arc_callback_t *acb = NULL;
+			DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr);
+			arc_access(hdr, *arc_flags, B_FALSE);
 
+			/*
+			 * If there are multiple threads reading the same block
+			 * and that block is not yet in the ARC, then only one
+			 * thread will do the physical I/O and all other
+			 * threads will wait until that I/O completes.
+			 * Synchronous reads use the acb_wait_cv whereas nowait
+			 * reads register a callback. Both are signalled/called
+			 * in arc_read_done.
+			 *
+			 * Errors of the physical I/O may need to be propagated.
+			 * Synchronous read errors are returned here from
+			 * arc_read_done via acb_wait_error.  Nowait reads
+			 * attach the acb_zio_dummy zio to pio and
+			 * arc_read_done propagates the physical I/O's io_error
+			 * to acb_zio_dummy, and thereby to pio.
+			 */
+			arc_callback_t *acb = NULL;
+			if (done || pio || *arc_flags & ARC_FLAG_WAIT) {
 				acb = kmem_zalloc(sizeof (arc_callback_t),
 				    KM_SLEEP);
 				acb->acb_done = done;
@@ -6003,46 +5592,52 @@ top:
 				acb->acb_encrypted = encrypted_read;
 				acb->acb_noauth = noauth_read;
 				acb->acb_nobuf = no_buf;
+				if (*arc_flags & ARC_FLAG_WAIT) {
+					acb->acb_wait = B_TRUE;
+					mutex_init(&acb->acb_wait_lock, NULL,
+					    MUTEX_DEFAULT, NULL);
+					cv_init(&acb->acb_wait_cv, NULL,
+					    CV_DEFAULT, NULL);
+				}
 				acb->acb_zb = *zb;
-				if (pio != NULL)
+				if (pio != NULL) {
 					acb->acb_zio_dummy = zio_null(pio,
 					    spa, NULL, NULL, NULL, zio_flags);
-
-				ASSERT3P(acb->acb_done, !=, NULL);
+				}
 				acb->acb_zio_head = head_zio;
 				acb->acb_next = hdr->b_l1hdr.b_acb;
+				hdr->b_l1hdr.b_acb->acb_prev = acb;
 				hdr->b_l1hdr.b_acb = acb;
 			}
 			mutex_exit(hash_lock);
+
+			ARCSTAT_BUMP(arcstat_iohits);
+			ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
+			    demand, prefetch, is_data, data, metadata, iohits);
+
+			if (*arc_flags & ARC_FLAG_WAIT) {
+				mutex_enter(&acb->acb_wait_lock);
+				while (acb->acb_wait) {
+					cv_wait(&acb->acb_wait_cv,
+					    &acb->acb_wait_lock);
+				}
+				rc = acb->acb_wait_error;
+				mutex_exit(&acb->acb_wait_lock);
+				mutex_destroy(&acb->acb_wait_lock);
+				cv_destroy(&acb->acb_wait_cv);
+				kmem_free(acb, sizeof (arc_callback_t));
+			}
 			goto out;
 		}
 
 		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
-		    hdr->b_l1hdr.b_state == arc_mfu);
+		    hdr->b_l1hdr.b_state == arc_mfu ||
+		    hdr->b_l1hdr.b_state == arc_uncached);
 
-		if (done && !no_buf) {
-			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
-				/*
-				 * This is a demand read which does not have to
-				 * wait for i/o because we did a predictive
-				 * prefetch i/o for it, which has completed.
-				 */
-				DTRACE_PROBE1(
-				    arc__demand__hit__predictive__prefetch,
-				    arc_buf_hdr_t *, hdr);
-				ARCSTAT_BUMP(
-				    arcstat_demand_hit_predictive_prefetch);
-				arc_hdr_clear_flags(hdr,
-				    ARC_FLAG_PREDICTIVE_PREFETCH);
-			}
-
-			if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
-				ARCSTAT_BUMP(
-				    arcstat_demand_hit_prescient_prefetch);
-				arc_hdr_clear_flags(hdr,
-				    ARC_FLAG_PRESCIENT_PREFETCH);
-			}
+		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+		arc_access(hdr, *arc_flags, B_TRUE);
 
+		if (done && !no_buf) {
 			ASSERT(!embedded_bp || !BP_IS_HOLE(bp));
 
 			/* Get a buf with the desired data in it. */
@@ -6057,44 +5652,28 @@ top:
 				 */
 				rc = SET_ERROR(EIO);
 				if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
-					spa_log_error(spa, zb);
+					spa_log_error(spa, zb, hdr->b_birth);
 					(void) zfs_ereport_post(
 					    FM_EREPORT_ZFS_AUTHENTICATION,
 					    spa, NULL, zb, NULL, 0);
 				}
 			}
 			if (rc != 0) {
-				(void) remove_reference(hdr, hash_lock,
-				    private);
 				arc_buf_destroy_impl(buf);
 				buf = NULL;
+				(void) remove_reference(hdr, private);
 			}
 
 			/* assert any errors weren't due to unloaded keys */
 			ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
 			    rc != EACCES);
-		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
-		    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
-			if (HDR_HAS_L2HDR(hdr))
-				l2arc_hdr_arcstats_decrement_state(hdr);
-			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
-			if (HDR_HAS_L2HDR(hdr))
-				l2arc_hdr_arcstats_increment_state(hdr);
 		}
-		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
-		arc_access(hdr, hash_lock);
-		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
-			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
-		if (*arc_flags & ARC_FLAG_L2CACHE)
-			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_hits);
-		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
-		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
-		    data, metadata, hits);
-
-		if (done)
-			done(NULL, zb, bp, buf, private);
+		ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
+		    demand, prefetch, is_data, data, metadata, hits);
+		*arc_flags |= ARC_FLAG_CACHED;
+		goto done;
 	} else {
 		uint64_t lsize = BP_GET_LSIZE(bp);
 		uint64_t psize = BP_GET_PSIZE(bp);
@@ -6105,12 +5684,13 @@ top:
 		uint64_t size;
 		abd_t *hdr_abd;
 		int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
+		arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
 
 		if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
-			rc = SET_ERROR(ENOENT);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
-			goto out;
+			rc = SET_ERROR(ENOENT);
+			goto done;
 		}
 
 		if (hdr == NULL) {
@@ -6119,13 +5699,12 @@ top:
 			 * embedded data.
 			 */
 			arc_buf_hdr_t *exists = NULL;
-			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
-			hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+			hdr = arc_hdr_alloc(guid, psize, lsize,
 			    BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type);
 
 			if (!embedded_bp) {
 				hdr->b_dva = *BP_IDENTITY(bp);
-				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
+				hdr->b_birth = BP_GET_BIRTH(bp);
 				exists = buf_hash_insert(hdr, &hash_lock);
 			}
 			if (exists != NULL) {
@@ -6135,7 +5714,6 @@ top:
 				arc_hdr_destroy(hdr);
 				goto top; /* restart the IO request */
 			}
-			alloc_flags |= ARC_HDR_DO_ADAPT;
 		} else {
 			/*
 			 * This block is in the ghost cache or encrypted data
@@ -6155,7 +5733,9 @@ top:
 				ASSERT0(zfs_refcount_count(
 				    &hdr->b_l1hdr.b_refcnt));
 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+#ifdef ZFS_DEBUG
 				ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+#endif
 			} else if (HDR_IO_IN_PROGRESS(hdr)) {
 				/*
 				 * If this header already had an IO in progress
@@ -6166,25 +5746,47 @@ top:
 				 * and so the performance impact shouldn't
 				 * matter.
 				 */
-				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
+				arc_callback_t *acb = kmem_zalloc(
+				    sizeof (arc_callback_t), KM_SLEEP);
+				acb->acb_wait = B_TRUE;
+				mutex_init(&acb->acb_wait_lock, NULL,
+				    MUTEX_DEFAULT, NULL);
+				cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT,
+				    NULL);
+				acb->acb_zio_head =
+				    hdr->b_l1hdr.b_acb->acb_zio_head;
+				acb->acb_next = hdr->b_l1hdr.b_acb;
+				hdr->b_l1hdr.b_acb->acb_prev = acb;
+				hdr->b_l1hdr.b_acb = acb;
 				mutex_exit(hash_lock);
+				mutex_enter(&acb->acb_wait_lock);
+				while (acb->acb_wait) {
+					cv_wait(&acb->acb_wait_cv,
+					    &acb->acb_wait_lock);
+				}
+				mutex_exit(&acb->acb_wait_lock);
+				mutex_destroy(&acb->acb_wait_lock);
+				cv_destroy(&acb->acb_wait_cv);
+				kmem_free(acb, sizeof (arc_callback_t));
 				goto top;
 			}
-
-			/*
-			 * This is a delicate dance that we play here.
-			 * This hdr might be in the ghost list so we access
-			 * it to move it out of the ghost list before we
-			 * initiate the read. If it's a prefetch then
-			 * it won't have a callback so we'll remove the
-			 * reference that arc_buf_alloc_impl() created. We
-			 * do this after we've called arc_access() to
-			 * avoid hitting an assert in remove_reference().
-			 */
-			arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state);
-			arc_access(hdr, hash_lock);
+		}
+		if (*arc_flags & ARC_FLAG_UNCACHED) {
+			arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
+			if (!encrypted_read)
+				alloc_flags |= ARC_HDR_ALLOC_LINEAR;
 		}
 
+		/*
+		 * Take additional reference for IO_IN_PROGRESS.  It stops
+		 * arc_access() from putting this header without any buffers
+		 * and so other references but obviously nonevictable onto
+		 * the evictable list of MRU or MFU state.
+		 */
+		add_reference(hdr, hdr);
+		if (!embedded_bp)
+			arc_access(hdr, *arc_flags, B_FALSE);
+		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		arc_hdr_alloc_abd(hdr, alloc_flags);
 		if (encrypted_read) {
 			ASSERT(HDR_HAS_RABD(hdr));
@@ -6211,24 +5813,10 @@ top:
 				zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
 		}
 
-		if (*arc_flags & ARC_FLAG_PREFETCH &&
-		    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
-			if (HDR_HAS_L2HDR(hdr))
-				l2arc_hdr_arcstats_decrement_state(hdr);
-			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
-			if (HDR_HAS_L2HDR(hdr))
-				l2arc_hdr_arcstats_increment_state(hdr);
-		}
-		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
-			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
-		if (*arc_flags & ARC_FLAG_L2CACHE)
-			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 		if (BP_IS_AUTHENTICATED(bp))
 			arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
 		if (BP_GET_LEVEL(bp) > 0)
 			arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
-		if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
-			arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
 		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
 
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
@@ -6241,7 +5829,6 @@ top:
 
 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 		hdr->b_l1hdr.b_acb = acb;
-		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 
 		if (HDR_HAS_L2HDR(hdr) &&
 		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
@@ -6282,7 +5869,7 @@ top:
 			    blkptr_t *, bp, uint64_t, lsize,
 			    zbookmark_phys_t *, zb);
 			ARCSTAT_BUMP(arcstat_misses);
-			ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+			ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
 			    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
 			    metadata, misses);
 			zfs_racct_read(size, 1);
@@ -6300,11 +5887,9 @@ top:
 			 * 3. This buffer isn't currently writing to the L2ARC.
 			 * 4. The L2ARC entry wasn't evicted, which may
 			 *    also have invalidated the vdev.
-			 * 5. This isn't prefetch or l2arc_noprefetch is 0.
 			 */
 			if (HDR_HAS_L2HDR(hdr) &&
-			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
-			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
+			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
 				l2arc_read_callback_t *cb;
 				abd_t *abd;
 				uint64_t asize;
@@ -6356,8 +5941,7 @@ top:
 				    asize, abd,
 				    ZIO_CHECKSUM_OFF,
 				    l2arc_read_done, cb, priority,
-				    zio_flags | ZIO_FLAG_DONT_CACHE |
-				    ZIO_FLAG_CANFAIL |
+				    zio_flags | ZIO_FLAG_CANFAIL |
 				    ZIO_FLAG_DONT_PROPAGATE |
 				    ZIO_FLAG_DONT_RETRY, B_FALSE);
 				acb->acb_zio_head = rzio;
@@ -6436,6 +6020,16 @@ out:
 		spa_read_history_add(spa, zb, *arc_flags);
 	spl_fstrans_unmark(cookie);
 	return (rc);
+
+done:
+	if (done)
+		done(NULL, zb, bp, buf, private);
+	if (pio && rc != 0) {
+		zio_t *zio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags);
+		zio->io_error = rc;
+		zio_nowait(zio);
+	}
+	goto out;
 }
 
 arc_prune_t *
@@ -6476,6 +6070,56 @@ arc_remove_prune_callback(arc_prune_t *p)
 }
 
 /*
+ * Helper function for arc_prune_async() it is responsible for safely
+ * handling the execution of a registered arc_prune_func_t.
+ */
+static void
+arc_prune_task(void *ptr)
+{
+	arc_prune_t *ap = (arc_prune_t *)ptr;
+	arc_prune_func_t *func = ap->p_pfunc;
+
+	if (func != NULL)
+		func(ap->p_adjust, ap->p_private);
+
+	(void) zfs_refcount_remove(&ap->p_refcnt, func);
+}
+
+/*
+ * Notify registered consumers they must drop holds on a portion of the ARC
+ * buffers they reference.  This provides a mechanism to ensure the ARC can
+ * honor the metadata limit and reclaim otherwise pinned ARC buffers.
+ *
+ * This operation is performed asynchronously so it may be safely called
+ * in the context of the arc_reclaim_thread().  A reference is taken here
+ * for each registered arc_prune_t and the arc_prune_task() is responsible
+ * for releasing it once the registered arc_prune_func_t has completed.
+ */
+static void
+arc_prune_async(uint64_t adjust)
+{
+	arc_prune_t *ap;
+
+	mutex_enter(&arc_prune_mtx);
+	for (ap = list_head(&arc_prune_list); ap != NULL;
+	    ap = list_next(&arc_prune_list, ap)) {
+
+		if (zfs_refcount_count(&ap->p_refcnt) >= 2)
+			continue;
+
+		zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
+		ap->p_adjust = adjust;
+		if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
+		    ap, TQ_SLEEP) == TASKQID_INVALID) {
+			(void) zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
+			continue;
+		}
+		ARCSTAT_BUMP(arcstat_prune);
+	}
+	mutex_exit(&arc_prune_mtx);
+}
+
+/*
  * Notify the arc that a block was freed, and thus will never be used again.
  */
 void
@@ -6493,10 +6137,8 @@ arc_freed(spa_t *spa, const blkptr_t *bp)
 
 	/*
 	 * We might be trying to free a block that is still doing I/O
-	 * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
-	 * dmu_sync-ed block). If this block is being prefetched, then it
-	 * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
-	 * until the I/O completes. A block may also have a reference if it is
+	 * (i.e. prefetch) or has some other reference (i.e. a dedup-ed,
+	 * dmu_sync-ed block). A block may also have a reference if it is
 	 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
 	 * have written the new block to its final resting place on disk but
 	 * without the dedup flag set. This would have left the hdr in the MRU
@@ -6513,9 +6155,9 @@ arc_freed(spa_t *spa, const blkptr_t *bp)
 	 * freed. So if we have an I/O in progress, or a reference to
 	 * this hdr, then we don't destroy the hdr.
 	 */
-	if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
-	    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
-		arc_change_state(arc_anon, hdr, hash_lock);
+	if (!HDR_HAS_L1HDR(hdr) ||
+	    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+		arc_change_state(arc_anon, hdr);
 		arc_hdr_destroy(hdr);
 		mutex_exit(hash_lock);
 	} else {
@@ -6531,7 +6173,7 @@ arc_freed(spa_t *spa, const blkptr_t *bp)
  * a new hdr for the buffer.
  */
 void
-arc_release(arc_buf_t *buf, void *tag)
+arc_release(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
@@ -6541,8 +6183,6 @@ arc_release(arc_buf_t *buf, void *tag)
 	 * But we don't know that information at this level.
 	 */
 
-	mutex_enter(&buf->b_evict_lock);
-
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	/*
@@ -6551,14 +6191,14 @@ arc_release(arc_buf_t *buf, void *tag)
 	 * linked into the hash table.
 	 */
 	if (hdr->b_l1hdr.b_state == arc_anon) {
-		mutex_exit(&buf->b_evict_lock);
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
 		ASSERT(!HDR_HAS_L2HDR(hdr));
 
-		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
+		ASSERT(ARC_BUF_LAST(buf));
 		ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
-		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 
 		hdr->b_l1hdr.b_arc_access = 0;
 
@@ -6607,7 +6247,7 @@ arc_release(arc_buf_t *buf, void *tag)
 	/*
 	 * Do we have more than one buf?
 	 */
-	if (hdr->b_l1hdr.b_bufcnt > 1) {
+	if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf)) {
 		arc_buf_hdr_t *nhdr;
 		uint64_t spa = hdr->b_spa;
 		uint64_t psize = HDR_GET_PSIZE(hdr);
@@ -6618,9 +6258,9 @@ arc_release(arc_buf_t *buf, void *tag)
 		VERIFY3U(hdr->b_type, ==, type);
 
 		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
-		(void) remove_reference(hdr, hash_lock, tag);
+		VERIFY3S(remove_reference(hdr, tag), >, 0);
 
-		if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
+		if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
 			ASSERT(ARC_BUF_LAST(buf));
 		}
@@ -6637,9 +6277,9 @@ arc_release(arc_buf_t *buf, void *tag)
 		 * If the current arc_buf_t and the hdr are sharing their data
 		 * buffer, then we must stop sharing that block.
 		 */
-		if (arc_buf_is_shared(buf)) {
+		if (ARC_BUF_SHARED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
-			VERIFY(!arc_buf_is_shared(lastbuf));
+			ASSERT(!arc_buf_is_shared(lastbuf));
 
 			/*
 			 * First, sever the block sharing relationship between
@@ -6656,7 +6296,7 @@ arc_release(arc_buf_t *buf, void *tag)
 			if (arc_can_share(hdr, lastbuf)) {
 				arc_share_buf(hdr, lastbuf);
 			} else {
-				arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
+				arc_hdr_alloc_abd(hdr, 0);
 				abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
 				    buf->b_data, psize);
 			}
@@ -6672,13 +6312,13 @@ arc_release(arc_buf_t *buf, void *tag)
 			 */
 			ASSERT(arc_buf_is_shared(lastbuf) ||
 			    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
-			ASSERT(!ARC_BUF_SHARED(buf));
+			ASSERT(!arc_buf_is_shared(buf));
 		}
 
 		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 		ASSERT3P(state, !=, arc_l2c_only);
 
-		(void) zfs_refcount_remove_many(&state->arcs_size,
+		(void) zfs_refcount_remove_many(&state->arcs_size[type],
 		    arc_buf_size(buf), buf);
 
 		if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
@@ -6688,10 +6328,6 @@ arc_release(arc_buf_t *buf, void *tag)
 			    arc_buf_size(buf), buf);
 		}
 
-		hdr->b_l1hdr.b_bufcnt -= 1;
-		if (ARC_BUF_ENCRYPTED(buf))
-			hdr->b_crypt_hdr.b_ebufcnt -= 1;
-
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
@@ -6701,30 +6337,20 @@ arc_release(arc_buf_t *buf, void *tag)
 
 		mutex_exit(hash_lock);
 
-		/*
-		 * Allocate a new hdr. The new hdr will contain a b_pabd
-		 * buffer which will be freed in arc_write().
-		 */
 		nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
 		    compress, hdr->b_complevel, type);
 		ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
-		ASSERT0(nhdr->b_l1hdr.b_bufcnt);
 		ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
 		VERIFY3U(nhdr->b_type, ==, type);
 		ASSERT(!HDR_SHARED_DATA(nhdr));
 
 		nhdr->b_l1hdr.b_buf = buf;
-		nhdr->b_l1hdr.b_bufcnt = 1;
-		if (ARC_BUF_ENCRYPTED(buf))
-			nhdr->b_crypt_hdr.b_ebufcnt = 1;
 		(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
 		buf->b_hdr = nhdr;
 
-		mutex_exit(&buf->b_evict_lock);
-		(void) zfs_refcount_add_many(&arc_anon->arcs_size,
+		(void) zfs_refcount_add_many(&arc_anon->arcs_size[type],
 		    arc_buf_size(buf), buf);
 	} else {
-		mutex_exit(&buf->b_evict_lock);
 		ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
 		/* protected by hash lock, or hdr is on arc_anon */
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
@@ -6733,7 +6359,7 @@ arc_release(arc_buf_t *buf, void *tag)
 		hdr->b_l1hdr.b_mru_ghost_hits = 0;
 		hdr->b_l1hdr.b_mfu_hits = 0;
 		hdr->b_l1hdr.b_mfu_ghost_hits = 0;
-		arc_change_state(arc_anon, hdr, hash_lock);
+		arc_change_state(arc_anon, hdr);
 		hdr->b_l1hdr.b_arc_access = 0;
 
 		mutex_exit(hash_lock);
@@ -6745,25 +6371,15 @@ arc_release(arc_buf_t *buf, void *tag)
 int
 arc_released(arc_buf_t *buf)
 {
-	int released;
-
-	mutex_enter(&buf->b_evict_lock);
-	released = (buf->b_data != NULL &&
+	return (buf->b_data != NULL &&
 	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
-	mutex_exit(&buf->b_evict_lock);
-	return (released);
 }
 
 #ifdef ZFS_DEBUG
 int
 arc_referenced(arc_buf_t *buf)
 {
-	int referenced;
-
-	mutex_enter(&buf->b_evict_lock);
-	referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
-	mutex_exit(&buf->b_evict_lock);
-	return (referenced);
+	return (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
 }
 #endif
 
@@ -6779,7 +6395,7 @@ arc_write_ready(zio_t *zio)
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
-	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 
 	/*
 	 * If we're reexecuting this zio because the pool suspended, then
@@ -6790,9 +6406,10 @@ arc_write_ready(zio_t *zio)
 		arc_cksum_free(hdr);
 		arc_buf_unwatch(buf);
 		if (hdr->b_l1hdr.b_pabd != NULL) {
-			if (arc_buf_is_shared(buf)) {
+			if (ARC_BUF_SHARED(buf)) {
 				arc_unshare_buf(hdr, buf);
 			} else {
+				ASSERT(!arc_buf_is_shared(buf));
 				arc_hdr_free_abd(hdr, B_FALSE);
 			}
 		}
@@ -6807,18 +6424,16 @@ arc_write_ready(zio_t *zio)
 
 	callback->awcb_ready(zio, buf, callback->awcb_private);
 
-	if (HDR_IO_IN_PROGRESS(hdr))
+	if (HDR_IO_IN_PROGRESS(hdr)) {
 		ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
-
-	arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
-
-	if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr))
-		hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp));
+	} else {
+		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+		add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */
+	}
 
 	if (BP_IS_PROTECTED(bp)) {
 		/* ZIL blocks are written through zio_rewrite */
 		ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
-		ASSERT(HDR_PROTECTED(hdr));
 
 		if (BP_SHOULD_BYTESWAP(bp)) {
 			if (BP_GET_LEVEL(bp) > 0) {
@@ -6831,11 +6446,14 @@ arc_write_ready(zio_t *zio)
 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 		}
 
+		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv);
 		zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
+	} else {
+		arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED);
 	}
 
 	/*
@@ -6886,10 +6504,11 @@ arc_write_ready(zio_t *zio)
 	if (ARC_BUF_ENCRYPTED(buf)) {
 		ASSERT3U(psize, >, 0);
 		ASSERT(ARC_BUF_COMPRESSED(buf));
-		arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA |
+		arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
 		    ARC_HDR_USE_RESERVE);
 		abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
-	} else if (!abd_size_alloc_linear(arc_buf_size(buf)) ||
+	} else if (!(HDR_UNCACHED(hdr) ||
+	    abd_size_alloc_linear(arc_buf_size(buf))) ||
 	    !arc_can_share(hdr, buf)) {
 		/*
 		 * Ideally, we would always copy the io_abd into b_pabd, but the
@@ -6898,26 +6517,25 @@ arc_write_ready(zio_t *zio)
 		 */
 		if (BP_IS_ENCRYPTED(bp)) {
 			ASSERT3U(psize, >, 0);
-			arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT |
-			    ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE);
+			arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
+			    ARC_HDR_USE_RESERVE);
 			abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
 		} else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
 		    !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3U(psize, >, 0);
-			arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT |
-			    ARC_HDR_USE_RESERVE);
+			arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
 			abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
 		} else {
 			ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
-			arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT |
-			    ARC_HDR_USE_RESERVE);
+			arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
 			abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
 			    arc_buf_size(buf));
 		}
 	} else {
 		ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
 		ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
-		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
+		ASSERT(ARC_BUF_LAST(buf));
 
 		arc_share_buf(hdr, buf);
 	}
@@ -6936,18 +6554,6 @@ arc_write_children_ready(zio_t *zio)
 	callback->awcb_children_ready(zio, buf, callback->awcb_private);
 }
 
-/*
- * The SPA calls this callback for each physical write that happens on behalf
- * of a logical write.  See the comment in dbuf_write_physdone() for details.
- */
-static void
-arc_write_physdone(zio_t *zio)
-{
-	arc_write_callback_t *cb = zio->io_private;
-	if (cb->awcb_physdone != NULL)
-		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
-}
-
 static void
 arc_write_done(zio_t *zio)
 {
@@ -6964,7 +6570,7 @@ arc_write_done(zio_t *zio)
 			buf_discard_identity(hdr);
 		} else {
 			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
-			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+			hdr->b_birth = BP_GET_BIRTH(zio->io_bp);
 		}
 	} else {
 		ASSERT(HDR_EMPTY(hdr));
@@ -6997,7 +6603,7 @@ arc_write_done(zio_t *zio)
 					    (void *)hdr, (void *)exists);
 				ASSERT(zfs_refcount_is_zero(
 				    &exists->b_l1hdr.b_refcnt));
-				arc_change_state(arc_anon, exists, hash_lock);
+				arc_change_state(arc_anon, exists);
 				arc_hdr_destroy(exists);
 				mutex_exit(hash_lock);
 				exists = buf_hash_insert(hdr, &hash_lock);
@@ -7010,22 +6616,24 @@ arc_write_done(zio_t *zio)
 					    (void *)hdr, (void *)exists);
 			} else {
 				/* Dedup */
-				ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
+				ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
+				ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
 				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
 				ASSERT(BP_GET_DEDUP(zio->io_bp));
 				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 			}
 		}
 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+		VERIFY3S(remove_reference(hdr, hdr), >, 0);
 		/* if it's not anon, we are doing a scrub */
 		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
-			arc_access(hdr, hash_lock);
+			arc_access(hdr, 0, B_FALSE);
 		mutex_exit(hash_lock);
 	} else {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+		VERIFY3S(remove_reference(hdr, hdr), >, 0);
 	}
 
-	ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 	callback->awcb_done(zio, buf, callback->awcb_private);
 
 	abd_free(zio->io_abd);
@@ -7034,11 +6642,11 @@ arc_write_done(zio_t *zio)
 
 zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
-    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc,
+    blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc,
     const zio_prop_t *zp, arc_write_done_func_t *ready,
-    arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
-    arc_write_done_func_t *done, void *private, zio_priority_t priority,
-    int zio_flags, const zbookmark_phys_t *zb)
+    arc_write_done_func_t *children_ready, arc_write_done_func_t *done,
+    void *private, zio_priority_t priority, int zio_flags,
+    const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
@@ -7050,8 +6658,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
-	ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
-	if (l2arc)
+	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
+	if (uncached)
+		arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
+	else if (l2arc)
 		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 
 	if (ARC_BUF_ENCRYPTED(buf)) {
@@ -7062,11 +6672,11 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
 		localprop.zp_byteorder =
 		    (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
 		    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
-		bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt,
+		memcpy(localprop.zp_salt, hdr->b_crypt_hdr.b_salt,
 		    ZIO_DATA_SALT_LEN);
-		bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv,
+		memcpy(localprop.zp_iv, hdr->b_crypt_hdr.b_iv,
 		    ZIO_DATA_IV_LEN);
-		bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac,
+		memcpy(localprop.zp_mac, hdr->b_crypt_hdr.b_mac,
 		    ZIO_DATA_MAC_LEN);
 		if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
 			localprop.zp_nopwrite = B_FALSE;
@@ -7083,7 +6693,6 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
 	callback->awcb_ready = ready;
 	callback->awcb_children_ready = children_ready;
-	callback->awcb_physdone = physdone;
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
@@ -7099,9 +6708,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
 		 * The hdr will remain with a NULL data pointer and the
 		 * buf will take sole ownership of the block.
 		 */
-		if (arc_buf_is_shared(buf)) {
+		if (ARC_BUF_SHARED(buf)) {
 			arc_unshare_buf(hdr, buf);
 		} else {
+			ASSERT(!arc_buf_is_shared(buf));
 			arc_hdr_free_abd(hdr, B_FALSE);
 		}
 		VERIFY3P(buf->b_data, !=, NULL);
@@ -7120,8 +6730,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
 	    abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
 	    HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
 	    (children_ready != NULL) ? arc_write_children_ready : NULL,
-	    arc_write_physdone, arc_write_done, callback,
-	    priority, zio_flags, zb);
+	    arc_write_done, callback, priority, zio_flags, zb);
 
 	return (zio);
 }
@@ -7162,7 +6771,9 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
 	/* assert that it has not wrapped around */
 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
 
-	anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) -
+	anon_size = MAX((int64_t)
+	    (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) +
+	    zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) -
 	    arc_loaned_bytes), 0);
 
 	/*
@@ -7218,9 +6829,14 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
 
 static void
 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
+    kstat_named_t *data, kstat_named_t *metadata,
     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
 {
-	size->value.ui64 = zfs_refcount_count(&state->arcs_size);
+	data->value.ui64 =
+	    zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]);
+	metadata->value.ui64 =
+	    zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]);
+	size->value.ui64 = data->value.ui64 + metadata->value.ui64;
 	evict_data->value.ui64 =
 	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
 	evict_metadata->value.ui64 =
@@ -7237,22 +6853,32 @@ arc_kstat_update(kstat_t *ksp, int rw)
 
 	as->arcstat_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hits);
+	as->arcstat_iohits.value.ui64 =
+	    wmsum_value(&arc_sums.arcstat_iohits);
 	as->arcstat_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_misses);
 	as->arcstat_demand_data_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_hits);
+	as->arcstat_demand_data_iohits.value.ui64 =
+	    wmsum_value(&arc_sums.arcstat_demand_data_iohits);
 	as->arcstat_demand_data_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_misses);
 	as->arcstat_demand_metadata_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_hits);
+	as->arcstat_demand_metadata_iohits.value.ui64 =
+	    wmsum_value(&arc_sums.arcstat_demand_metadata_iohits);
 	as->arcstat_demand_metadata_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_misses);
 	as->arcstat_prefetch_data_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_hits);
+	as->arcstat_prefetch_data_iohits.value.ui64 =
+	    wmsum_value(&arc_sums.arcstat_prefetch_data_iohits);
 	as->arcstat_prefetch_data_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_misses);
 	as->arcstat_prefetch_metadata_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits);
+	as->arcstat_prefetch_metadata_iohits.value.ui64 =
+	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits);
 	as->arcstat_prefetch_metadata_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses);
 	as->arcstat_mru_hits.value.ui64 =
@@ -7263,6 +6889,8 @@ arc_kstat_update(kstat_t *ksp, int rw)
 	    wmsum_value(&arc_sums.arcstat_mfu_hits);
 	as->arcstat_mfu_ghost_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mfu_ghost_hits);
+	as->arcstat_uncached_hits.value.ui64 =
+	    wmsum_value(&arc_sums.arcstat_uncached_hits);
 	as->arcstat_deleted.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_deleted);
 	as->arcstat_mutex_miss.value.ui64 =
@@ -7308,33 +6936,49 @@ arc_kstat_update(kstat_t *ksp, int rw)
 #if defined(COMPAT_FREEBSD11)
 	as->arcstat_other_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size) +
-	    aggsum_value(&arc_sums.arcstat_dnode_size) +
+	    wmsum_value(&arc_sums.arcstat_dnode_size) +
 	    wmsum_value(&arc_sums.arcstat_dbuf_size);
 #endif
 
 	arc_kstat_update_state(arc_anon,
 	    &as->arcstat_anon_size,
+	    &as->arcstat_anon_data,
+	    &as->arcstat_anon_metadata,
 	    &as->arcstat_anon_evictable_data,
 	    &as->arcstat_anon_evictable_metadata);
 	arc_kstat_update_state(arc_mru,
 	    &as->arcstat_mru_size,
+	    &as->arcstat_mru_data,
+	    &as->arcstat_mru_metadata,
 	    &as->arcstat_mru_evictable_data,
 	    &as->arcstat_mru_evictable_metadata);
 	arc_kstat_update_state(arc_mru_ghost,
 	    &as->arcstat_mru_ghost_size,
+	    &as->arcstat_mru_ghost_data,
+	    &as->arcstat_mru_ghost_metadata,
 	    &as->arcstat_mru_ghost_evictable_data,
 	    &as->arcstat_mru_ghost_evictable_metadata);
 	arc_kstat_update_state(arc_mfu,
 	    &as->arcstat_mfu_size,
+	    &as->arcstat_mfu_data,
+	    &as->arcstat_mfu_metadata,
 	    &as->arcstat_mfu_evictable_data,
 	    &as->arcstat_mfu_evictable_metadata);
 	arc_kstat_update_state(arc_mfu_ghost,
 	    &as->arcstat_mfu_ghost_size,
+	    &as->arcstat_mfu_ghost_data,
+	    &as->arcstat_mfu_ghost_metadata,
 	    &as->arcstat_mfu_ghost_evictable_data,
 	    &as->arcstat_mfu_ghost_evictable_metadata);
+	arc_kstat_update_state(arc_uncached,
+	    &as->arcstat_uncached_size,
+	    &as->arcstat_uncached_data,
+	    &as->arcstat_uncached_metadata,
+	    &as->arcstat_uncached_evictable_data,
+	    &as->arcstat_uncached_evictable_metadata);
 
 	as->arcstat_dnode_size.value.ui64 =
-	    aggsum_value(&arc_sums.arcstat_dnode_size);
+	    wmsum_value(&arc_sums.arcstat_dnode_size);
 	as->arcstat_bonus_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size);
 	as->arcstat_l2_hits.value.ui64 =
@@ -7432,13 +7076,21 @@ arc_kstat_update(kstat_t *ksp, int rw)
 	as->arcstat_prune.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prune);
 	as->arcstat_meta_used.value.ui64 =
-	    aggsum_value(&arc_sums.arcstat_meta_used);
+	    wmsum_value(&arc_sums.arcstat_meta_used);
 	as->arcstat_async_upgrade_sync.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_async_upgrade_sync);
+	as->arcstat_predictive_prefetch.value.ui64 =
+	    wmsum_value(&arc_sums.arcstat_predictive_prefetch);
 	as->arcstat_demand_hit_predictive_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch);
+	as->arcstat_demand_iohit_predictive_prefetch.value.ui64 =
+	    wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
+	as->arcstat_prescient_prefetch.value.ui64 =
+	    wmsum_value(&arc_sums.arcstat_prescient_prefetch);
 	as->arcstat_demand_hit_prescient_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch);
+	as->arcstat_demand_iohit_prescient_prefetch.value.ui64 =
+	    wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
 	as->arcstat_raw_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_raw_size);
 	as->arcstat_cached_only_in_progress.value.ui64 =
@@ -7510,7 +7162,6 @@ void
 arc_tuning_update(boolean_t verbose)
 {
 	uint64_t allmem = arc_all_memory();
-	unsigned long limit;
 
 	/* Valid range: 32M - <arc_c_max> */
 	if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
@@ -7527,44 +7178,15 @@ arc_tuning_update(boolean_t verbose)
 	    (zfs_arc_max > arc_c_min)) {
 		arc_c_max = zfs_arc_max;
 		arc_c = MIN(arc_c, arc_c_max);
-		arc_p = (arc_c >> 1);
-		if (arc_meta_limit > arc_c_max)
-			arc_meta_limit = arc_c_max;
-		if (arc_dnode_size_limit > arc_meta_limit)
-			arc_dnode_size_limit = arc_meta_limit;
+		if (arc_dnode_limit > arc_c_max)
+			arc_dnode_limit = arc_c_max;
 	}
 	WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose);
 
-	/* Valid range: 16M - <arc_c_max> */
-	if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) &&
-	    (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) &&
-	    (zfs_arc_meta_min <= arc_c_max)) {
-		arc_meta_min = zfs_arc_meta_min;
-		if (arc_meta_limit < arc_meta_min)
-			arc_meta_limit = arc_meta_min;
-		if (arc_dnode_size_limit < arc_meta_min)
-			arc_dnode_size_limit = arc_meta_min;
-	}
-	WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose);
-
-	/* Valid range: <arc_meta_min> - <arc_c_max> */
-	limit = zfs_arc_meta_limit ? zfs_arc_meta_limit :
-	    MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100;
-	if ((limit != arc_meta_limit) &&
-	    (limit >= arc_meta_min) &&
-	    (limit <= arc_c_max))
-		arc_meta_limit = limit;
-	WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose);
-
-	/* Valid range: <arc_meta_min> - <arc_meta_limit> */
-	limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
-	    MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100;
-	if ((limit != arc_dnode_size_limit) &&
-	    (limit >= arc_meta_min) &&
-	    (limit <= arc_meta_limit))
-		arc_dnode_size_limit = limit;
-	WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit,
-	    verbose);
+	/* Valid range: 0 - <all physical memory> */
+	arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
+	    MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100;
+	WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose);
 
 	/* Valid range: 1 - N */
 	if (zfs_arc_grow_retry)
@@ -7576,10 +7198,6 @@ arc_tuning_update(boolean_t verbose)
 		arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
 	}
 
-	/* Valid range: 1 - N */
-	if (zfs_arc_p_min_shift)
-		arc_p_min_shift = zfs_arc_p_min_shift;
-
 	/* Valid range: 1 - N ms */
 	if (zfs_arc_min_prefetch_ms)
 		arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
@@ -7591,65 +7209,67 @@ arc_tuning_update(boolean_t verbose)
 	}
 
 	/* Valid range: 0 - 100 */
-	if ((zfs_arc_lotsfree_percent >= 0) &&
-	    (zfs_arc_lotsfree_percent <= 100))
+	if (zfs_arc_lotsfree_percent <= 100)
 		arc_lotsfree_percent = zfs_arc_lotsfree_percent;
 	WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent,
 	    verbose);
 
 	/* Valid range: 0 - <all physical memory> */
 	if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
-		arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), allmem);
+		arc_sys_free = MIN(zfs_arc_sys_free, allmem);
 	WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose);
 }
 
 static void
+arc_state_multilist_init(multilist_t *ml,
+    multilist_sublist_index_func_t *index_func, int *maxcountp)
+{
+	multilist_create(ml, sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func);
+	*maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml));
+}
+
+static void
 arc_state_init(void)
 {
-	multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
-	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
-	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
-	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
-	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
-	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
-	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
-	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
-	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
+	int num_sublists = 0;
+
+	arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA],
+	    arc_state_multilist_index_func, &num_sublists);
+	arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA],
+	    arc_state_multilist_index_func, &num_sublists);
+	arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
+	    arc_state_multilist_index_func, &num_sublists);
+	arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
+	    arc_state_multilist_index_func, &num_sublists);
+	arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
+	    arc_state_multilist_index_func, &num_sublists);
+	arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA],
+	    arc_state_multilist_index_func, &num_sublists);
+	arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
+	    arc_state_multilist_index_func, &num_sublists);
+	arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
+	    arc_state_multilist_index_func, &num_sublists);
+	arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_METADATA],
+	    arc_state_multilist_index_func, &num_sublists);
+	arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_DATA],
+	    arc_state_multilist_index_func, &num_sublists);
+
 	/*
 	 * L2 headers should never be on the L2 state list since they don't
 	 * have L1 headers allocated.  Special index function asserts that.
 	 */
-	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
-	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_l2c_multilist_index_func);
-	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
-	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_l2c_multilist_index_func);
+	arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+	    arc_state_l2c_multilist_index_func, &num_sublists);
+	arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+	    arc_state_l2c_multilist_index_func, &num_sublists);
+
+	/*
+	 * Keep track of the number of markers needed to reclaim buffers from
+	 * any ARC state.  The markers will be pre-allocated so as to minimize
+	 * the number of memory allocations performed by the eviction thread.
+	 */
+	arc_state_evict_marker_count = num_sublists;
 
 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
@@ -7663,28 +7283,49 @@ arc_state_init(void)
 	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
-
-	zfs_refcount_create(&arc_anon->arcs_size);
-	zfs_refcount_create(&arc_mru->arcs_size);
-	zfs_refcount_create(&arc_mru_ghost->arcs_size);
-	zfs_refcount_create(&arc_mfu->arcs_size);
-	zfs_refcount_create(&arc_mfu_ghost->arcs_size);
-	zfs_refcount_create(&arc_l2c_only->arcs_size);
+	zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
+	zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
+
+	zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]);
+	zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
+	zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]);
+	zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
+	zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
+	zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
+	zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
+	zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
+	zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
+	zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
+	zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
+	zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
+	zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
+	zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
+
+	wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0);
+	wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
+	wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0);
+	wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
 
 	wmsum_init(&arc_sums.arcstat_hits, 0);
+	wmsum_init(&arc_sums.arcstat_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_misses, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_hits, 0);
+	wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_misses, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0);
+	wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0);
+	wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0);
+	wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0);
 	wmsum_init(&arc_sums.arcstat_mru_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mfu_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0);
+	wmsum_init(&arc_sums.arcstat_uncached_hits, 0);
 	wmsum_init(&arc_sums.arcstat_deleted, 0);
 	wmsum_init(&arc_sums.arcstat_mutex_miss, 0);
 	wmsum_init(&arc_sums.arcstat_access_skip, 0);
@@ -7706,7 +7347,7 @@ arc_state_init(void)
 	wmsum_init(&arc_sums.arcstat_data_size, 0);
 	wmsum_init(&arc_sums.arcstat_metadata_size, 0);
 	wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
-	aggsum_init(&arc_sums.arcstat_dnode_size, 0);
+	wmsum_init(&arc_sums.arcstat_dnode_size, 0);
 	wmsum_init(&arc_sums.arcstat_bonus_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_hits, 0);
 	wmsum_init(&arc_sums.arcstat_l2_misses, 0);
@@ -7751,10 +7392,14 @@ arc_state_init(void)
 	wmsum_init(&arc_sums.arcstat_memory_direct_count, 0);
 	wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0);
 	wmsum_init(&arc_sums.arcstat_prune, 0);
-	aggsum_init(&arc_sums.arcstat_meta_used, 0);
+	wmsum_init(&arc_sums.arcstat_meta_used, 0);
 	wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0);
+	wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0);
+	wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0);
+	wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0);
+	wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_raw_size, 0);
 	wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0);
 	wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0);
@@ -7765,6 +7410,7 @@ arc_state_init(void)
 	arc_mfu->arcs_state = ARC_STATE_MFU;
 	arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
 	arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
+	arc_uncached->arcs_state = ARC_STATE_UNCACHED;
 }
 
 static void
@@ -7782,13 +7428,23 @@ arc_state_fini(void)
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
-
-	zfs_refcount_destroy(&arc_anon->arcs_size);
-	zfs_refcount_destroy(&arc_mru->arcs_size);
-	zfs_refcount_destroy(&arc_mru_ghost->arcs_size);
-	zfs_refcount_destroy(&arc_mfu->arcs_size);
-	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size);
-	zfs_refcount_destroy(&arc_l2c_only->arcs_size);
+	zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
+	zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
+
+	zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]);
+	zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
+	zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]);
+	zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
+	zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
+	zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
+	zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
+	zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
+	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
+	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
+	zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
+	zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
+	zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
+	zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
 
 	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
@@ -7800,21 +7456,34 @@ arc_state_fini(void)
 	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]);
+
+	wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
+	wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
+	wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
+	wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
 
 	wmsum_fini(&arc_sums.arcstat_hits);
+	wmsum_fini(&arc_sums.arcstat_iohits);
 	wmsum_fini(&arc_sums.arcstat_misses);
 	wmsum_fini(&arc_sums.arcstat_demand_data_hits);
+	wmsum_fini(&arc_sums.arcstat_demand_data_iohits);
 	wmsum_fini(&arc_sums.arcstat_demand_data_misses);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_hits);
+	wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_misses);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_hits);
+	wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_misses);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits);
+	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses);
 	wmsum_fini(&arc_sums.arcstat_mru_hits);
 	wmsum_fini(&arc_sums.arcstat_mru_ghost_hits);
 	wmsum_fini(&arc_sums.arcstat_mfu_hits);
 	wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits);
+	wmsum_fini(&arc_sums.arcstat_uncached_hits);
 	wmsum_fini(&arc_sums.arcstat_deleted);
 	wmsum_fini(&arc_sums.arcstat_mutex_miss);
 	wmsum_fini(&arc_sums.arcstat_access_skip);
@@ -7836,7 +7505,7 @@ arc_state_fini(void)
 	wmsum_fini(&arc_sums.arcstat_data_size);
 	wmsum_fini(&arc_sums.arcstat_metadata_size);
 	wmsum_fini(&arc_sums.arcstat_dbuf_size);
-	aggsum_fini(&arc_sums.arcstat_dnode_size);
+	wmsum_fini(&arc_sums.arcstat_dnode_size);
 	wmsum_fini(&arc_sums.arcstat_bonus_size);
 	wmsum_fini(&arc_sums.arcstat_l2_hits);
 	wmsum_fini(&arc_sums.arcstat_l2_misses);
@@ -7881,10 +7550,14 @@ arc_state_fini(void)
 	wmsum_fini(&arc_sums.arcstat_memory_direct_count);
 	wmsum_fini(&arc_sums.arcstat_memory_indirect_count);
 	wmsum_fini(&arc_sums.arcstat_prune);
-	aggsum_fini(&arc_sums.arcstat_meta_used);
+	wmsum_fini(&arc_sums.arcstat_meta_used);
 	wmsum_fini(&arc_sums.arcstat_async_upgrade_sync);
+	wmsum_fini(&arc_sums.arcstat_predictive_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch);
+	wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
+	wmsum_fini(&arc_sums.arcstat_prescient_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch);
+	wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
 	wmsum_fini(&arc_sums.arcstat_raw_size);
 	wmsum_fini(&arc_sums.arcstat_cached_only_in_progress);
 	wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size);
@@ -7949,18 +7622,16 @@ arc_init(void)
 #endif
 
 	arc_c = arc_c_min;
-	arc_p = (arc_c >> 1);
-
-	/* Set min to 1/2 of arc_c_min */
-	arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
 	/*
-	 * Set arc_meta_limit to a percent of arc_c_max with a floor of
-	 * arc_meta_min, and a ceiling of arc_c_max.
+	 * 32-bit fixed point fractions of metadata from total ARC size,
+	 * MRU data from all data and MRU metadata from all metadata.
 	 */
-	percent = MIN(zfs_arc_meta_limit_percent, 100);
-	arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100);
+	arc_meta = (1ULL << 32) / 4;	/* Metadata is 25% of arc_c. */
+	arc_pd = (1ULL << 32) / 2;	/* Data MRU is 50% of data. */
+	arc_pm = (1ULL << 32) / 2;	/* Metadata MRU is 50% of metadata. */
+
 	percent = MIN(zfs_arc_dnode_limit_percent, 100);
-	arc_dnode_size_limit = (percent * arc_meta_limit) / 100;
+	arc_dnode_limit = arc_c_max * percent / 100;
 
 	/* Apply user specified tunings */
 	arc_tuning_update(B_TRUE);
@@ -7981,9 +7652,8 @@ arc_init(void)
 	    offsetof(arc_prune_t, p_node));
 	mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
 
-	arc_prune_taskq = taskq_create("arc_prune", 100, defclsyspri,
-	    boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
-	    TASKQ_THREADS_CPU_PCT);
+	arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
+	    defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
@@ -7994,8 +7664,10 @@ arc_init(void)
 		kstat_install(arc_ksp);
 	}
 
-	arc_evict_zthr = zthr_create("arc_evict",
-	    arc_evict_cb_check, arc_evict_cb, NULL, defclsyspri);
+	arc_state_evict_markers =
+	    arc_state_alloc_markers(arc_state_evict_marker_count);
+	arc_evict_zthr = zthr_create_timer("arc_evict",
+	    arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1), defclsyspri);
 	arc_reap_zthr = zthr_create_timer("arc_reap",
 	    arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri);
 
@@ -8060,9 +7732,8 @@ arc_fini(void)
 	taskq_destroy(arc_prune_taskq);
 
 	mutex_enter(&arc_prune_mtx);
-	while ((p = list_head(&arc_prune_list)) != NULL) {
-		list_remove(&arc_prune_list, p);
-		zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
+	while ((p = list_remove_head(&arc_prune_list)) != NULL) {
+		(void) zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
 		zfs_refcount_destroy(&p->p_refcnt);
 		kmem_free(p, sizeof (*p));
 	}
@@ -8073,6 +7744,8 @@ arc_fini(void)
 
 	(void) zthr_cancel(arc_evict_zthr);
 	(void) zthr_cancel(arc_reap_zthr);
+	arc_state_free_markers(arc_state_evict_markers,
+	    arc_state_evict_marker_count);
 
 	mutex_destroy(&arc_evict_lock);
 	list_destroy(&arc_evict_waiters);
@@ -8367,7 +8040,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
 static uint64_t
 l2arc_write_size(l2arc_dev_t *dev)
 {
-	uint64_t size, dev_size, tsize;
+	uint64_t size;
 
 	/*
 	 * Make sure our globals have meaningful values in case the user
@@ -8375,38 +8048,33 @@ l2arc_write_size(l2arc_dev_t *dev)
 	 */
 	size = l2arc_write_max;
 	if (size == 0) {
-		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
-		    "be greater than zero, resetting it to the default (%d)",
-		    L2ARC_WRITE_SIZE);
+		cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, "
+		    "resetting it to the default (%d)", L2ARC_WRITE_SIZE);
 		size = l2arc_write_max = L2ARC_WRITE_SIZE;
 	}
 
 	if (arc_warm == B_FALSE)
 		size += l2arc_write_boost;
 
+	/* We need to add in the worst case scenario of log block overhead. */
+	size += l2arc_log_blk_overhead(size, dev);
+	if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
+		/*
+		 * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
+		 * times the writesize, whichever is greater.
+		 */
+		size += MAX(64 * 1024 * 1024,
+		    (size * l2arc_trim_ahead) / 100);
+	}
+
 	/*
 	 * Make sure the write size does not exceed the size of the cache
 	 * device. This is important in l2arc_evict(), otherwise infinite
 	 * iteration can occur.
 	 */
-	dev_size = dev->l2ad_end - dev->l2ad_start;
-	tsize = size + l2arc_log_blk_overhead(size, dev);
-	if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0)
-		tsize += MAX(64 * 1024 * 1024,
-		    (tsize * l2arc_trim_ahead) / 100);
+	size = MIN(size, (dev->l2ad_end - dev->l2ad_start) / 4);
 
-	if (tsize >= dev_size) {
-		cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
-		    "plus the overhead of log blocks (persistent L2ARC, "
-		    "%llu bytes) exceeds the size of the cache device "
-		    "(guid %llu), resetting them to the default (%d)",
-		    (u_longlong_t)l2arc_log_blk_overhead(size, dev),
-		    (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
-		size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
-
-		if (arc_warm == B_FALSE)
-			size += l2arc_write_boost;
-	}
+	size = P2ROUNDUP(size, 1ULL << dev->l2ad_vdev->vdev_ashift);
 
 	return (size);
 
@@ -8473,12 +8141,13 @@ l2arc_dev_get_next(void)
 		else if (next == first)
 			break;
 
+		ASSERT3P(next, !=, NULL);
 	} while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
-	    next->l2ad_trim_all);
+	    next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting);
 
 	/* if we were unable to find any usable vdevs, return NULL */
 	if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
-	    next->l2ad_trim_all)
+	    next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting)
 		next = NULL;
 
 	l2arc_dev_last = next;
@@ -8503,20 +8172,14 @@ out:
 static void
 l2arc_do_free_on_write(void)
 {
-	list_t *buflist;
-	l2arc_data_free_t *df, *df_prev;
+	l2arc_data_free_t *df;
 
 	mutex_enter(&l2arc_free_on_write_mtx);
-	buflist = l2arc_free_on_write;
-
-	for (df = list_tail(buflist); df; df = df_prev) {
-		df_prev = list_prev(buflist, df);
+	while ((df = list_remove_head(l2arc_free_on_write)) != NULL) {
 		ASSERT3P(df->l2df_abd, !=, NULL);
 		abd_free(df->l2df_abd);
-		list_remove(buflist, df);
 		kmem_free(df, sizeof (l2arc_data_free_t));
 	}
-
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
@@ -8651,7 +8314,8 @@ top:
 			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
 			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
 			    lb_ptr_buf);
-			zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
+			(void) zfs_refcount_remove(&dev->l2ad_lb_count,
+			    lb_ptr_buf);
 			kmem_free(lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
@@ -8676,14 +8340,15 @@ top:
 				 * block pointer in the header.
 				 */
 				if (i == 0) {
-					bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
+					memset(l2dhdr, 0,
+					    dev->l2ad_dev_hdr_asize);
 				} else {
-					bzero(&l2dhdr->dh_start_lbps[i],
+					memset(&l2dhdr->dh_start_lbps[i], 0,
 					    sizeof (l2arc_log_blkptr_t));
 				}
 				break;
 			}
-			bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i],
+			memcpy(&l2dhdr->dh_start_lbps[i], lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
 			    lb_ptr_buf);
@@ -8732,7 +8397,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
 	 */
 	if (BP_IS_ENCRYPTED(bp)) {
 		abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
-		    ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE);
+		    ARC_HDR_USE_RESERVE);
 
 		zio_crypt_decode_params_bp(bp, salt, iv);
 		zio_crypt_decode_mac_bp(bp, mac);
@@ -8769,7 +8434,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
-		    ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE);
+		    ARC_HDR_USE_RESERVE);
 		void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
@@ -8981,7 +8646,7 @@ l2arc_sublist_lock(int list_num)
 	 * sublists being selected.
 	 */
 	idx = multilist_get_random_index(ml);
-	return (multilist_sublist_lock(ml, idx));
+	return (multilist_sublist_lock_idx(ml, idx));
 }
 
 /*
@@ -9026,22 +8691,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 
 	buflist = &dev->l2ad_buflist;
 
-	/*
-	 * We need to add in the worst case scenario of log block overhead.
-	 */
-	distance += l2arc_log_blk_overhead(distance, dev);
-	if (vd->vdev_has_trim && l2arc_trim_ahead > 0) {
-		/*
-		 * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
-		 * times the write size, whichever is greater.
-		 */
-		distance += MAX(64 * 1024 * 1024,
-		    (distance * l2arc_trim_ahead) / 100);
-	}
-
 top:
 	rerun = B_FALSE;
-	if (dev->l2ad_hand >= (dev->l2ad_end - distance)) {
+	if (dev->l2ad_hand + distance > dev->l2ad_end) {
 		/*
 		 * When there is no space to accommodate upcoming writes,
 		 * evict to the end. Then bump the write and evict hands
@@ -9134,7 +8786,8 @@ retry:
 			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
 			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
 			    lb_ptr_buf);
-			zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
+			(void) zfs_refcount_remove(&dev->l2ad_lb_count,
+			    lb_ptr_buf);
 			list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
 			kmem_free(lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
@@ -9190,7 +8843,7 @@ retry:
 			 * arc_hdr_destroy() will call list_remove()
 			 * and decrement arcstat_l2_lsize.
 			 */
-			arc_change_state(arc_anon, hdr, hash_lock);
+			arc_change_state(arc_anon, hdr);
 			arc_hdr_destroy(hdr);
 		} else {
 			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
@@ -9233,9 +8886,9 @@ out:
 		 * assertions may be violated without functional consequences
 		 * as the device is about to be removed.
 		 */
-		ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
+		ASSERT3U(dev->l2ad_hand + distance, <=, dev->l2ad_end);
 		if (!dev->l2ad_first)
-			ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
+			ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
 	}
 }
 
@@ -9249,7 +8902,6 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
     abd_t **abd_out)
 {
 	int ret;
-	void *tmp = NULL;
 	abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
 	enum zio_compress compress = HDR_GET_COMPRESS(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
@@ -9270,12 +8922,11 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 	 * and copy the data. This may be done to eliminate a dependency on a
 	 * shared buffer or to reallocate the buffer to match asize.
 	 */
-	if (HDR_HAS_RABD(hdr) && asize != psize) {
-		ASSERT3U(asize, >=, psize);
+	if (HDR_HAS_RABD(hdr)) {
+		ASSERT3U(asize, >, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
-		if (psize != asize)
-			abd_zero_off(to_write, psize, asize - psize);
+		abd_zero_off(to_write, psize, asize - psize);
 		goto out;
 	}
 
@@ -9284,36 +8935,31 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 		ASSERT3U(size, ==, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
-		if (size != asize)
+		if (asize > size)
 			abd_zero_off(to_write, size, asize - size);
 		goto out;
 	}
 
 	if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
-		cabd = abd_alloc_for_io(asize, ismd);
-		tmp = abd_borrow_buf(cabd, asize);
-
-		psize = zio_compress_data(compress, to_write, tmp, size,
-		    hdr->b_complevel);
-
-		if (psize >= size) {
-			abd_return_buf(cabd, tmp, asize);
-			HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
-			to_write = cabd;
-			abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
-			if (size != asize)
-				abd_zero_off(to_write, size, asize - size);
-			goto encrypt;
+		size_t bufsize = MAX(size, asize);
+		void *buf = zio_buf_alloc(bufsize);
+		uint64_t csize = zio_compress_data(compress, to_write, &buf,
+		    size, hdr->b_complevel);
+		if (csize > psize) {
+			/*
+			 * We can't re-compress the block into the original
+			 * psize.  Even if it fits into asize, it does not
+			 * matter, since checksum will never match on read.
+			 */
+			zio_buf_free(buf, bufsize);
+			return (SET_ERROR(EIO));
 		}
-		ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
-		if (psize < asize)
-			bzero((char *)tmp + psize, asize - psize);
-		psize = HDR_GET_PSIZE(hdr);
-		abd_return_buf_copy(cabd, tmp, asize);
-		to_write = cabd;
+		if (asize > csize)
+			memset((char *)buf + csize, 0, asize - csize);
+		to_write = cabd = abd_get_from_buf(buf, bufsize);
+		abd_take_ownership_of_buf(cabd, B_TRUE);
 	}
 
-encrypt:
 	if (HDR_ENCRYPTED(hdr)) {
 		eabd = abd_alloc_for_io(asize, ismd);
 
@@ -9342,7 +8988,7 @@ encrypt:
 			abd_zero_off(eabd, psize, asize - psize);
 
 		/* assert that the MAC we got here matches the one we saved */
-		ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
+		ASSERT0(memcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
 		spa_keystore_dsl_key_rele(spa, dck, FTAG);
 
 		if (to_write == cabd)
@@ -9394,9 +9040,9 @@ l2arc_blk_fetch_done(zio_t *zio)
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
-	arc_buf_hdr_t 		*hdr, *hdr_prev, *head;
-	uint64_t 		write_asize, write_psize, write_lsize, headroom;
-	boolean_t		full;
+	arc_buf_hdr_t 		*hdr, *head, *marker;
+	uint64_t 		write_asize, write_psize, headroom;
+	boolean_t		full, from_head = !arc_warm;
 	l2arc_write_callback_t	*cb = NULL;
 	zio_t 			*pio, *wzio;
 	uint64_t 		guid = spa_load_guid(spa);
@@ -9405,10 +9051,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	ASSERT3P(dev->l2ad_vdev, !=, NULL);
 
 	pio = NULL;
-	write_lsize = write_asize = write_psize = 0;
+	write_asize = write_psize = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
 	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
+	marker = arc_state_alloc_marker();
 
 	/*
 	 * Copy buffers for L2ARC writing.
@@ -9423,40 +9070,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				continue;
 		}
 
-		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
 		uint64_t passed_sz = 0;
-
-		VERIFY3P(mls, !=, NULL);
+		headroom = target_sz * l2arc_headroom;
+		if (zfs_compressed_arc_enabled)
+			headroom = (headroom * l2arc_headroom_boost) / 100;
 
 		/*
-		 * L2ARC fast warmup.
-		 *
 		 * Until the ARC is warm and starts to evict, read from the
 		 * head of the ARC lists rather than the tail.
 		 */
-		if (arc_warm == B_FALSE)
+		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
+		ASSERT3P(mls, !=, NULL);
+		if (from_head)
 			hdr = multilist_sublist_head(mls);
 		else
 			hdr = multilist_sublist_tail(mls);
 
-		headroom = target_sz * l2arc_headroom;
-		if (zfs_compressed_arc_enabled)
-			headroom = (headroom * l2arc_headroom_boost) / 100;
-
-		for (; hdr; hdr = hdr_prev) {
+		while (hdr != NULL) {
 			kmutex_t *hash_lock;
 			abd_t *to_write = NULL;
 
-			if (arc_warm == B_FALSE)
-				hdr_prev = multilist_sublist_next(mls, hdr);
-			else
-				hdr_prev = multilist_sublist_prev(mls, hdr);
-
 			hash_lock = HDR_LOCK(hdr);
 			if (!mutex_tryenter(hash_lock)) {
-				/*
-				 * Skip this buffer rather than waiting.
-				 */
+skip:
+				/* Skip this buffer rather than waiting. */
+				if (from_head)
+					hdr = multilist_sublist_next(mls, hdr);
+				else
+					hdr = multilist_sublist_prev(mls, hdr);
 				continue;
 			}
 
@@ -9471,17 +9112,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 
 			if (!l2arc_write_eligible(guid, hdr)) {
 				mutex_exit(hash_lock);
-				continue;
+				goto skip;
 			}
 
-			/*
-			 * We rely on the L1 portion of the header below, so
-			 * it's invalid for this header to have been evicted out
-			 * of the ghost cache, prior to being written out. The
-			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
-			 */
 			ASSERT(HDR_HAS_L1HDR(hdr));
-
 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
 			ASSERT3U(arc_hdr_size(hdr), >, 0);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
@@ -9490,25 +9124,31 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
 			    psize);
 
-			if ((write_asize + asize) > target_sz) {
+			/*
+			 * If the allocated size of this buffer plus the max
+			 * size for the pending log block exceeds the evicted
+			 * target size, terminate writing buffers for this run.
+			 */
+			if (write_asize + asize +
+			    sizeof (l2arc_log_blk_phys_t) > target_sz) {
 				full = B_TRUE;
 				mutex_exit(hash_lock);
 				break;
 			}
 
 			/*
-			 * We rely on the L1 portion of the header below, so
-			 * it's invalid for this header to have been evicted out
-			 * of the ghost cache, prior to being written out. The
-			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+			 * We should not sleep with sublist lock held or it
+			 * may block ARC eviction.  Insert a marker to save
+			 * the position and drop the lock.
 			 */
-			arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
-			ASSERT(HDR_HAS_L1HDR(hdr));
-
-			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
-			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
-			    HDR_HAS_RABD(hdr));
-			ASSERT3U(arc_hdr_size(hdr), >, 0);
+			if (from_head) {
+				multilist_sublist_insert_after(mls, hdr,
+				    marker);
+			} else {
+				multilist_sublist_insert_before(mls, hdr,
+				    marker);
+			}
+			multilist_sublist_unlock(mls);
 
 			/*
 			 * If this header has b_rabd, we can use this since it
@@ -9539,32 +9179,45 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				    &to_write);
 				if (ret != 0) {
 					arc_hdr_clear_flags(hdr,
-					    ARC_FLAG_L2_WRITING);
+					    ARC_FLAG_L2CACHE);
 					mutex_exit(hash_lock);
-					continue;
+					goto next;
 				}
 
 				l2arc_free_abd_on_write(to_write, asize, type);
 			}
 
+			hdr->b_l2hdr.b_dev = dev;
+			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+			hdr->b_l2hdr.b_hits = 0;
+			hdr->b_l2hdr.b_arcs_state =
+			    hdr->b_l1hdr.b_state->arcs_state;
+			mutex_enter(&dev->l2ad_mtx);
 			if (pio == NULL) {
 				/*
 				 * Insert a dummy header on the buflist so
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
-				mutex_enter(&dev->l2ad_mtx);
 				list_insert_head(&dev->l2ad_buflist, head);
-				mutex_exit(&dev->l2ad_mtx);
+			}
+			list_insert_head(&dev->l2ad_buflist, hdr);
+			mutex_exit(&dev->l2ad_mtx);
+			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
+			    ARC_FLAG_L2_WRITING);
+
+			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
+			    arc_hdr_size(hdr), hdr);
+			l2arc_hdr_arcstats_increment(hdr);
 
+			boolean_t commit = l2arc_log_blk_insert(dev, hdr);
+			mutex_exit(hash_lock);
+
+			if (pio == NULL) {
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
 				cb->l2wcb_dev = dev;
 				cb->l2wcb_head = head;
-				/*
-				 * Create a list to save allocated abd buffers
-				 * for l2arc_log_blk_commit().
-				 */
 				list_create(&cb->l2wcb_abd_list,
 				    sizeof (l2arc_lb_abd_buf_t),
 				    offsetof(l2arc_lb_abd_buf_t, node));
@@ -9572,48 +9225,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				    ZIO_FLAG_CANFAIL);
 			}
 
-			hdr->b_l2hdr.b_dev = dev;
-			hdr->b_l2hdr.b_hits = 0;
-
-			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
-			hdr->b_l2hdr.b_arcs_state =
-			    hdr->b_l1hdr.b_state->arcs_state;
-			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
-
-			mutex_enter(&dev->l2ad_mtx);
-			list_insert_head(&dev->l2ad_buflist, hdr);
-			mutex_exit(&dev->l2ad_mtx);
-
-			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
-			    arc_hdr_size(hdr), hdr);
-
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
-			    hdr->b_l2hdr.b_daddr, asize, to_write,
+			    dev->l2ad_hand, asize, to_write,
 			    ZIO_CHECKSUM_OFF, NULL, hdr,
 			    ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_CANFAIL, B_FALSE);
 
-			write_lsize += HDR_GET_LSIZE(hdr);
 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
 			    zio_t *, wzio);
+			zio_nowait(wzio);
 
 			write_psize += psize;
 			write_asize += asize;
 			dev->l2ad_hand += asize;
-			l2arc_hdr_arcstats_increment(hdr);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
-			mutex_exit(hash_lock);
-
-			/*
-			 * Append buf info to current log and commit if full.
-			 * arcstat_l2_{size,asize} kstats are updated
-			 * internally.
-			 */
-			if (l2arc_log_blk_insert(dev, hdr))
-				l2arc_log_blk_commit(dev, pio, cb);
+			if (commit) {
+				/* l2ad_hand will be adjusted inside. */
+				write_asize +=
+				    l2arc_log_blk_commit(dev, pio, cb);
+			}
 
-			zio_nowait(wzio);
+next:
+			multilist_sublist_lock(mls);
+			if (from_head)
+				hdr = multilist_sublist_next(mls, marker);
+			else
+				hdr = multilist_sublist_prev(mls, marker);
+			multilist_sublist_remove(mls, marker);
 		}
 
 		multilist_sublist_unlock(mls);
@@ -9622,9 +9261,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			break;
 	}
 
+	arc_state_free_marker(marker);
+
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
-		ASSERT0(write_lsize);
+		ASSERT0(write_psize);
 		ASSERT(!HDR_HAS_L1HDR(head));
 		kmem_cache_free(hdr_l2only_cache, head);
 
@@ -9664,7 +9305,7 @@ l2arc_hdr_limit_reached(void)
 {
 	int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size);
 
-	return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) ||
+	return (arc_reclaim_needed() ||
 	    (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100));
 }
 
@@ -9672,10 +9313,10 @@ l2arc_hdr_limit_reached(void)
  * This thread feeds the L2ARC at regular intervals.  This is the beating
  * heart of the L2ARC.
  */
-/* ARGSUSED */
-static void
+static  __attribute__((noreturn)) void
 l2arc_feed_thread(void *unused)
 {
+	(void) unused;
 	callb_cpr_t cpr;
 	l2arc_dev_t *dev;
 	spa_t *spa;
@@ -9863,7 +9504,7 @@ l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen)
 		if (l2arc_trim_ahead > 0) {
 			dev->l2ad_trim_all = B_TRUE;
 		} else {
-			bzero(l2dhdr, l2dhdr_asize);
+			memset(l2dhdr, 0, l2dhdr_asize);
 			l2arc_dev_hdr_update(dev);
 		}
 	}
@@ -10111,7 +9752,7 @@ l2arc_spa_rebuild_start(spa_t *spa)
 /*
  * Main entry point for L2ARC rebuilding.
  */
-static void
+static __attribute__((noreturn)) void
 l2arc_dev_rebuild_thread(void *arg)
 {
 	l2arc_dev_t *dev = arg;
@@ -10184,7 +9825,7 @@ l2arc_rebuild(l2arc_dev_t *dev)
 		goto out;
 
 	/* Prepare the rebuild process */
-	bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps));
+	memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));
 
 	/* Start the rebuild process */
 	for (;;) {
@@ -10230,7 +9871,7 @@ l2arc_rebuild(l2arc_dev_t *dev)
 		lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
 		lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
 		    KM_SLEEP);
-		bcopy(&lbps[0], lb_ptr_buf->lb_ptr,
+		memcpy(lb_ptr_buf->lb_ptr, &lbps[0],
 		    sizeof (l2arc_log_blkptr_t));
 		mutex_enter(&dev->l2ad_mtx);
 		list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
@@ -10268,7 +9909,7 @@ l2arc_rebuild(l2arc_dev_t *dev)
 		    !dev->l2ad_first)
 			goto out;
 
-		cond_resched();
+		kpreempt(KPREEMPT_SYNC);
 		for (;;) {
 			mutex_enter(&l2arc_rebuild_thr_lock);
 			if (dev->l2ad_rebuild_cancel) {
@@ -10328,7 +9969,7 @@ out:
 		 */
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "no valid log blocks");
-		bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
+		memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize);
 		l2arc_dev_hdr_update(dev);
 	} else if (err == ECANCELED) {
 		/*
@@ -10370,8 +10011,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev)
 	err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
 	    VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
 	    ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
-	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
-	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 	    ZIO_FLAG_SPECULATIVE, B_FALSE));
 
 	abd_free(abd);
@@ -10554,7 +10194,7 @@ l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
 	 * since we may allocate significant amount of memory here, let ARC
 	 * grow its arc_c.
 	 */
-	arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only);
+	arc_adapt(log_entries * HDR_L2ONLY_SIZE);
 
 	for (int i = log_entries - 1; i >= 0; i--) {
 		/*
@@ -10691,11 +10331,10 @@ l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
 	cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
 	cb->l2rcb_abd = abd_get_from_buf(lb, asize);
 	pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
-	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
-	    ZIO_FLAG_DONT_RETRY);
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
 	(void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
 	    cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
-	    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+	    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
 
 	return (pio);
@@ -10761,7 +10400,7 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev)
  * This function allocates some memory to temporarily hold the serialized
  * buffer to be written. This is then released in l2arc_write_done.
  */
-static void
+static uint64_t
 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 {
 	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
@@ -10769,12 +10408,11 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 	uint64_t		psize, asize;
 	zio_t			*wzio;
 	l2arc_lb_abd_buf_t	*abd_buf;
-	uint8_t			*tmpbuf;
+	uint8_t			*tmpbuf = NULL;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 
 	VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
 
-	tmpbuf = zio_buf_alloc(sizeof (*lb));
 	abd_buf = zio_buf_alloc(sizeof (*abd_buf));
 	abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
 	lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
@@ -10793,7 +10431,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 
 	/* try to compress the buffer */
 	psize = zio_compress_data(ZIO_COMPRESS_LZ4,
-	    abd_buf->abd, tmpbuf, sizeof (*lb), 0);
+	    abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0);
 
 	/* a log block is never entirely zero */
 	ASSERT(psize != 0);
@@ -10819,13 +10457,13 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 	    ZIO_CHECKSUM_FLETCHER_4);
 	if (asize < sizeof (*lb)) {
 		/* compression succeeded */
-		bzero(tmpbuf + psize, asize - psize);
+		memset(tmpbuf + psize, 0, asize - psize);
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_LZ4);
 	} else {
 		/* compression failed */
-		bcopy(lb, tmpbuf, sizeof (*lb));
+		memcpy(tmpbuf, lb, sizeof (*lb));
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_OFF);
@@ -10851,7 +10489,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 	 * Include the committed log block's pointer  in the list of pointers
 	 * to log blocks present in the L2ARC device.
 	 */
-	bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr,
+	memcpy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[0],
 	    sizeof (l2arc_log_blkptr_t));
 	mutex_enter(&dev->l2ad_mtx);
 	list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
@@ -10873,6 +10511,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 	dev->l2ad_log_ent_idx = 0;
 	dev->l2ad_log_blk_payload_asize = 0;
 	dev->l2ad_log_blk_payload_start = 0;
+
+	return (asize);
 }
 
 /*
@@ -10940,7 +10580,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	le = &lb->lb_entries[index];
-	bzero(le, sizeof (*le));
+	memset(le, 0, sizeof (*le));
 	le->le_dva = hdr->b_dva;
 	le->le_birth = hdr->b_birth;
 	le->le_daddr = hdr->b_l2hdr.b_daddr;
@@ -10953,7 +10593,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
 	L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
 	L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
 	L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
-	L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
+	L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
 
 	dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
 	    HDR_GET_PSIZE(hdr));
@@ -11009,79 +10649,56 @@ EXPORT_SYMBOL(arc_getbuf_func);
 EXPORT_SYMBOL(arc_add_prune_callback);
 EXPORT_SYMBOL(arc_remove_prune_callback);
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min,
-	param_get_long, ZMOD_RW, "Min arc size");
+	spl_param_get_u64, ZMOD_RW, "Minimum ARC size in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max,
-	param_get_long, ZMOD_RW, "Max arc size");
-
-ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long,
-	param_get_long, ZMOD_RW, "Metadata limit for arc size");
-
-ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent,
-	param_set_arc_long, param_get_long, ZMOD_RW,
-	"Percent of arc size for arc meta limit");
-
-ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long,
-	param_get_long, ZMOD_RW, "Min arc metadata");
+	spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes");
 
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW,
-	"Meta objects to scan for prune");
-
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, INT, ZMOD_RW,
-	"Limit number of restarts in arc_evict_meta");
-
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, INT, ZMOD_RW,
-	"Meta reclaim strategy");
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW,
+	"Balance between metadata and data on ghost hits.");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
-	param_get_int, ZMOD_RW, "Seconds before growing arc size");
-
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW,
-	"Disable arc_p adapt dampener");
+	param_get_uint, ZMOD_RW, "Seconds before growing ARC size");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
-	param_get_int, ZMOD_RW, "log2(fraction of arc to reclaim)");
+	param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
-	"Percent of pagecache to reclaim arc to");
+	"Percent of pagecache to reclaim ARC to");
 
-ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int,
-	param_get_int, ZMOD_RW, "arc_c shift to calc min/max arc_p");
-
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD,
 	"Target average block size");
 
 ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW,
-	"Disable compressed arc buffers");
+	"Disable compressed ARC buffers");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int,
-	param_get_int, ZMOD_RW, "Min life of prefetch block in ms");
+	param_get_uint, ZMOD_RW, "Min life of prefetch block in ms");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms,
-	param_set_arc_int, param_get_int, ZMOD_RW,
+    param_set_arc_int, param_get_uint, ZMOD_RW,
 	"Min life of prescient prefetched block in ms");
 
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW,
 	"Max write bytes per interval");
 
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW,
 	"Extra write bytes during device warmup");
 
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW,
 	"Number of max device writes to precache");
 
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, U64, ZMOD_RW,
 	"Compressed l2arc_headroom multiplier");
 
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, U64, ZMOD_RW,
 	"TRIM ahead L2ARC write size multiplier");
 
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, U64, ZMOD_RW,
 	"Seconds between L2ARC writing");
 
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, U64, ZMOD_RW,
 	"Min feed interval in milliseconds");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW,
@@ -11093,41 +10710,42 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW,
 	"No reads during writes");
 
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, UINT, ZMOD_RW,
 	"Percent of ARC size allowed for L2ARC-only headers");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,
 	"Rebuild the L2ARC when importing a pool");
 
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, U64, ZMOD_RW,
 	"Min size in bytes to write rebuild log blocks in L2ARC");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,
 	"Cache only MFU data from ARC into L2ARC");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW,
-	"If set to 1 exclude dbufs on special vdevs from being cached to "
-	"L2ARC.");
+	"Exclude dbufs on special vdevs from being cached to L2ARC if set.");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
-	param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes");
+	param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes");
 
-ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long,
-	param_get_long, ZMOD_RW, "System free memory target size in bytes");
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_u64,
+	spl_param_get_u64, ZMOD_RW, "System free memory target size in bytes");
 
-ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long,
-	param_get_long, ZMOD_RW, "Minimum bytes of dnodes in arc");
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_u64,
+	spl_param_get_u64, ZMOD_RW, "Minimum bytes of dnodes in ARC");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
-	param_set_arc_long, param_get_long, ZMOD_RW,
+    param_set_arc_int, param_get_uint, ZMOD_RW,
 	"Percent of ARC meta buffers for dnodes");
 
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, UINT, ZMOD_RW,
 	"Percentage of excess dnodes to try to unpin");
 
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW,
 	"When full, ARC allocation waits for eviction of this % of alloc size");
 
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
 	"The number of headers to evict per sublist before moving to the next");
-/* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
+	"Number of arc_prune threads");
diff --git a/sys/contrib/openzfs/module/zfs/blake3_zfs.c b/sys/contrib/openzfs/module/zfs/blake3_zfs.c
new file mode 100644
index 000000000000..7783282b671a
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/blake3_zfs.c
@@ -0,0 +1,120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_checksum.h>
+#include <sys/blake3.h>
+#include <sys/abd.h>
+
+static int
+blake3_incremental(void *buf, size_t size, void *arg)
+{
+	BLAKE3_CTX *ctx = arg;
+
+	Blake3_Update(ctx, buf, size);
+
+	return (0);
+}
+
+/*
+ * Computes a native 256-bit BLAKE3 MAC checksum. Please note that this
+ * function requires the presence of a ctx_template that should be allocated
+ * using abd_checksum_blake3_tmpl_init.
+ */
+void
+abd_checksum_blake3_native(abd_t *abd, uint64_t size, const void *ctx_template,
+    zio_cksum_t *zcp)
+{
+	ASSERT(ctx_template != NULL);
+
+#if defined(_KERNEL)
+	kpreempt_disable();
+	BLAKE3_CTX *ctx = blake3_per_cpu_ctx[CPU_SEQID];
+#else
+	BLAKE3_CTX *ctx = kmem_alloc(sizeof (*ctx), KM_SLEEP);
+#endif
+
+	memcpy(ctx, ctx_template, sizeof (*ctx));
+	(void) abd_iterate_func(abd, 0, size, blake3_incremental, ctx);
+	Blake3_Final(ctx, (uint8_t *)zcp);
+
+#if defined(_KERNEL)
+	kpreempt_enable();
+#else
+	memset(ctx, 0, sizeof (*ctx));
+	kmem_free(ctx, sizeof (*ctx));
+#endif
+}
+
+/*
+ * Byteswapped version of abd_checksum_blake3_native. This just invokes
+ * the native checksum function and byteswaps the resulting checksum (since
+ * BLAKE3 is internally endian-insensitive).
+ */
+void
+abd_checksum_blake3_byteswap(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	zio_cksum_t tmp;
+
+	ASSERT(ctx_template != NULL);
+
+	abd_checksum_blake3_native(abd, size, ctx_template, &tmp);
+	zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+	zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+	zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+	zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
+
+/*
+ * Allocates a BLAKE3 MAC template suitable for using in BLAKE3 MAC checksum
+ * computations and returns a pointer to it.
+ */
+void *
+abd_checksum_blake3_tmpl_init(const zio_cksum_salt_t *salt)
+{
+	BLAKE3_CTX *ctx;
+
+	ASSERT(sizeof (salt->zcs_bytes) == 32);
+
+	/* init reference object */
+	ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+	Blake3_InitKeyed(ctx, salt->zcs_bytes);
+
+	return (ctx);
+}
+
+/*
+ * Frees a BLAKE3 context template previously allocated using
+ * zio_checksum_blake3_tmpl_init.
+ */
+void
+abd_checksum_blake3_tmpl_free(void *ctx_template)
+{
+	BLAKE3_CTX *ctx = ctx_template;
+
+	memset(ctx, 0, sizeof (*ctx));
+	kmem_free(ctx, sizeof (*ctx));
+}
diff --git a/sys/contrib/openzfs/module/zfs/blkptr.c b/sys/contrib/openzfs/module/zfs/blkptr.c
index aa09ded8dba3..d85f0737f6f6 100644
--- a/sys/contrib/openzfs/module/zfs/blkptr.c
+++ b/sys/contrib/openzfs/module/zfs/blkptr.c
@@ -58,7 +58,7 @@ encode_embedded_bp_compressed(blkptr_t *bp, void *data,
 	ASSERT3U(comp, >=, ZIO_COMPRESS_OFF);
 	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
 
-	bzero(bp, sizeof (*bp));
+	memset(bp, 0, sizeof (*bp));
 	BP_SET_EMBEDDED(bp, B_TRUE);
 	BP_SET_COMPRESS(bp, comp);
 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
diff --git a/sys/contrib/openzfs/module/zfs/bplist.c b/sys/contrib/openzfs/module/zfs/bplist.c
index 47ea364ef26f..da7360f8ce10 100644
--- a/sys/contrib/openzfs/module/zfs/bplist.c
+++ b/sys/contrib/openzfs/module/zfs/bplist.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -65,9 +65,8 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
 	bplist_entry_t *bpe;
 
 	mutex_enter(&bpl->bpl_lock);
-	while ((bpe = list_head(&bpl->bpl_list))) {
+	while ((bpe = list_remove_head(&bpl->bpl_list))) {
 		bplist_iterate_last_removed = bpe;
-		list_remove(&bpl->bpl_list, bpe);
 		mutex_exit(&bpl->bpl_lock);
 		func(arg, &bpe->bpe_blk, tx);
 		kmem_free(bpe, sizeof (*bpe));
@@ -82,10 +81,7 @@ bplist_clear(bplist_t *bpl)
 	bplist_entry_t *bpe;
 
 	mutex_enter(&bpl->bpl_lock);
-	while ((bpe = list_head(&bpl->bpl_list))) {
-		bplist_iterate_last_removed = bpe;
-		list_remove(&bpl->bpl_list, bpe);
+	while ((bpe = list_remove_head(&bpl->bpl_list)))
 		kmem_free(bpe, sizeof (*bpe));
-	}
 	mutex_exit(&bpl->bpl_lock);
 }
diff --git a/sys/contrib/openzfs/module/zfs/bpobj.c b/sys/contrib/openzfs/module/zfs/bpobj.c
index e75ba5cccde6..96e1601c4e9c 100644
--- a/sys/contrib/openzfs/module/zfs/bpobj.c
+++ b/sys/contrib/openzfs/module/zfs/bpobj.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -156,7 +156,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
 	if (err)
 		return (err);
 
-	bzero(bpo, sizeof (*bpo));
+	memset(bpo, 0, sizeof (*bpo));
 	mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	ASSERT(bpo->bpo_dbuf == NULL);
@@ -284,7 +284,17 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
 	dmu_buf_t *dbuf = NULL;
 	bpobj_t *bpo = bpi->bpi_bpo;
 
-	for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
+	int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1;
+	uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) *
+	    sizeof (blkptr_t);
+	uint64_t ps = start * sizeof (blkptr_t);
+	uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0,
+	    ps);
+	if (pe > pb) {
+		dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb,
+		    ZIO_PRIORITY_ASYNC_READ);
+	}
+	for (; i >= start; i--) {
 		uint64_t offset = i * sizeof (blkptr_t);
 		uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
 
@@ -292,9 +302,16 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
 			if (dbuf)
 				dmu_buf_rele(dbuf, FTAG);
 			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
-			    offset, FTAG, &dbuf, 0);
+			    offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH);
 			if (err)
 				break;
+			pe = pb;
+			pb = MAX((dbuf->db_offset > dmu_prefetch_max) ?
+			    dbuf->db_offset - dmu_prefetch_max : 0, ps);
+			if (pe > pb) {
+				dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
+				    pb, pe - pb, ZIO_PRIORITY_ASYNC_READ);
+			}
 		}
 
 		ASSERT3U(offset, >=, dbuf->db_offset);
@@ -466,22 +483,30 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
 			int64_t i = bpi->bpi_unprocessed_subobjs - 1;
 			uint64_t offset = i * sizeof (uint64_t);
 
-			uint64_t obj_from_sublist;
+			uint64_t subobj;
 			err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
-			    offset, sizeof (uint64_t), &obj_from_sublist,
-			    DMU_READ_PREFETCH);
+			    offset, sizeof (uint64_t), &subobj,
+			    DMU_READ_NO_PREFETCH);
 			if (err)
 				break;
-			bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
-			    KM_SLEEP);
 
-			err = bpobj_open(sublist, bpo->bpo_os,
-			    obj_from_sublist);
-			if (err)
+			bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t),
+			    KM_SLEEP);
+			err = bpobj_open(subbpo, bpo->bpo_os, subobj);
+			if (err) {
+				kmem_free(subbpo, sizeof (bpobj_t));
 				break;
+			}
 
-			list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
-			mutex_enter(&sublist->bpo_lock);
+			if (subbpo->bpo_havesubobj &&
+			    subbpo->bpo_phys->bpo_subobjs != 0) {
+				dmu_prefetch(subbpo->bpo_os,
+				    subbpo->bpo_phys->bpo_subobjs, 0, 0, 0,
+				    ZIO_PRIORITY_ASYNC_READ);
+			}
+
+			list_insert_head(&stack, bpi_alloc(subbpo, bpi, i));
+			mutex_enter(&subbpo->bpo_lock);
 			bpi->bpi_unprocessed_subobjs--;
 		}
 	}
@@ -663,14 +688,13 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 	}
 
 	VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
-	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
-
 	if (bpobj_is_empty(&subbpo)) {
 		/* No point in having an empty subobj. */
 		bpobj_close(&subbpo);
 		bpobj_free(bpo->bpo_os, subobj, tx);
 		return;
 	}
+	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
 
 	mutex_enter(&bpo->bpo_lock);
 	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
@@ -780,6 +804,68 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 
 }
 
+/*
+ * Prefetch metadata required for bpobj_enqueue_subobj().
+ */
+void
+bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj)
+{
+	dmu_object_info_t doi;
+	bpobj_t subbpo;
+	uint64_t subsubobjs;
+	boolean_t copy_subsub = B_TRUE;
+	boolean_t copy_bps = B_TRUE;
+
+	ASSERT(bpobj_is_open(bpo));
+	ASSERT(subobj != 0);
+
+	if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj)
+		return;
+
+	if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0)
+		return;
+	if (bpobj_is_empty(&subbpo)) {
+		bpobj_close(&subbpo);
+		return;
+	}
+	subsubobjs = subbpo.bpo_phys->bpo_subobjs;
+	bpobj_close(&subbpo);
+
+	if (subsubobjs != 0) {
+		if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0)
+			return;
+		if (doi.doi_max_offset > doi.doi_data_block_size)
+			copy_subsub = B_FALSE;
+	}
+
+	if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0)
+		return;
+	if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub)
+		copy_bps = B_FALSE;
+
+	if (copy_subsub && subsubobjs != 0) {
+		if (bpo->bpo_phys->bpo_subobjs) {
+			dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
+			    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
+			    ZIO_PRIORITY_ASYNC_READ);
+		}
+		dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1,
+		    ZIO_PRIORITY_ASYNC_READ);
+	}
+
+	if (copy_bps) {
+		dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
+		    bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1,
+		    ZIO_PRIORITY_ASYNC_READ);
+		dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1,
+		    ZIO_PRIORITY_ASYNC_READ);
+	} else if (bpo->bpo_phys->bpo_subobjs) {
+		dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
+		    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
+		    ZIO_PRIORITY_ASYNC_READ);
+	}
+}
+
 void
 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
@@ -805,12 +891,12 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
 		 * set of BP's stored, and bpobj_iterate() wouldn't visit
 		 * all the space accounted for in the bpobj.
 		 */
-		bzero(&stored_bp, sizeof (stored_bp));
+		memset(&stored_bp, 0, sizeof (stored_bp));
 		stored_bp.blk_prop = bp->blk_prop;
-		stored_bp.blk_birth = bp->blk_birth;
+		BP_SET_LOGICAL_BIRTH(&stored_bp, BP_GET_LOGICAL_BIRTH(bp));
 	} else if (!BP_GET_DEDUP(bp)) {
 		/* The bpobj will compress better without the checksum */
-		bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+		memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum));
 	}
 
 	stored_bp.blk_fill = 0;
@@ -829,6 +915,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
 			dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
 		VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
 		    offset, bpo, &bpo->bpo_cached_dbuf, 0));
+		ASSERT3P(bpo->bpo_cached_dbuf, !=, NULL);
 	}
 
 	dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
@@ -860,13 +947,14 @@ struct space_range_arg {
 	uint64_t uncomp;
 };
 
-/* ARGSUSED */
 static int
 space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 {
+	(void) bp_freed, (void) tx;
 	struct space_range_arg *sra = arg;
 
-	if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+	if (BP_GET_LOGICAL_BIRTH(bp) > sra->mintxg &&
+	    BP_GET_LOGICAL_BIRTH(bp) <= sra->maxtxg) {
 		if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
 			sra->used += bp_get_dsize_sync(sra->spa, bp);
 		else
@@ -898,7 +986,7 @@ bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 
 /*
  * Return the amount of space in the bpobj which is:
- * mintxg < blk_birth <= maxtxg
+ * mintxg < logical birth <= maxtxg
  */
 int
 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
@@ -932,11 +1020,11 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
  * bpobj are designated as free or allocated that information is not preserved
  * in bplists.
  */
-/* ARGSUSED */
 int
 bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
+	(void) bp_freed, (void) tx;
 	bplist_t *bpl = arg;
 	bplist_append(bpl, bp);
 	return (0);
diff --git a/sys/contrib/openzfs/module/zfs/bptree.c b/sys/contrib/openzfs/module/zfs/bptree.c
index 1827a3c4e326..1f5d8e77bcc0 100644
--- a/sys/contrib/openzfs/module/zfs/bptree.c
+++ b/sys/contrib/openzfs/module/zfs/bptree.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -147,11 +147,11 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
 	dmu_buf_rele(db, FTAG);
 }
 
-/* ARGSUSED */
 static int
 bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
+	(void) zilog, (void) dnp;
 	int err;
 	struct bptree_args *ba = arg;
 
diff --git a/sys/contrib/openzfs/module/zfs/bqueue.c b/sys/contrib/openzfs/module/zfs/bqueue.c
index 22539efc4e23..a7fa516975de 100644
--- a/sys/contrib/openzfs/module/zfs/bqueue.c
+++ b/sys/contrib/openzfs/module/zfs/bqueue.c
@@ -27,34 +27,46 @@ obj2node(bqueue_t *q, void *data)
 
 /*
  * Initialize a blocking queue  The maximum capacity of the queue is set to
- * size.  Types that are stored in a bqueue must contain a bqueue_node_t,
- * and node_offset must be its offset from the start of the struct.
- * fill_fraction is a performance tuning value; when the queue is full, any
- * threads attempting to enqueue records will block.  They will block until
- * they're signaled, which will occur when the queue is at least 1/fill_fraction
+ * size.  Types that are stored in a bqueue must contain a bqueue_node_t, and
+ * node_offset must be its offset from the start of the struct. fill_fraction
+ * is a performance tuning value; when the queue is full, any threads
+ * attempting to enqueue records will block.  They will block until they're
+ * signaled, which will occur when the queue is at least 1/fill_fraction
  * empty.  Similar behavior occurs on dequeue; if the queue is empty, threads
- * block.  They will be signalled when the queue has 1/fill_fraction full, or
- * when bqueue_flush is called.  As a result, you must call bqueue_flush when
- * you enqueue your final record on a thread, in case the dequeueing threads are
- * currently blocked and that enqueue does not cause them to be awoken.
- * Alternatively, this behavior can be disabled (causing signaling to happen
- * immediately) by setting fill_fraction to any value larger than size.
- * Return 0 on success, or -1 on failure.
+ * block.  They will be signalled when the queue has 1/fill_fraction full.
+ * As a result, you must call bqueue_enqueue_flush() when you enqueue your
+ * final record on a thread, in case the dequeuing threads are currently
+ * blocked and that enqueue does not cause them to be woken. Alternatively,
+ * this behavior can be disabled (causing signaling to happen immediately) by
+ * setting fill_fraction to any value larger than size. Return 0 on success,
+ * or -1 on failure.
+ *
+ * Note: The caller must ensure that for a given bqueue_t, there's only a
+ * single call to bqueue_enqueue() running at a time (e.g. by calling only
+ * from a single thread, or with locking around the call). Similarly, the
+ * caller must ensure that there's only a single call to bqueue_dequeue()
+ * running at a time. However, the one call to bqueue_enqueue() may be
+ * invoked concurrently with the one call to bqueue_dequeue().
  */
 int
-bqueue_init(bqueue_t *q, uint64_t fill_fraction, uint64_t size,
-    size_t node_offset)
+bqueue_init(bqueue_t *q, uint_t fill_fraction, size_t size, size_t node_offset)
 {
 	if (fill_fraction == 0) {
 		return (-1);
 	}
 	list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
 	    node_offset + offsetof(bqueue_node_t, bqn_node));
+	list_create(&q->bq_dequeuing_list, node_offset + sizeof (bqueue_node_t),
+	    node_offset + offsetof(bqueue_node_t, bqn_node));
+	list_create(&q->bq_enqueuing_list, node_offset + sizeof (bqueue_node_t),
+	    node_offset + offsetof(bqueue_node_t, bqn_node));
 	cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
 	q->bq_node_offset = node_offset;
 	q->bq_size = 0;
+	q->bq_dequeuing_size = 0;
+	q->bq_enqueuing_size = 0;
 	q->bq_maxsize = size;
 	q->bq_fill_fraction = fill_fraction;
 	return (0);
@@ -70,31 +82,40 @@ bqueue_destroy(bqueue_t *q)
 {
 	mutex_enter(&q->bq_lock);
 	ASSERT0(q->bq_size);
+	ASSERT0(q->bq_dequeuing_size);
+	ASSERT0(q->bq_enqueuing_size);
 	cv_destroy(&q->bq_add_cv);
 	cv_destroy(&q->bq_pop_cv);
 	list_destroy(&q->bq_list);
+	list_destroy(&q->bq_dequeuing_list);
+	list_destroy(&q->bq_enqueuing_list);
 	mutex_exit(&q->bq_lock);
 	mutex_destroy(&q->bq_lock);
 }
 
 static void
-bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size,
-    boolean_t flush)
+bqueue_enqueue_impl(bqueue_t *q, void *data, size_t item_size, boolean_t flush)
 {
 	ASSERT3U(item_size, >, 0);
 	ASSERT3U(item_size, <=, q->bq_maxsize);
-	mutex_enter(&q->bq_lock);
+
 	obj2node(q, data)->bqn_size = item_size;
-	while (q->bq_size + item_size > q->bq_maxsize) {
-		cv_wait_sig(&q->bq_add_cv, &q->bq_lock);
-	}
-	q->bq_size += item_size;
-	list_insert_tail(&q->bq_list, data);
-	if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction)
-		cv_signal(&q->bq_pop_cv);
-	if (flush)
+	q->bq_enqueuing_size += item_size;
+	list_insert_tail(&q->bq_enqueuing_list, data);
+
+	if (flush ||
+	    q->bq_enqueuing_size >= q->bq_maxsize / q->bq_fill_fraction) {
+		/* Append the enquing list to the shared list. */
+		mutex_enter(&q->bq_lock);
+		while (q->bq_size > q->bq_maxsize) {
+			cv_wait_sig(&q->bq_add_cv, &q->bq_lock);
+		}
+		q->bq_size += q->bq_enqueuing_size;
+		list_move_tail(&q->bq_list, &q->bq_enqueuing_list);
+		q->bq_enqueuing_size = 0;
 		cv_broadcast(&q->bq_pop_cv);
-	mutex_exit(&q->bq_lock);
+		mutex_exit(&q->bq_lock);
+	}
 }
 
 /*
@@ -103,7 +124,7 @@ bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size,
  * > 0.
  */
 void
-bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
+bqueue_enqueue(bqueue_t *q, void *data, size_t item_size)
 {
 	bqueue_enqueue_impl(q, data, item_size, B_FALSE);
 }
@@ -112,12 +133,12 @@ bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
  * Enqueue an entry, and then flush the queue.  This forces the popping threads
  * to wake up, even if we're below the fill fraction.  We have this in a single
  * function, rather than having a separate call, because it prevents race
- * conditions between the enqueuing thread and the dequeueing thread, where the
- * enqueueing thread will wake up the dequeueing thread, that thread will
+ * conditions between the enqueuing thread and the dequeuing thread, where the
+ * enqueueing thread will wake up the dequeuing thread, that thread will
  * destroy the condvar before the enqueuing thread is done.
  */
 void
-bqueue_enqueue_flush(bqueue_t *q, void *data, uint64_t item_size)
+bqueue_enqueue_flush(bqueue_t *q, void *data, size_t item_size)
 {
 	bqueue_enqueue_impl(q, data, item_size, B_TRUE);
 }
@@ -129,27 +150,26 @@ bqueue_enqueue_flush(bqueue_t *q, void *data, uint64_t item_size)
 void *
 bqueue_dequeue(bqueue_t *q)
 {
-	void *ret = NULL;
-	uint64_t item_size;
-	mutex_enter(&q->bq_lock);
-	while (q->bq_size == 0) {
-		cv_wait_sig(&q->bq_pop_cv, &q->bq_lock);
+	void *ret = list_remove_head(&q->bq_dequeuing_list);
+	if (ret == NULL) {
+		/*
+		 * Dequeuing list is empty.  Wait for there to be something on
+		 * the shared list, then move the entire shared list to the
+		 * dequeuing list.
+		 */
+		mutex_enter(&q->bq_lock);
+		while (q->bq_size == 0) {
+			cv_wait_sig(&q->bq_pop_cv, &q->bq_lock);
+		}
+		ASSERT0(q->bq_dequeuing_size);
+		ASSERT(list_is_empty(&q->bq_dequeuing_list));
+		list_move_tail(&q->bq_dequeuing_list, &q->bq_list);
+		q->bq_dequeuing_size = q->bq_size;
+		q->bq_size = 0;
+		cv_broadcast(&q->bq_add_cv);
+		mutex_exit(&q->bq_lock);
+		ret = list_remove_head(&q->bq_dequeuing_list);
 	}
-	ret = list_remove_head(&q->bq_list);
-	ASSERT3P(ret, !=, NULL);
-	item_size = obj2node(q, ret)->bqn_size;
-	q->bq_size -= item_size;
-	if (q->bq_size <= q->bq_maxsize - (q->bq_maxsize / q->bq_fill_fraction))
-		cv_signal(&q->bq_add_cv);
-	mutex_exit(&q->bq_lock);
+	q->bq_dequeuing_size -= obj2node(q, ret)->bqn_size;
 	return (ret);
 }
-
-/*
- * Returns true if the space used is 0.
- */
-boolean_t
-bqueue_empty(bqueue_t *q)
-{
-	return (q->bq_size == 0);
-}
diff --git a/sys/contrib/openzfs/module/zfs/brt.c b/sys/contrib/openzfs/module/zfs/brt.c
new file mode 100644
index 000000000000..ea8c0735c4b7
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/brt.c
@@ -0,0 +1,1673 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/brt.h>
+#include <sys/brt_impl.h>
+#include <sys/ddt.h>
+#include <sys/bitmap.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
+#include <sys/vdev_impl.h>
+#include <sys/kstat.h>
+#include <sys/wmsum.h>
+
+/*
+ * Block Cloning design.
+ *
+ * Block Cloning allows to manually clone a file (or a subset of its blocks)
+ * into another (or the same) file by just creating additional references to
+ * the data blocks without copying the data itself. Those references are kept
+ * in the Block Reference Tables (BRTs).
+ *
+ * In many ways this is similar to the existing deduplication, but there are
+ * some important differences:
+ *
+ * - Deduplication is automatic and Block Cloning is not - one has to use a
+ *   dedicated system call(s) to clone the given file/blocks.
+ * - Deduplication keeps all data blocks in its table, even those referenced
+ *   just once. Block Cloning creates an entry in its tables only when there
+ *   are at least two references to the given data block. If the block was
+ *   never explicitly cloned or the second to last reference was dropped,
+ *   there will be neither space nor performance overhead.
+ * - Deduplication needs data to work - one needs to pass real data to the
+ *   write(2) syscall, so hash can be calculated. Block Cloning doesn't require
+ *   data, just block pointers to the data, so it is extremely fast, as we pay
+ *   neither the cost of reading the data, nor the cost of writing the data -
+ *   we operate exclusively on metadata.
+ * - If the D (dedup) bit is not set in the block pointer, it means that
+ *   the block is not in the dedup table (DDT) and we won't consult the DDT
+ *   when we need to free the block. Block Cloning must be consulted on every
+ *   free, because we cannot modify the source BP (eg. by setting something
+ *   similar to the D bit), thus we have no hint if the block is in the
+ *   Block Reference Table (BRT), so we need to look into the BRT. There is
+ *   an optimization in place that allows us to eliminate the majority of BRT
+ *   lookups which is described below in the "Minimizing free penalty" section.
+ * - The BRT entry is much smaller than the DDT entry - for BRT we only store
+ *   64bit offset and 64bit reference counter.
+ * - Dedup keys are cryptographic hashes, so two blocks that are close to each
+ *   other on disk are most likely in totally different parts of the DDT.
+ *   The BRT entry keys are offsets into a single top-level VDEV, so data blocks
+ *   from one file should have BRT entries close to each other.
+ * - Scrub will only do a single pass over a block that is referenced multiple
+ *   times in the DDT. Unfortunately it is not currently (if at all) possible
+ *   with Block Cloning and block referenced multiple times will be scrubbed
+ *   multiple times. The new, sorted scrub should be able to eliminate
+ *   duplicated reads given enough memory.
+ * - Deduplication requires cryptographically strong hash as a checksum or
+ *   additional data verification. Block Cloning works with any checksum
+ *   algorithm or even with checksumming disabled.
+ *
+ * As mentioned above, the BRT entries are much smaller than the DDT entries.
+ * To uniquely identify a block we just need its vdev id and offset. We also
+ * need to maintain a reference counter. The vdev id will often repeat, as there
+ * is a small number of top-level VDEVs and a large number of blocks stored in
+ * each VDEV. We take advantage of that to reduce the BRT entry size further by
+ * maintaining one BRT for each top-level VDEV, so we can then have only offset
+ * and counter as the BRT entry.
+ *
+ * Minimizing free penalty.
+ *
+ * Block Cloning allows creating additional references to any existing block.
+ * When we free a block there is no hint in the block pointer whether the block
+ * was cloned or not, so on each free we have to check if there is a
+ * corresponding entry in the BRT or not. If there is, we need to decrease
+ * the reference counter. Doing BRT lookup on every free can potentially be
+ * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
+ * This is the main problem with deduplication, so we've learned our lesson and
+ * try not to repeat the same mistake here. How do we do that? We divide each
+ * top-level VDEV into 16MB regions. For each region we maintain a counter that
+ * is a sum of all the BRT entries that have offsets within the region. This
+ * creates the entries count array of 16bit numbers for each top-level VDEV.
+ * The entries count array is always kept in memory and updated on disk in the
+ * same transaction group as the BRT updates to keep everything in-sync. We can
+ * keep the array in memory, because it is very small. With 16MB regions and
+ * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
+ * the region size even further in the future). Now, when we want to free
+ * a block, we first consult the array. If the counter for the whole region is
+ * zero, there is no need to look for the BRT entry, as there isn't one for
+ * sure. If the counter for the region is greater than zero, only then we will
+ * do a BRT lookup and if an entry is found we will decrease the reference
+ * counter in the BRT entry and in the entry counters array.
+ *
+ * The entry counters array is small, but can potentially be larger for very
+ * large VDEVs or smaller regions. In this case we don't want to rewrite entire
+ * array on every change. We then divide the array into 32kB block and keep
+ * a bitmap of dirty blocks within a transaction group. When we sync the
+ * transaction group we can only update the parts of the entry counters array
+ * that were modified. Note: Keeping track of the dirty parts of the entry
+ * counters array is implemented, but updating only parts of the array on disk
+ * is not yet implemented - for now we will update entire array if there was
+ * any change.
+ *
+ * The implementation tries to be economic: if BRT is not used, or no longer
+ * used, there will be no entries in the MOS and no additional memory used (eg.
+ * the entry counters array is only allocated if needed).
+ *
+ * Interaction between Deduplication and Block Cloning.
+ *
+ * If both functionalities are in use, we could end up with a block that is
+ * referenced multiple times in both DDT and BRT. When we free one of the
+ * references we couldn't tell where it belongs, so we would have to decide
+ * what table takes the precedence: do we first clear DDT references or BRT
+ * references? To avoid this dilemma BRT cooperates with DDT - if a given block
+ * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
+ * lookup DDT entry instead and increase the counter there. No BRT entry
+ * will be created for a block which has the D (dedup) bit set.
+ * BRT may be more efficient for manual deduplication, but if the block is
+ * already in the DDT, then creating additional BRT entry would be less
+ * efficient. This clever idea was proposed by Allan Jude.
+ *
+ * Block Cloning across datasets.
+ *
+ * Block Cloning is not limited to cloning blocks within the same dataset.
+ * It is possible (and very useful) to clone blocks between different datasets.
+ * One use case is recovering files from snapshots. By cloning the files into
+ * dataset we need no additional storage. Without Block Cloning we would need
+ * additional space for those files.
+ * Another interesting use case is moving the files between datasets
+ * (copying the file content to the new dataset and removing the source file).
+ * In that case Block Cloning will only be used briefly, because the BRT entries
+ * will be removed when the source is removed.
+ * Block Cloning across encrypted datasets is supported as long as both
+ * datasets share the same master key (e.g. snapshots and clones)
+ *
+ * Block Cloning flow through ZFS layers.
+ *
+ * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
+ * blocks. As of this writing no interface is implemented that allows for block
+ * cloning within a ZVOL.
+ * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
+ * for blocking cloning.
+ *
+ *	ssize_t
+ *	copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
+ *	                size_t len, unsigned int flags);
+ *
+ * Even though offsets and length represent bytes, they have to be
+ * block-aligned or we will return an error so the upper layer can
+ * fallback to the generic mechanism that will just copy the data.
+ * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
+ * This function was implemented based on zfs_write(), but instead of writing
+ * the given data we first read block pointers using the new dmu_read_l0_bps()
+ * function from the source file. Once we have BPs from the source file we call
+ * the dmu_brt_clone() function on the destination file. This function
+ * allocates BPs for us. We iterate over all source BPs. If the given BP is
+ * a hole or an embedded block, we just copy BP as-is. If it points to a real
+ * data we place this BP on a BRT pending list using the brt_pending_add()
+ * function.
+ *
+ * We use this pending list to keep track of all BPs that got new references
+ * within this transaction group.
+ *
+ * Some special cases to consider and how we address them:
+ * - The block we want to clone may have been created within the same
+ *   transaction group that we are trying to clone. Such block has no BP
+ *   allocated yet, so cannot be immediately cloned. We return EAGAIN.
+ * - The block we want to clone may have been modified within the same
+ *   transaction group. We return EAGAIN.
+ * - A block may be cloned multiple times during one transaction group (that's
+ *   why pending list is actually a tree and not an append-only list - this
+ *   way we can figure out faster if this block is cloned for the first time
+ *   in this txg or consecutive time).
+ * - A block may be cloned and freed within the same transaction group
+ *   (see dbuf_undirty()).
+ * - A block may be cloned and within the same transaction group the clone
+ *   can be cloned again (see dmu_read_l0_bps()).
+ * - A file might have been deleted, but the caller still has a file descriptor
+ *   open to this file and clones it.
+ *
+ * When we free a block we have an additional step in the ZIO pipeline where we
+ * call the zio_brt_free() function. We then call the brt_entry_decref()
+ * that loads the corresponding BRT entry (if one exists) and decreases
+ * reference counter. If this is not the last reference we will stop ZIO
+ * pipeline here. If this is the last reference or the block is not in the
+ * BRT, we continue the pipeline and free the block as usual.
+ *
+ * At the beginning of spa_sync() where there can be no more block cloning,
+ * but before issuing frees we call brt_pending_apply(). This function applies
+ * all the new clones to the BRT table - we load BRT entries and update
+ * reference counters. To sync new BRT entries to disk, we use brt_sync()
+ * function. This function will sync all dirty per-top-level-vdev BRTs,
+ * the entry counters arrays, etc.
+ *
+ * Block Cloning and ZIL.
+ *
+ * Every clone operation is divided into chunks (similar to write) and each
+ * chunk is cloned in a separate transaction. The chunk size is determined by
+ * how many BPs we can fit into a single ZIL entry.
+ * Replaying clone operation is different from the regular clone operation,
+ * as when we log clone operations we cannot use the source object - it may
+ * reside on a different dataset, so we log BPs we want to clone.
+ * The ZIL is replayed when we mount the given dataset, not when the pool is
+ * imported. Taking this into account it is possible that the pool is imported
+ * without mounting datasets and the source dataset is destroyed before the
+ * destination dataset is mounted and its ZIL replayed.
+ * To address this situation we leverage zil_claim() mechanism where ZFS will
+ * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
+ * entries, we will bump reference counters for their BPs in the BRT.  Then
+ * on mount and ZIL replay we bump the reference counters once more, while the
+ * first references are dropped during ZIL destroy by zil_free_clone_range().
+ * It is possible that after zil_claim() we never mount the destination, so
+ * we never replay its ZIL and just destroy it.  In this case the only taken
+ * references will be dropped by zil_free_clone_range(), since the cloning is
+ * not going to ever take place.
+ */
+
+static kmem_cache_t *brt_entry_cache;
+static kmem_cache_t *brt_pending_entry_cache;
+
+/*
+ * Enable/disable prefetching of BRT entries that we are going to modify.
+ */
+static int brt_zap_prefetch = 1;
+
+#ifdef ZFS_DEBUG
+#define	BRT_DEBUG(...)	do {						\
+	if ((zfs_flags & ZFS_DEBUG_BRT) != 0) {				\
+		__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
+	}								\
+} while (0)
+#else
+#define	BRT_DEBUG(...)	do { } while (0)
+#endif
+
+static int brt_zap_default_bs = 12;
+static int brt_zap_default_ibs = 12;
+
+static kstat_t	*brt_ksp;
+
+typedef struct brt_stats {
+	kstat_named_t brt_addref_entry_in_memory;
+	kstat_named_t brt_addref_entry_not_on_disk;
+	kstat_named_t brt_addref_entry_on_disk;
+	kstat_named_t brt_addref_entry_read_lost_race;
+	kstat_named_t brt_decref_entry_in_memory;
+	kstat_named_t brt_decref_entry_loaded_from_disk;
+	kstat_named_t brt_decref_entry_not_in_memory;
+	kstat_named_t brt_decref_entry_not_on_disk;
+	kstat_named_t brt_decref_entry_read_lost_race;
+	kstat_named_t brt_decref_entry_still_referenced;
+	kstat_named_t brt_decref_free_data_later;
+	kstat_named_t brt_decref_free_data_now;
+	kstat_named_t brt_decref_no_entry;
+} brt_stats_t;
+
+static brt_stats_t brt_stats = {
+	{ "addref_entry_in_memory",		KSTAT_DATA_UINT64 },
+	{ "addref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
+	{ "addref_entry_on_disk",		KSTAT_DATA_UINT64 },
+	{ "addref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
+	{ "decref_entry_in_memory",		KSTAT_DATA_UINT64 },
+	{ "decref_entry_loaded_from_disk",	KSTAT_DATA_UINT64 },
+	{ "decref_entry_not_in_memory",		KSTAT_DATA_UINT64 },
+	{ "decref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
+	{ "decref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
+	{ "decref_entry_still_referenced",	KSTAT_DATA_UINT64 },
+	{ "decref_free_data_later",		KSTAT_DATA_UINT64 },
+	{ "decref_free_data_now",		KSTAT_DATA_UINT64 },
+	{ "decref_no_entry",			KSTAT_DATA_UINT64 }
+};
+
+struct {
+	wmsum_t brt_addref_entry_in_memory;
+	wmsum_t brt_addref_entry_not_on_disk;
+	wmsum_t brt_addref_entry_on_disk;
+	wmsum_t brt_addref_entry_read_lost_race;
+	wmsum_t brt_decref_entry_in_memory;
+	wmsum_t brt_decref_entry_loaded_from_disk;
+	wmsum_t brt_decref_entry_not_in_memory;
+	wmsum_t brt_decref_entry_not_on_disk;
+	wmsum_t brt_decref_entry_read_lost_race;
+	wmsum_t brt_decref_entry_still_referenced;
+	wmsum_t brt_decref_free_data_later;
+	wmsum_t brt_decref_free_data_now;
+	wmsum_t brt_decref_no_entry;
+} brt_sums;
+
+#define	BRTSTAT_BUMP(stat)	wmsum_add(&brt_sums.stat, 1)
+
+static int brt_entry_compare(const void *x1, const void *x2);
+static int brt_pending_entry_compare(const void *x1, const void *x2);
+
+static void
+brt_rlock(brt_t *brt)
+{
+	rw_enter(&brt->brt_lock, RW_READER);
+}
+
+static void
+brt_wlock(brt_t *brt)
+{
+	rw_enter(&brt->brt_lock, RW_WRITER);
+}
+
+static void
+brt_unlock(brt_t *brt)
+{
+	rw_exit(&brt->brt_lock);
+}
+
+static uint16_t
+brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
+{
+
+	ASSERT3U(idx, <, brtvd->bv_size);
+
+	if (unlikely(brtvd->bv_need_byteswap)) {
+		return (BSWAP_16(brtvd->bv_entcount[idx]));
+	} else {
+		return (brtvd->bv_entcount[idx]);
+	}
+}
+
+static void
+brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
+{
+
+	ASSERT3U(idx, <, brtvd->bv_size);
+
+	if (unlikely(brtvd->bv_need_byteswap)) {
+		brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
+	} else {
+		brtvd->bv_entcount[idx] = entcnt;
+	}
+}
+
+static void
+brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
+{
+	uint16_t entcnt;
+
+	ASSERT3U(idx, <, brtvd->bv_size);
+
+	entcnt = brt_vdev_entcount_get(brtvd, idx);
+	ASSERT(entcnt < UINT16_MAX);
+
+	brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
+}
+
+static void
+brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
+{
+	uint16_t entcnt;
+
+	ASSERT3U(idx, <, brtvd->bv_size);
+
+	entcnt = brt_vdev_entcount_get(brtvd, idx);
+	ASSERT(entcnt > 0);
+
+	brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
+}
+
+#ifdef ZFS_DEBUG
+static void
+brt_vdev_dump(brt_vdev_t *brtvd)
+{
+	uint64_t idx;
+
+	zfs_dbgmsg("  BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
+	    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
+	    (u_longlong_t)brtvd->bv_vdevid,
+	    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
+	    (u_longlong_t)brtvd->bv_size,
+	    (u_longlong_t)brtvd->bv_totalcount,
+	    (u_longlong_t)brtvd->bv_nblocks,
+	    (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
+	if (brtvd->bv_totalcount > 0) {
+		zfs_dbgmsg("    entcounts:");
+		for (idx = 0; idx < brtvd->bv_size; idx++) {
+			uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
+			if (entcnt > 0) {
+				zfs_dbgmsg("      [%04llu] %hu",
+				    (u_longlong_t)idx, entcnt);
+			}
+		}
+	}
+	if (brtvd->bv_entcount_dirty) {
+		char *bitmap;
+
+		bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
+		for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
+			bitmap[idx] =
+			    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
+		}
+		bitmap[idx] = '\0';
+		zfs_dbgmsg("    dirty: %s", bitmap);
+		kmem_free(bitmap, brtvd->bv_nblocks + 1);
+	}
+}
+#endif
+
+static brt_vdev_t *
+brt_vdev(brt_t *brt, uint64_t vdevid)
+{
+	brt_vdev_t *brtvd;
+
+	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+
+	if (vdevid < brt->brt_nvdevs) {
+		brtvd = &brt->brt_vdevs[vdevid];
+	} else {
+		brtvd = NULL;
+	}
+
+	return (brtvd);
+}
+
+static void
+brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+{
+	char name[64];
+
+	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+	ASSERT0(brtvd->bv_mos_brtvdev);
+	ASSERT0(brtvd->bv_mos_entries);
+	ASSERT(brtvd->bv_entcount != NULL);
+	ASSERT(brtvd->bv_size > 0);
+	ASSERT(brtvd->bv_bitmap != NULL);
+	ASSERT(brtvd->bv_nblocks > 0);
+
+	brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
+	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
+	    brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
+	VERIFY(brtvd->bv_mos_entries != 0);
+	BRT_DEBUG("MOS entries created, object=%llu",
+	    (u_longlong_t)brtvd->bv_mos_entries);
+
+	/*
+	 * We allocate DMU buffer to store the bv_entcount[] array.
+	 * We will keep array size (bv_size) and cummulative count for all
+	 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
+	 */
+	brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
+	    DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
+	    DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
+	VERIFY(brtvd->bv_mos_brtvdev != 0);
+	BRT_DEBUG("MOS BRT VDEV created, object=%llu",
+	    (u_longlong_t)brtvd->bv_mos_brtvdev);
+
+	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
+	    (u_longlong_t)brtvd->bv_vdevid);
+	VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
+	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
+	BRT_DEBUG("Pool directory object created, object=%s", name);
+
+	spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
+}
+
+static void
+brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
+{
+	vdev_t *vd;
+	uint16_t *entcount;
+	ulong_t *bitmap;
+	uint64_t nblocks, size;
+
+	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+
+	spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
+	vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
+	size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
+	spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
+
+	entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
+	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
+	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
+
+	if (!brtvd->bv_initiated) {
+		ASSERT0(brtvd->bv_size);
+		ASSERT(brtvd->bv_entcount == NULL);
+		ASSERT(brtvd->bv_bitmap == NULL);
+		ASSERT0(brtvd->bv_nblocks);
+
+		avl_create(&brtvd->bv_tree, brt_entry_compare,
+		    sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
+	} else {
+		ASSERT(brtvd->bv_size > 0);
+		ASSERT(brtvd->bv_entcount != NULL);
+		ASSERT(brtvd->bv_bitmap != NULL);
+		ASSERT(brtvd->bv_nblocks > 0);
+		/*
+		 * TODO: Allow vdev shrinking. We only need to implement
+		 * shrinking the on-disk BRT VDEV object.
+		 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
+		 *     size, tx);
+		 */
+		ASSERT3U(brtvd->bv_size, <=, size);
+
+		memcpy(entcount, brtvd->bv_entcount,
+		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
+		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
+		    BT_SIZEOFMAP(brtvd->bv_nblocks)));
+		vmem_free(brtvd->bv_entcount,
+		    sizeof (entcount[0]) * brtvd->bv_size);
+		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
+	}
+
+	brtvd->bv_size = size;
+	brtvd->bv_entcount = entcount;
+	brtvd->bv_bitmap = bitmap;
+	brtvd->bv_nblocks = nblocks;
+	if (!brtvd->bv_initiated) {
+		brtvd->bv_need_byteswap = FALSE;
+		brtvd->bv_initiated = TRUE;
+		BRT_DEBUG("BRT VDEV %llu initiated.",
+		    (u_longlong_t)brtvd->bv_vdevid);
+	}
+}
+
+static void
+brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
+{
+	char name[64];
+	dmu_buf_t *db;
+	brt_vdev_phys_t *bvphys;
+	int error;
+
+	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
+	    (u_longlong_t)brtvd->bv_vdevid);
+	error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
+	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
+	if (error != 0)
+		return;
+	ASSERT(brtvd->bv_mos_brtvdev != 0);
+
+	error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
+	ASSERT0(error);
+	if (error != 0)
+		return;
+
+	bvphys = db->db_data;
+	if (brt->brt_rangesize == 0) {
+		brt->brt_rangesize = bvphys->bvp_rangesize;
+	} else {
+		ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
+	}
+
+	ASSERT(!brtvd->bv_initiated);
+	brt_vdev_realloc(brt, brtvd);
+
+	/* TODO: We don't support VDEV shrinking. */
+	ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
+
+	/*
+	 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
+	 */
+	error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
+	    MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
+	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
+	ASSERT0(error);
+
+	brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
+	ASSERT(brtvd->bv_mos_entries != 0);
+	brtvd->bv_need_byteswap =
+	    (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
+	brtvd->bv_totalcount = bvphys->bvp_totalcount;
+	brtvd->bv_usedspace = bvphys->bvp_usedspace;
+	brtvd->bv_savedspace = bvphys->bvp_savedspace;
+	brt->brt_usedspace += brtvd->bv_usedspace;
+	brt->brt_savedspace += brtvd->bv_savedspace;
+
+	dmu_buf_rele(db, FTAG);
+
+	BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
+	    name, (u_longlong_t)brtvd->bv_mos_brtvdev,
+	    (u_longlong_t)brtvd->bv_mos_entries);
+}
+
+static void
+brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
+{
+
+	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+	ASSERT(brtvd->bv_initiated);
+
+	vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
+	brtvd->bv_entcount = NULL;
+	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
+	brtvd->bv_bitmap = NULL;
+	ASSERT0(avl_numnodes(&brtvd->bv_tree));
+	avl_destroy(&brtvd->bv_tree);
+
+	brtvd->bv_size = 0;
+	brtvd->bv_nblocks = 0;
+
+	brtvd->bv_initiated = FALSE;
+	BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
+}
+
+static void
+brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+{
+	char name[64];
+	uint64_t count;
+	dmu_buf_t *db;
+	brt_vdev_phys_t *bvphys;
+
+	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+	ASSERT(brtvd->bv_mos_brtvdev != 0);
+	ASSERT(brtvd->bv_mos_entries != 0);
+
+	VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
+	VERIFY0(count);
+	VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
+	BRT_DEBUG("MOS entries destroyed, object=%llu",
+	    (u_longlong_t)brtvd->bv_mos_entries);
+	brtvd->bv_mos_entries = 0;
+
+	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
+	bvphys = db->db_data;
+	ASSERT0(bvphys->bvp_totalcount);
+	ASSERT0(bvphys->bvp_usedspace);
+	ASSERT0(bvphys->bvp_savedspace);
+	dmu_buf_rele(db, FTAG);
+
+	VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
+	BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
+	    (u_longlong_t)brtvd->bv_mos_brtvdev);
+	brtvd->bv_mos_brtvdev = 0;
+
+	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
+	    (u_longlong_t)brtvd->bv_vdevid);
+	VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
+	BRT_DEBUG("Pool directory object removed, object=%s", name);
+
+	brt_vdev_dealloc(brt, brtvd);
+
+	spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
+}
+
+static void
+brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
+{
+	brt_vdev_t *brtvd, *vdevs;
+	uint64_t vdevid;
+
+	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+	ASSERT3U(nvdevs, >, brt->brt_nvdevs);
+
+	vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
+	if (brt->brt_nvdevs > 0) {
+		ASSERT(brt->brt_vdevs != NULL);
+
+		memcpy(vdevs, brt->brt_vdevs,
+		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
+		kmem_free(brt->brt_vdevs,
+		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
+	}
+	for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
+		brtvd = &vdevs[vdevid];
+
+		brtvd->bv_vdevid = vdevid;
+		brtvd->bv_initiated = FALSE;
+	}
+
+	BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
+	    (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
+
+	brt->brt_vdevs = vdevs;
+	brt->brt_nvdevs = nvdevs;
+}
+
+static boolean_t
+brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
+{
+	uint64_t idx;
+
+	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+
+	idx = bre->bre_offset / brt->brt_rangesize;
+	if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
+		/* VDEV wasn't expanded. */
+		return (brt_vdev_entcount_get(brtvd, idx) > 0);
+	}
+
+	return (FALSE);
+}
+
+static void
+brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
+    uint64_t dsize)
+{
+	uint64_t idx;
+
+	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+	ASSERT(brtvd != NULL);
+	ASSERT(brtvd->bv_entcount != NULL);
+
+	brt->brt_savedspace += dsize;
+	brtvd->bv_savedspace += dsize;
+	brtvd->bv_meta_dirty = TRUE;
+
+	if (bre->bre_refcount > 1) {
+		return;
+	}
+
+	brt->brt_usedspace += dsize;
+	brtvd->bv_usedspace += dsize;
+
+	idx = bre->bre_offset / brt->brt_rangesize;
+	if (idx >= brtvd->bv_size) {
+		/* VDEV has been expanded. */
+		brt_vdev_realloc(brt, brtvd);
+	}
+
+	ASSERT3U(idx, <, brtvd->bv_size);
+
+	brtvd->bv_totalcount++;
+	brt_vdev_entcount_inc(brtvd, idx);
+	brtvd->bv_entcount_dirty = TRUE;
+	idx = idx / BRT_BLOCKSIZE / 8;
+	BT_SET(brtvd->bv_bitmap, idx);
+
+#ifdef ZFS_DEBUG
+	if (zfs_flags & ZFS_DEBUG_BRT)
+		brt_vdev_dump(brtvd);
+#endif
+}
+
+static void
+brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
+    uint64_t dsize)
+{
+	uint64_t idx;
+
+	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+	ASSERT(brtvd != NULL);
+	ASSERT(brtvd->bv_entcount != NULL);
+
+	brt->brt_savedspace -= dsize;
+	brtvd->bv_savedspace -= dsize;
+	brtvd->bv_meta_dirty = TRUE;
+
+	if (bre->bre_refcount > 0) {
+		return;
+	}
+
+	brt->brt_usedspace -= dsize;
+	brtvd->bv_usedspace -= dsize;
+
+	idx = bre->bre_offset / brt->brt_rangesize;
+	ASSERT3U(idx, <, brtvd->bv_size);
+
+	ASSERT(brtvd->bv_totalcount > 0);
+	brtvd->bv_totalcount--;
+	brt_vdev_entcount_dec(brtvd, idx);
+	brtvd->bv_entcount_dirty = TRUE;
+	idx = idx / BRT_BLOCKSIZE / 8;
+	BT_SET(brtvd->bv_bitmap, idx);
+
+#ifdef ZFS_DEBUG
+	if (zfs_flags & ZFS_DEBUG_BRT)
+		brt_vdev_dump(brtvd);
+#endif
+}
+
+static void
+brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	brt_vdev_phys_t *bvphys;
+
+	ASSERT(brtvd->bv_meta_dirty);
+	ASSERT(brtvd->bv_mos_brtvdev != 0);
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
+
+	if (brtvd->bv_entcount_dirty) {
+		/*
+		 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
+		 */
+		dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
+		    brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
+		    brtvd->bv_entcount, tx);
+		memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
+		brtvd->bv_entcount_dirty = FALSE;
+	}
+
+	dmu_buf_will_dirty(db, tx);
+	bvphys = db->db_data;
+	bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
+	bvphys->bvp_size = brtvd->bv_size;
+	if (brtvd->bv_need_byteswap) {
+		bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
+	} else {
+		bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
+	}
+	bvphys->bvp_totalcount = brtvd->bv_totalcount;
+	bvphys->bvp_rangesize = brt->brt_rangesize;
+	bvphys->bvp_usedspace = brtvd->bv_usedspace;
+	bvphys->bvp_savedspace = brtvd->bv_savedspace;
+	dmu_buf_rele(db, FTAG);
+
+	brtvd->bv_meta_dirty = FALSE;
+}
+
+static void
+brt_vdevs_alloc(brt_t *brt, boolean_t load)
+{
+	brt_vdev_t *brtvd;
+	uint64_t vdevid;
+
+	brt_wlock(brt);
+
+	brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
+
+	if (load) {
+		for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+			brtvd = &brt->brt_vdevs[vdevid];
+			ASSERT(brtvd->bv_entcount == NULL);
+
+			brt_vdev_load(brt, brtvd);
+		}
+	}
+
+	if (brt->brt_rangesize == 0) {
+		brt->brt_rangesize = BRT_RANGESIZE;
+	}
+
+	brt_unlock(brt);
+}
+
+static void
+brt_vdevs_free(brt_t *brt)
+{
+	brt_vdev_t *brtvd;
+	uint64_t vdevid;
+
+	brt_wlock(brt);
+
+	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+		brtvd = &brt->brt_vdevs[vdevid];
+		if (brtvd->bv_initiated)
+			brt_vdev_dealloc(brt, brtvd);
+	}
+	kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
+
+	brt_unlock(brt);
+}
+
+static void
+brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
+{
+
+	bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
+	bre->bre_refcount = 0;
+
+	*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
+}
+
+static int
+brt_entry_compare(const void *x1, const void *x2)
+{
+	const brt_entry_t *bre1 = x1;
+	const brt_entry_t *bre2 = x2;
+
+	return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
+}
+
+static int
+brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
+{
+	uint64_t mos_entries;
+	int error;
+
+	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+
+	if (!brt_vdev_lookup(brt, brtvd, bre))
+		return (SET_ERROR(ENOENT));
+
+	/*
+	 * Remember mos_entries object number. After we reacquire the BRT lock,
+	 * the brtvd pointer may be invalid.
+	 */
+	mos_entries = brtvd->bv_mos_entries;
+	if (mos_entries == 0)
+		return (SET_ERROR(ENOENT));
+
+	brt_unlock(brt);
+
+	error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
+	    BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount);
+
+	brt_wlock(brt);
+
+	return (error);
+}
+
+static void
+brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
+{
+	brt_vdev_t *brtvd;
+	uint64_t mos_entries = 0;
+
+	brt_rlock(brt);
+	brtvd = brt_vdev(brt, vdevid);
+	if (brtvd != NULL)
+		mos_entries = brtvd->bv_mos_entries;
+	brt_unlock(brt);
+
+	if (mos_entries == 0)
+		return;
+
+	(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
+	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
+}
+
+/*
+ * Return TRUE if we _can_ have BRT entry for this bp. It might be false
+ * positive, but gives us quick answer if we should look into BRT, which
+ * may require reads and thus will be more expensive.
+ */
+boolean_t
+brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
+{
+	brt_t *brt = spa->spa_brt;
+	brt_vdev_t *brtvd;
+	brt_entry_t bre_search;
+	boolean_t mayexists = FALSE;
+	uint64_t vdevid;
+
+	brt_entry_fill(bp, &bre_search, &vdevid);
+
+	brt_rlock(brt);
+
+	brtvd = brt_vdev(brt, vdevid);
+	if (brtvd != NULL && brtvd->bv_initiated) {
+		if (!avl_is_empty(&brtvd->bv_tree) ||
+		    brt_vdev_lookup(brt, brtvd, &bre_search)) {
+			mayexists = TRUE;
+		}
+	}
+
+	brt_unlock(brt);
+
+	return (mayexists);
+}
+
+uint64_t
+brt_get_dspace(spa_t *spa)
+{
+	brt_t *brt = spa->spa_brt;
+
+	if (brt == NULL)
+		return (0);
+
+	return (brt->brt_savedspace);
+}
+
+uint64_t
+brt_get_used(spa_t *spa)
+{
+	brt_t *brt = spa->spa_brt;
+
+	if (brt == NULL)
+		return (0);
+
+	return (brt->brt_usedspace);
+}
+
+uint64_t
+brt_get_saved(spa_t *spa)
+{
+	brt_t *brt = spa->spa_brt;
+
+	if (brt == NULL)
+		return (0);
+
+	return (brt->brt_savedspace);
+}
+
+uint64_t
+brt_get_ratio(spa_t *spa)
+{
+	brt_t *brt = spa->spa_brt;
+
+	if (brt->brt_usedspace == 0)
+		return (100);
+
+	return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
+	    brt->brt_usedspace);
+}
+
+static int
+brt_kstats_update(kstat_t *ksp, int rw)
+{
+	brt_stats_t *bs = ksp->ks_data;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	bs->brt_addref_entry_in_memory.value.ui64 =
+	    wmsum_value(&brt_sums.brt_addref_entry_in_memory);
+	bs->brt_addref_entry_not_on_disk.value.ui64 =
+	    wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
+	bs->brt_addref_entry_on_disk.value.ui64 =
+	    wmsum_value(&brt_sums.brt_addref_entry_on_disk);
+	bs->brt_addref_entry_read_lost_race.value.ui64 =
+	    wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
+	bs->brt_decref_entry_in_memory.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_entry_in_memory);
+	bs->brt_decref_entry_loaded_from_disk.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
+	bs->brt_decref_entry_not_in_memory.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
+	bs->brt_decref_entry_not_on_disk.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
+	bs->brt_decref_entry_read_lost_race.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
+	bs->brt_decref_entry_still_referenced.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
+	bs->brt_decref_free_data_later.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_free_data_later);
+	bs->brt_decref_free_data_now.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_free_data_now);
+	bs->brt_decref_no_entry.value.ui64 =
+	    wmsum_value(&brt_sums.brt_decref_no_entry);
+
+	return (0);
+}
+
+static void
+brt_stat_init(void)
+{
+
+	wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
+	wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
+	wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
+	wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
+	wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
+	wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
+	wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
+	wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
+	wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
+	wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
+	wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
+	wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
+	wmsum_init(&brt_sums.brt_decref_no_entry, 0);
+
+	brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
+	    sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+	if (brt_ksp != NULL) {
+		brt_ksp->ks_data = &brt_stats;
+		brt_ksp->ks_update = brt_kstats_update;
+		kstat_install(brt_ksp);
+	}
+}
+
+static void
+brt_stat_fini(void)
+{
+	if (brt_ksp != NULL) {
+		kstat_delete(brt_ksp);
+		brt_ksp = NULL;
+	}
+
+	wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
+	wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
+	wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
+	wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
+	wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
+	wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
+	wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
+	wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
+	wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
+	wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
+	wmsum_fini(&brt_sums.brt_decref_free_data_later);
+	wmsum_fini(&brt_sums.brt_decref_free_data_now);
+	wmsum_fini(&brt_sums.brt_decref_no_entry);
+}
+
+void
+brt_init(void)
+{
+	brt_entry_cache = kmem_cache_create("brt_entry_cache",
+	    sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
+	    sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	brt_stat_init();
+}
+
+void
+brt_fini(void)
+{
+	brt_stat_fini();
+
+	kmem_cache_destroy(brt_entry_cache);
+	kmem_cache_destroy(brt_pending_entry_cache);
+}
+
+static brt_entry_t *
+brt_entry_alloc(const brt_entry_t *bre_init)
+{
+	brt_entry_t *bre;
+
+	bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
+	bre->bre_offset = bre_init->bre_offset;
+	bre->bre_refcount = bre_init->bre_refcount;
+
+	return (bre);
+}
+
+static void
+brt_entry_free(brt_entry_t *bre)
+{
+
+	kmem_cache_free(brt_entry_cache, bre);
+}
+
+static void
+brt_entry_addref(brt_t *brt, const blkptr_t *bp)
+{
+	brt_vdev_t *brtvd;
+	brt_entry_t *bre, *racebre;
+	brt_entry_t bre_search;
+	avl_index_t where;
+	uint64_t vdevid;
+	int error;
+
+	ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
+
+	brt_entry_fill(bp, &bre_search, &vdevid);
+
+	brt_wlock(brt);
+
+	brtvd = brt_vdev(brt, vdevid);
+	if (brtvd == NULL) {
+		ASSERT3U(vdevid, >=, brt->brt_nvdevs);
+
+		/* New VDEV was added. */
+		brt_vdevs_expand(brt, vdevid + 1);
+		brtvd = brt_vdev(brt, vdevid);
+	}
+	ASSERT(brtvd != NULL);
+	if (!brtvd->bv_initiated)
+		brt_vdev_realloc(brt, brtvd);
+
+	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
+	if (bre != NULL) {
+		BRTSTAT_BUMP(brt_addref_entry_in_memory);
+	} else {
+		/*
+		 * brt_entry_lookup() may drop the BRT (read) lock and
+		 * reacquire it (write).
+		 */
+		error = brt_entry_lookup(brt, brtvd, &bre_search);
+		/* bre_search now contains correct bre_refcount */
+		ASSERT(error == 0 || error == ENOENT);
+		if (error == 0)
+			BRTSTAT_BUMP(brt_addref_entry_on_disk);
+		else
+			BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
+		/*
+		 * When the BRT lock was dropped, brt_vdevs[] may have been
+		 * expanded and reallocated, we need to update brtvd's pointer.
+		 */
+		brtvd = brt_vdev(brt, vdevid);
+		ASSERT(brtvd != NULL);
+
+		racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
+		if (racebre == NULL) {
+			bre = brt_entry_alloc(&bre_search);
+			ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+			avl_insert(&brtvd->bv_tree, bre, where);
+			brt->brt_nentries++;
+		} else {
+			/*
+			 * The entry was added when the BRT lock was dropped in
+			 * brt_entry_lookup().
+			 */
+			BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
+			bre = racebre;
+		}
+	}
+	bre->bre_refcount++;
+	brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
+
+	brt_unlock(brt);
+}
+
+/* Return TRUE if block should be freed immediately. */
+boolean_t
+brt_entry_decref(spa_t *spa, const blkptr_t *bp)
+{
+	brt_t *brt = spa->spa_brt;
+	brt_vdev_t *brtvd;
+	brt_entry_t *bre, *racebre;
+	brt_entry_t bre_search;
+	avl_index_t where;
+	uint64_t vdevid;
+	int error;
+
+	brt_entry_fill(bp, &bre_search, &vdevid);
+
+	brt_wlock(brt);
+
+	brtvd = brt_vdev(brt, vdevid);
+	ASSERT(brtvd != NULL);
+
+	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
+	if (bre != NULL) {
+		BRTSTAT_BUMP(brt_decref_entry_in_memory);
+		goto out;
+	} else {
+		BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
+	}
+
+	/*
+	 * brt_entry_lookup() may drop the BRT lock and reacquire it.
+	 */
+	error = brt_entry_lookup(brt, brtvd, &bre_search);
+	/* bre_search now contains correct bre_refcount */
+	ASSERT(error == 0 || error == ENOENT);
+	/*
+	 * When the BRT lock was dropped, brt_vdevs[] may have been expanded
+	 * and reallocated, we need to update brtvd's pointer.
+	 */
+	brtvd = brt_vdev(brt, vdevid);
+	ASSERT(brtvd != NULL);
+
+	if (error == ENOENT) {
+		BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
+		bre = NULL;
+		goto out;
+	}
+
+	racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
+	if (racebre != NULL) {
+		/*
+		 * The entry was added when the BRT lock was dropped in
+		 * brt_entry_lookup().
+		 */
+		BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
+		bre = racebre;
+		goto out;
+	}
+
+	BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
+	bre = brt_entry_alloc(&bre_search);
+	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+	avl_insert(&brtvd->bv_tree, bre, where);
+	brt->brt_nentries++;
+
+out:
+	if (bre == NULL) {
+		/*
+		 * This is a free of a regular (not cloned) block.
+		 */
+		brt_unlock(brt);
+		BRTSTAT_BUMP(brt_decref_no_entry);
+		return (B_TRUE);
+	}
+	if (bre->bre_refcount == 0) {
+		brt_unlock(brt);
+		BRTSTAT_BUMP(brt_decref_free_data_now);
+		return (B_TRUE);
+	}
+
+	ASSERT(bre->bre_refcount > 0);
+	bre->bre_refcount--;
+	if (bre->bre_refcount == 0)
+		BRTSTAT_BUMP(brt_decref_free_data_later);
+	else
+		BRTSTAT_BUMP(brt_decref_entry_still_referenced);
+	brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
+
+	brt_unlock(brt);
+
+	return (B_FALSE);
+}
+
+uint64_t
+brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
+{
+	brt_t *brt = spa->spa_brt;
+	brt_vdev_t *brtvd;
+	brt_entry_t bre_search, *bre;
+	uint64_t vdevid, refcnt;
+	int error;
+
+	brt_entry_fill(bp, &bre_search, &vdevid);
+
+	brt_rlock(brt);
+
+	brtvd = brt_vdev(brt, vdevid);
+	ASSERT(brtvd != NULL);
+
+	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
+	if (bre == NULL) {
+		error = brt_entry_lookup(brt, brtvd, &bre_search);
+		ASSERT(error == 0 || error == ENOENT);
+		if (error == ENOENT)
+			refcnt = 0;
+		else
+			refcnt = bre_search.bre_refcount;
+	} else
+		refcnt = bre->bre_refcount;
+
+	brt_unlock(brt);
+	return (refcnt);
+}
+
+static void
+brt_prefetch(brt_t *brt, const blkptr_t *bp)
+{
+	brt_entry_t bre;
+	uint64_t vdevid;
+
+	ASSERT(bp != NULL);
+
+	if (!brt_zap_prefetch)
+		return;
+
+	brt_entry_fill(bp, &bre, &vdevid);
+
+	brt_entry_prefetch(brt, vdevid, &bre);
+}
+
+static int
+brt_pending_entry_compare(const void *x1, const void *x2)
+{
+	const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
+	const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
+	int cmp;
+
+	cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
+	    DVA_GET_VDEV(&bp2->blk_dva[0]));
+	if (cmp == 0) {
+		cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
+		    DVA_GET_OFFSET(&bp2->blk_dva[0]));
+		if (unlikely(cmp == 0)) {
+			cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2));
+		}
+	}
+
+	return (cmp);
+}
+
+void
+brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	brt_t *brt;
+	avl_tree_t *pending_tree;
+	kmutex_t *pending_lock;
+	brt_pending_entry_t *bpe, *newbpe;
+	avl_index_t where;
+	uint64_t txg;
+
+	brt = spa->spa_brt;
+	txg = dmu_tx_get_txg(tx);
+	ASSERT3U(txg, !=, 0);
+	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
+
+	newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
+	newbpe->bpe_bp = *bp;
+	newbpe->bpe_count = 1;
+
+	mutex_enter(pending_lock);
+
+	bpe = avl_find(pending_tree, newbpe, &where);
+	if (bpe == NULL) {
+		avl_insert(pending_tree, newbpe, where);
+		newbpe = NULL;
+	} else {
+		bpe->bpe_count++;
+	}
+
+	mutex_exit(pending_lock);
+
+	if (newbpe != NULL) {
+		ASSERT(bpe != NULL);
+		ASSERT(bpe != newbpe);
+		kmem_cache_free(brt_pending_entry_cache, newbpe);
+	} else {
+		ASSERT(bpe == NULL);
+
+		/* Prefetch BRT entry for the syncing context. */
+		brt_prefetch(brt, bp);
+	}
+}
+
+void
+brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	brt_t *brt;
+	avl_tree_t *pending_tree;
+	kmutex_t *pending_lock;
+	brt_pending_entry_t *bpe, bpe_search;
+	uint64_t txg;
+
+	brt = spa->spa_brt;
+	txg = dmu_tx_get_txg(tx);
+	ASSERT3U(txg, !=, 0);
+	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
+
+	bpe_search.bpe_bp = *bp;
+
+	mutex_enter(pending_lock);
+
+	bpe = avl_find(pending_tree, &bpe_search, NULL);
+	/* I believe we should always find bpe when this function is called. */
+	if (bpe != NULL) {
+		ASSERT(bpe->bpe_count > 0);
+
+		bpe->bpe_count--;
+		if (bpe->bpe_count == 0) {
+			avl_remove(pending_tree, bpe);
+			kmem_cache_free(brt_pending_entry_cache, bpe);
+		}
+	}
+
+	mutex_exit(pending_lock);
+}
+
+void
+brt_pending_apply(spa_t *spa, uint64_t txg)
+{
+	brt_t *brt = spa->spa_brt;
+	brt_pending_entry_t *bpe;
+	avl_tree_t *pending_tree;
+	void *c;
+
+	ASSERT3U(txg, !=, 0);
+
+	/*
+	 * We are in syncing context, so no other brt_pending_tree accesses
+	 * are possible for the TXG. Don't need to acquire brt_pending_lock.
+	 */
+	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+
+	c = NULL;
+	while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
+		boolean_t added_to_ddt;
+
+		for (int i = 0; i < bpe->bpe_count; i++) {
+			/*
+			 * If the block has DEDUP bit set, it means that it
+			 * already exists in the DEDUP table, so we can just
+			 * use that instead of creating new entry in
+			 * the BRT table.
+			 */
+			if (BP_GET_DEDUP(&bpe->bpe_bp)) {
+				added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
+			} else {
+				added_to_ddt = B_FALSE;
+			}
+			if (!added_to_ddt)
+				brt_entry_addref(brt, &bpe->bpe_bp);
+		}
+
+		kmem_cache_free(brt_pending_entry_cache, bpe);
+	}
+}
+
+static void
+brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
+{
+	if (bre->bre_refcount == 0) {
+		int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset,
+		    BRT_KEY_WORDS, tx);
+		VERIFY(error == 0 || error == ENOENT);
+	} else {
+		VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset,
+		    BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount),
+		    &bre->bre_refcount, tx));
+	}
+}
+
+static void
+brt_sync_table(brt_t *brt, dmu_tx_t *tx)
+{
+	brt_vdev_t *brtvd;
+	brt_entry_t *bre;
+	dnode_t *dn;
+	uint64_t vdevid;
+	void *c;
+
+	brt_wlock(brt);
+
+	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+		brtvd = &brt->brt_vdevs[vdevid];
+
+		if (!brtvd->bv_initiated)
+			continue;
+
+		if (!brtvd->bv_meta_dirty) {
+			ASSERT(!brtvd->bv_entcount_dirty);
+			ASSERT0(avl_numnodes(&brtvd->bv_tree));
+			continue;
+		}
+
+		ASSERT(!brtvd->bv_entcount_dirty ||
+		    avl_numnodes(&brtvd->bv_tree) != 0);
+
+		if (brtvd->bv_mos_brtvdev == 0)
+			brt_vdev_create(brt, brtvd, tx);
+
+		VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries,
+		    FTAG, &dn));
+
+		c = NULL;
+		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
+			brt_sync_entry(dn, bre, tx);
+			brt_entry_free(bre);
+			ASSERT(brt->brt_nentries > 0);
+			brt->brt_nentries--;
+		}
+
+		dnode_rele(dn, FTAG);
+
+		brt_vdev_sync(brt, brtvd, tx);
+
+		if (brtvd->bv_totalcount == 0)
+			brt_vdev_destroy(brt, brtvd, tx);
+	}
+
+	ASSERT0(brt->brt_nentries);
+
+	brt_unlock(brt);
+}
+
+void
+brt_sync(spa_t *spa, uint64_t txg)
+{
+	dmu_tx_t *tx;
+	brt_t *brt;
+
+	ASSERT(spa_syncing_txg(spa) == txg);
+
+	brt = spa->spa_brt;
+	brt_rlock(brt);
+	if (brt->brt_nentries == 0) {
+		/* No changes. */
+		brt_unlock(brt);
+		return;
+	}
+	brt_unlock(brt);
+
+	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+	brt_sync_table(brt, tx);
+
+	dmu_tx_commit(tx);
+}
+
+static void
+brt_table_alloc(brt_t *brt)
+{
+
+	for (int i = 0; i < TXG_SIZE; i++) {
+		avl_create(&brt->brt_pending_tree[i],
+		    brt_pending_entry_compare,
+		    sizeof (brt_pending_entry_t),
+		    offsetof(brt_pending_entry_t, bpe_node));
+		mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
+		    NULL);
+	}
+}
+
+static void
+brt_table_free(brt_t *brt)
+{
+
+	for (int i = 0; i < TXG_SIZE; i++) {
+		ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
+
+		avl_destroy(&brt->brt_pending_tree[i]);
+		mutex_destroy(&brt->brt_pending_lock[i]);
+	}
+}
+
+static void
+brt_alloc(spa_t *spa)
+{
+	brt_t *brt;
+
+	ASSERT(spa->spa_brt == NULL);
+
+	brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
+	rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
+	brt->brt_spa = spa;
+	brt->brt_rangesize = 0;
+	brt->brt_nentries = 0;
+	brt->brt_vdevs = NULL;
+	brt->brt_nvdevs = 0;
+	brt_table_alloc(brt);
+
+	spa->spa_brt = brt;
+}
+
+void
+brt_create(spa_t *spa)
+{
+
+	brt_alloc(spa);
+	brt_vdevs_alloc(spa->spa_brt, B_FALSE);
+}
+
+int
+brt_load(spa_t *spa)
+{
+
+	brt_alloc(spa);
+	brt_vdevs_alloc(spa->spa_brt, B_TRUE);
+
+	return (0);
+}
+
+void
+brt_unload(spa_t *spa)
+{
+	brt_t *brt = spa->spa_brt;
+
+	if (brt == NULL)
+		return;
+
+	brt_vdevs_free(brt);
+	brt_table_free(brt);
+	rw_destroy(&brt->brt_lock);
+	kmem_free(brt, sizeof (*brt));
+	spa->spa_brt = NULL;
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
+	"Enable prefetching of BRT ZAP entries");
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
+	"BRT ZAP leaf blockshift");
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
+	"BRT ZAP indirect blockshift");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/btree.c b/sys/contrib/openzfs/module/zfs/btree.c
index 57b9dbbb2b50..9c52083603f1 100644
--- a/sys/contrib/openzfs/module/zfs/btree.c
+++ b/sys/contrib/openzfs/module/zfs/btree.c
@@ -53,18 +53,30 @@ kmem_cache_t *zfs_btree_leaf_cache;
  * (while the asymptotic complexity of the other steps is the same, the
  * importance of the constant factors cannot be denied).
  */
-int zfs_btree_verify_intensity = 0;
+uint_t zfs_btree_verify_intensity = 0;
 
 /*
- * A convenience function to silence warnings from memmove's return value and
- * change argument order to src, dest.
+ * Convenience functions to silence warnings from memcpy/memmove's
+ * return values and change argument order to src, dest.
  */
 static void
+bcpy(const void *src, void *dest, size_t size)
+{
+	(void) memcpy(dest, src, size);
+}
+
+static void
 bmov(const void *src, void *dest, size_t size)
 {
 	(void) memmove(dest, src, size);
 }
 
+static boolean_t
+zfs_btree_is_core(struct zfs_btree_hdr *hdr)
+{
+	return (hdr->bth_first == -1);
+}
+
 #ifdef _ILP32
 #define	BTREE_POISON 0xabadb10c
 #else
@@ -76,59 +88,74 @@ zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 {
 #ifdef ZFS_DEBUG
 	size_t size = tree->bt_elem_size;
-	if (!hdr->bth_core) {
-		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
-		(void) memset(leaf->btl_elems + hdr->bth_count * size, 0x0f,
-		    BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t) -
-		    hdr->bth_count * size);
-	} else {
+	if (zfs_btree_is_core(hdr)) {
 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
-		for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) {
+		for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
+		    i++) {
 			node->btc_children[i] =
 			    (zfs_btree_hdr_t *)BTREE_POISON;
 		}
 		(void) memset(node->btc_elems + hdr->bth_count * size, 0x0f,
 		    (BTREE_CORE_ELEMS - hdr->bth_count) * size);
+	} else {
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+		(void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size);
+		(void) memset(leaf->btl_elems +
+		    (hdr->bth_first + hdr->bth_count) * size, 0x0f,
+		    tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) -
+		    (hdr->bth_first + hdr->bth_count) * size);
 	}
 #endif
 }
 
 static inline void
 zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
-    uint64_t offset)
+    uint32_t idx, uint32_t count)
 {
 #ifdef ZFS_DEBUG
 	size_t size = tree->bt_elem_size;
-	ASSERT3U(offset, >=, hdr->bth_count);
-	if (!hdr->bth_core) {
-		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
-		(void) memset(leaf->btl_elems + offset * size, 0x0f, size);
-	} else {
+	if (zfs_btree_is_core(hdr)) {
+		ASSERT3U(idx, >=, hdr->bth_count);
+		ASSERT3U(idx, <=, BTREE_CORE_ELEMS);
+		ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS);
 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
-		node->btc_children[offset + 1] =
-		    (zfs_btree_hdr_t *)BTREE_POISON;
-		(void) memset(node->btc_elems + offset * size, 0x0f, size);
+		for (uint32_t i = 1; i <= count; i++) {
+			node->btc_children[idx + i] =
+			    (zfs_btree_hdr_t *)BTREE_POISON;
+		}
+		(void) memset(node->btc_elems + idx * size, 0x0f, count * size);
+	} else {
+		ASSERT3U(idx, <=, tree->bt_leaf_cap);
+		ASSERT3U(idx + count, <=, tree->bt_leaf_cap);
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+		(void) memset(leaf->btl_elems +
+		    (hdr->bth_first + idx) * size, 0x0f, count * size);
 	}
 #endif
 }
 
 static inline void
 zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
-    uint64_t offset)
+    uint32_t idx)
 {
 #ifdef ZFS_DEBUG
 	size_t size = tree->bt_elem_size;
-	uint8_t eval = 0x0f;
-	if (hdr->bth_core) {
+	if (zfs_btree_is_core(hdr)) {
+		ASSERT3U(idx, <, BTREE_CORE_ELEMS);
 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
 		zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON;
-		VERIFY3P(node->btc_children[offset + 1], ==, cval);
-		for (int i = 0; i < size; i++)
-			VERIFY3U(node->btc_elems[offset * size + i], ==, eval);
+		VERIFY3P(node->btc_children[idx + 1], ==, cval);
+		for (size_t i = 0; i < size; i++)
+			VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f);
 	} else  {
+		ASSERT3U(idx, <, tree->bt_leaf_cap);
 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
-		for (int i = 0; i < size; i++)
-			VERIFY3U(leaf->btl_elems[offset * size + i], ==, eval);
+		if (idx >= tree->bt_leaf_cap - hdr->bth_first)
+			return;
+		for (size_t i = 0; i < size; i++) {
+			VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx)
+			    * size + i], ==, 0x0f);
+		}
 	}
 #endif
 }
@@ -137,8 +164,7 @@ void
 zfs_btree_init(void)
 {
 	zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache",
-	    BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL,
-	    NULL, 0);
+	    BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
@@ -147,21 +173,52 @@ zfs_btree_fini(void)
 	kmem_cache_destroy(zfs_btree_leaf_cache);
 }
 
+static void *
+zfs_btree_leaf_alloc(zfs_btree_t *tree)
+{
+	if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
+		return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP));
+	else
+		return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP));
+}
+
+static void
+zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr)
+{
+	if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
+		return (kmem_cache_free(zfs_btree_leaf_cache, ptr));
+	else
+		return (kmem_free(ptr, tree->bt_leaf_size));
+}
+
 void
 zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
-    size_t size)
+    bt_find_in_buf_f bt_find_in_buf, size_t size)
 {
-	/*
-	 * We need a minimmum of 4 elements so that when we split a node we
-	 * always have at least two elements in each node. This simplifies the
-	 * logic in zfs_btree_bulk_finish, since it means the last leaf will
-	 * always have a left sibling to share with (unless it's the root).
-	 */
-	ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / 4);
+	zfs_btree_create_custom(tree, compar, bt_find_in_buf, size,
+	    BTREE_LEAF_SIZE);
+}
+
+static void *
+zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
+    const void *value, zfs_btree_index_t *where);
+
+void
+zfs_btree_create_custom(zfs_btree_t *tree,
+    int (*compar) (const void *, const void *),
+    bt_find_in_buf_f bt_find_in_buf,
+    size_t size, size_t lsize)
+{
+	size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems);
 
-	bzero(tree, sizeof (*tree));
+	ASSERT3U(size, <=, esize / 2);
+	memset(tree, 0, sizeof (*tree));
 	tree->bt_compar = compar;
+	tree->bt_find_in_buf = (bt_find_in_buf == NULL) ?
+	    zfs_btree_find_in_buf : bt_find_in_buf;
 	tree->bt_elem_size = size;
+	tree->bt_leaf_size = lsize;
+	tree->bt_leaf_cap = P2ALIGN_TYPED(esize / size, 2, size_t);
 	tree->bt_height = -1;
 	tree->bt_bulk = NULL;
 }
@@ -170,21 +227,20 @@ zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
  * Find value in the array of elements provided. Uses a simple binary search.
  */
 static void *
-zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint64_t nelems,
+zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
     const void *value, zfs_btree_index_t *where)
 {
-	uint64_t max = nelems;
-	uint64_t min = 0;
+	uint32_t max = nelems;
+	uint32_t min = 0;
 	while (max > min) {
-		uint64_t idx = (min + max) / 2;
+		uint32_t idx = (min + max) / 2;
 		uint8_t *cur = buf + idx * tree->bt_elem_size;
 		int comp = tree->bt_compar(cur, value);
-		if (comp == -1) {
+		if (comp < 0) {
 			min = idx + 1;
-		} else if (comp == 1) {
+		} else if (comp > 0) {
 			max = idx;
 		} else {
-			ASSERT0(comp);
 			where->bti_offset = idx;
 			where->bti_before = B_FALSE;
 			return (cur);
@@ -219,12 +275,13 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
 	 * bulk-insert mode are to insert new elements.
 	 */
 	zfs_btree_index_t idx;
+	size_t size = tree->bt_elem_size;
 	if (tree->bt_bulk != NULL) {
 		zfs_btree_leaf_t *last_leaf = tree->bt_bulk;
-		int compar = tree->bt_compar(last_leaf->btl_elems +
-		    ((last_leaf->btl_hdr.bth_count - 1) * tree->bt_elem_size),
-		    value);
-		if (compar < 0) {
+		int comp = tree->bt_compar(last_leaf->btl_elems +
+		    (last_leaf->btl_hdr.bth_first +
+		    last_leaf->btl_hdr.bth_count - 1) * size, value);
+		if (comp < 0) {
 			/*
 			 * If what they're looking for is after the last
 			 * element, it's not in the tree.
@@ -236,7 +293,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
 				where->bti_before = B_TRUE;
 			}
 			return (NULL);
-		} else if (compar == 0) {
+		} else if (comp == 0) {
 			if (where != NULL) {
 				where->bti_node = (zfs_btree_hdr_t *)last_leaf;
 				where->bti_offset =
@@ -244,18 +301,20 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
 				where->bti_before = B_FALSE;
 			}
 			return (last_leaf->btl_elems +
-			    ((last_leaf->btl_hdr.bth_count - 1) *
-			    tree->bt_elem_size));
+			    (last_leaf->btl_hdr.bth_first +
+			    last_leaf->btl_hdr.bth_count - 1) * size);
 		}
-		if (tree->bt_compar(last_leaf->btl_elems, value) <= 0) {
+		if (tree->bt_compar(last_leaf->btl_elems +
+		    last_leaf->btl_hdr.bth_first * size, value) <= 0) {
 			/*
 			 * If what they're looking for is after the first
 			 * element in the last leaf, it's in the last leaf or
 			 * it's not in the tree.
 			 */
-			void *d = zfs_btree_find_in_buf(tree,
-			    last_leaf->btl_elems, last_leaf->btl_hdr.bth_count,
-			    value, &idx);
+			void *d = tree->bt_find_in_buf(tree,
+			    last_leaf->btl_elems +
+			    last_leaf->btl_hdr.bth_first * size,
+			    last_leaf->btl_hdr.bth_count, value, &idx);
 
 			if (where != NULL) {
 				idx.bti_node = (zfs_btree_hdr_t *)last_leaf;
@@ -266,8 +325,8 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
 	}
 
 	zfs_btree_core_t *node = NULL;
-	uint64_t child = 0;
-	uint64_t depth = 0;
+	uint32_t child = 0;
+	uint32_t depth = 0;
 
 	/*
 	 * Iterate down the tree, finding which child the value should be in
@@ -276,7 +335,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
 	for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
 	    node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
 		ASSERT3P(node, !=, NULL);
-		void *d = zfs_btree_find_in_buf(tree, node->btc_elems,
+		void *d = tree->bt_find_in_buf(tree, node->btc_elems,
 		    node->btc_hdr.bth_count, value, &idx);
 		EQUIV(d != NULL, !idx.bti_before);
 		if (d != NULL) {
@@ -296,7 +355,8 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
 	 */
 	zfs_btree_leaf_t *leaf = (depth == 0 ?
 	    (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
-	void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems,
+	void *d = tree->bt_find_in_buf(tree, leaf->btl_elems +
+	    leaf->btl_hdr.bth_first * size,
 	    leaf->btl_hdr.bth_count, value, &idx);
 
 	if (where != NULL) {
@@ -366,24 +426,23 @@ enum bt_shift_direction {
  * shift is determined by shape. The direction is determined by dir.
  */
 static inline void
-bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
-    uint64_t count, uint64_t off, enum bt_shift_shape shape,
+bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
+    uint32_t count, uint32_t off, enum bt_shift_shape shape,
     enum bt_shift_direction dir)
 {
 	size_t size = tree->bt_elem_size;
-	ASSERT(node->btc_hdr.bth_core);
+	ASSERT(zfs_btree_is_core(&node->btc_hdr));
 
 	uint8_t *e_start = node->btc_elems + idx * size;
-	int sign = (dir == BSD_LEFT ? -1 : +1);
-	uint8_t *e_out = e_start + sign * off * size;
-	uint64_t e_count = count;
-	bmov(e_start, e_out, e_count * size);
+	uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size :
+	    e_start + off * size);
+	bmov(e_start, e_out, count * size);
 
 	zfs_btree_hdr_t **c_start = node->btc_children + idx +
 	    (shape == BSS_TRAPEZOID ? 0 : 1);
 	zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off :
 	    c_start + off);
-	uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
+	uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
 	bmov(c_start, c_out, c_count * sizeof (*c_start));
 }
 
@@ -394,8 +453,8 @@ bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
  * false if it is a parallelogram.
  */
 static inline void
-bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
-    uint64_t count, enum bt_shift_shape shape)
+bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
+    uint32_t count, enum bt_shift_shape shape)
 {
 	bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT);
 }
@@ -405,8 +464,8 @@ bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
  * Starts with elements[idx] and children[idx] and one more child than element.
  */
 static inline void
-bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
-    uint64_t count, enum bt_shift_shape shape)
+bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
+    uint32_t count, enum bt_shift_shape shape)
 {
 	bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT);
 }
@@ -417,30 +476,78 @@ bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
  * is determined by left.
  */
 static inline void
-bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint64_t idx,
-    uint64_t count, uint64_t off, enum bt_shift_direction dir)
+bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx,
+    uint32_t count, uint32_t off, enum bt_shift_direction dir)
 {
 	size_t size = tree->bt_elem_size;
-	ASSERT(!node->btl_hdr.bth_core);
+	zfs_btree_hdr_t *hdr = &node->btl_hdr;
+	ASSERT(!zfs_btree_is_core(hdr));
 
-	uint8_t *start = node->btl_elems + idx * size;
-	int sign = (dir == BSD_LEFT ? -1 : +1);
-	uint8_t *out = start + sign * off * size;
+	if (count == 0)
+		return;
+	uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size;
+	uint8_t *out = (dir == BSD_LEFT ? start - off * size :
+	    start + off * size);
 	bmov(start, out, count * size);
 }
 
-static inline void
-bt_shift_leaf_right(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx,
-    uint64_t count)
+/*
+ * Grow leaf for n new elements before idx.
+ */
+static void
+bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
+    uint32_t n)
 {
-	bt_shift_leaf(tree, leaf, idx, count, 1, BSD_RIGHT);
+	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+	ASSERT(!zfs_btree_is_core(hdr));
+	ASSERT3U(idx, <=, hdr->bth_count);
+	uint32_t capacity = tree->bt_leaf_cap;
+	ASSERT3U(hdr->bth_count + n, <=, capacity);
+	boolean_t cl = (hdr->bth_first >= n);
+	boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity);
+
+	if (cl && (!cr || idx <= hdr->bth_count / 2)) {
+		/* Grow left. */
+		hdr->bth_first -= n;
+		bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT);
+	} else if (cr) {
+		/* Grow right. */
+		bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n,
+		    BSD_RIGHT);
+	} else {
+		/* Grow both ways. */
+		uint32_t fn = hdr->bth_first -
+		    (capacity - (hdr->bth_count + n)) / 2;
+		hdr->bth_first -= fn;
+		bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT);
+		bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx,
+		    n - fn, BSD_RIGHT);
+	}
+	hdr->bth_count += n;
 }
 
-static inline void
-bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx,
-    uint64_t count)
+/*
+ * Shrink leaf for count elements starting from idx.
+ */
+static void
+bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
+    uint32_t n)
 {
-	bt_shift_leaf(tree, leaf, idx, count, 1, BSD_LEFT);
+	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+	ASSERT(!zfs_btree_is_core(hdr));
+	ASSERT3U(idx, <=, hdr->bth_count);
+	ASSERT3U(idx + n, <=, hdr->bth_count);
+
+	if (idx <= (hdr->bth_count - n) / 2) {
+		bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT);
+		zfs_btree_poison_node_at(tree, hdr, 0, n);
+		hdr->bth_first += n;
+	} else {
+		bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n,
+		    BSD_LEFT);
+		zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n);
+	}
+	hdr->bth_count -= n;
 }
 
 /*
@@ -448,32 +555,33 @@ bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx,
  * parameter behaves the same as it does in the shift logic.
  */
 static inline void
-bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint64_t sidx,
-    uint64_t count, zfs_btree_core_t *dest, uint64_t didx,
+bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx,
+    uint32_t count, zfs_btree_core_t *dest, uint32_t didx,
     enum bt_shift_shape shape)
 {
 	size_t size = tree->bt_elem_size;
-	ASSERT(source->btc_hdr.bth_core);
-	ASSERT(dest->btc_hdr.bth_core);
+	ASSERT(zfs_btree_is_core(&source->btc_hdr));
+	ASSERT(zfs_btree_is_core(&dest->btc_hdr));
 
-	bmov(source->btc_elems + sidx * size, dest->btc_elems + didx * size,
+	bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size,
 	    count * size);
 
-	uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
-	bmov(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1),
+	uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
+	bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1),
 	    dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1),
 	    c_count * sizeof (*source->btc_children));
 }
 
 static inline void
-bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx,
-    uint64_t count, zfs_btree_leaf_t *dest, uint64_t didx)
+bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx,
+    uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx)
 {
 	size_t size = tree->bt_elem_size;
-	ASSERT(!source->btl_hdr.bth_core);
-	ASSERT(!dest->btl_hdr.bth_core);
+	ASSERT(!zfs_btree_is_core(&source->btl_hdr));
+	ASSERT(!zfs_btree_is_core(&dest->btl_hdr));
 
-	bmov(source->btl_elems + sidx * size, dest->btl_elems + didx * size,
+	bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size,
+	    dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size,
 	    count * size);
 }
 
@@ -482,30 +590,31 @@ bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx,
  * put its location in where if non-null.
  */
 static void *
-zfs_btree_first_helper(zfs_btree_hdr_t *hdr, zfs_btree_index_t *where)
+zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
+    zfs_btree_index_t *where)
 {
 	zfs_btree_hdr_t *node;
 
-	for (node = hdr; node->bth_core; node =
-	    ((zfs_btree_core_t *)node)->btc_children[0])
+	for (node = hdr; zfs_btree_is_core(node);
+	    node = ((zfs_btree_core_t *)node)->btc_children[0])
 		;
 
-	ASSERT(!node->bth_core);
+	ASSERT(!zfs_btree_is_core(node));
 	zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
 	if (where != NULL) {
 		where->bti_node = node;
 		where->bti_offset = 0;
 		where->bti_before = B_FALSE;
 	}
-	return (&leaf->btl_elems[0]);
+	return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]);
 }
 
 /* Insert an element and a child into a core node at the given offset. */
 static void
 zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent,
-    uint64_t offset, zfs_btree_hdr_t *new_node, void *buf)
+    uint32_t offset, zfs_btree_hdr_t *new_node, void *buf)
 {
-	uint64_t size = tree->bt_elem_size;
+	size_t size = tree->bt_elem_size;
 	zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
 	ASSERT3P(par_hdr, ==, new_node->bth_parent);
 	ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS);
@@ -515,13 +624,13 @@ zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent,
 		    par_hdr->bth_count);
 	}
 	/* Shift existing elements and children */
-	uint64_t count = par_hdr->bth_count - offset;
+	uint32_t count = par_hdr->bth_count - offset;
 	bt_shift_core_right(tree, parent, offset, count,
 	    BSS_PARALLELOGRAM);
 
 	/* Insert new values */
 	parent->btc_children[offset + 1] = new_node;
-	bmov(buf, parent->btc_elems + offset * size, size);
+	bcpy(buf, parent->btc_elems + offset * size, size);
 	par_hdr->bth_count++;
 }
 
@@ -534,9 +643,8 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
     zfs_btree_hdr_t *new_node, void *buf)
 {
 	ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent);
-	uint64_t size = tree->bt_elem_size;
+	size_t size = tree->bt_elem_size;
 	zfs_btree_core_t *parent = old_node->bth_parent;
-	zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
 
 	/*
 	 * If this is the root node we were splitting, we create a new root
@@ -550,13 +658,13 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
 		    size, KM_SLEEP);
 		zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr;
 		new_root_hdr->bth_parent = NULL;
-		new_root_hdr->bth_core = B_TRUE;
+		new_root_hdr->bth_first = -1;
 		new_root_hdr->bth_count = 1;
 
 		old_node->bth_parent = new_node->bth_parent = new_root;
 		new_root->btc_children[0] = old_node;
 		new_root->btc_children[1] = new_node;
-		bmov(buf, new_root->btc_elems, size);
+		bcpy(buf, new_root->btc_elems, size);
 
 		tree->bt_height++;
 		tree->bt_root = new_root_hdr;
@@ -568,12 +676,13 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
 	 * Since we have the new separator, binary search for where to put
 	 * new_node.
 	 */
+	zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
 	zfs_btree_index_t idx;
-	ASSERT(par_hdr->bth_core);
-	VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+	ASSERT(zfs_btree_is_core(par_hdr));
+	VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
 	    par_hdr->bth_count, buf, &idx), ==, NULL);
 	ASSERT(idx.bti_before);
-	uint64_t offset = idx.bti_offset;
+	uint32_t offset = idx.bti_offset;
 	ASSERT3U(offset, <=, par_hdr->bth_count);
 	ASSERT3P(parent->btc_children[offset], ==, old_node);
 
@@ -604,16 +713,16 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
 	 * We do this in two stages: first we split into two nodes, and then we
 	 * reuse our existing logic to insert the new element and child.
 	 */
-	uint64_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ?
+	uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ?
 	    2 : 4)) - 1, 2);
-	uint64_t keep_count = BTREE_CORE_ELEMS - move_count - 1;
+	uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1;
 	ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2);
 	tree->bt_num_nodes++;
 	zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) +
 	    BTREE_CORE_ELEMS * size, KM_SLEEP);
 	zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr;
 	new_par_hdr->bth_parent = par_hdr->bth_parent;
-	new_par_hdr->bth_core = B_TRUE;
+	new_par_hdr->bth_first = -1;
 	new_par_hdr->bth_count = move_count;
 	zfs_btree_poison_node(tree, new_par_hdr);
 
@@ -624,7 +733,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
 
 	/* Store the new separator in a buffer. */
 	uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP);
-	bmov(parent->btc_elems + keep_count * size, tmp_buf,
+	bcpy(parent->btc_elems + keep_count * size, tmp_buf,
 	    size);
 	zfs_btree_poison_node(tree, par_hdr);
 
@@ -636,7 +745,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
 		/*
 		 * Move the new separator to the existing buffer.
 		 */
-		bmov(tmp_buf, buf, size);
+		bcpy(tmp_buf, buf, size);
 	} else if (offset > keep_count) {
 		/* Insert the new node into the right half */
 		new_node->bth_parent = new_parent;
@@ -646,7 +755,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
 		/*
 		 * Move the new separator to the existing buffer.
 		 */
-		bmov(tmp_buf, buf, size);
+		bcpy(tmp_buf, buf, size);
 	} else {
 		/*
 		 * Move the new separator into the right half, and replace it
@@ -656,16 +765,16 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
 		bt_shift_core_right(tree, new_parent, 0, move_count,
 		    BSS_TRAPEZOID);
 		new_parent->btc_children[0] = new_node;
-		bmov(tmp_buf, new_parent->btc_elems, size);
+		bcpy(tmp_buf, new_parent->btc_elems, size);
 		new_par_hdr->bth_count++;
 	}
 	kmem_free(tmp_buf, size);
 	zfs_btree_poison_node(tree, par_hdr);
 
-	for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++)
+	for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++)
 		new_parent->btc_children[i]->bth_parent = new_parent;
 
-	for (int i = 0; i <= parent->btc_hdr.bth_count; i++)
+	for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++)
 		ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent);
 
 	/*
@@ -679,34 +788,32 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
 /* Insert an element into a leaf node at the given offset. */
 static void
 zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
-    uint64_t idx, const void *value)
+    uint32_t idx, const void *value)
 {
-	uint64_t size = tree->bt_elem_size;
-	uint8_t *start = leaf->btl_elems + (idx * size);
+	size_t size = tree->bt_elem_size;
 	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
-	uint64_t capacity __maybe_unused = P2ALIGN((BTREE_LEAF_SIZE -
-	    sizeof (zfs_btree_hdr_t)) / size, 2);
-	uint64_t count = leaf->btl_hdr.bth_count - idx;
-	ASSERT3U(leaf->btl_hdr.bth_count, <, capacity);
+	ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap);
 
 	if (zfs_btree_verify_intensity >= 5) {
 		zfs_btree_verify_poison_at(tree, &leaf->btl_hdr,
 		    leaf->btl_hdr.bth_count);
 	}
 
-	bt_shift_leaf_right(tree, leaf, idx, count);
-	bmov(value, start, size);
-	hdr->bth_count++;
+	bt_grow_leaf(tree, leaf, idx, 1);
+	uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size;
+	bcpy(value, start, size);
 }
 
+static void
+zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr);
+
 /* Helper function for inserting a new value into leaf at the given index. */
 static void
 zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
-    const void *value, uint64_t idx)
+    const void *value, uint32_t idx)
 {
-	uint64_t size = tree->bt_elem_size;
-	uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
-	    sizeof (zfs_btree_hdr_t)) / size, 2);
+	size_t size = tree->bt_elem_size;
+	uint32_t capacity = tree->bt_leaf_cap;
 
 	/*
 	 * If the leaf isn't full, shift the elements after idx and insert
@@ -731,32 +838,35 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
 	 * In either case, we're left with one extra element. The leftover
 	 * element will become the new dividing element between the two nodes.
 	 */
-	uint64_t move_count = MAX(capacity / (tree->bt_bulk == NULL ? 2 : 4) -
-	    1, 2);
-	uint64_t keep_count = capacity - move_count - 1;
-	ASSERT3U(capacity - move_count, >=, 2);
+	uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1;
+	uint32_t keep_count = capacity - move_count - 1;
+	ASSERT3U(keep_count, >=, 1);
+	/* If we insert on left. move one more to keep leaves balanced.  */
+	if (idx < keep_count) {
+		keep_count--;
+		move_count++;
+	}
 	tree->bt_num_nodes++;
-	zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
-	    KM_SLEEP);
+	zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree);
 	zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr;
 	new_hdr->bth_parent = leaf->btl_hdr.bth_parent;
-	new_hdr->bth_core = B_FALSE;
+	new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) +
+	    (idx >= keep_count && idx <= keep_count + move_count / 2);
 	new_hdr->bth_count = move_count;
 	zfs_btree_poison_node(tree, new_hdr);
 
-	leaf->btl_hdr.bth_count = keep_count;
-
 	if (tree->bt_bulk != NULL && leaf == tree->bt_bulk)
 		tree->bt_bulk = new_leaf;
 
 	/* Copy the back part to the new leaf. */
-	bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf,
-	    0);
+	bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0);
 
 	/* We store the new separator in a buffer we control for simplicity. */
 	uint8_t *buf = kmem_alloc(size, KM_SLEEP);
-	bmov(leaf->btl_elems + (keep_count * size), buf, size);
-	zfs_btree_poison_node(tree, &leaf->btl_hdr);
+	bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size,
+	    buf, size);
+
+	bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count);
 
 	if (idx < keep_count) {
 		/* Insert into the existing leaf. */
@@ -767,13 +877,11 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
 		    1, value);
 	} else {
 		/*
-		 * Shift the elements in the new leaf to make room for the
-		 * separator, and use the new value as the new separator.
+		 * Insert planned separator into the new leaf, and use
+		 * the new value as the new separator.
 		 */
-		bt_shift_leaf_right(tree, new_leaf, 0, move_count);
-		bmov(buf, new_leaf->btl_elems, size);
-		bmov(value, buf, size);
-		new_hdr->bth_count++;
+		zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf);
+		bcpy(value, buf, size);
 	}
 
 	/*
@@ -785,18 +893,19 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
 	kmem_free(buf, size);
 }
 
-static uint64_t
+static uint32_t
 zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 {
 	void *buf;
-	if (hdr->bth_core) {
+	if (zfs_btree_is_core(hdr)) {
 		buf = ((zfs_btree_core_t *)hdr)->btc_elems;
 	} else {
-		buf = ((zfs_btree_leaf_t *)hdr)->btl_elems;
+		buf = ((zfs_btree_leaf_t *)hdr)->btl_elems +
+		    hdr->bth_first * tree->bt_elem_size;
 	}
 	zfs_btree_index_t idx;
 	zfs_btree_core_t *parent = hdr->bth_parent;
-	VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+	VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
 	    parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
 	ASSERT(idx.bti_before);
 	ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
@@ -821,9 +930,8 @@ zfs_btree_bulk_finish(zfs_btree_t *tree)
 	zfs_btree_leaf_t *leaf = tree->bt_bulk;
 	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
 	zfs_btree_core_t *parent = hdr->bth_parent;
-	uint64_t size = tree->bt_elem_size;
-	uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
-	    sizeof (zfs_btree_hdr_t)) / size, 2);
+	size_t size = tree->bt_elem_size;
+	uint32_t capacity = tree->bt_leaf_cap;
 
 	/*
 	 * The invariant doesn't apply to the root node, if that's the only
@@ -848,56 +956,54 @@ zfs_btree_bulk_finish(zfs_btree_t *tree)
 			.bti_offset = 0
 		};
 		VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
-		ASSERT(idx.bti_node->bth_core);
+		ASSERT(zfs_btree_is_core(idx.bti_node));
 		zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node;
-		uint64_t common_idx = idx.bti_offset;
+		uint32_t common_idx = idx.bti_offset;
 
 		VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
-		ASSERT(!idx.bti_node->bth_core);
+		ASSERT(!zfs_btree_is_core(idx.bti_node));
 		zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node;
 		zfs_btree_hdr_t *l_hdr = idx.bti_node;
-		uint64_t move_count = (capacity / 2) - hdr->bth_count;
+		uint32_t move_count = (capacity / 2) - hdr->bth_count;
 		ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=,
 		    capacity / 2);
 
 		if (zfs_btree_verify_intensity >= 5) {
-			for (int i = 0; i < move_count; i++) {
+			for (uint32_t i = 0; i < move_count; i++) {
 				zfs_btree_verify_poison_at(tree, hdr,
 				    leaf->btl_hdr.bth_count + i);
 			}
 		}
 
 		/* First, shift elements in leaf back. */
-		bt_shift_leaf(tree, leaf, 0, hdr->bth_count, move_count,
-		    BSD_RIGHT);
+		bt_grow_leaf(tree, leaf, 0, move_count);
 
 		/* Next, move the separator from the common ancestor to leaf. */
-		uint8_t *separator = common->btc_elems + (common_idx * size);
-		uint8_t *out = leaf->btl_elems + ((move_count - 1) * size);
-		bmov(separator, out, size);
-		move_count--;
+		uint8_t *separator = common->btc_elems + common_idx * size;
+		uint8_t *out = leaf->btl_elems +
+		    (hdr->bth_first + move_count - 1) * size;
+		bcpy(separator, out, size);
 
 		/*
 		 * Now we move elements from the tail of the left neighbor to
 		 * fill the remaining spots in leaf.
 		 */
 		bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count -
-		    move_count, move_count, leaf, 0);
+		    (move_count - 1), move_count - 1, leaf, 0);
 
 		/*
 		 * Finally, move the new last element in the left neighbor to
 		 * the separator.
 		 */
-		bmov(l_neighbor->btl_elems + (l_hdr->bth_count -
-		    move_count - 1) * size, separator, size);
+		bcpy(l_neighbor->btl_elems + (l_hdr->bth_first +
+		    l_hdr->bth_count - move_count) * size, separator, size);
 
 		/* Adjust the node's counts, and we're done. */
-		l_hdr->bth_count -= move_count + 1;
-		hdr->bth_count += move_count + 1;
+		bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count,
+		    move_count);
 
 		ASSERT3U(l_hdr->bth_count, >=, capacity / 2);
 		ASSERT3U(hdr->bth_count, >=, capacity / 2);
-		zfs_btree_poison_node(tree, l_hdr);
 	}
 
 	/*
@@ -921,16 +1027,16 @@ zfs_btree_bulk_finish(zfs_btree_t *tree)
 		 * splitting is 2, we never need to worry about not having a
 		 * left sibling (a sibling is a neighbor with the same parent).
 		 */
-		uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+		uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
 		ASSERT3U(parent_idx, >, 0);
 		zfs_btree_core_t *l_neighbor =
 		    (zfs_btree_core_t *)parent->btc_children[parent_idx - 1];
-		uint64_t move_count = (capacity / 2) - hdr->bth_count;
+		uint32_t move_count = (capacity / 2) - hdr->bth_count;
 		ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=,
 		    capacity / 2);
 
 		if (zfs_btree_verify_intensity >= 5) {
-			for (int i = 0; i < move_count; i++) {
+			for (uint32_t i = 0; i < move_count; i++) {
 				zfs_btree_verify_poison_at(tree, hdr,
 				    hdr->bth_count + i);
 			}
@@ -943,14 +1049,14 @@ zfs_btree_bulk_finish(zfs_btree_t *tree)
 		uint8_t *separator = parent->btc_elems + ((parent_idx - 1) *
 		    size);
 		uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size);
-		bmov(separator, e_out, size);
+		bcpy(separator, e_out, size);
 
 		/*
 		 * Now, move elements and children from the left node to the
 		 * right.  We move one more child than elements.
 		 */
 		move_count--;
-		uint64_t move_idx = l_neighbor->btc_hdr.bth_count - move_count;
+		uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count;
 		bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0,
 		    BSS_TRAPEZOID);
 
@@ -959,7 +1065,7 @@ zfs_btree_bulk_finish(zfs_btree_t *tree)
 		 * separator's position.
 		 */
 		move_idx--;
-		bmov(l_neighbor->btc_elems + move_idx * size, separator, size);
+		bcpy(l_neighbor->btc_elems + move_idx * size, separator, size);
 
 		l_neighbor->btc_hdr.bth_count -= move_count + 1;
 		hdr->bth_count += move_count + 1;
@@ -969,11 +1075,12 @@ zfs_btree_bulk_finish(zfs_btree_t *tree)
 
 		zfs_btree_poison_node(tree, &l_neighbor->btc_hdr);
 
-		for (int i = 0; i <= hdr->bth_count; i++)
+		for (uint32_t i = 0; i <= hdr->bth_count; i++)
 			cur->btc_children[i]->bth_parent = cur;
 	}
 
 	tree->bt_bulk = NULL;
+	zfs_btree_verify(tree);
 }
 
 /*
@@ -1006,20 +1113,19 @@ zfs_btree_add_idx(zfs_btree_t *tree, const void *value,
 		ASSERT0(where->bti_offset);
 
 		tree->bt_num_nodes++;
-		zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
-		    KM_SLEEP);
+		zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree);
 		tree->bt_root = &leaf->btl_hdr;
 		tree->bt_height++;
 
 		zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
 		hdr->bth_parent = NULL;
-		hdr->bth_core = B_FALSE;
+		hdr->bth_first = 0;
 		hdr->bth_count = 0;
 		zfs_btree_poison_node(tree, hdr);
 
 		zfs_btree_insert_into_leaf(tree, leaf, value, 0);
 		tree->bt_bulk = leaf;
-	} else if (!where->bti_node->bth_core) {
+	} else if (!zfs_btree_is_core(where->bti_node)) {
 		/*
 		 * If we're inserting into a leaf, go directly to the helper
 		 * function.
@@ -1035,28 +1141,28 @@ zfs_btree_add_idx(zfs_btree_t *tree, const void *value,
 		 * value in the node at that spot and then insert the old
 		 * separator into the first slot in the subtree to the right.
 		 */
-		ASSERT(where->bti_node->bth_core);
 		zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node;
 
 		/*
 		 * We can ignore bti_before, because either way the value
 		 * should end up in bti_offset.
 		 */
-		uint64_t off = where->bti_offset;
+		uint32_t off = where->bti_offset;
 		zfs_btree_hdr_t *subtree = node->btc_children[off + 1];
 		size_t size = tree->bt_elem_size;
 		uint8_t *buf = kmem_alloc(size, KM_SLEEP);
-		bmov(node->btc_elems + off * size, buf, size);
-		bmov(value, node->btc_elems + off * size, size);
+		bcpy(node->btc_elems + off * size, buf, size);
+		bcpy(value, node->btc_elems + off * size, size);
 
 		/*
 		 * Find the first slot in the subtree to the right, insert
 		 * there.
 		 */
 		zfs_btree_index_t new_idx;
-		VERIFY3P(zfs_btree_first_helper(subtree, &new_idx), !=, NULL);
+		VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=,
+		    NULL);
 		ASSERT0(new_idx.bti_offset);
-		ASSERT(!new_idx.bti_node->bth_core);
+		ASSERT(!zfs_btree_is_core(new_idx.bti_node));
 		zfs_btree_insert_into_leaf(tree,
 		    (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0);
 		kmem_free(buf, size);
@@ -1075,7 +1181,7 @@ zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where)
 		ASSERT0(tree->bt_num_elems);
 		return (NULL);
 	}
-	return (zfs_btree_first_helper(tree->bt_root, where));
+	return (zfs_btree_first_helper(tree, tree->bt_root, where));
 }
 
 /*
@@ -1088,7 +1194,7 @@ zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr,
 {
 	zfs_btree_hdr_t *node;
 
-	for (node = hdr; node->bth_core; node =
+	for (node = hdr; zfs_btree_is_core(node); node =
 	    ((zfs_btree_core_t *)node)->btc_children[node->bth_count])
 		;
 
@@ -1098,7 +1204,8 @@ zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr,
 		where->bti_offset = node->bth_count - 1;
 		where->bti_before = B_FALSE;
 	}
-	return (leaf->btl_elems + (node->bth_count - 1) * btree->bt_elem_size);
+	return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) *
+	    btree->bt_elem_size);
 }
 
 /*
@@ -1131,8 +1238,8 @@ zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
 		return (NULL);
 	}
 
-	uint64_t offset = idx->bti_offset;
-	if (!idx->bti_node->bth_core) {
+	uint32_t offset = idx->bti_offset;
+	if (!zfs_btree_is_core(idx->bti_node)) {
 		/*
 		 * When finding the next element of an element in a leaf,
 		 * there are two cases. If the element isn't the last one in
@@ -1143,20 +1250,21 @@ zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
 		 * separator after our ancestor in its parent.
 		 */
 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
-		uint64_t new_off = offset + (idx->bti_before ? 0 : 1);
+		uint32_t new_off = offset + (idx->bti_before ? 0 : 1);
 		if (leaf->btl_hdr.bth_count > new_off) {
 			out_idx->bti_node = &leaf->btl_hdr;
 			out_idx->bti_offset = new_off;
 			out_idx->bti_before = B_FALSE;
-			return (leaf->btl_elems + new_off * tree->bt_elem_size);
+			return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
+			    new_off) * tree->bt_elem_size);
 		}
 
 		zfs_btree_hdr_t *prev = &leaf->btl_hdr;
 		for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
 		    node != NULL; node = node->btc_hdr.bth_parent) {
 			zfs_btree_hdr_t *hdr = &node->btc_hdr;
-			ASSERT(hdr->bth_core);
-			uint64_t i = zfs_btree_find_parent_idx(tree, prev);
+			ASSERT(zfs_btree_is_core(hdr));
+			uint32_t i = zfs_btree_find_parent_idx(tree, prev);
 			if (done_func != NULL)
 				done_func(tree, prev);
 			if (i == hdr->bth_count) {
@@ -1178,7 +1286,7 @@ zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
 	}
 
 	/* If we were before an element in a core node, return that element. */
-	ASSERT(idx->bti_node->bth_core);
+	ASSERT(zfs_btree_is_core(idx->bti_node));
 	zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
 	if (idx->bti_before) {
 		out_idx->bti_before = B_FALSE;
@@ -1190,7 +1298,7 @@ zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
 	 * the subtree just to the right of the separator.
 	 */
 	zfs_btree_hdr_t *child = node->btc_children[offset + 1];
-	return (zfs_btree_first_helper(child, out_idx));
+	return (zfs_btree_first_helper(tree, child, out_idx));
 }
 
 /*
@@ -1217,8 +1325,8 @@ zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
 		return (NULL);
 	}
 
-	uint64_t offset = idx->bti_offset;
-	if (!idx->bti_node->bth_core) {
+	uint32_t offset = idx->bti_offset;
+	if (!zfs_btree_is_core(idx->bti_node)) {
 		/*
 		 * When finding the previous element of an element in a leaf,
 		 * there are two cases. If the element isn't the first one in
@@ -1233,15 +1341,15 @@ zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
 			out_idx->bti_node = &leaf->btl_hdr;
 			out_idx->bti_offset = offset - 1;
 			out_idx->bti_before = B_FALSE;
-			return (leaf->btl_elems + (offset - 1) *
-			    tree->bt_elem_size);
+			return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
+			    offset - 1) * tree->bt_elem_size);
 		}
 		zfs_btree_hdr_t *prev = &leaf->btl_hdr;
 		for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
 		    node != NULL; node = node->btc_hdr.bth_parent) {
 			zfs_btree_hdr_t *hdr = &node->btc_hdr;
-			ASSERT(hdr->bth_core);
-			uint64_t i = zfs_btree_find_parent_idx(tree, prev);
+			ASSERT(zfs_btree_is_core(hdr));
+			uint32_t i = zfs_btree_find_parent_idx(tree, prev);
 			if (i == 0) {
 				prev = hdr;
 				continue;
@@ -1262,7 +1370,7 @@ zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
 	 * The previous element from one in a core node is the last element in
 	 * the subtree just to the left of the separator.
 	 */
-	ASSERT(idx->bti_node->bth_core);
+	ASSERT(zfs_btree_is_core(idx->bti_node));
 	zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
 	zfs_btree_hdr_t *child = node->btc_children[offset];
 	return (zfs_btree_last_helper(tree, child, out_idx));
@@ -1279,13 +1387,14 @@ void *
 zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx)
 {
 	ASSERT(!idx->bti_before);
-	if (!idx->bti_node->bth_core) {
+	size_t size = tree->bt_elem_size;
+	if (!zfs_btree_is_core(idx->bti_node)) {
 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
-		return (leaf->btl_elems + idx->bti_offset * tree->bt_elem_size);
+		return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
+		    idx->bti_offset) * size);
 	}
-	ASSERT(idx->bti_node->bth_core);
 	zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
-	return (node->btc_elems + idx->bti_offset * tree->bt_elem_size);
+	return (node->btc_elems + idx->bti_offset * size);
 }
 
 /* Add the given value to the tree. Must not already be in the tree. */
@@ -1302,8 +1411,8 @@ static void
 zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node)
 {
 	tree->bt_num_nodes--;
-	if (!node->bth_core) {
-		kmem_cache_free(zfs_btree_leaf_cache, node);
+	if (!zfs_btree_is_core(node)) {
+		zfs_btree_leaf_free(tree, node);
 	} else {
 		kmem_free(node, sizeof (zfs_btree_core_t) +
 		    BTREE_CORE_ELEMS * tree->bt_elem_size);
@@ -1320,7 +1429,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
     zfs_btree_hdr_t *rm_hdr)
 {
 	size_t size = tree->bt_elem_size;
-	uint64_t min_count = (BTREE_CORE_ELEMS / 2) - 1;
+	uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1;
 	zfs_btree_hdr_t *hdr = &node->btc_hdr;
 	/*
 	 * If the node is the root node and rm_hdr is one of two children,
@@ -1337,7 +1446,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
 		return;
 	}
 
-	uint64_t idx;
+	uint32_t idx;
 	for (idx = 0; idx <= hdr->bth_count; idx++) {
 		if (node->btc_children[idx] == rm_hdr)
 			break;
@@ -1357,7 +1466,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
 		bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
 		    BSS_PARALLELOGRAM);
 		hdr->bth_count--;
-		zfs_btree_poison_node_at(tree, hdr, hdr->bth_count);
+		zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1);
 		return;
 	}
 
@@ -1378,13 +1487,13 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
 	 * implementing in the future for completeness' sake.
 	 */
 	zfs_btree_core_t *parent = hdr->bth_parent;
-	uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+	uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
 
 	zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
 	    parent->btc_children[parent_idx - 1]);
 	if (l_hdr != NULL && l_hdr->bth_count > min_count) {
 		/* We can take a node from the left neighbor. */
-		ASSERT(l_hdr->bth_core);
+		ASSERT(zfs_btree_is_core(l_hdr));
 		zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr;
 
 		/*
@@ -1399,20 +1508,19 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
 		 */
 		uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
 		    size;
-		bmov(separator, node->btc_elems, size);
+		bcpy(separator, node->btc_elems, size);
 
 		/* Move the last child of neighbor to our first child slot. */
-		zfs_btree_hdr_t **take_child = neighbor->btc_children +
-		    l_hdr->bth_count;
-		bmov(take_child, node->btc_children, sizeof (*take_child));
+		node->btc_children[0] =
+		    neighbor->btc_children[l_hdr->bth_count];
 		node->btc_children[0]->bth_parent = node;
 
 		/* Move the last element of neighbor to the separator spot. */
 		uint8_t *take_elem = neighbor->btc_elems +
 		    (l_hdr->bth_count - 1) * size;
-		bmov(take_elem, separator, size);
+		bcpy(take_elem, separator, size);
 		l_hdr->bth_count--;
-		zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count);
+		zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1);
 		return;
 	}
 
@@ -1420,7 +1528,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
 	    NULL : parent->btc_children[parent_idx + 1]);
 	if (r_hdr != NULL && r_hdr->bth_count > min_count) {
 		/* We can take a node from the right neighbor. */
-		ASSERT(r_hdr->bth_core);
+		ASSERT(zfs_btree_is_core(r_hdr));
 		zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr;
 
 		/*
@@ -1435,21 +1543,19 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
 		 * element spot in node.
 		 */
 		uint8_t *separator = parent->btc_elems + parent_idx * size;
-		bmov(separator, node->btc_elems + (hdr->bth_count - 1) * size,
+		bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size,
 		    size);
 
 		/*
 		 * Move the first child of neighbor to the last child spot in
 		 * node.
 		 */
-		zfs_btree_hdr_t **take_child = neighbor->btc_children;
-		bmov(take_child, node->btc_children + hdr->bth_count,
-		    sizeof (*take_child));
+		node->btc_children[hdr->bth_count] = neighbor->btc_children[0];
 		node->btc_children[hdr->bth_count]->bth_parent = node;
 
 		/* Move the first element of neighbor to the separator spot. */
 		uint8_t *take_elem = neighbor->btc_elems;
-		bmov(take_elem, separator, size);
+		bcpy(take_elem, separator, size);
 		r_hdr->bth_count--;
 
 		/*
@@ -1458,7 +1564,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
 		 */
 		bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count,
 		    BSS_TRAPEZOID);
-		zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count);
+		zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1);
 		return;
 	}
 
@@ -1473,7 +1579,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
 	 * merging.
 	 */
 	zfs_btree_hdr_t *new_rm_hdr, *keep_hdr;
-	uint64_t new_idx = idx;
+	uint32_t new_idx = idx;
 	if (l_hdr != NULL) {
 		keep_hdr = l_hdr;
 		new_rm_hdr = hdr;
@@ -1485,14 +1591,14 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
 		parent_idx++;
 	}
 
-	ASSERT(keep_hdr->bth_core);
-	ASSERT(new_rm_hdr->bth_core);
+	ASSERT(zfs_btree_is_core(keep_hdr));
+	ASSERT(zfs_btree_is_core(new_rm_hdr));
 
 	zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr;
 	zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr;
 
 	if (zfs_btree_verify_intensity >= 5) {
-		for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) {
+		for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) {
 			zfs_btree_verify_poison_at(tree, keep_hdr,
 			    keep_hdr->bth_count + i);
 		}
@@ -1502,14 +1608,14 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
 	uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size;
 	uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
 	    size;
-	bmov(separator, e_out, size);
+	bcpy(separator, e_out, size);
 	keep_hdr->bth_count++;
 
 	/* Move all our elements and children into the left node. */
 	bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep,
 	    keep_hdr->bth_count, BSS_TRAPEZOID);
 
-	uint64_t old_count = keep_hdr->bth_count;
+	uint32_t old_count = keep_hdr->bth_count;
 
 	/* Update bookkeeping */
 	keep_hdr->bth_count += new_rm_hdr->bth_count;
@@ -1527,17 +1633,17 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
 	/* Reparent all our children to point to the left node. */
 	zfs_btree_hdr_t **new_start = keep->btc_children +
 	    old_count - 1;
-	for (int i = 0; i < new_rm_hdr->bth_count + 1; i++)
+	for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++)
 		new_start[i]->bth_parent = keep;
-	for (int i = 0; i <= keep_hdr->bth_count; i++) {
+	for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) {
 		ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep);
 		ASSERT3P(keep->btc_children[i], !=, rm_hdr);
 	}
-	zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count);
+	zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1);
 
 	new_rm_hdr->bth_count = 0;
-	zfs_btree_node_destroy(tree, new_rm_hdr);
 	zfs_btree_remove_from_node(tree, parent, new_rm_hdr);
+	zfs_btree_node_destroy(tree, new_rm_hdr);
 }
 
 /* Remove the element at the specific location. */
@@ -1546,9 +1652,7 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
 {
 	size_t size = tree->bt_elem_size;
 	zfs_btree_hdr_t *hdr = where->bti_node;
-	uint64_t idx = where->bti_offset;
-	uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
-	    sizeof (zfs_btree_hdr_t)) / size, 2);
+	uint32_t idx = where->bti_offset;
 
 	ASSERT(!where->bti_before);
 	if (tree->bt_bulk != NULL) {
@@ -1560,7 +1664,7 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
 		 */
 		uint8_t *value = zfs_btree_get(tree, where);
 		uint8_t *tmp = kmem_alloc(size, KM_SLEEP);
-		bmov(value, tmp, size);
+		bcpy(value, tmp, size);
 		zfs_btree_bulk_finish(tree);
 		VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL);
 		kmem_free(tmp, size);
@@ -1575,14 +1679,14 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
 	 * makes the rebalance logic not need to be recursive both upwards and
 	 * downwards.
 	 */
-	if (hdr->bth_core) {
+	if (zfs_btree_is_core(hdr)) {
 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
 		zfs_btree_hdr_t *left_subtree = node->btc_children[idx];
 		void *new_value = zfs_btree_last_helper(tree, left_subtree,
 		    where);
 		ASSERT3P(new_value, !=, NULL);
 
-		bmov(new_value, node->btc_elems + idx * size, size);
+		bcpy(new_value, node->btc_elems + idx * size, size);
 
 		hdr = where->bti_node;
 		idx = where->bti_offset;
@@ -1594,19 +1698,18 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
 	 * elements after the idx to the left. After that, we rebalance if
 	 * needed.
 	 */
-	ASSERT(!hdr->bth_core);
+	ASSERT(!zfs_btree_is_core(hdr));
 	zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
 	ASSERT3U(hdr->bth_count, >, 0);
 
-	uint64_t min_count = (capacity / 2) - 1;
+	uint32_t min_count = (tree->bt_leaf_cap / 2) - 1;
 
 	/*
 	 * If we're over the minimum size or this is the root, just overwrite
 	 * the value and return.
 	 */
 	if (hdr->bth_count > min_count || hdr->bth_parent == NULL) {
-		hdr->bth_count--;
-		bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx);
+		bt_shrink_leaf(tree, leaf, idx, 1);
 		if (hdr->bth_parent == NULL) {
 			ASSERT0(tree->bt_height);
 			if (hdr->bth_count == 0) {
@@ -1615,8 +1718,6 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
 				zfs_btree_node_destroy(tree, &leaf->btl_hdr);
 			}
 		}
-		if (tree->bt_root != NULL)
-			zfs_btree_poison_node_at(tree, hdr, hdr->bth_count);
 		zfs_btree_verify(tree);
 		return;
 	}
@@ -1636,33 +1737,33 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
 	 * worth implementing in the future for completeness' sake.
 	 */
 	zfs_btree_core_t *parent = hdr->bth_parent;
-	uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+	uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
 
 	zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
 	    parent->btc_children[parent_idx - 1]);
 	if (l_hdr != NULL && l_hdr->bth_count > min_count) {
 		/* We can take a node from the left neighbor. */
-		ASSERT(!l_hdr->bth_core);
+		ASSERT(!zfs_btree_is_core(l_hdr));
+		zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr;
 
 		/*
 		 * Move our elements back by one spot to make room for the
 		 * stolen element and overwrite the element being removed.
 		 */
-		bt_shift_leaf_right(tree, leaf, 0, idx);
+		bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT);
+
+		/* Move the separator to our first spot. */
 		uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
 		    size;
-		uint8_t *take_elem = ((zfs_btree_leaf_t *)l_hdr)->btl_elems +
-		    (l_hdr->bth_count - 1) * size;
-		/* Move the separator to our first spot. */
-		bmov(separator, leaf->btl_elems, size);
+		bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size);
 
 		/* Move our neighbor's last element to the separator. */
-		bmov(take_elem, separator, size);
-
-		/* Update the bookkeeping. */
-		l_hdr->bth_count--;
-		zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count);
+		uint8_t *take_elem = neighbor->btl_elems +
+		    (l_hdr->bth_first + l_hdr->bth_count - 1) * size;
+		bcpy(take_elem, separator, size);
 
+		/* Delete our neighbor's last element. */
+		bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1);
 		zfs_btree_verify(tree);
 		return;
 	}
@@ -1671,7 +1772,7 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
 	    NULL : parent->btc_children[parent_idx + 1]);
 	if (r_hdr != NULL && r_hdr->bth_count > min_count) {
 		/* We can take a node from the right neighbor. */
-		ASSERT(!r_hdr->bth_core);
+		ASSERT(!zfs_btree_is_core(r_hdr));
 		zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr;
 
 		/*
@@ -1679,96 +1780,81 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
 		 * by one spot to make room for the stolen element and
 		 * overwrite the element being removed.
 		 */
-		bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx -
-		    1);
+		bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1,
+		    1, BSD_LEFT);
 
-		uint8_t *separator = parent->btc_elems + parent_idx * size;
-		uint8_t *take_elem = ((zfs_btree_leaf_t *)r_hdr)->btl_elems;
 		/* Move the separator between us to our last spot. */
-		bmov(separator, leaf->btl_elems + (hdr->bth_count - 1) * size,
-		    size);
+		uint8_t *separator = parent->btc_elems + parent_idx * size;
+		bcpy(separator, leaf->btl_elems + (hdr->bth_first +
+		    hdr->bth_count - 1) * size, size);
 
 		/* Move our neighbor's first element to the separator. */
-		bmov(take_elem, separator, size);
+		uint8_t *take_elem = neighbor->btl_elems +
+		    r_hdr->bth_first * size;
+		bcpy(take_elem, separator, size);
 
-		/* Update the bookkeeping. */
-		r_hdr->bth_count--;
-
-		/*
-		 * Move our neighbors elements forwards to overwrite the
-		 * stolen element.
-		 */
-		bt_shift_leaf_left(tree, neighbor, 1, r_hdr->bth_count);
-		zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count);
+		/* Delete our neighbor's first element. */
+		bt_shrink_leaf(tree, neighbor, 0, 1);
 		zfs_btree_verify(tree);
 		return;
 	}
 
 	/*
 	 * In this case, neither of our neighbors can spare an element, so we
-	 * need to merge with one of them. We prefer the left one,
-	 * arbitrarily. Move the separator into the leftmost merging node
+	 * need to merge with one of them. We prefer the left one, arbitrarily.
+	 * After remove we move the separator into the leftmost merging node
 	 * (which may be us or the left neighbor), and then move the right
 	 * merging node's elements. Once that's done, we go back and delete
 	 * the element we're removing. Finally, go into the parent and delete
 	 * the right merging node and the separator. This may cause further
 	 * merging.
 	 */
-	zfs_btree_hdr_t *rm_hdr, *keep_hdr;
-	uint64_t new_idx = idx;
+	zfs_btree_hdr_t *rm_hdr, *k_hdr;
 	if (l_hdr != NULL) {
-		keep_hdr = l_hdr;
+		k_hdr = l_hdr;
 		rm_hdr = hdr;
-		new_idx += keep_hdr->bth_count + 1; // 449
 	} else {
 		ASSERT3P(r_hdr, !=, NULL);
-		keep_hdr = hdr;
+		k_hdr = hdr;
 		rm_hdr = r_hdr;
 		parent_idx++;
 	}
-
-	ASSERT(!keep_hdr->bth_core);
-	ASSERT(!rm_hdr->bth_core);
-	ASSERT3U(keep_hdr->bth_count, ==, min_count);
+	ASSERT(!zfs_btree_is_core(k_hdr));
+	ASSERT(!zfs_btree_is_core(rm_hdr));
+	ASSERT3U(k_hdr->bth_count, ==, min_count);
 	ASSERT3U(rm_hdr->bth_count, ==, min_count);
-
-	zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)keep_hdr;
+	zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr;
 	zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr;
 
 	if (zfs_btree_verify_intensity >= 5) {
-		for (int i = 0; i < rm_hdr->bth_count + 1; i++) {
-			zfs_btree_verify_poison_at(tree, keep_hdr,
-			    keep_hdr->bth_count + i);
+		for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) {
+			zfs_btree_verify_poison_at(tree, k_hdr,
+			    k_hdr->bth_count + i);
 		}
 	}
+
 	/*
-	 * Move the separator into the first open spot in the left
-	 * neighbor.
+	 * Remove the value from the node.  It will go below the minimum,
+	 * but we'll fix it in no time.
 	 */
-	uint8_t *out = keep->btl_elems + keep_hdr->bth_count * size;
-	uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
-	    size;
-	bmov(separator, out, size);
-	keep_hdr->bth_count++;
+	bt_shrink_leaf(tree, leaf, idx, 1);
 
-	/* Move our elements to the left neighbor. */
-	bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep,
-	    keep_hdr->bth_count);
+	/* Prepare space for elements to be moved from the right. */
+	uint32_t k_count = k_hdr->bth_count;
+	bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count);
+	ASSERT3U(k_hdr->bth_count, ==, min_count * 2);
 
-	/* Update the bookkeeping. */
-	keep_hdr->bth_count += rm_hdr->bth_count;
-	ASSERT3U(keep_hdr->bth_count, ==, min_count * 2 + 1);
+	/* Move the separator into the first open spot. */
+	uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size;
+	uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size;
+	bcpy(separator, out, size);
 
-	/* Remove the value from the node */
-	keep_hdr->bth_count--;
-	bt_shift_leaf_left(tree, keep, new_idx + 1, keep_hdr->bth_count -
-	    new_idx);
-	zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count);
+	/* Move our elements to the left neighbor. */
+	bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1);
 
-	rm_hdr->bth_count = 0;
-	zfs_btree_node_destroy(tree, rm_hdr);
 	/* Remove the emptied node from the parent. */
 	zfs_btree_remove_from_node(tree, parent, rm_hdr);
+	zfs_btree_node_destroy(tree, rm_hdr);
 	zfs_btree_verify(tree);
 }
 
@@ -1831,11 +1917,10 @@ zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie)
 static void
 zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 {
-	if (hdr->bth_core) {
+	if (zfs_btree_is_core(hdr)) {
 		zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr;
-		for (int i = 0; i <= hdr->bth_count; i++) {
+		for (uint32_t i = 0; i <= hdr->bth_count; i++)
 			zfs_btree_clear_helper(tree, btc->btc_children[i]);
-		}
 	}
 
 	zfs_btree_node_destroy(tree, hdr);
@@ -1868,11 +1953,11 @@ zfs_btree_destroy(zfs_btree_t *tree)
 static void
 zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 {
-	if (!hdr->bth_core)
+	if (!zfs_btree_is_core(hdr))
 		return;
 
 	zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
-	for (int i = 0; i <= hdr->bth_count; i++) {
+	for (uint32_t i = 0; i <= hdr->bth_count; i++) {
 		VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr);
 		zfs_btree_verify_pointers_helper(tree, node->btc_children[i]);
 	}
@@ -1897,11 +1982,10 @@ zfs_btree_verify_pointers(zfs_btree_t *tree)
 static uint64_t
 zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 {
-	if (!hdr->bth_core) {
-		if (tree->bt_root != hdr && hdr != &tree->bt_bulk->btl_hdr) {
-			uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
-			    sizeof (zfs_btree_hdr_t)) / tree->bt_elem_size, 2);
-			VERIFY3U(hdr->bth_count, >=, (capacity / 2) - 1);
+	if (!zfs_btree_is_core(hdr)) {
+		if (tree->bt_root != hdr && tree->bt_bulk &&
+		    hdr != &tree->bt_bulk->btl_hdr) {
+			VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1);
 		}
 
 		return (hdr->bth_count);
@@ -1911,7 +1995,7 @@ zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 		uint64_t ret = hdr->bth_count;
 		if (tree->bt_root != hdr && tree->bt_bulk == NULL)
 			VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1);
-		for (int i = 0; i <= hdr->bth_count; i++) {
+		for (uint32_t i = 0; i <= hdr->bth_count; i++) {
 			ret += zfs_btree_verify_counts_helper(tree,
 			    node->btc_children[i]);
 		}
@@ -1941,17 +2025,16 @@ zfs_btree_verify_counts(zfs_btree_t *tree)
  */
 static uint64_t
 zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
-    int64_t height)
+    int32_t height)
 {
-	if (!hdr->bth_core) {
+	if (!zfs_btree_is_core(hdr)) {
 		VERIFY0(height);
 		return (1);
 	}
 
-	VERIFY(hdr->bth_core);
 	zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
 	uint64_t ret = 1;
-	for (int i = 0; i <= hdr->bth_count; i++) {
+	for (uint32_t i = 0; i <= hdr->bth_count; i++) {
 		ret += zfs_btree_verify_height_helper(tree,
 		    node->btc_children[i], height - 1);
 	}
@@ -1983,24 +2066,26 @@ static void
 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 {
 	size_t size = tree->bt_elem_size;
-	if (!hdr->bth_core) {
+	if (!zfs_btree_is_core(hdr)) {
 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
-		for (int i = 1; i < hdr->bth_count; i++) {
-			VERIFY3S(tree->bt_compar(leaf->btl_elems + (i - 1) *
-			    size, leaf->btl_elems + i * size), ==, -1);
+		for (uint32_t i = 1; i < hdr->bth_count; i++) {
+			VERIFY3S(tree->bt_compar(leaf->btl_elems +
+			    (hdr->bth_first + i - 1) * size,
+			    leaf->btl_elems +
+			    (hdr->bth_first + i) * size), ==, -1);
 		}
 		return;
 	}
 
 	zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
-	for (int i = 1; i < hdr->bth_count; i++) {
+	for (uint32_t i = 1; i < hdr->bth_count; i++) {
 		VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size,
 		    node->btc_elems + i * size), ==, -1);
 	}
-	for (int i = 0; i < hdr->bth_count; i++) {
+	for (uint32_t i = 0; i < hdr->bth_count; i++) {
 		uint8_t *left_child_last = NULL;
 		zfs_btree_hdr_t *left_child_hdr = node->btc_children[i];
-		if (left_child_hdr->bth_core) {
+		if (zfs_btree_is_core(left_child_hdr)) {
 			zfs_btree_core_t *left_child =
 			    (zfs_btree_core_t *)left_child_hdr;
 			left_child_last = left_child->btc_elems +
@@ -2009,40 +2094,39 @@ zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 			zfs_btree_leaf_t *left_child =
 			    (zfs_btree_leaf_t *)left_child_hdr;
 			left_child_last = left_child->btl_elems +
-			    (left_child_hdr->bth_count - 1) * size;
+			    (left_child_hdr->bth_first +
+			    left_child_hdr->bth_count - 1) * size;
 		}
-		if (tree->bt_compar(node->btc_elems + i * size,
-		    left_child_last) != 1) {
+		int comp = tree->bt_compar(node->btc_elems + i * size,
+		    left_child_last);
+		if (comp <= 0) {
 			panic("btree: compar returned %d (expected 1) at "
-			    "%px %d: compar(%px,  %px)", tree->bt_compar(
-			    node->btc_elems + i * size, left_child_last),
-			    (void *)node, i, (void *)(node->btc_elems + i *
-			    size), (void *)left_child_last);
+			    "%px %d: compar(%px,  %px)", comp, node, i,
+			    node->btc_elems + i * size, left_child_last);
 		}
 
 		uint8_t *right_child_first = NULL;
 		zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1];
-		if (right_child_hdr->bth_core) {
+		if (zfs_btree_is_core(right_child_hdr)) {
 			zfs_btree_core_t *right_child =
 			    (zfs_btree_core_t *)right_child_hdr;
 			right_child_first = right_child->btc_elems;
 		} else {
 			zfs_btree_leaf_t *right_child =
 			    (zfs_btree_leaf_t *)right_child_hdr;
-			right_child_first = right_child->btl_elems;
+			right_child_first = right_child->btl_elems +
+			    right_child_hdr->bth_first * size;
 		}
-		if (tree->bt_compar(node->btc_elems + i * size,
-		    right_child_first) != -1) {
+		comp = tree->bt_compar(node->btc_elems + i * size,
+		    right_child_first);
+		if (comp >= 0) {
 			panic("btree: compar returned %d (expected -1) at "
-			    "%px %d: compar(%px,  %px)", tree->bt_compar(
-			    node->btc_elems + i * size, right_child_first),
-			    (void *)node, i, (void *)(node->btc_elems + i *
-			    size), (void *)right_child_first);
+			    "%px %d: compar(%px,  %px)", comp, node, i,
+			    node->btc_elems + i * size, right_child_first);
 		}
 	}
-	for (int i = 0; i <= hdr->bth_count; i++) {
+	for (uint32_t i = 0; i <= hdr->bth_count; i++)
 		zfs_btree_verify_order_helper(tree, node->btc_children[i]);
-	}
 }
 
 /* Check that all elements in the tree are in sorted order. */
@@ -2063,27 +2147,28 @@ static void
 zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 {
 	size_t size = tree->bt_elem_size;
-	if (!hdr->bth_core) {
+	if (!zfs_btree_is_core(hdr)) {
 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
-		uint8_t val = 0x0f;
-		for (int i = hdr->bth_count * size; i < BTREE_LEAF_SIZE -
-		    sizeof (zfs_btree_hdr_t); i++) {
-			VERIFY3U(leaf->btl_elems[i], ==, val);
-		}
+		for (size_t i = 0; i < hdr->bth_first * size; i++)
+			VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
+		size_t esize = tree->bt_leaf_size -
+		    offsetof(zfs_btree_leaf_t, btl_elems);
+		for (size_t i = (hdr->bth_first + hdr->bth_count) * size;
+		    i < esize; i++)
+			VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
 	} else {
 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
-		uint8_t val = 0x0f;
-		for (int i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size;
-		    i++) {
-			VERIFY3U(node->btc_elems[i], ==, val);
-		}
+		for (size_t i = hdr->bth_count * size;
+		    i < BTREE_CORE_ELEMS * size; i++)
+			VERIFY3U(node->btc_elems[i], ==, 0x0f);
 
-		for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) {
+		for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
+		    i++) {
 			VERIFY3P(node->btc_children[i], ==,
 			    (zfs_btree_hdr_t *)BTREE_POISON);
 		}
 
-		for (int i = 0; i <= hdr->bth_count; i++) {
+		for (uint32_t i = 0; i <= hdr->bth_count; i++) {
 			zfs_btree_verify_poison_helper(tree,
 			    node->btc_children[i]);
 		}
@@ -2122,3 +2207,9 @@ zfs_btree_verify(zfs_btree_t *tree)
 		return;
 	zfs_btree_verify_poison(tree);
 }
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW,
+	"Enable btree verification. Levels above 4 require ZFS be built "
+	"with debugging");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dataset_kstats.c b/sys/contrib/openzfs/module/zfs/dataset_kstats.c
index 3fbb24ddef5e..2ac058fd2c93 100644
--- a/sys/contrib/openzfs/module/zfs/dataset_kstats.c
+++ b/sys/contrib/openzfs/module/zfs/dataset_kstats.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -37,18 +37,37 @@ static dataset_kstat_values_t empty_dataset_kstats = {
 	{ "nread",	KSTAT_DATA_UINT64 },
 	{ "nunlinks",	KSTAT_DATA_UINT64 },
 	{ "nunlinked",	KSTAT_DATA_UINT64 },
+	{
+	{ "zil_commit_count",			KSTAT_DATA_UINT64 },
+	{ "zil_commit_writer_count",		KSTAT_DATA_UINT64 },
+	{ "zil_itx_count",			KSTAT_DATA_UINT64 },
+	{ "zil_itx_indirect_count",		KSTAT_DATA_UINT64 },
+	{ "zil_itx_indirect_bytes",		KSTAT_DATA_UINT64 },
+	{ "zil_itx_copied_count",		KSTAT_DATA_UINT64 },
+	{ "zil_itx_copied_bytes",		KSTAT_DATA_UINT64 },
+	{ "zil_itx_needcopy_count",		KSTAT_DATA_UINT64 },
+	{ "zil_itx_needcopy_bytes",		KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_normal_count",	KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_normal_bytes",	KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_normal_write",	KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_normal_alloc",	KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_slog_count",	KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_slog_bytes",	KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_slog_write",	KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_slog_alloc",	KSTAT_DATA_UINT64 }
+	}
 };
 
 static int
 dataset_kstats_update(kstat_t *ksp, int rw)
 {
 	dataset_kstats_t *dk = ksp->ks_private;
-	ASSERT3P(dk->dk_kstats->ks_data, ==, ksp->ks_data);
+	dataset_kstat_values_t *dkv = ksp->ks_data;
+	ASSERT3P(dk->dk_kstats->ks_data, ==, dkv);
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
-	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
 	dkv->dkv_writes.value.ui64 =
 	    wmsum_value(&dk->dk_sums.dss_writes);
 	dkv->dkv_nwritten.value.ui64 =
@@ -62,10 +81,12 @@ dataset_kstats_update(kstat_t *ksp, int rw)
 	dkv->dkv_nunlinked.value.ui64 =
 	    wmsum_value(&dk->dk_sums.dss_nunlinked);
 
+	zil_kstat_values_update(&dkv->dkv_zil_stats, &dk->dk_zil_sums);
+
 	return (0);
 }
 
-void
+int
 dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
 {
 	/*
@@ -75,7 +96,7 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
 	 * a filesystem with many snapshots, we skip them for now.
 	 */
 	if (dmu_objset_is_snapshot(objset))
-		return;
+		return (0);
 
 	/*
 	 * At the time of this writing, KSTAT_STRLEN is 255 in Linux,
@@ -94,13 +115,13 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
 		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
 		    " snprintf() for kstat module name returned %d",
 		    (unsigned long long)dmu_objset_id(objset), n);
-		return;
+		return (SET_ERROR(EINVAL));
 	} else if (n >= KSTAT_STRLEN) {
 		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
 		    "kstat module name length (%d) exceeds limit (%d)",
 		    (unsigned long long)dmu_objset_id(objset),
 		    n, KSTAT_STRLEN);
-		return;
+		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	char kstat_name[KSTAT_STRLEN];
@@ -110,20 +131,25 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
 		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
 		    " snprintf() for kstat name returned %d",
 		    (unsigned long long)dmu_objset_id(objset), n);
-		return;
+		return (SET_ERROR(EINVAL));
+	} else if (n >= KSTAT_STRLEN) {
+		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
+		    "kstat name length (%d) exceeds limit (%d)",
+		    (unsigned long long)dmu_objset_id(objset),
+		    n, KSTAT_STRLEN);
+		return (SET_ERROR(ENAMETOOLONG));
 	}
-	ASSERT3U(n, <, KSTAT_STRLEN);
 
 	kstat_t *kstat = kstat_create(kstat_module_name, 0, kstat_name,
 	    "dataset", KSTAT_TYPE_NAMED,
 	    sizeof (empty_dataset_kstats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 	if (kstat == NULL)
-		return;
+		return (SET_ERROR(ENOMEM));
 
 	dataset_kstat_values_t *dk_kstats =
 	    kmem_alloc(sizeof (empty_dataset_kstats), KM_SLEEP);
-	bcopy(&empty_dataset_kstats, dk_kstats,
+	memcpy(dk_kstats, &empty_dataset_kstats,
 	    sizeof (empty_dataset_kstats));
 
 	char *ds_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
@@ -137,15 +163,17 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
 	kstat->ks_private = dk;
 	kstat->ks_data_size += ZFS_MAX_DATASET_NAME_LEN;
 
-	kstat_install(kstat);
-	dk->dk_kstats = kstat;
-
 	wmsum_init(&dk->dk_sums.dss_writes, 0);
 	wmsum_init(&dk->dk_sums.dss_nwritten, 0);
 	wmsum_init(&dk->dk_sums.dss_reads, 0);
 	wmsum_init(&dk->dk_sums.dss_nread, 0);
 	wmsum_init(&dk->dk_sums.dss_nunlinks, 0);
 	wmsum_init(&dk->dk_sums.dss_nunlinked, 0);
+	zil_sums_init(&dk->dk_zil_sums);
+
+	dk->dk_kstats = kstat;
+	kstat_install(kstat);
+	return (0);
 }
 
 void
@@ -155,19 +183,31 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
 		return;
 
 	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
+	kstat_delete(dk->dk_kstats);
+	dk->dk_kstats = NULL;
 	kmem_free(KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name),
 	    KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name));
 	kmem_free(dkv, sizeof (empty_dataset_kstats));
 
-	kstat_delete(dk->dk_kstats);
-	dk->dk_kstats = NULL;
-
 	wmsum_fini(&dk->dk_sums.dss_writes);
 	wmsum_fini(&dk->dk_sums.dss_nwritten);
 	wmsum_fini(&dk->dk_sums.dss_reads);
 	wmsum_fini(&dk->dk_sums.dss_nread);
 	wmsum_fini(&dk->dk_sums.dss_nunlinks);
 	wmsum_fini(&dk->dk_sums.dss_nunlinked);
+	zil_sums_fini(&dk->dk_zil_sums);
+}
+
+void
+dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
+{
+	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
+	char *ds_name;
+
+	ds_name = KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name);
+	ASSERT3S(ds_name, !=, NULL);
+	(void) strlcpy(ds_name, name,
+	    KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name));
 }
 
 void
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
index fe54da425286..56fe2c4dbe30 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -26,6 +26,7 @@
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/zfs_context.h>
@@ -49,13 +50,14 @@
 #include <sys/trace_zfs.h>
 #include <sys/callb.h>
 #include <sys/abd.h>
+#include <sys/brt.h>
 #include <sys/vdev.h>
 #include <cityhash.h>
 #include <sys/spa_impl.h>
 #include <sys/wmsum.h>
 #include <sys/vdev_impl.h>
 
-kstat_t *dbuf_ksp;
+static kstat_t *dbuf_ksp;
 
 typedef struct dbuf_stats {
 	/*
@@ -100,6 +102,11 @@ typedef struct dbuf_stats {
 	 */
 	kstat_named_t hash_insert_race;
 	/*
+	 * Number of entries in the hash table dbuf and mutex arrays.
+	 */
+	kstat_named_t hash_table_count;
+	kstat_named_t hash_mutex_count;
+	/*
 	 * Statistics about the size of the metadata dbuf cache.
 	 */
 	kstat_named_t metadata_cache_count;
@@ -131,6 +138,8 @@ dbuf_stats_t dbuf_stats = {
 	{ "hash_chains",			KSTAT_DATA_UINT64 },
 	{ "hash_chain_max",			KSTAT_DATA_UINT64 },
 	{ "hash_insert_race",			KSTAT_DATA_UINT64 },
+	{ "hash_table_count",			KSTAT_DATA_UINT64 },
+	{ "hash_mutex_count",			KSTAT_DATA_UINT64 },
 	{ "metadata_cache_count",		KSTAT_DATA_UINT64 },
 	{ "metadata_cache_size_bytes",		KSTAT_DATA_UINT64 },
 	{ "metadata_cache_size_bytes_max",	KSTAT_DATA_UINT64 },
@@ -152,13 +161,13 @@ struct {
 } dbuf_sums;
 
 #define	DBUF_STAT_INCR(stat, val)	\
-	wmsum_add(&dbuf_sums.stat, val);
+	wmsum_add(&dbuf_sums.stat, val)
 #define	DBUF_STAT_DECR(stat, val)	\
-	DBUF_STAT_INCR(stat, -(val));
+	DBUF_STAT_INCR(stat, -(val))
 #define	DBUF_STAT_BUMP(stat)		\
-	DBUF_STAT_INCR(stat, 1);
+	DBUF_STAT_INCR(stat, 1)
 #define	DBUF_STAT_BUMPDOWN(stat)	\
-	DBUF_STAT_INCR(stat, -1);
+	DBUF_STAT_INCR(stat, -1)
 #define	DBUF_STAT_MAX(stat, v) {					\
 	uint64_t _m;							\
 	while ((v) > (_m = dbuf_stats.stat.value.ui64) &&		\
@@ -166,15 +175,8 @@ struct {
 		continue;						\
 }
 
-static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
-static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
-
-extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
-    dmu_buf_evict_func_t *evict_func_sync,
-    dmu_buf_evict_func_t *evict_func_async,
-    dmu_buf_t **clear_on_evict_dbufp);
 
 /*
  * Global data structures and functions for the dbuf cache.
@@ -225,12 +227,15 @@ typedef struct dbuf_cache {
 dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
 
 /* Size limits for the caches */
-unsigned long dbuf_cache_max_bytes = ULONG_MAX;
-unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX;
+static uint64_t dbuf_cache_max_bytes = UINT64_MAX;
+static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX;
 
 /* Set the default sizes of the caches to log2 fraction of arc size */
-int dbuf_cache_shift = 5;
-int dbuf_metadata_cache_shift = 6;
+static uint_t dbuf_cache_shift = 5;
+static uint_t dbuf_metadata_cache_shift = 6;
+
+/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
+static uint_t dbuf_mutex_cache_shift = 0;
 
 static unsigned long dbuf_cache_target_bytes(void);
 static unsigned long dbuf_metadata_cache_target_bytes(void);
@@ -277,18 +282,18 @@ static unsigned long dbuf_metadata_cache_target_bytes(void);
 /*
  * The percentage above and below the maximum cache size.
  */
-uint_t dbuf_cache_hiwater_pct = 10;
-uint_t dbuf_cache_lowater_pct = 10;
+static uint_t dbuf_cache_hiwater_pct = 10;
+static uint_t dbuf_cache_lowater_pct = 10;
 
-/* ARGSUSED */
 static int
 dbuf_cons(void *vdb, void *unused, int kmflag)
 {
+	(void) unused, (void) kmflag;
 	dmu_buf_impl_t *db = vdb;
-	bzero(db, sizeof (dmu_buf_impl_t));
+	memset(db, 0, sizeof (dmu_buf_impl_t));
 
-	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
-	rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&db->db_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
+	rw_init(&db->db_rwlock, NULL, RW_NOLOCKDEP, NULL);
 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
 	multilist_link_init(&db->db_cache_link);
 	zfs_refcount_create(&db->db_holds);
@@ -296,10 +301,10 @@ dbuf_cons(void *vdb, void *unused, int kmflag)
 	return (0);
 }
 
-/* ARGSUSED */
 static void
 dbuf_dest(void *vdb, void *unused)
 {
+	(void) unused;
 	dmu_buf_impl_t *db = vdb;
 	mutex_destroy(&db->db_mtx);
 	rw_destroy(&db->db_rwlock);
@@ -334,7 +339,8 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
 	(dbuf)->db_blkid == (blkid))
 
 dmu_buf_impl_t *
-dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
+dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid,
+    uint64_t *hash_out)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	uint64_t hv;
@@ -356,6 +362,8 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
 		}
 	}
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
+	if (hash_out != NULL)
+		*hash_out = hv;
 	return (NULL);
 }
 
@@ -390,13 +398,13 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
 	objset_t *os = db->db_objset;
 	uint64_t obj = db->db.db_object;
 	int level = db->db_level;
-	uint64_t blkid, hv, idx;
+	uint64_t blkid, idx;
 	dmu_buf_impl_t *dbf;
 	uint32_t i;
 
 	blkid = db->db_blkid;
-	hv = dbuf_hash(os, obj, level, blkid);
-	idx = hv & h->hash_table_mask;
+	ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash);
+	idx = db->db_hash & h->hash_table_mask;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
@@ -470,12 +478,12 @@ static void
 dbuf_hash_remove(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
-	uint64_t hv, idx;
+	uint64_t idx;
 	dmu_buf_impl_t *dbf, **dbp;
 
-	hv = dbuf_hash(db->db_objset, db->db.db_object,
-	    db->db_level, db->db_blkid);
-	idx = hv & h->hash_table_mask;
+	ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level,
+	    db->db_blkid), ==, db->db_hash);
+	idx = db->db_hash & h->hash_table_mask;
 
 	/*
 	 * We mustn't hold db_mtx to maintain lock ordering:
@@ -560,6 +568,21 @@ dbuf_evict_user(dmu_buf_impl_t *db)
 		*dbu->dbu_clear_on_evict_dbufp = NULL;
 #endif
 
+	if (db->db_caching_status != DB_NO_CACHE) {
+		/*
+		 * This is a cached dbuf, so the size of the user data is
+		 * included in its cached amount. We adjust it here because the
+		 * user data has already been detached from the dbuf, and the
+		 * sync functions are not supposed to touch it (the dbuf might
+		 * not exist anymore by the time the sync functions run.
+		 */
+		uint64_t size = dbu->dbu_size;
+		(void) zfs_refcount_remove_many(
+		    &dbuf_caches[db->db_caching_status].size, size, dbu);
+		if (db->db_caching_status == DB_DBUF_CACHE)
+			DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
+	}
+
 	/*
 	 * There are two eviction callbacks - one that we call synchronously
 	 * and one that we invoke via a taskq.  The async one is useful for
@@ -607,58 +630,58 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
 boolean_t
 dbuf_is_l2cacheable(dmu_buf_impl_t *db)
 {
-	vdev_t *vd = NULL;
-	zfs_cache_type_t cache = db->db_objset->os_secondary_cache;
-	blkptr_t *bp = db->db_blkptr;
-
-	if (bp != NULL && !BP_IS_HOLE(bp)) {
+	if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
+	    (db->db_objset->os_secondary_cache ==
+	    ZFS_CACHE_METADATA && dbuf_is_metadata(db))) {
+		if (l2arc_exclude_special == 0)
+			return (B_TRUE);
+
+		blkptr_t *bp = db->db_blkptr;
+		if (bp == NULL || BP_IS_HOLE(bp))
+			return (B_FALSE);
 		uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
 		vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
+		vdev_t *vd = NULL;
 
 		if (vdev < rvd->vdev_children)
 			vd = rvd->vdev_child[vdev];
 
-		if (cache == ZFS_CACHE_ALL ||
-		    (dbuf_is_metadata(db) && cache == ZFS_CACHE_METADATA)) {
-			if (vd == NULL)
-				return (B_TRUE);
+		if (vd == NULL)
+			return (B_TRUE);
 
-			if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
-			    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) ||
-			    l2arc_exclude_special == 0)
-				return (B_TRUE);
-		}
+		if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+		    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+			return (B_TRUE);
 	}
-
 	return (B_FALSE);
 }
 
 static inline boolean_t
 dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)
 {
-	vdev_t *vd = NULL;
-	zfs_cache_type_t cache = dn->dn_objset->os_secondary_cache;
-
-	if (bp != NULL && !BP_IS_HOLE(bp)) {
+	if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||
+	    (dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA &&
+	    (level > 0 ||
+	    DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) {
+		if (l2arc_exclude_special == 0)
+			return (B_TRUE);
+
+		if (bp == NULL || BP_IS_HOLE(bp))
+			return (B_FALSE);
 		uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
 		vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;
+		vdev_t *vd = NULL;
 
 		if (vdev < rvd->vdev_children)
 			vd = rvd->vdev_child[vdev];
 
-		if (cache == ZFS_CACHE_ALL || ((level > 0 ||
-		    DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)) &&
-		    cache == ZFS_CACHE_METADATA)) {
-			if (vd == NULL)
-				return (B_TRUE);
+		if (vd == NULL)
+			return (B_TRUE);
 
-			if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
-			    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) ||
-			    l2arc_exclude_special == 0)
-				return (B_TRUE);
-		}
+		if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+		    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+			return (B_TRUE);
 	}
-
 	return (B_FALSE);
 }
 
@@ -745,7 +768,7 @@ static void
 dbuf_evict_one(void)
 {
 	int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
-	multilist_sublist_t *mls = multilist_sublist_lock(
+	multilist_sublist_t *mls = multilist_sublist_lock_idx(
 	    &dbuf_caches[DB_DBUF_CACHE].cache, idx);
 
 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
@@ -761,12 +784,15 @@ dbuf_evict_one(void)
 	if (db != NULL) {
 		multilist_sublist_remove(mls, db);
 		multilist_sublist_unlock(mls);
+		uint64_t size = db->db.db_size;
+		uint64_t usize = dmu_buf_user_size(&db->db);
+		(void) zfs_refcount_remove_many(
+		    &dbuf_caches[DB_DBUF_CACHE].size, size, db);
 		(void) zfs_refcount_remove_many(
-		    &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
+		    &dbuf_caches[DB_DBUF_CACHE].size, usize, db->db_user);
 		DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
 		DBUF_STAT_BUMPDOWN(cache_count);
-		DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
-		    db->db.db_size);
+		DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size + usize);
 		ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
 		db->db_caching_status = DB_NO_CACHE;
 		dbuf_destroy(db);
@@ -783,10 +809,10 @@ dbuf_evict_one(void)
  * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
  * out of the cache it is destroyed and becomes eligible for arc eviction.
  */
-/* ARGSUSED */
-static void
+static __attribute__((noreturn)) void
 dbuf_evict_thread(void *unused)
 {
+	(void) unused;
 	callb_cpr_t cpr;
 
 	CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
@@ -822,7 +848,7 @@ dbuf_evict_thread(void *unused)
 /*
  * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
  * If the dbuf cache is at its high water mark, then evict a dbuf from the
- * dbuf cache using the callers context.
+ * dbuf cache using the caller's context.
  */
 static void
 dbuf_evict_notify(uint64_t size)
@@ -843,6 +869,7 @@ static int
 dbuf_kstat_update(kstat_t *ksp, int rw)
 {
 	dbuf_stats_t *ds = ksp->ks_data;
+	dbuf_hash_table_t *h = &dbuf_hash_table;
 
 	if (rw == KSTAT_WRITE)
 		return (SET_ERROR(EACCES));
@@ -872,6 +899,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
 	    wmsum_value(&dbuf_sums.hash_chains);
 	ds->hash_insert_race.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_insert_race);
+	ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;
+	ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;
 	ds->metadata_cache_count.value.ui64 =
 	    wmsum_value(&dbuf_sums.metadata_cache_count);
 	ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
@@ -884,9 +913,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
 void
 dbuf_init(void)
 {
-	uint64_t hsize = 1ULL << 16;
+	uint64_t hmsize, hsize = 1ULL << 16;
 	dbuf_hash_table_t *h = &dbuf_hash_table;
-	int i;
 
 	/*
 	 * The hash table is big enough to fill one eighth of physical memory
@@ -897,30 +925,43 @@ dbuf_init(void)
 	while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
 		hsize <<= 1;
 
-retry:
-	h->hash_table_mask = hsize - 1;
-#if defined(_KERNEL)
+	h->hash_table = NULL;
+	while (h->hash_table == NULL) {
+		h->hash_table_mask = hsize - 1;
+
+		h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
+		if (h->hash_table == NULL)
+			hsize >>= 1;
+
+		ASSERT3U(hsize, >=, 1ULL << 10);
+	}
+
 	/*
-	 * Large allocations which do not require contiguous pages
-	 * should be using vmem_alloc() in the linux kernel
+	 * The hash table buckets are protected by an array of mutexes where
+	 * each mutex is reponsible for protecting 128 buckets.  A minimum
+	 * array size of 8192 is targeted to avoid contention.
 	 */
-	h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
-#else
-	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
-#endif
-	if (h->hash_table == NULL) {
-		/* XXX - we should really return an error instead of assert */
-		ASSERT(hsize > (1ULL << 10));
-		hsize >>= 1;
-		goto retry;
+	if (dbuf_mutex_cache_shift == 0)
+		hmsize = MAX(hsize >> 7, 1ULL << 13);
+	else
+		hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);
+
+	h->hash_mutexes = NULL;
+	while (h->hash_mutexes == NULL) {
+		h->hash_mutex_mask = hmsize - 1;
+
+		h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),
+		    KM_SLEEP);
+		if (h->hash_mutexes == NULL)
+			hmsize >>= 1;
 	}
 
 	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
 	    sizeof (dmu_buf_impl_t),
 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 
-	for (i = 0; i < DBUF_MUTEXES; i++)
-		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+	for (int i = 0; i < hmsize; i++)
+		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_NOLOCKDEP, NULL);
 
 	dbuf_stats_init(h);
 
@@ -946,7 +987,7 @@ retry:
 
 	wmsum_init(&dbuf_sums.cache_count, 0);
 	wmsum_init(&dbuf_sums.cache_total_evicts, 0);
-	for (i = 0; i < DN_MAX_LEVELS; i++) {
+	for (int i = 0; i < DN_MAX_LEVELS; i++) {
 		wmsum_init(&dbuf_sums.cache_levels[i], 0);
 		wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);
 	}
@@ -962,7 +1003,7 @@ retry:
 	    KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 	if (dbuf_ksp != NULL) {
-		for (i = 0; i < DN_MAX_LEVELS; i++) {
+		for (int i = 0; i < DN_MAX_LEVELS; i++) {
 			snprintf(dbuf_stats.cache_levels[i].name,
 			    KSTAT_STRLEN, "cache_level_%d", i);
 			dbuf_stats.cache_levels[i].data_type =
@@ -982,21 +1023,16 @@ void
 dbuf_fini(void)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
-	int i;
 
 	dbuf_stats_destroy();
 
-	for (i = 0; i < DBUF_MUTEXES; i++)
+	for (int i = 0; i < (h->hash_mutex_mask + 1); i++)
 		mutex_destroy(&h->hash_mutexes[i]);
-#if defined(_KERNEL)
-	/*
-	 * Large allocations which do not require contiguous pages
-	 * should be using vmem_free() in the linux kernel
-	 */
+
 	vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
-#else
-	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
-#endif
+	vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *
+	    sizeof (kmutex_t));
+
 	kmem_cache_destroy(dbuf_kmem_cache);
 	taskq_destroy(dbu_evict_taskq);
 
@@ -1023,7 +1059,7 @@ dbuf_fini(void)
 
 	wmsum_fini(&dbuf_sums.cache_count);
 	wmsum_fini(&dbuf_sums.cache_total_evicts);
-	for (i = 0; i < DN_MAX_LEVELS; i++) {
+	for (int i = 0; i < DN_MAX_LEVELS; i++) {
 		wmsum_fini(&dbuf_sums.cache_levels[i]);
 		wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);
 	}
@@ -1137,7 +1173,7 @@ dbuf_verify(dmu_buf_impl_t *db)
 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
 	    (db->db_buf == NULL || db->db_buf->b_data) &&
 	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
-	    db->db_state != DB_FILL && !dn->dn_free_txg) {
+	    db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
 		/*
 		 * If the blkptr isn't set but they have nonzero data,
 		 * it had better be dirty, otherwise we'll lose that
@@ -1183,7 +1219,7 @@ dbuf_verify(dmu_buf_impl_t *db)
 					ASSERT0(bp->blk_pad[1]);
 					ASSERT(!BP_IS_EMBEDDED(bp));
 					ASSERT(BP_IS_HOLE(bp));
-					ASSERT0(bp->blk_phys_birth);
+					ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
 				}
 			}
 		}
@@ -1240,7 +1276,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 
 		mutex_exit(&db->db_mtx);
 		abuf = arc_loan_buf(spa, B_FALSE, blksz);
-		bcopy(db->db.db_data, abuf->b_data, blksz);
+		memcpy(abuf->b_data, db->db.db_data, blksz);
 	} else {
 		abuf = db->db_buf;
 		arc_loan_inuse_buf(abuf, db);
@@ -1302,7 +1338,7 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
  * used when modifying or reading db_blkptr.
  */
 db_lock_type_t
-dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag)
+dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, const void *tag)
 {
 	enum db_lock_type ret = DLT_NONE;
 	if (db->db_parent != NULL) {
@@ -1327,7 +1363,7 @@ dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag)
  * panic if we didn't pass the lock type in.
  */
 void
-dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag)
+dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, const void *tag)
 {
 	if (type == DLT_PARENT)
 		rw_exit(&db->db_parent->db_rwlock);
@@ -1339,6 +1375,7 @@ static void
 dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *vdb)
 {
+	(void) zb, (void) bp;
 	dmu_buf_impl_t *db = vdb;
 
 	mutex_enter(&db->db_mtx);
@@ -1360,7 +1397,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
 		/* freed in flight */
 		ASSERT(zio == NULL || zio->io_error == 0);
 		arc_release(buf, db);
-		bzero(buf->b_data, db->db.db_size);
+		memset(buf->b_data, 0, db->db.db_size);
 		arc_buf_freeze(buf);
 		db->db_freed_in_flight = FALSE;
 		dbuf_set_data(db, buf);
@@ -1383,13 +1420,9 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
  * a decrypted block. Otherwise success.
  */
 static int
-dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
+dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)
 {
-	int bonuslen, max_bonuslen, err;
-
-	err = dbuf_read_verify_dnode_crypt(db, flags);
-	if (err)
-		return (err);
+	int bonuslen, max_bonuslen;
 
 	bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 	max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
@@ -1399,16 +1432,16 @@ dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
 	db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
 	arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
 	if (bonuslen < max_bonuslen)
-		bzero(db->db.db_data, max_bonuslen);
+		memset(db->db.db_data, 0, max_bonuslen);
 	if (bonuslen)
-		bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+		memcpy(db->db.db_data, DN_BONUS(dn->dn_phys), bonuslen);
 	db->db_state = DB_CACHED;
 	DTRACE_SET_STATE(db, "bonus buffer filled");
 	return (0);
 }
 
 static void
-dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
 {
 	blkptr_t *bps = db->db.db_data;
 	uint32_t indbs = 1ULL << dn->dn_indblkshift;
@@ -1417,12 +1450,12 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
 	for (int i = 0; i < n_bps; i++) {
 		blkptr_t *bp = &bps[i];
 
-		ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs);
-		BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ?
-		    dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr));
-		BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
-		BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1);
-		BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+		ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
+		BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
+		    dn->dn_datablksz : BP_GET_LSIZE(dbbp));
+		BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
+		BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
+		BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);
 	}
 }
 
@@ -1432,30 +1465,27 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
  * was taken, ENOENT if no action was taken.
  */
 static int
-dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
+dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
-	int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr);
+	int is_hole = bp == NULL || BP_IS_HOLE(bp);
 	/*
 	 * For level 0 blocks only, if the above check fails:
 	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
 	 * processes the delete record and clears the bp while we are waiting
 	 * for the dn_mtx (resulting in a "no" from block_freed).
 	 */
-	if (!is_hole && db->db_level == 0) {
-		is_hole = dnode_block_freed(dn, db->db_blkid) ||
-		    BP_IS_HOLE(db->db_blkptr);
-	}
+	if (!is_hole && db->db_level == 0)
+		is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
 
 	if (is_hole) {
 		dbuf_set_data(db, dbuf_alloc_arcbuf(db));
-		bzero(db->db.db_data, db->db.db_size);
+		memset(db->db.db_data, 0, db->db.db_size);
 
-		if (db->db_blkptr != NULL && db->db_level > 0 &&
-		    BP_IS_HOLE(db->db_blkptr) &&
-		    db->db_blkptr->blk_birth != 0) {
-			dbuf_handle_indirect_hole(db, dn);
+		if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
+		    BP_GET_LOGICAL_BIRTH(bp) != 0) {
+			dbuf_handle_indirect_hole(db, dn, bp);
 		}
 		db->db_state = DB_CACHED;
 		DTRACE_SET_STATE(db, "hole read satisfied");
@@ -1477,32 +1507,46 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
  * decrypt / authenticate them when we need to read an encrypted bonus buffer.
  */
 static int
-dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
+dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
 {
-	int err = 0;
 	objset_t *os = db->db_objset;
-	arc_buf_t *dnode_abuf;
-	dnode_t *dn;
+	dmu_buf_impl_t *dndb;
+	arc_buf_t *dnbuf;
 	zbookmark_phys_t zb;
+	int err;
 
-	ASSERT(MUTEX_HELD(&db->db_mtx));
+	if ((flags & DB_RF_NO_DECRYPT) != 0 ||
+	    !os->os_encrypted || os->os_raw_receive ||
+	    (dndb = dn->dn_dbuf) == NULL)
+		return (0);
 
-	if (!os->os_encrypted || os->os_raw_receive ||
-	    (flags & DB_RF_NO_DECRYPT) != 0)
+	dnbuf = dndb->db_buf;
+	if (!arc_is_encrypted(dnbuf))
 		return (0);
 
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
+	mutex_enter(&dndb->db_mtx);
 
-	if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
-		DB_DNODE_EXIT(db);
-		return (0);
-	}
+	/*
+	 * Since dnode buffer is modified by sync process, there can be only
+	 * one copy of it.  It means we can not modify (decrypt) it while it
+	 * is being written.  I don't see how this may happen now, since
+	 * encrypted dnode writes by receive should be completed before any
+	 * plain-text reads due to txg wait, but better be safe than sorry.
+	 */
+	while (1) {
+		if (!arc_is_encrypted(dnbuf)) {
+			mutex_exit(&dndb->db_mtx);
+			return (0);
+		}
+		dbuf_dirty_record_t *dr = dndb->db_data_pending;
+		if (dr == NULL || dr->dt.dl.dr_data != dnbuf)
+			break;
+		cv_wait(&dndb->db_changed, &dndb->db_mtx);
+	};
 
 	SET_BOOKMARK(&zb, dmu_objset_id(os),
-	    DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
-	err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
+	    DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);
+	err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);
 
 	/*
 	 * An error code of EACCES tells us that the key is still not
@@ -1515,7 +1559,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
 	    !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
 		err = 0;
 
-	DB_DNODE_EXIT(db);
+	mutex_exit(&dndb->db_mtx);
 
 	return (err);
 }
@@ -1525,39 +1569,63 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
  * returning.
  */
 static int
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
-    db_lock_type_t dblt, void *tag)
+dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
+    db_lock_type_t dblt, const void *tag)
 {
-	dnode_t *dn;
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_FLAG_NOWAIT;
 	int err, zio_flags;
+	blkptr_t bp, *bpp = NULL;
 
-	err = zio_flags = 0;
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(db->db_state == DB_UNCACHED);
+	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db_parent == NULL ||
 	    RW_LOCK_HELD(&db->db_parent->db_rwlock));
 
 	if (db->db_blkid == DMU_BONUS_BLKID) {
-		err = dbuf_read_bonus(db, dn, flags);
+		err = dbuf_read_bonus(db, dn);
 		goto early_unlock;
 	}
 
-	err = dbuf_read_hole(db, dn, flags);
+	/*
+	 * If we have a pending block clone, we don't want to read the
+	 * underlying block, but the content of the block being cloned,
+	 * pointed by the dirty record, so we have the most recent data.
+	 * If there is no dirty record, then we hit a race in a sync
+	 * process when the dirty record is already removed, while the
+	 * dbuf is not yet destroyed. Such case is equivalent to uncached.
+	 */
+	if (db->db_state == DB_NOFILL) {
+		dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
+		if (dr != NULL) {
+			if (!dr->dt.dl.dr_brtwrite) {
+				err = EIO;
+				goto early_unlock;
+			}
+			bp = dr->dt.dl.dr_overridden_by;
+			bpp = &bp;
+		}
+	}
+
+	if (bpp == NULL && db->db_blkptr != NULL) {
+		bp = *db->db_blkptr;
+		bpp = &bp;
+	}
+
+	err = dbuf_read_hole(db, dn, bpp);
 	if (err == 0)
 		goto early_unlock;
 
+	ASSERT(bpp != NULL);
+
 	/*
 	 * Any attempt to read a redacted block should result in an error. This
 	 * will never happen under normal conditions, but can be useful for
 	 * debugging purposes.
 	 */
-	if (BP_IS_REDACTED(db->db_blkptr)) {
+	if (BP_IS_REDACTED(bpp)) {
 		ASSERT(dsl_dataset_feature_is_active(
 		    db->db_objset->os_dsl_dataset,
 		    SPA_FEATURE_REDACTED_DATASETS));
@@ -1572,25 +1640,20 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	 * All bps of an encrypted os should have the encryption bit set.
 	 * If this is not true it indicates tampering and we report an error.
 	 */
-	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
-		spa_log_error(db->db_objset->os_spa, &zb);
-		zfs_panic_recover("unencrypted block in encrypted "
-		    "object set %llu", dmu_objset_id(db->db_objset));
+	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
+		spa_log_error(db->db_objset->os_spa, &zb,
+		    BP_GET_LOGICAL_BIRTH(bpp));
 		err = SET_ERROR(EIO);
 		goto early_unlock;
 	}
 
-	err = dbuf_read_verify_dnode_crypt(db, flags);
-	if (err != 0)
-		goto early_unlock;
-
-	DB_DNODE_EXIT(db);
-
 	db->db_state = DB_READ;
 	DTRACE_SET_STATE(db, "read issued");
 	mutex_exit(&db->db_mtx);
 
-	if (dbuf_is_l2cacheable(db))
+	if (!DBUF_IS_CACHEABLE(db))
+		aflags |= ARC_FLAG_UNCACHED;
+	else if (dbuf_is_l2cacheable(db))
 		aflags |= ARC_FLAG_L2CACHE;
 
 	dbuf_add_ref(db, NULL);
@@ -1601,20 +1664,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
 		zio_flags |= ZIO_FLAG_RAW;
 	/*
-	 * The zio layer will copy the provided blkptr later, but we need to
-	 * do this now so that we can release the parent's rwlock. We have to
-	 * do that now so that if dbuf_read_done is called synchronously (on
+	 * The zio layer will copy the provided blkptr later, but we have our
+	 * own copy so that we can release the parent's rwlock. We have to
+	 * do that so that if dbuf_read_done is called synchronously (on
 	 * an l1 cache hit) we don't acquire the db_mtx while holding the
 	 * parent's rwlock, which would be a lock ordering violation.
 	 */
-	blkptr_t bp = *db->db_blkptr;
 	dmu_buf_unlock_parent(db, dblt, tag);
-	(void) arc_read(zio, db->db_objset->os_spa, &bp,
+	return (arc_read(zio, db->db_objset->os_spa, bpp,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
-	    &aflags, &zb);
-	return (err);
+	    &aflags, &zb));
+
 early_unlock:
-	DB_DNODE_EXIT(db);
 	mutex_exit(&db->db_mtx);
 	dmu_buf_unlock_parent(db, dblt, tag);
 	return (err);
@@ -1661,7 +1722,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 		int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 		dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
 		arc_space_consume(bonuslen, ARC_SPACE_BONUS);
-		bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
+		memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
 	} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		dnode_t *dn = DB_DNODE(db);
 		int size = arc_buf_size(db->db_buf);
@@ -1691,7 +1752,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 		} else {
 			dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
 		}
-		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
+		memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
 	} else {
 		db->db_buf = NULL;
 		dbuf_clear_data(db);
@@ -1699,38 +1760,65 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 }
 
 int
-dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 {
-	int err = 0;
-	boolean_t prefetch;
 	dnode_t *dn;
+	boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
+	int err;
 
-	/*
-	 * We don't have to hold the mutex to check db_state because it
-	 * can't be freed while we have a hold on the buffer.
-	 */
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
-	if (db->db_state == DB_NOFILL)
-		return (SET_ERROR(EIO));
-
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
+	/*
+	 * Ensure that this block's dnode has been decrypted if the caller
+	 * has requested decrypted data.
+	 */
+	err = dbuf_read_verify_dnode_crypt(db, dn, flags);
+	if (err != 0)
+		goto done;
+
 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
-	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
-	    DBUF_IS_CACHEABLE(db);
+	    (flags & DB_RF_NOPREFETCH) == 0;
 
 	mutex_enter(&db->db_mtx);
-	if (db->db_state == DB_CACHED) {
-		spa_t *spa = dn->dn_objset->os_spa;
+	if (flags & DB_RF_PARTIAL_FIRST)
+		db->db_partial_read = B_TRUE;
+	else if (!(flags & DB_RF_PARTIAL_MORE))
+		db->db_partial_read = B_FALSE;
+	miss = (db->db_state != DB_CACHED);
 
+	if (db->db_state == DB_READ || db->db_state == DB_FILL) {
 		/*
-		 * Ensure that this block's dnode has been decrypted if
-		 * the caller has requested decrypted data.
+		 * Another reader came in while the dbuf was in flight between
+		 * UNCACHED and CACHED.  Either a writer will finish filling
+		 * the buffer, sending the dbuf to CACHED, or the first reader's
+		 * request will reach the read_done callback and send the dbuf
+		 * to CACHED.  Otherwise, a failure occurred and the dbuf will
+		 * be sent to UNCACHED.
 		 */
-		err = dbuf_read_verify_dnode_crypt(db, flags);
+		if (flags & DB_RF_NEVERWAIT) {
+			mutex_exit(&db->db_mtx);
+			DB_DNODE_EXIT(db);
+			goto done;
+		}
+		do {
+			ASSERT(db->db_state == DB_READ ||
+			    (flags & DB_RF_HAVESTRUCT) == 0);
+			DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,
+			    zio_t *, pio);
+			cv_wait(&db->db_changed, &db->db_mtx);
+		} while (db->db_state == DB_READ || db->db_state == DB_FILL);
+		if (db->db_state == DB_UNCACHED) {
+			err = SET_ERROR(EIO);
+			mutex_exit(&db->db_mtx);
+			DB_DNODE_EXIT(db);
+			goto done;
+		}
+	}
 
+	if (db->db_state == DB_CACHED) {
 		/*
 		 * If the arc buf is compressed or encrypted and the caller
 		 * requested uncompressed data, we need to untransform it
@@ -1738,11 +1826,11 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 		 * unauthenticated blocks, which will verify their MAC if
 		 * the key is now available.
 		 */
-		if (err == 0 && db->db_buf != NULL &&
-		    (flags & DB_RF_NO_DECRYPT) == 0 &&
+		if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
 		    (arc_is_encrypted(db->db_buf) ||
 		    arc_is_unauthenticated(db->db_buf) ||
 		    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
+			spa_t *spa = dn->dn_objset->os_spa;
 			zbookmark_phys_t zb;
 
 			SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
@@ -1752,80 +1840,49 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 			dbuf_set_data(db, db->db_buf);
 		}
 		mutex_exit(&db->db_mtx);
-		if (err == 0 && prefetch) {
-			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
-			    B_FALSE, flags & DB_RF_HAVESTRUCT);
-		}
-		DB_DNODE_EXIT(db);
-		DBUF_STAT_BUMP(hash_hits);
-	} else if (db->db_state == DB_UNCACHED) {
-		spa_t *spa = dn->dn_objset->os_spa;
-		boolean_t need_wait = B_FALSE;
-
+	} else {
+		ASSERT(db->db_state == DB_UNCACHED ||
+		    db->db_state == DB_NOFILL);
 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
-
-		if (zio == NULL &&
-		    db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
-			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+		if (pio == NULL && (db->db_state == DB_NOFILL ||
+		    (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
+			spa_t *spa = dn->dn_objset->os_spa;
+			pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			need_wait = B_TRUE;
 		}
-		err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
-		/*
-		 * dbuf_read_impl has dropped db_mtx and our parent's rwlock
-		 * for us
-		 */
-		if (!err && prefetch) {
-			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
-			    db->db_state != DB_CACHED,
-			    flags & DB_RF_HAVESTRUCT);
-		}
+		err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
+		/* dbuf_read_impl drops db_mtx and parent's rwlock. */
+		miss = (db->db_state != DB_CACHED);
+	}
 
-		DB_DNODE_EXIT(db);
-		DBUF_STAT_BUMP(hash_misses);
+	if (err == 0 && prefetch) {
+		dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
+		    flags & DB_RF_HAVESTRUCT);
+	}
+	DB_DNODE_EXIT(db);
 
-		/*
-		 * If we created a zio_root we must execute it to avoid
-		 * leaking it, even if it isn't attached to any work due
-		 * to an error in dbuf_read_impl().
-		 */
-		if (need_wait) {
-			if (err == 0)
-				err = zio_wait(zio);
-			else
-				VERIFY0(zio_wait(zio));
-		}
-	} else {
-		/*
-		 * Another reader came in while the dbuf was in flight
-		 * between UNCACHED and CACHED.  Either a writer will finish
-		 * writing the buffer (sending the dbuf to CACHED) or the
-		 * first reader's request will reach the read_done callback
-		 * and send the dbuf to CACHED.  Otherwise, a failure
-		 * occurred and the dbuf went to UNCACHED.
-		 */
-		mutex_exit(&db->db_mtx);
-		if (prefetch) {
-			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
-			    B_TRUE, flags & DB_RF_HAVESTRUCT);
-		}
-		DB_DNODE_EXIT(db);
-		DBUF_STAT_BUMP(hash_misses);
+	/*
+	 * If we created a zio we must execute it to avoid leaking it, even if
+	 * it isn't attached to any work due to an error in dbuf_read_impl().
+	 */
+	if (need_wait) {
+		if (err == 0)
+			err = zio_wait(pio);
+		else
+			(void) zio_wait(pio);
+		pio = NULL;
+	}
 
-		/* Skip the wait per the caller's request. */
-		if ((flags & DB_RF_NEVERWAIT) == 0) {
-			mutex_enter(&db->db_mtx);
-			while (db->db_state == DB_READ ||
-			    db->db_state == DB_FILL) {
-				ASSERT(db->db_state == DB_READ ||
-				    (flags & DB_RF_HAVESTRUCT) == 0);
-				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
-				    db, zio_t *, zio);
-				cv_wait(&db->db_changed, &db->db_mtx);
-			}
-			if (db->db_state == DB_UNCACHED)
-				err = SET_ERROR(EIO);
-			mutex_exit(&db->db_mtx);
-		}
+done:
+	if (miss)
+		DBUF_STAT_BUMP(hash_misses);
+	else
+		DBUF_STAT_BUMP(hash_hits);
+	if (pio && err != 0) {
+		zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
+		    ZIO_FLAG_CANFAIL);
+		zio->io_error = err;
+		zio_nowait(zio);
 	}
 
 	return (err);
@@ -1879,8 +1936,13 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
 		zio_free(db->db_objset->os_spa, txg, bp);
 
+	if (dr->dt.dl.dr_brtwrite) {
+		ASSERT0P(dr->dt.dl.dr_data);
+		dr->dt.dl.dr_data = db->db_buf;
+	}
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	dr->dt.dl.dr_nopwrite = B_FALSE;
+	dr->dt.dl.dr_brtwrite = B_FALSE;
 	dr->dt.dl.dr_has_raw_params = B_FALSE;
 
 	/*
@@ -1891,7 +1953,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 	 * the buf thawed to save the effort of freezing &
 	 * immediately re-thawing it.
 	 */
-	arc_release(dr->dt.dl.dr_data, db);
+	if (dr->dt.dl.dr_data)
+		arc_release(dr->dt.dl.dr_data, db);
 }
 
 /*
@@ -1989,7 +2052,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
 			ASSERT(db->db.db_data != NULL);
 			arc_release(db->db_buf, db);
 			rw_enter(&db->db_rwlock, RW_WRITER);
-			bzero(db->db.db_data, db->db.db_size);
+			memset(db->db.db_data, 0, db->db.db_size);
 			rw_exit(&db->db_rwlock);
 			arc_buf_freeze(db->db_buf);
 		}
@@ -1997,8 +2060,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
 		mutex_exit(&db->db_mtx);
 	}
 
-	kmem_free(db_search, sizeof (dmu_buf_impl_t));
 	mutex_exit(&dn->dn_dbufs_mtx);
+	kmem_free(db_search, sizeof (dmu_buf_impl_t));
 }
 
 void
@@ -2026,10 +2089,10 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 
 	/* copy old block data to the new block */
 	old_buf = db->db_buf;
-	bcopy(old_buf->b_data, buf->b_data, MIN(osize, size));
+	memcpy(buf->b_data, old_buf->b_data, MIN(osize, size));
 	/* zero the remainder */
 	if (size > osize)
-		bzero((uint8_t *)buf->b_data + osize, size - osize);
+		memset((uint8_t *)buf->b_data + osize, 0, size - osize);
 
 	mutex_enter(&db->db_mtx);
 	dbuf_set_data(db, buf);
@@ -2110,7 +2173,8 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
 	 * Otherwise the buffer contents could be inconsistent between the
 	 * dbuf and the lightweight dirty record.
 	 */
-	ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid));
+	ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid,
+	    NULL));
 
 	mutex_enter(&dn->dn_mtx);
 	int txgoff = tx->tx_txg & TXG_MASK;
@@ -2262,7 +2326,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
-	if (db->db_blkid != DMU_BONUS_BLKID) {
+	if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
 		dmu_objset_willuse_space(os, db->db.db_size, tx);
 	}
 
@@ -2305,8 +2369,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
-	if (db->db_blkid != DMU_BONUS_BLKID)
+	if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
 		dr->dr_accounted = db->db.db_size;
+	}
 	dr->dr_dbuf = db;
 	dr->dr_txg = tx->tx_txg;
 	list_insert_before(&db->db_dirty_records, dr_next, dr);
@@ -2462,10 +2527,11 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
  * Undirty a buffer in the transaction group referenced by the given
  * transaction.  Return whether this evicted the dbuf.
  */
-static boolean_t
+boolean_t
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	uint64_t txg = tx->tx_txg;
+	boolean_t brtwrite;
 
 	ASSERT(txg != 0);
 
@@ -2490,6 +2556,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		return (B_FALSE);
 	ASSERT(dr->dr_dbuf == db);
 
+	brtwrite = dr->dt.dl.dr_brtwrite;
+	if (brtwrite) {
+		/*
+		 * We are freeing a block that we cloned in the same
+		 * transaction group.
+		 */
+		brt_pending_remove(dmu_objset_spa(db->db_objset),
+		    &dr->dt.dl.dr_overridden_by, tx);
+	}
+
 	dnode_t *dn = dr->dr_dnode;
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@@ -2519,7 +2595,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		mutex_exit(&dn->dn_mtx);
 	}
 
-	if (db->db_state != DB_NOFILL) {
+	if (db->db_state != DB_NOFILL && !brtwrite) {
 		dbuf_unoverride(dr);
 
 		ASSERT(db->db_buf != NULL);
@@ -2534,7 +2610,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	db->db_dirtycnt -= 1;
 
 	if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
-		ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+		ASSERT(db->db_state == DB_NOFILL || brtwrite ||
+		    arc_released(db->db_buf));
 		dbuf_destroy(db);
 		return (B_TRUE);
 	}
@@ -2546,30 +2623,40 @@ static void
 dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	boolean_t undirty = B_FALSE;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
 	/*
-	 * Quick check for dirtiness.  For already dirty blocks, this
-	 * reduces runtime of this function by >90%, and overall performance
-	 * by 50% for some workloads (e.g. file deletion with indirect blocks
-	 * cached).
+	 * Quick check for dirtiness to improve performance for some workloads
+	 * (e.g. file deletion with indirect blocks cached).
 	 */
 	mutex_enter(&db->db_mtx);
-
-	if (db->db_state == DB_CACHED) {
-		dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+	if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
 		/*
-		 * It's possible that it is already dirty but not cached,
+		 * It's possible that the dbuf is already dirty but not cached,
 		 * because there are some calls to dbuf_dirty() that don't
 		 * go through dmu_buf_will_dirty().
 		 */
+		dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 		if (dr != NULL) {
-			/* This dbuf is already dirty and cached. */
-			dbuf_redirty(dr);
-			mutex_exit(&db->db_mtx);
-			return;
+			if (db->db_level == 0 &&
+			    dr->dt.dl.dr_brtwrite) {
+				/*
+				 * Block cloning: If we are dirtying a cloned
+				 * level 0 block, we cannot simply redirty it,
+				 * because this dr has no associated data.
+				 * We will go through a full undirtying below,
+				 * before dirtying it again.
+				 */
+				undirty = B_TRUE;
+			} else {
+				/* This dbuf is already dirty and cached. */
+				dbuf_redirty(dr);
+				mutex_exit(&db->db_mtx);
+				return;
+			}
 		}
 	}
 	mutex_exit(&db->db_mtx);
@@ -2578,7 +2665,20 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
 	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
 		flags |= DB_RF_HAVESTRUCT;
 	DB_DNODE_EXIT(db);
+
+	/*
+	 * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
+	 * want to make sure dbuf_read() will read the pending cloned block and
+	 * not the uderlying block that is being replaced. dbuf_undirty() will
+	 * do dbuf_unoverride(), so we will end up with cloned block content,
+	 * without overridden BP.
+	 */
 	(void) dbuf_read(db, NULL, flags);
+	if (undirty) {
+		mutex_enter(&db->db_mtx);
+		VERIFY(!dbuf_undirty(db, tx));
+		mutex_exit(&db->db_mtx);
+	}
 	(void) dbuf_dirty(db, tx);
 }
 
@@ -2602,17 +2702,51 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 }
 
 void
+dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	/*
+	 * Block cloning: We are going to clone into this block, so undirty
+	 * modifications done to this block so far in this txg. This includes
+	 * writes and clones into this block.
+	 */
+	mutex_enter(&db->db_mtx);
+	DBUF_VERIFY(db);
+	VERIFY(!dbuf_undirty(db, tx));
+	ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
+	if (db->db_buf != NULL) {
+		arc_buf_destroy(db->db_buf, db);
+		db->db_buf = NULL;
+		dbuf_clear_data(db);
+	}
+
+	db->db_state = DB_NOFILL;
+	DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
+
+	DBUF_VERIFY(db);
+	mutex_exit(&db->db_mtx);
+
+	dbuf_noread(db);
+	(void) dbuf_dirty(db, tx);
+}
+
+void
 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
+	mutex_enter(&db->db_mtx);
 	db->db_state = DB_NOFILL;
 	DTRACE_SET_STATE(db, "allocating NOFILL buffer");
-	dmu_buf_will_fill(db_fake, tx);
+	mutex_exit(&db->db_mtx);
+
+	dbuf_noread(db);
+	(void) dbuf_dirty(db, tx);
 }
 
 void
-dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
@@ -2624,6 +2758,25 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
 	    dmu_tx_private_ok(tx));
 
+	mutex_enter(&db->db_mtx);
+	if (db->db_state == DB_NOFILL) {
+		/*
+		 * Block cloning: We will be completely overwriting a block
+		 * cloned in this transaction group, so let's undirty the
+		 * pending clone and mark the block as uncached. This will be
+		 * as if the clone was never done.  But if the fill can fail
+		 * we should have a way to return back to the cloned data.
+		 */
+		if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+			mutex_exit(&db->db_mtx);
+			dmu_buf_will_dirty(db_fake, tx);
+			return;
+		}
+		VERIFY(!dbuf_undirty(db, tx));
+		db->db_state = DB_UNCACHED;
+	}
+	mutex_exit(&db->db_mtx);
+
 	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);
 }
@@ -2659,9 +2812,9 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
 
 	dr->dt.dl.dr_has_raw_params = B_TRUE;
 	dr->dt.dl.dr_byteorder = byteorder;
-	bcopy(salt, dr->dt.dl.dr_salt, ZIO_DATA_SALT_LEN);
-	bcopy(iv, dr->dt.dl.dr_iv, ZIO_DATA_IV_LEN);
-	bcopy(mac, dr->dt.dl.dr_mac, ZIO_DATA_MAC_LEN);
+	memcpy(dr->dt.dl.dr_salt, salt, ZIO_DATA_SALT_LEN);
+	memcpy(dr->dt.dl.dr_iv, iv, ZIO_DATA_IV_LEN);
+	memcpy(dr->dt.dl.dr_mac, mac, ZIO_DATA_MAC_LEN);
 }
 
 static void
@@ -2671,39 +2824,50 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
 	dbuf_dirty_record_t *dr;
 
 	dr = list_head(&db->db_dirty_records);
+	ASSERT3P(dr, !=, NULL);
 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 	dl = &dr->dt.dl;
 	dl->dr_overridden_by = *bp;
 	dl->dr_override_state = DR_OVERRIDDEN;
-	dl->dr_overridden_by.blk_birth = dr->dr_txg;
+	BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
 }
 
-/* ARGSUSED */
-void
-dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
+boolean_t
+dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
 {
+	(void) tx;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
-	dbuf_states_t old_state;
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
 
-	old_state = db->db_state;
-	db->db_state = DB_CACHED;
-	if (old_state == DB_FILL) {
+	if (db->db_state == DB_FILL) {
 		if (db->db_level == 0 && db->db_freed_in_flight) {
 			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 			/* we were freed while filling */
 			/* XXX dbuf_undirty? */
-			bzero(db->db.db_data, db->db.db_size);
+			memset(db->db.db_data, 0, db->db.db_size);
 			db->db_freed_in_flight = FALSE;
+			db->db_state = DB_CACHED;
 			DTRACE_SET_STATE(db,
 			    "fill done handling freed in flight");
+			failed = B_FALSE;
+		} else if (failed) {
+			VERIFY(!dbuf_undirty(db, tx));
+			arc_buf_destroy(db->db_buf, db);
+			db->db_buf = NULL;
+			dbuf_clear_data(db);
+			DTRACE_SET_STATE(db, "fill failed");
 		} else {
+			db->db_state = DB_CACHED;
 			DTRACE_SET_STATE(db, "fill done");
 		}
 		cv_broadcast(&db->db_changed);
+	} else {
+		db->db_state = DB_CACHED;
+		failed = B_FALSE;
 	}
 	mutex_exit(&db->db_mtx);
+	return (failed);
 }
 
 void
@@ -2732,6 +2896,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
 	dmu_buf_will_not_fill(dbuf, tx);
 
 	dr = list_head(&db->db_dirty_records);
+	ASSERT3P(dr, !=, NULL);
 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 	dl = &dr->dt.dl;
 	encode_embedded_bp_compressed(&dl->dr_overridden_by,
@@ -2742,7 +2907,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
 	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
 
 	dl->dr_override_state = DR_OVERRIDDEN;
-	dl->dr_overridden_by.blk_birth = dr->dr_txg;
+	BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
 }
 
 void
@@ -2793,7 +2958,8 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 
-	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
+	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED ||
+	    db->db_state == DB_NOFILL);
 
 	if (db->db_state == DB_CACHED &&
 	    zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
@@ -2806,7 +2972,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 		ASSERT(!arc_is_encrypted(buf));
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_dirty(db, tx);
-		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
+		memcpy(db->db.db_data, buf->b_data, db->db.db_size);
 		arc_buf_destroy(buf, db);
 		return;
 	}
@@ -2830,6 +2996,15 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 			arc_buf_destroy(db->db_buf, db);
 		}
 		db->db_buf = NULL;
+	} else if (db->db_state == DB_NOFILL) {
+		/*
+		 * We will be completely replacing the cloned block.  In case
+		 * it was cloned in this transaction group, let's undirty the
+		 * pending clone and mark the block as uncached. This will be
+		 * as if the clone was never done.
+		 */
+		VERIFY(!dbuf_undirty(db, tx));
+		db->db_state = DB_UNCACHED;
 	}
 	ASSERT(db->db_buf == NULL);
 	dbuf_set_data(db, buf);
@@ -2837,7 +3012,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 	DTRACE_SET_STATE(db, "filling assigned arcbuf");
 	mutex_exit(&db->db_mtx);
 	(void) dbuf_dirty(db, tx);
-	dmu_buf_fill_done(&db->db, tx);
+	dmu_buf_fill_done(&db->db, tx, B_FALSE);
 }
 
 void
@@ -2873,6 +3048,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
 		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
 
 		multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+		ASSERT0(dmu_buf_user_size(&db->db));
 		(void) zfs_refcount_remove_many(
 		    &dbuf_caches[db->db_caching_status].size,
 		    db->db.db_size, db);
@@ -2945,9 +3122,6 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
 	ASSERT(!multilist_link_active(&db->db_cache_link));
 
-	kmem_cache_free(dbuf_kmem_cache, db);
-	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
-
 	/*
 	 * If this dbuf is referenced from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
@@ -2956,6 +3130,9 @@ dbuf_destroy(dmu_buf_impl_t *db)
 		mutex_enter(&parent->db_mtx);
 		dbuf_rele_and_unlock(parent, db, B_TRUE);
 	}
+
+	kmem_cache_free(dbuf_kmem_cache, db);
+	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 }
 
 /*
@@ -3057,7 +3234,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
 
 static dmu_buf_impl_t *
 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
-    dmu_buf_impl_t *parent, blkptr_t *blkptr)
+    dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash)
 {
 	objset_t *os = dn->dn_objset;
 	dmu_buf_impl_t *db, *odb;
@@ -3078,6 +3255,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 	db->db_dnode_handle = dn->dn_handle;
 	db->db_parent = parent;
 	db->db_blkptr = blkptr;
+	db->db_hash = hash;
 
 	db->db_user = NULL;
 	db->db_user_immediate_evict = FALSE;
@@ -3161,6 +3339,7 @@ dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
 
 	err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
 	if (err == 0) {
+		ASSERT3P(bp2, !=, NULL);
 		*bp = *bp2;
 		if (dbp != NULL)
 			dbuf_rele(dbp, NULL);
@@ -3189,8 +3368,10 @@ typedef struct dbuf_prefetch_arg {
 static void
 dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
 {
-	if (dpa->dpa_cb != NULL)
-		dpa->dpa_cb(dpa->dpa_arg, io_done);
+	if (dpa->dpa_cb != NULL) {
+		dpa->dpa_cb(dpa->dpa_arg, dpa->dpa_zb.zb_level,
+		    dpa->dpa_zb.zb_blkid, io_done);
+	}
 	kmem_free(dpa, sizeof (*dpa));
 }
 
@@ -3198,11 +3379,13 @@ static void
 dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
     const blkptr_t *iobp, arc_buf_t *abuf, void *private)
 {
+	(void) zio, (void) zb, (void) iobp;
 	dbuf_prefetch_arg_t *dpa = private;
 
-	dbuf_prefetch_fini(dpa, B_TRUE);
 	if (abuf != NULL)
 		arc_buf_destroy(abuf, private);
+
+	dbuf_prefetch_fini(dpa, B_TRUE);
 }
 
 /*
@@ -3246,6 +3429,7 @@ static void
 dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
     const blkptr_t *iobp, arc_buf_t *abuf, void *private)
 {
+	(void) zb, (void) iobp;
 	dbuf_prefetch_arg_t *dpa = private;
 
 	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
@@ -3253,7 +3437,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
 
 	if (abuf == NULL) {
 		ASSERT(zio == NULL || zio->io_error != 0);
-		return (dbuf_prefetch_fini(dpa, B_TRUE));
+		dbuf_prefetch_fini(dpa, B_TRUE);
+		return;
 	}
 	ASSERT(zio == NULL || zio->io_error == 0);
 
@@ -3286,7 +3471,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
 		    dpa->dpa_curlevel, curblkid, FTAG);
 		if (db == NULL) {
 			arc_buf_destroy(abuf, private);
-			return (dbuf_prefetch_fini(dpa, B_TRUE));
+			dbuf_prefetch_fini(dpa, B_TRUE);
+			return;
 		}
 		(void) dbuf_read(db, NULL,
 		    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
@@ -3299,12 +3485,14 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
 	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
 	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
 
-	ASSERT(!BP_IS_REDACTED(bp) ||
+	ASSERT(!BP_IS_REDACTED(bp) || (dpa->dpa_dnode &&
 	    dsl_dataset_feature_is_active(
 	    dpa->dpa_dnode->dn_objset->os_dsl_dataset,
-	    SPA_FEATURE_REDACTED_DATASETS));
+	    SPA_FEATURE_REDACTED_DATASETS)));
 	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
+		arc_buf_destroy(abuf, private);
 		dbuf_prefetch_fini(dpa, B_TRUE);
+		return;
 	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
 		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
 		dbuf_issue_final_prefetch(dpa, bp);
@@ -3322,7 +3510,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
 		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
 
 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
-		    bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
+		    bp, dbuf_prefetch_indirect_done, dpa,
+		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 		    &iter_aflags, &zb);
 	}
@@ -3368,7 +3557,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
 		goto no_issue;
 
 	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
-	    level, blkid);
+	    level, blkid, NULL);
 	if (db != NULL) {
 		mutex_exit(&db->db_mtx);
 		/*
@@ -3432,8 +3621,9 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
 	dpa->dpa_cb = cb;
 	dpa->dpa_arg = arg;
 
-	/* flag if L2ARC eligible, l2arc_noprefetch then decides */
-	if (dnode_level_is_l2cacheable(&bp, dn, level))
+	if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))
+		dpa->dpa_aflags |= ARC_FLAG_UNCACHED;
+	else if (dnode_level_is_l2cacheable(&bp, dn, level))
 		dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
 
 	/*
@@ -3457,7 +3647,8 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
 		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
 		    dn->dn_object, curlevel, curblkid);
 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
-		    &bp, dbuf_prefetch_indirect_done, dpa, prio,
+		    &bp, dbuf_prefetch_indirect_done, dpa,
+		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 		    &iter_aflags, &zb);
 	}
@@ -3469,7 +3660,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
 	return (1);
 no_issue:
 	if (cb != NULL)
-		cb(arg, B_FALSE);
+		cb(arg, level, blkid, B_FALSE);
 	return (0);
 }
 
@@ -3518,7 +3709,7 @@ dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
 	}
 
 	rw_enter(&db->db_rwlock, RW_WRITER);
-	bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
+	memcpy(db->db.db_data, data->b_data, arc_buf_size(data));
 	rw_exit(&db->db_rwlock);
 }
 
@@ -3529,9 +3720,10 @@ dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
 int
 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
     boolean_t fail_sparse, boolean_t fail_uncached,
-    void *tag, dmu_buf_impl_t **dbp)
+    const void *tag, dmu_buf_impl_t **dbp)
 {
 	dmu_buf_impl_t *db, *parent = NULL;
+	uint64_t hv;
 
 	/* If the pool has been created, verify the tx_sync_lock is not held */
 	spa_t *spa = dn->dn_objset->os_spa;
@@ -3547,7 +3739,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
 	*dbp = NULL;
 
 	/* dbuf_find() returns with db_mtx held */
-	db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
+	db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv);
 
 	if (db == NULL) {
 		blkptr_t *bp = NULL;
@@ -3569,7 +3761,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
 		}
 		if (err && err != ENOENT)
 			return (err);
-		db = dbuf_create(dn, level, blkid, parent, bp);
+		db = dbuf_create(dn, level, blkid, parent, bp, hv);
 	}
 
 	if (fail_uncached && db->db_state != DB_CACHED) {
@@ -3593,8 +3785,10 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    db->db_state == DB_CACHED && db->db_data_pending) {
 		dbuf_dirty_record_t *dr = db->db_data_pending;
-		if (dr->dt.dl.dr_data == db->db_buf)
+		if (dr->dt.dl.dr_data == db->db_buf) {
+			ASSERT3P(db->db_buf, !=, NULL);
 			dbuf_hold_copy(dn, db);
+		}
 	}
 
 	if (multilist_link_active(&db->db_cache_link)) {
@@ -3603,9 +3797,14 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
 		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
 
 		multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+		uint64_t size = db->db.db_size;
+		uint64_t usize = dmu_buf_user_size(&db->db);
 		(void) zfs_refcount_remove_many(
-		    &dbuf_caches[db->db_caching_status].size,
-		    db->db.db_size, db);
+		    &dbuf_caches[db->db_caching_status].size, size, db);
+		(void) zfs_refcount_remove_many(
+		    &dbuf_caches[db->db_caching_status].size, usize,
+		    db->db_user);
 
 		if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
 			DBUF_STAT_BUMPDOWN(metadata_cache_count);
@@ -3613,7 +3812,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
 			DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
 			DBUF_STAT_BUMPDOWN(cache_count);
 			DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
-			    db->db.db_size);
+			    size + usize);
 		}
 		db->db_caching_status = DB_NO_CACHE;
 	}
@@ -3634,13 +3833,13 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
 }
 
 dmu_buf_impl_t *
-dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
+dbuf_hold(dnode_t *dn, uint64_t blkid, const void *tag)
 {
 	return (dbuf_hold_level(dn, 0, blkid, tag));
 }
 
 dmu_buf_impl_t *
-dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
+dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, const void *tag)
 {
 	dmu_buf_impl_t *db;
 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
@@ -3653,7 +3852,8 @@ dbuf_create_bonus(dnode_t *dn)
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	ASSERT(dn->dn_bonus == NULL);
-	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
+	    dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
 }
 
 int
@@ -3681,7 +3881,7 @@ dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 
 #pragma weak dmu_buf_add_ref = dbuf_add_ref
 void
-dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
+dbuf_add_ref(dmu_buf_impl_t *db, const void *tag)
 {
 	int64_t holds = zfs_refcount_add(&db->db_holds, tag);
 	VERIFY3S(holds, >, 1);
@@ -3690,7 +3890,7 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
 boolean_t
 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
-    void *tag)
+    const void *tag)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dmu_buf_impl_t *found_db;
@@ -3699,7 +3899,7 @@ dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
 	if (blkid == DMU_BONUS_BLKID)
 		found_db = dbuf_find_bonus(os, obj);
 	else
-		found_db = dbuf_find(os, obj, 0, blkid);
+		found_db = dbuf_find(os, obj, 0, blkid, NULL);
 
 	if (found_db != NULL) {
 		if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
@@ -3719,14 +3919,14 @@ dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
  * dnode's parent dbuf evicting its dnode handles.
  */
 void
-dbuf_rele(dmu_buf_impl_t *db, void *tag)
+dbuf_rele(dmu_buf_impl_t *db, const void *tag)
 {
 	mutex_enter(&db->db_mtx);
 	dbuf_rele_and_unlock(db, tag, B_FALSE);
 }
 
 void
-dmu_buf_rele(dmu_buf_t *db, void *tag)
+dmu_buf_rele(dmu_buf_t *db, const void *tag)
 {
 	dbuf_rele((dmu_buf_impl_t *)db, tag);
 }
@@ -3745,7 +3945,7 @@ dmu_buf_rele(dmu_buf_t *db, void *tag)
  *
  */
 void
-dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
 {
 	int64_t holds;
 	uint64_t size;
@@ -3819,59 +4019,41 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
 			 * This dbuf has anonymous data associated with it.
 			 */
 			dbuf_destroy(db);
-		} else {
-			boolean_t do_arc_evict = B_FALSE;
-			blkptr_t bp;
-			spa_t *spa = dmu_objset_spa(db->db_objset);
-
-			if (!DBUF_IS_CACHEABLE(db) &&
-			    db->db_blkptr != NULL &&
-			    !BP_IS_HOLE(db->db_blkptr) &&
-			    !BP_IS_EMBEDDED(db->db_blkptr)) {
-				do_arc_evict = B_TRUE;
-				bp = *db->db_blkptr;
-			}
-
-			if (!DBUF_IS_CACHEABLE(db) ||
-			    db->db_pending_evict) {
-				dbuf_destroy(db);
-			} else if (!multilist_link_active(&db->db_cache_link)) {
-				ASSERT3U(db->db_caching_status, ==,
-				    DB_NO_CACHE);
-
-				dbuf_cached_state_t dcs =
-				    dbuf_include_in_metadata_cache(db) ?
-				    DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
-				db->db_caching_status = dcs;
-
-				multilist_insert(&dbuf_caches[dcs].cache, db);
-				uint64_t db_size = db->db.db_size;
-				size = zfs_refcount_add_many(
-				    &dbuf_caches[dcs].size, db_size, db);
-				uint8_t db_level = db->db_level;
-				mutex_exit(&db->db_mtx);
-
-				if (dcs == DB_DBUF_METADATA_CACHE) {
-					DBUF_STAT_BUMP(metadata_cache_count);
-					DBUF_STAT_MAX(
-					    metadata_cache_size_bytes_max,
-					    size);
-				} else {
-					DBUF_STAT_BUMP(cache_count);
-					DBUF_STAT_MAX(cache_size_bytes_max,
-					    size);
-					DBUF_STAT_BUMP(cache_levels[db_level]);
-					DBUF_STAT_INCR(
-					    cache_levels_bytes[db_level],
-					    db_size);
-				}
+		} else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
+		    db->db_pending_evict) {
+			dbuf_destroy(db);
+		} else if (!multilist_link_active(&db->db_cache_link)) {
+			ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+
+			dbuf_cached_state_t dcs =
+			    dbuf_include_in_metadata_cache(db) ?
+			    DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+			db->db_caching_status = dcs;
+
+			multilist_insert(&dbuf_caches[dcs].cache, db);
+			uint64_t db_size = db->db.db_size;
+			uint64_t dbu_size = dmu_buf_user_size(&db->db);
+			(void) zfs_refcount_add_many(
+			    &dbuf_caches[dcs].size, db_size, db);
+			size = zfs_refcount_add_many(
+			    &dbuf_caches[dcs].size, dbu_size, db->db_user);
+			uint8_t db_level = db->db_level;
+			mutex_exit(&db->db_mtx);
 
-				if (dcs == DB_DBUF_CACHE && !evicting)
-					dbuf_evict_notify(size);
+			if (dcs == DB_DBUF_METADATA_CACHE) {
+				DBUF_STAT_BUMP(metadata_cache_count);
+				DBUF_STAT_MAX(metadata_cache_size_bytes_max,
+				    size);
+			} else {
+				DBUF_STAT_BUMP(cache_count);
+				DBUF_STAT_MAX(cache_size_bytes_max, size);
+				DBUF_STAT_BUMP(cache_levels[db_level]);
+				DBUF_STAT_INCR(cache_levels_bytes[db_level],
+				    db_size + dbu_size);
 			}
 
-			if (do_arc_evict)
-				arc_freed(spa, &bp);
+			if (dcs == DB_DBUF_CACHE && !evicting)
+				dbuf_evict_notify(size);
 		}
 	} else {
 		mutex_exit(&db->db_mtx);
@@ -3948,8 +4130,37 @@ dmu_buf_get_user(dmu_buf_t *db_fake)
 	return (db->db_user);
 }
 
+uint64_t
+dmu_buf_user_size(dmu_buf_t *db_fake)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	if (db->db_user == NULL)
+		return (0);
+	return (atomic_load_64(&db->db_user->dbu_size));
+}
+
+void
+dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+	ASSERT3P(db->db_user, !=, NULL);
+	ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
+	atomic_add_64(&db->db_user->dbu_size, nadd);
+}
+
 void
-dmu_buf_user_evict_wait()
+dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+	ASSERT3P(db->db_user, !=, NULL);
+	ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
+	atomic_sub_64(&db->db_user->dbu_size, nsub);
+}
+
+void
+dmu_buf_user_evict_wait(void)
 {
 	taskq_wait(dbu_evict_taskq);
 }
@@ -3968,21 +4179,6 @@ dmu_buf_get_objset(dmu_buf_t *db)
 	return (dbi->db_objset);
 }
 
-dnode_t *
-dmu_buf_dnode_enter(dmu_buf_t *db)
-{
-	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-	DB_DNODE_ENTER(dbi);
-	return (DB_DNODE(dbi));
-}
-
-void
-dmu_buf_dnode_exit(dmu_buf_t *db)
-{
-	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-	DB_DNODE_EXIT(dbi);
-}
-
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
@@ -4042,7 +4238,7 @@ dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	dnode_t *dn = dr->dr_dnode;
 	ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
 	    DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
-	bcopy(data, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys));
+	memcpy(DN_BONUS(dn->dn_phys), data, DN_MAX_BONUS_LEN(dn->dn_phys));
 
 	dbuf_sync_leaf_verify_bonus_dnode(dr);
 
@@ -4244,22 +4440,6 @@ dbuf_lightweight_ready(zio_t *zio)
 }
 
 static void
-dbuf_lightweight_physdone(zio_t *zio)
-{
-	dbuf_dirty_record_t *dr = zio->io_private;
-	dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
-	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
-	/*
-	 * The callback will be called io_phys_children times.  Retire one
-	 * portion of our dirty space each time we are called.  Any rounding
-	 * error will be cleaned up by dbuf_lightweight_done().
-	 */
-	int delta = dr->dr_accounted / zio->io_phys_children;
-	dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
-static void
 dbuf_lightweight_done(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
@@ -4277,16 +4457,8 @@ dbuf_lightweight_done(zio_t *zio)
 		dsl_dataset_block_born(ds, zio->io_bp, tx);
 	}
 
-	/*
-	 * See comment in dbuf_write_done().
-	 */
-	if (zio->io_phys_children == 0) {
-		dsl_pool_undirty_space(dmu_objset_pool(os),
-		    dr->dr_accounted, zio->io_txg);
-	} else {
-		dsl_pool_undirty_space(dmu_objset_pool(os),
-		    dr->dr_accounted % zio->io_phys_children, zio->io_txg);
-	}
+	dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+	    zio->io_txg);
 
 	abd_free(dr->dt.dll.dr_abd);
 	kmem_free(dr, sizeof (*dr));
@@ -4320,8 +4492,7 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	    dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
 	    dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
 	    &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
-	    dbuf_lightweight_physdone, dbuf_lightweight_done, dr,
-	    ZIO_PRIORITY_ASYNC_WRITE,
+	    dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
 	    ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
 
 	zio_nowait(dr->dr_zio);
@@ -4356,6 +4527,15 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	} else if (db->db_state == DB_FILL) {
 		/* This buffer was freed and is now being re-filled */
 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+	} else if (db->db_state == DB_READ) {
+		/*
+		 * This buffer has a clone we need to write, and an in-flight
+		 * read on the BP we're about to clone. Its safe to issue the
+		 * write here because the read has already been issued and the
+		 * contents won't change.
+		 */
+		ASSERT(dr->dt.dl.dr_brtwrite &&
+		    dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
 	} else {
 		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
 	}
@@ -4412,7 +4592,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 		cv_wait(&db->db_changed, &db->db_mtx);
-		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
 	}
 
 	/*
@@ -4422,11 +4601,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
 		dbuf_prepare_encrypted_dnode_leaf(dr);
 
-	if (db->db_state != DB_NOFILL &&
+	if (*datap != NULL && *datap == db->db_buf &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    zfs_refcount_count(&db->db_holds) > 1 &&
-	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
-	    *datap == db->db_buf) {
+	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN) {
 		/*
 		 * If this buffer is currently "in use" (i.e., there
 		 * are active holds and db_data still references it),
@@ -4462,7 +4640,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 		} else {
 			*datap = arc_alloc_buf(os->os_spa, db, type, psize);
 		}
-		bcopy(db->db.db_data, (*datap)->b_data, psize);
+		memcpy((*datap)->b_data, db->db.db_data, psize);
 	}
 	db->db_data_pending = dr;
 
@@ -4478,6 +4656,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	}
 }
 
+/*
+ * Syncs out a range of dirty records for indirect or leaf dbufs.  May be
+ * called recursively from dbuf_sync_indirect().
+ */
 void
 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
@@ -4512,10 +4694,10 @@ dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 	}
 }
 
-/* ARGSUSED */
 static void
 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
+	(void) buf;
 	dmu_buf_impl_t *db = vdb;
 	dnode_t *dn;
 	blkptr_t *bp = zio->io_bp;
@@ -4534,7 +4716,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
 	zio->io_prev_space_delta = delta;
 
-	if (bp->blk_birth != 0) {
+	if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
 		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_type) ||
 		    (db->db_blkid == DMU_SPILL_BLKID &&
@@ -4571,6 +4753,20 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 				i += DNODE_MIN_SIZE;
 				if (dnp->dn_type != DMU_OT_NONE) {
 					fill++;
+					for (int j = 0; j < dnp->dn_nblkptr;
+					    j++) {
+						(void) zfs_blkptr_verify(spa,
+						    &dnp->dn_blkptr[j],
+						    BLK_CONFIG_SKIP,
+						    BLK_VERIFY_HALT);
+					}
+					if (dnp->dn_flags &
+					    DNODE_FLAG_SPILL_BLKPTR) {
+						(void) zfs_blkptr_verify(spa,
+						    DN_SPILL_BLKPTR(dnp),
+						    BLK_CONFIG_SKIP,
+						    BLK_VERIFY_HALT);
+					}
 					i += dnp->dn_extra_slots *
 					    DNODE_MIN_SIZE;
 				}
@@ -4588,6 +4784,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
 			if (BP_IS_HOLE(ibp))
 				continue;
+			(void) zfs_blkptr_verify(spa, ibp,
+			    BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
 			fill += BP_GET_FILL(ibp);
 		}
 	}
@@ -4603,7 +4801,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 	dmu_buf_unlock_parent(db, dblt, FTAG);
 }
 
-/* ARGSUSED */
 /*
  * This function gets called just prior to running through the compression
  * stage of the zio pipeline. If we're an indirect block comprised of only
@@ -4614,6 +4811,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 static void
 dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
+	(void) zio, (void) buf;
 	dmu_buf_impl_t *db = vdb;
 	dnode_t *dn;
 	blkptr_t *bp;
@@ -4642,47 +4840,16 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 		 * zero out.
 		 */
 		rw_enter(&db->db_rwlock, RW_WRITER);
-		bzero(db->db.db_data, db->db.db_size);
+		memset(db->db.db_data, 0, db->db.db_size);
 		rw_exit(&db->db_rwlock);
 	}
 	DB_DNODE_EXIT(db);
 }
 
-/*
- * The SPA will call this callback several times for each zio - once
- * for every physical child i/o (zio->io_phys_children times).  This
- * allows the DMU to monitor the progress of each logical i/o.  For example,
- * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
- * block.  There may be a long delay before all copies/fragments are completed,
- * so this callback allows us to retire dirty space gradually, as the physical
- * i/os complete.
- */
-/* ARGSUSED */
-static void
-dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
-{
-	dmu_buf_impl_t *db = arg;
-	objset_t *os = db->db_objset;
-	dsl_pool_t *dp = dmu_objset_pool(os);
-	dbuf_dirty_record_t *dr;
-	int delta = 0;
-
-	dr = db->db_data_pending;
-	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
-	/*
-	 * The callback will be called io_phys_children times.  Retire one
-	 * portion of our dirty space each time we are called.  Any rounding
-	 * error will be cleaned up by dbuf_write_done().
-	 */
-	delta = dr->dr_accounted / zio->io_phys_children;
-	dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
-/* ARGSUSED */
 static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
+	(void) buf;
 	dmu_buf_impl_t *db = vdb;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	blkptr_t *bp = db->db_blkptr;
@@ -4726,9 +4893,9 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
-		if (db->db_state != DB_NOFILL) {
-			if (dr->dt.dl.dr_data != db->db_buf)
-				arc_buf_destroy(dr->dt.dl.dr_data, db);
+		if (dr->dt.dl.dr_data != NULL &&
+		    dr->dt.dl.dr_data != db->db_buf) {
+			arc_buf_destroy(dr->dt.dl.dr_data, db);
 		}
 	} else {
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -4751,27 +4918,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	db->db_data_pending = NULL;
 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
 
-	/*
-	 * If we didn't do a physical write in this ZIO and we
-	 * still ended up here, it means that the space of the
-	 * dbuf that we just released (and undirtied) above hasn't
-	 * been marked as undirtied in the pool's accounting.
-	 *
-	 * Thus, we undirty that space in the pool's view of the
-	 * world here. For physical writes this type of update
-	 * happens in dbuf_write_physdone().
-	 *
-	 * If we did a physical write, cleanup any rounding errors
-	 * that came up due to writing multiple copies of a block
-	 * on disk [see dbuf_write_physdone()].
-	 */
-	if (zio->io_phys_children == 0) {
-		dsl_pool_undirty_space(dmu_objset_pool(os),
-		    dr->dr_accounted, zio->io_txg);
-	} else {
-		dsl_pool_undirty_space(dmu_objset_pool(os),
-		    dr->dr_accounted % zio->io_phys_children, zio->io_txg);
-	}
+	dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+	    zio->io_txg);
 
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 }
@@ -4853,7 +5001,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	drica.drica_os = dn->dn_objset;
-	drica.drica_blk_birth = bp->blk_birth;
+	drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
 	drica.drica_tx = tx;
 	if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
 	    &drica)) {
@@ -4868,7 +5016,8 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
 		if (dn->dn_objset != spa_meta_objset(spa)) {
 			dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
 			if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-			    bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+			    BP_GET_LOGICAL_BIRTH(bp) >
+			    ds->ds_dir->dd_origin_txg) {
 				ASSERT(!BP_IS_EMBEDDED(bp));
 				ASSERT(dsl_dir_is_clone(ds->ds_dir));
 				ASSERT(spa_feature_is_enabled(spa,
@@ -4928,7 +5077,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
 }
 
 
-/* Issue I/O to commit a dirty buffer to disk. */
+/*
+ * Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
+ * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
+ */
 static void
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
@@ -4946,21 +5098,18 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 
 	os = dn->dn_objset;
 
-	if (db->db_state != DB_NOFILL) {
-		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
-			/*
-			 * Private object buffers are released here rather
-			 * than in dbuf_dirty() since they are only modified
-			 * in the syncing context and we don't want the
-			 * overhead of making multiple copies of the data.
-			 */
-			if (BP_IS_HOLE(db->db_blkptr)) {
-				arc_buf_thaw(data);
-			} else {
-				dbuf_release_bp(db);
-			}
-			dbuf_remap(dn, db, tx);
-		}
+	if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+		/*
+		 * Private object buffers are released here rather than in
+		 * dbuf_dirty() since they are only modified in the syncing
+		 * context and we don't want the overhead of making multiple
+		 * copies of the data.
+		 */
+		if (BP_IS_HOLE(db->db_blkptr))
+			arc_buf_thaw(data);
+		else
+			dbuf_release_bp(db);
+		dbuf_remap(dn, db, tx);
 	}
 
 	if (parent != dn->dn_dbuf) {
@@ -4987,7 +5136,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 	}
 
 	ASSERT(db->db_level == 0 || data == db->db_buf);
-	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
 	ASSERT(pio);
 
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
@@ -4996,7 +5145,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		wp_flag = WP_SPILL;
-	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+	wp_flag |= (data == NULL) ? WP_NOFILL : 0;
 
 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
 
@@ -5019,20 +5168,21 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 
 		dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
 		    contents, db->db.db_size, db->db.db_size, &zp,
-		    dbuf_write_override_ready, NULL, NULL,
+		    dbuf_write_override_ready, NULL,
 		    dbuf_write_override_done,
 		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 		mutex_enter(&db->db_mtx);
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
-		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
+		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
+		    dr->dt.dl.dr_brtwrite);
 		mutex_exit(&db->db_mtx);
-	} else if (db->db_state == DB_NOFILL) {
+	} else if (data == NULL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(pio, os->os_spa, txg,
 		    &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
-		    dbuf_write_nofill_ready, NULL, NULL,
+		    dbuf_write_nofill_ready, NULL,
 		    dbuf_write_nofill_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE,
 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
@@ -5049,11 +5199,10 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 			children_ready_cb = dbuf_write_children_ready;
 
 		dr->dr_zio = arc_write(pio, os->os_spa, txg,
-		    &dr->dr_bp_copy, data, dbuf_is_l2cacheable(db),
-		    &zp, dbuf_write_ready,
-		    children_ready_cb, dbuf_write_physdone,
-		    dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
-		    ZIO_FLAG_MUSTSUCCEED, &zb);
+		    &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
+		    dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
+		    children_ready_cb, dbuf_write_done, db,
+		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 	}
 }
 
@@ -5071,6 +5220,7 @@ EXPORT_SYMBOL(dbuf_dirty);
 EXPORT_SYMBOL(dmu_buf_set_crypt_params);
 EXPORT_SYMBOL(dmu_buf_will_dirty);
 EXPORT_SYMBOL(dmu_buf_is_dirty);
+EXPORT_SYMBOL(dmu_buf_will_clone);
 EXPORT_SYMBOL(dmu_buf_will_not_fill);
 EXPORT_SYMBOL(dmu_buf_will_fill);
 EXPORT_SYMBOL(dmu_buf_fill_done);
@@ -5093,25 +5243,23 @@ EXPORT_SYMBOL(dmu_buf_set_user_ie);
 EXPORT_SYMBOL(dmu_buf_get_user);
 EXPORT_SYMBOL(dmu_buf_get_blkptr);
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW,
 	"Maximum size in bytes of the dbuf cache.");
 
 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
-	"Percentage over dbuf_cache_max_bytes when dbufs must be evicted "
-	"directly.");
+	"Percentage over dbuf_cache_max_bytes for direct dbuf eviction.");
 
 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
-	"Percentage below dbuf_cache_max_bytes when the evict thread stops "
-	"evicting dbufs.");
+	"Percentage below dbuf_cache_max_bytes when dbuf eviction stops.");
+
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW,
+	"Maximum size in bytes of dbuf metadata cache.");
 
-ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW,
-	"Maximum size in bytes of the dbuf metadata cache.");
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW,
+	"Set size of dbuf cache to log2 fraction of arc size.");
 
-ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW,
-	"Set the size of the dbuf cache to a log2 fraction of arc size.");
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
+	"Set size of dbuf metadata cache to log2 fraction of arc size.");
 
-ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW,
-	"Set the size of the dbuf metadata cache to a log2 fraction of arc "
-	"size.");
-/* END CSTYLED */
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
+	"Set size of dbuf cache mutex array as log2 shift.");
diff --git a/sys/contrib/openzfs/module/zfs/dbuf_stats.c b/sys/contrib/openzfs/module/zfs/dbuf_stats.c
index 12bb568a08cc..ccee8997e10e 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf_stats.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf_stats.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -46,14 +46,14 @@ static int
 dbuf_stats_hash_table_headers(char *buf, size_t size)
 {
 	(void) snprintf(buf, size,
-	    "%-96s | %-119s | %s\n"
-	    "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-5s %-5s %-7s %3s | "
+	    "%-105s | %-119s | %s\n"
+	    "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-8s %-5s %-5s %-7s %3s | "
 	    "%-5s %-5s %-9s %-6s %-8s %-12s "
 	    "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-6s | "
 	    "%-6s %-6s %-8s %-8s %-6s %-6s %-6s %-8s %-8s\n",
 	    "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
-	    "blkid", "offset", "dbsize", "meta", "state", "dbholds", "dbc",
-	    "list", "atype", "flags", "count", "asize", "access",
+	    "blkid", "offset", "dbsize", "usize", "meta", "state", "dbholds",
+	    "dbc", "list", "atype", "flags", "count", "asize", "access",
 	    "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
 	    "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs",
 	    "bsize", "lvls", "dholds", "blocks", "dsize");
@@ -75,8 +75,8 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
 	__dmu_object_info_from_dnode(dn, &doi);
 
 	nwritten = snprintf(buf, size,
-	    "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-5d %-5d "
-	    "%-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
+	    "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-8llu "
+	    "%-5d %-5d %-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
 	    "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-6lu | "
 	    "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-6lu %-8llu %-8llu\n",
 	    /* dmu_buf_impl_t */
@@ -87,6 +87,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
 	    (longlong_t)db->db_blkid,
 	    (u_longlong_t)db->db.db_offset,
 	    (u_longlong_t)db->db.db_size,
+	    (u_longlong_t)dmu_buf_user_size(&db->db),
 	    !!dbuf_is_metadata(db),
 	    db->db_state,
 	    (ulong_t)zfs_refcount_count(&db->db_holds),
@@ -226,7 +227,5 @@ dbuf_stats_destroy(void)
 	dbuf_stats_hash_table_destroy();
 }
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, zfs_, dbuf_state_index, INT, ZMOD_RW,
 	"Calculate arc header index");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c
index fe5a188f4da1..4c53cb0a2f9b 100644
--- a/sys/contrib/openzfs/module/zfs/ddt.c
+++ b/sys/contrib/openzfs/module/zfs/ddt.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -22,6 +22,8 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2022 by Pawel Jakub Dawidek
+ * Copyright (c) 2023, Klara Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -29,15 +31,119 @@
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/ddt.h>
+#include <sys/ddt_impl.h>
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/dsl_pool.h>
 #include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
 #include <sys/dsl_scan.h>
 #include <sys/abd.h>
 
+/*
+ * # DDT: Deduplication tables
+ *
+ * The dedup subsystem provides block-level deduplication. When enabled, blocks
+ * to be written will have the dedup (D) bit set, which causes them to be
+ * tracked in a "dedup table", or DDT. If a block has been seen before (exists
+ * in the DDT), instead of being written, it will instead be made to reference
+ * the existing on-disk data, and a refcount bumped in the DDT instead.
+ *
+ * ## Dedup tables and entries
+ *
+ * Conceptually, a DDT is a dictionary or map. Each entry has a "key"
+ * (ddt_key_t) made up a block's checksum and certian properties, and a "value"
+ * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth
+ * time and refcount. Together these are enough to track references to a
+ * specific block, to build a valid block pointer to reference that block (for
+ * freeing, scrubbing, etc), and to fill a new block pointer with the missing
+ * pieces to make it seem like it was written.
+ *
+ * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[].
+ * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk
+ * object data formats, each with their own implementations) and "classes"
+ * (ddt_class_t, instance of a storage type object, for entries with a specific
+ * characteristic). An entry (key) will only ever exist on one of these objects
+ * at any given time, but may be moved from one to another if their type or
+ * class changes.
+ *
+ * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block
+ * is to be written, before DVAs have been allocated, ddt_lookup() is called to
+ * see if the block has been seen before. If its not found, the write proceeds
+ * as normal, and after it succeeds, a new entry is created. If it is found, we
+ * fill the BP with the DVAs from the entry, increment the refcount and cause
+ * the write IO to return immediately.
+ *
+ * Each ddt_phys_t slot in the entry represents a separate dedup block for the
+ * same content/checksum. The slot is selected based on the zp_copies parameter
+ * the block is written with, that is, the number of DVAs in the block. The
+ * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto"
+ * feature. These are no longer written, and will be freed if encountered on
+ * old pools.
+ *
+ * ## Lifetime of an entry
+ *
+ * A DDT can be enormous, and typically is not held in memory all at once.
+ * Instead, the changes to an entry are tracked in memory, and written down to
+ * disk at the end of each txg.
+ *
+ * A "live" in-memory entry (ddt_entry_t) is a node on the live tree
+ * (ddt_tree).  At the start of a txg, ddt_tree is empty. When an entry is
+ * required for IO, ddt_lookup() is called. If an entry already exists on
+ * ddt_tree, it is returned. Otherwise, a new one is created, and the
+ * type/class objects for the DDT are searched for that key. If its found, its
+ * value is copied into the live entry. If not, an empty entry is created.
+ *
+ * The live entry will be modified during the txg, usually by modifying the
+ * refcount, but sometimes by adding or updating DVAs. At the end of the txg
+ * (during spa_sync()), type and class are recalculated for entry (see
+ * ddt_sync_entry()), and the entry is written to the appropriate storage
+ * object and (if necessary), removed from an old one. ddt_tree is cleared and
+ * the next txg can start.
+ *
+ * ## Repair IO
+ *
+ * If a read on a dedup block fails, but there are other copies of the block in
+ * the other ddt_phys_t slots, reads will be issued for those instead
+ * (zio_ddt_read_start()). If one of those succeeds, the read is returned to
+ * the caller, and a copy is stashed on the entry's dde_repair_abd.
+ *
+ * During the end-of-txg sync, any entries with a dde_repair_abd get a
+ * "rewrite" write issued for the original block pointer, with the data read
+ * from the alternate block. If the block is actually damaged, this will invoke
+ * the pool's "self-healing" mechanism, and repair the block.
+ *
+ * ## Scanning (scrub/resilver)
+ *
+ * If dedup is active, the scrub machinery will walk the dedup table first, and
+ * scrub all blocks with refcnt > 1 first. After that it will move on to the
+ * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them.
+ * In this way, heavily deduplicated blocks are only scrubbed once. See the
+ * commentary on dsl_scan_ddt() for more details.
+ *
+ * Walking the DDT is done via ddt_walk(). The current position is stored in a
+ * ddt_bookmark_t, which represents a stable position in the storage object.
+ * This bookmark is stored by the scan machinery, and must reference the same
+ * position on the object even if the object changes, the pool is exported, or
+ * OpenZFS is upgraded.
+ *
+ * ## Interaction with block cloning
+ *
+ * If block cloning and dedup are both enabled on a pool, BRT will look for the
+ * dedup bit on an incoming block pointer. If set, it will call into the DDT
+ * (ddt_addref()) to add a reference to the block, instead of adding a
+ * reference to the BRT. See brt_pending_apply().
+ */
+
+/*
+ * These are the only checksums valid for dedup. They must match the list
+ * from dedup_table in zfs_prop.c
+ */
+#define	DDT_CHECKSUM_VALID(c)	\
+	(c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \
+	c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \
+	c == ZIO_CHECKSUM_BLAKE3)
+
 static kmem_cache_t *ddt_cache;
 static kmem_cache_t *ddt_entry_cache;
 
@@ -46,18 +152,18 @@ static kmem_cache_t *ddt_entry_cache;
  */
 int zfs_dedup_prefetch = 0;
 
-static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
+static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
 	&ddt_zap_ops,
 };
 
-static const char *ddt_class_name[DDT_CLASSES] = {
+static const char *const ddt_class_name[DDT_CLASSES] = {
 	"ditto",
 	"duplicate",
 	"unique",
 };
 
 static void
-ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_tx_t *tx)
 {
 	spa_t *spa = ddt->ddt_spa;
@@ -69,20 +175,20 @@ ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 
 	ddt_object_name(ddt, type, class, name);
 
-	ASSERT(*objectp == 0);
-	VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
-	ASSERT(*objectp != 0);
+	ASSERT3U(*objectp, ==, 0);
+	VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash));
+	ASSERT3U(*objectp, !=, 0);
 
-	VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
-	    sizeof (uint64_t), 1, objectp, tx) == 0);
+	VERIFY0(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
+	    sizeof (uint64_t), 1, objectp, tx));
 
-	VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
+	VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name,
 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
-	    &ddt->ddt_histogram[type][class], tx) == 0);
+	    &ddt->ddt_histogram[type][class], tx));
 }
 
 static void
-ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_tx_t *tx)
 {
 	spa_t *spa = ddt->ddt_spa;
@@ -93,19 +199,20 @@ ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 
 	ddt_object_name(ddt, type, class, name);
 
-	ASSERT(*objectp != 0);
+	ASSERT3U(*objectp, !=, 0);
 	ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
-	VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0);
-	VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
-	VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
-	VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
-	bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
+	VERIFY0(ddt_object_count(ddt, type, class, &count));
+	VERIFY0(count);
+	VERIFY0(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx));
+	VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx));
+	VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx));
+	memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t));
 
 	*objectp = 0;
 }
 
 static int
-ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
 	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 	dmu_object_info_t doi;
@@ -145,7 +252,7 @@ ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 }
 
 static void
-ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_sync(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_tx_t *tx)
 {
 	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
@@ -155,75 +262,95 @@ ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 
 	ddt_object_name(ddt, type, class, name);
 
-	VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+	VERIFY0(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
-	    &ddt->ddt_histogram[type][class], tx) == 0);
+	    &ddt->ddt_histogram[type][class], tx));
 
 	/*
 	 * Cache DDT statistics; this is the only time they'll change.
 	 */
-	VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
-	VERIFY(ddt_object_count(ddt, type, class, &count) == 0);
+	VERIFY0(ddt_object_info(ddt, type, class, &doi));
+	VERIFY0(ddt_object_count(ddt, type, class, &count));
 
 	ddo->ddo_count = count;
 	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 }
 
+static boolean_t
+ddt_object_exists(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
+{
+	return (!!ddt->ddt_object[type][class]);
+}
+
 static int
-ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     ddt_entry_t *dde)
 {
 	if (!ddt_object_exists(ddt, type, class))
 		return (SET_ERROR(ENOENT));
 
 	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
-	    ddt->ddt_object[type][class], dde));
+	    ddt->ddt_object[type][class], &dde->dde_key,
+	    dde->dde_phys, sizeof (dde->dde_phys)));
+}
+
+static int
+ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
+    const ddt_key_t *ddk)
+{
+	if (!ddt_object_exists(ddt, type, class))
+		return (SET_ERROR(ENOENT));
+
+	return (ddt_ops[type]->ddt_op_contains(ddt->ddt_os,
+	    ddt->ddt_object[type][class], ddk));
 }
 
 static void
-ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
-    ddt_entry_t *dde)
+ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
+    const ddt_key_t *ddk)
 {
 	if (!ddt_object_exists(ddt, type, class))
 		return;
 
 	ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
-	    ddt->ddt_object[type][class], dde);
+	    ddt->ddt_object[type][class], ddk);
 }
 
-int
-ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+static int
+ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     ddt_entry_t *dde, dmu_tx_t *tx)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
-	    ddt->ddt_object[type][class], dde, tx));
+	    ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys,
+	    sizeof (dde->dde_phys), tx));
 }
 
 static int
-ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
-    ddt_entry_t *dde, dmu_tx_t *tx)
+ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
+    const ddt_key_t *ddk, dmu_tx_t *tx)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
-	    ddt->ddt_object[type][class], dde, tx));
+	    ddt->ddt_object[type][class], ddk, tx));
 }
 
 int
-ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     uint64_t *walk, ddt_entry_t *dde)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
-	    ddt->ddt_object[type][class], dde, walk));
+	    ddt->ddt_object[type][class], walk, &dde->dde_key,
+	    dde->dde_phys, sizeof (dde->dde_phys)));
 }
 
 int
-ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     uint64_t *count)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
@@ -233,7 +360,7 @@ ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 }
 
 int
-ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_object_info_t *doi)
 {
 	if (!ddt_object_exists(ddt, type, class))
@@ -243,14 +370,8 @@ ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 	    doi));
 }
 
-boolean_t
-ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
-{
-	return (!!ddt->ddt_object[type][class]);
-}
-
 void
-ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     char *name)
 {
 	(void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT,
@@ -261,7 +382,7 @@ ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 void
 ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
 {
-	ASSERT(txg != 0);
+	ASSERT3U(txg, !=, 0);
 
 	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 		bp->blk_dva[d] = ddp->ddp_dva[d];
@@ -312,17 +433,17 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
 void
 ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
 {
-	ASSERT(ddp->ddp_phys_birth == 0);
+	ASSERT0(ddp->ddp_phys_birth);
 
 	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 		ddp->ddp_dva[d] = bp->blk_dva[d];
-	ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+	ddp->ddp_phys_birth = BP_GET_BIRTH(bp);
 }
 
 void
 ddt_phys_clear(ddt_phys_t *ddp)
 {
-	bzero(ddp, sizeof (*ddp));
+	memset(ddp, 0, sizeof (*ddp));
 }
 
 void
@@ -335,12 +456,12 @@ void
 ddt_phys_decref(ddt_phys_t *ddp)
 {
 	if (ddp) {
-		ASSERT(ddp->ddp_refcnt > 0);
+		ASSERT3U(ddp->ddp_refcnt, >, 0);
 		ddp->ddp_refcnt--;
 	}
 }
 
-void
+static void
 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
 {
 	blkptr_t blk;
@@ -364,7 +485,7 @@ ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
-		    BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+		    BP_GET_BIRTH(bp) == ddp->ddp_phys_birth)
 			return (ddp);
 	}
 	return (NULL);
@@ -381,221 +502,10 @@ ddt_phys_total_refcnt(const ddt_entry_t *dde)
 	return (refcnt);
 }
 
-static void
-ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
-{
-	spa_t *spa = ddt->ddt_spa;
-	ddt_phys_t *ddp = dde->dde_phys;
-	ddt_key_t *ddk = &dde->dde_key;
-	uint64_t lsize = DDK_GET_LSIZE(ddk);
-	uint64_t psize = DDK_GET_PSIZE(ddk);
-
-	bzero(dds, sizeof (*dds));
-
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-		uint64_t dsize = 0;
-		uint64_t refcnt = ddp->ddp_refcnt;
-
-		if (ddp->ddp_phys_birth == 0)
-			continue;
-
-		for (int d = 0; d < DDE_GET_NDVAS(dde); d++)
-			dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
-
-		dds->dds_blocks += 1;
-		dds->dds_lsize += lsize;
-		dds->dds_psize += psize;
-		dds->dds_dsize += dsize;
-
-		dds->dds_ref_blocks += refcnt;
-		dds->dds_ref_lsize += lsize * refcnt;
-		dds->dds_ref_psize += psize * refcnt;
-		dds->dds_ref_dsize += dsize * refcnt;
-	}
-}
-
-void
-ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
-{
-	const uint64_t *s = (const uint64_t *)src;
-	uint64_t *d = (uint64_t *)dst;
-	uint64_t *d_end = (uint64_t *)(dst + 1);
-
-	ASSERT(neg == 0 || neg == -1ULL);	/* add or subtract */
-
-	for (int i = 0; i < d_end - d; i++)
-		d[i] += (s[i] ^ neg) - neg;
-}
-
-static void
-ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
-{
-	ddt_stat_t dds;
-	ddt_histogram_t *ddh;
-	int bucket;
-
-	ddt_stat_generate(ddt, dde, &dds);
-
-	bucket = highbit64(dds.dds_ref_blocks) - 1;
-	ASSERT(bucket >= 0);
-
-	ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
-
-	ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
-}
-
-void
-ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
-{
-	for (int h = 0; h < 64; h++)
-		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
-}
-
-void
-ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
-{
-	bzero(dds, sizeof (*dds));
-
-	for (int h = 0; h < 64; h++)
-		ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
-}
-
-boolean_t
-ddt_histogram_empty(const ddt_histogram_t *ddh)
-{
-	const uint64_t *s = (const uint64_t *)ddh;
-	const uint64_t *s_end = (const uint64_t *)(ddh + 1);
-
-	while (s < s_end)
-		if (*s++ != 0)
-			return (B_FALSE);
-
-	return (B_TRUE);
-}
-
-void
-ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
-{
-	/* Sum the statistics we cached in ddt_object_sync(). */
-	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
-		ddt_t *ddt = spa->spa_ddt[c];
-		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
-			for (enum ddt_class class = 0; class < DDT_CLASSES;
-			    class++) {
-				ddt_object_t *ddo =
-				    &ddt->ddt_object_stats[type][class];
-				ddo_total->ddo_count += ddo->ddo_count;
-				ddo_total->ddo_dspace += ddo->ddo_dspace;
-				ddo_total->ddo_mspace += ddo->ddo_mspace;
-			}
-		}
-	}
-
-	/* ... and compute the averages. */
-	if (ddo_total->ddo_count != 0) {
-		ddo_total->ddo_dspace /= ddo_total->ddo_count;
-		ddo_total->ddo_mspace /= ddo_total->ddo_count;
-	}
-}
-
-void
-ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
-{
-	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
-		ddt_t *ddt = spa->spa_ddt[c];
-		for (enum ddt_type type = 0; type < DDT_TYPES && ddt; type++) {
-			for (enum ddt_class class = 0; class < DDT_CLASSES;
-			    class++) {
-				ddt_histogram_add(ddh,
-				    &ddt->ddt_histogram_cache[type][class]);
-			}
-		}
-	}
-}
-
-void
-ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
-{
-	ddt_histogram_t *ddh_total;
-
-	ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
-	ddt_get_dedup_histogram(spa, ddh_total);
-	ddt_histogram_stat(dds_total, ddh_total);
-	kmem_free(ddh_total, sizeof (ddt_histogram_t));
-}
-
-uint64_t
-ddt_get_dedup_dspace(spa_t *spa)
-{
-	ddt_stat_t dds_total;
-
-	if (spa->spa_dedup_dspace != ~0ULL)
-		return (spa->spa_dedup_dspace);
-
-	bzero(&dds_total, sizeof (ddt_stat_t));
-
-	/* Calculate and cache the stats */
-	ddt_get_dedup_stats(spa, &dds_total);
-	spa->spa_dedup_dspace = dds_total.dds_ref_dsize - dds_total.dds_dsize;
-	return (spa->spa_dedup_dspace);
-}
-
-uint64_t
-ddt_get_pool_dedup_ratio(spa_t *spa)
-{
-	ddt_stat_t dds_total = { 0 };
-
-	ddt_get_dedup_stats(spa, &dds_total);
-	if (dds_total.dds_dsize == 0)
-		return (100);
-
-	return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
-}
-
-size_t
-ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
-{
-	uchar_t *version = dst++;
-	int cpfunc = ZIO_COMPRESS_ZLE;
-	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
-	size_t c_len;
-
-	ASSERT(d_len >= s_len + 1);	/* no compression plus version byte */
-
-	c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
-
-	if (c_len == s_len) {
-		cpfunc = ZIO_COMPRESS_OFF;
-		bcopy(src, dst, s_len);
-	}
-
-	*version = cpfunc;
-	if (ZFS_HOST_BYTEORDER)
-		*version |= DDT_COMPRESS_BYTEORDER_MASK;
-
-	return (c_len + 1);
-}
-
-void
-ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
-{
-	uchar_t version = *src++;
-	int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
-	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
-
-	if (ci->ci_decompress != NULL)
-		(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
-	else
-		bcopy(src, dst, d_len);
-
-	if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
-	    (ZFS_HOST_BYTEORDER != 0))
-		byteswap_uint64_array(dst, d_len);
-}
-
 ddt_t *
 ddt_select(spa_t *spa, const blkptr_t *bp)
 {
+	ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp)));
 	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 }
 
@@ -633,7 +543,7 @@ ddt_alloc(const ddt_key_t *ddk)
 	ddt_entry_t *dde;
 
 	dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
-	bzero(dde, sizeof (ddt_entry_t));
+	memset(dde, 0, sizeof (ddt_entry_t));
 	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
 
 	dde->dde_key = *ddk;
@@ -644,10 +554,10 @@ ddt_alloc(const ddt_key_t *ddk)
 static void
 ddt_free(ddt_entry_t *dde)
 {
-	ASSERT(!dde->dde_loading);
+	ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++)
-		ASSERT(dde->dde_lead_zio[p] == NULL);
+		ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
 
 	if (dde->dde_repair_abd != NULL)
 		abd_free(dde->dde_repair_abd);
@@ -668,36 +578,48 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 ddt_entry_t *
 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 {
-	ddt_entry_t *dde, dde_search;
-	enum ddt_type type;
-	enum ddt_class class;
+	ddt_key_t search;
+	ddt_entry_t *dde;
+	ddt_type_t type;
+	ddt_class_t class;
 	avl_index_t where;
 	int error;
 
 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 
-	ddt_key_fill(&dde_search.dde_key, bp);
+	ddt_key_fill(&search, bp);
 
-	dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
-	if (dde == NULL) {
-		if (!add)
-			return (NULL);
-		dde = ddt_alloc(&dde_search.dde_key);
-		avl_insert(&ddt->ddt_tree, dde, where);
-	}
+	/* Find an existing live entry */
+	dde = avl_find(&ddt->ddt_tree, &search, &where);
+	if (dde != NULL) {
+		/* Found it. If it's already loaded, we can just return it. */
+		if (dde->dde_flags & DDE_FLAG_LOADED)
+			return (dde);
 
-	while (dde->dde_loading)
-		cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+		/* Someone else is loading it, wait for it. */
+		while (!(dde->dde_flags & DDE_FLAG_LOADED))
+			cv_wait(&dde->dde_cv, &ddt->ddt_lock);
 
-	if (dde->dde_loaded)
 		return (dde);
+	}
 
-	dde->dde_loading = B_TRUE;
+	/* Not found. */
+	if (!add)
+		return (NULL);
 
+	/* Time to make a new entry. */
+	dde = ddt_alloc(&search);
+	avl_insert(&ddt->ddt_tree, dde, where);
+
+	/*
+	 * ddt_tree is now stable, so unlock and let everyone else keep moving.
+	 * Anyone landing on this entry will find it without DDE_FLAG_LOADED,
+	 * and go to sleep waiting for it above.
+	 */
 	ddt_exit(ddt);
 
+	/* Search all store objects for the entry. */
 	error = ENOENT;
-
 	for (type = 0; type < DDT_TYPES; type++) {
 		for (class = 0; class < DDT_CLASSES; class++) {
 			error = ddt_object_lookup(ddt, type, class, dde);
@@ -712,17 +634,16 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 
 	ddt_enter(ddt);
 
-	ASSERT(dde->dde_loaded == B_FALSE);
-	ASSERT(dde->dde_loading == B_TRUE);
+	ASSERT(!(dde->dde_flags & DDE_FLAG_LOADED));
 
 	dde->dde_type = type;	/* will be DDT_TYPES if no entry found */
 	dde->dde_class = class;	/* will be DDT_CLASSES if no entry found */
-	dde->dde_loaded = B_TRUE;
-	dde->dde_loading = B_FALSE;
 
 	if (error == 0)
 		ddt_stat_update(ddt, dde, -1ULL);
 
+	/* Entry loaded, everyone can proceed now */
+	dde->dde_flags |= DDE_FLAG_LOADED;
 	cv_broadcast(&dde->dde_cv);
 
 	return (dde);
@@ -732,7 +653,7 @@ void
 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
 {
 	ddt_t *ddt;
-	ddt_entry_t dde;
+	ddt_key_t ddk;
 
 	if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
 		return;
@@ -743,17 +664,18 @@ ddt_prefetch(spa_t *spa, const blkptr_t *bp)
 	 * Thus no locking is required as the DDT can't disappear on us.
 	 */
 	ddt = ddt_select(spa, bp);
-	ddt_key_fill(&dde.dde_key, bp);
+	ddt_key_fill(&ddk, bp);
 
-	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
-		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
-			ddt_object_prefetch(ddt, type, class, &dde);
+	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
+			ddt_object_prefetch(ddt, type, class, &ddk);
 		}
 	}
 }
 
 /*
- * Opaque struct used for ddt_key comparison
+ * Key comparison. Any struct wanting to make use of this function must have
+ * the key as the first element.
  */
 #define	DDT_KEY_CMP_LEN	(sizeof (ddt_key_t) / sizeof (uint16_t))
 
@@ -762,12 +684,10 @@ typedef struct ddt_key_cmp {
 } ddt_key_cmp_t;
 
 int
-ddt_entry_compare(const void *x1, const void *x2)
+ddt_key_compare(const void *x1, const void *x2)
 {
-	const ddt_entry_t *dde1 = x1;
-	const ddt_entry_t *dde2 = x2;
-	const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)&dde1->dde_key;
-	const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)&dde2->dde_key;
+	const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)x1;
+	const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)x2;
 	int32_t cmp = 0;
 
 	for (int i = 0; i < DDT_KEY_CMP_LEN; i++) {
@@ -785,12 +705,12 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 	ddt_t *ddt;
 
 	ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
-	bzero(ddt, sizeof (ddt_t));
+	memset(ddt, 0, sizeof (ddt_t));
 
 	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
-	avl_create(&ddt->ddt_tree, ddt_entry_compare,
+	avl_create(&ddt->ddt_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
-	avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
+	avl_create(&ddt->ddt_repair_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 	ddt->ddt_checksum = c;
 	ddt->ddt_spa = spa;
@@ -802,8 +722,8 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 static void
 ddt_table_free(ddt_t *ddt)
 {
-	ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
-	ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
+	ASSERT0(avl_numnodes(&ddt->ddt_tree));
+	ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
 	avl_destroy(&ddt->ddt_tree);
 	avl_destroy(&ddt->ddt_repair_tree);
 	mutex_destroy(&ddt->ddt_lock);
@@ -815,8 +735,10 @@ ddt_create(spa_t *spa)
 {
 	spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
 
-	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
-		spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		if (DDT_CHECKSUM_VALID(c))
+			spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+	}
 }
 
 int
@@ -834,9 +756,12 @@ ddt_load(spa_t *spa)
 		return (error == ENOENT ? 0 : error);
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		if (!DDT_CHECKSUM_VALID(c))
+			continue;
+
 		ddt_t *ddt = spa->spa_ddt[c];
-		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
-			for (enum ddt_class class = 0; class < DDT_CLASSES;
+		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+			for (ddt_class_t class = 0; class < DDT_CLASSES;
 			    class++) {
 				error = ddt_object_load(ddt, type, class);
 				if (error != 0 && error != ENOENT)
@@ -847,7 +772,7 @@ ddt_load(spa_t *spa)
 		/*
 		 * Seed the cached histograms.
 		 */
-		bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+		memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
 		    sizeof (ddt->ddt_histogram));
 		spa->spa_dedup_dspace = ~0ULL;
 	}
@@ -867,10 +792,10 @@ ddt_unload(spa_t *spa)
 }
 
 boolean_t
-ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
+ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp)
 {
 	ddt_t *ddt;
-	ddt_entry_t *dde;
+	ddt_key_t ddk;
 
 	if (!BP_GET_DEDUP(bp))
 		return (B_FALSE);
@@ -879,20 +804,16 @@ ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
 		return (B_TRUE);
 
 	ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
-	dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
 
-	ddt_key_fill(&(dde->dde_key), bp);
+	ddt_key_fill(&ddk, bp);
 
-	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
-		for (enum ddt_class class = 0; class <= max_class; class++) {
-			if (ddt_object_lookup(ddt, type, class, dde) == 0) {
-				kmem_cache_free(ddt_entry_cache, dde);
+	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+		for (ddt_class_t class = 0; class <= max_class; class++) {
+			if (ddt_object_contains(ddt, type, class, &ddk) == 0)
 				return (B_TRUE);
-			}
 		}
 	}
 
-	kmem_cache_free(ddt_entry_cache, dde);
 	return (B_FALSE);
 }
 
@@ -906,8 +827,8 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 
 	dde = ddt_alloc(&ddk);
 
-	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
-		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
 			/*
 			 * We can only do repair if there are multiple copies
 			 * of the block.  For anything in the UNIQUE class,
@@ -919,7 +840,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 		}
 	}
 
-	bzero(dde->dde_phys, sizeof (dde->dde_phys));
+	memset(dde->dde_phys, 0, sizeof (dde->dde_phys));
 
 	return (dde);
 }
@@ -964,7 +885,7 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
 		if (ddp->ddp_phys_birth == 0 ||
 		    ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
-		    bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+		    memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
@@ -1006,19 +927,18 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
 	ddt_phys_t *ddp = dde->dde_phys;
 	ddt_key_t *ddk = &dde->dde_key;
-	enum ddt_type otype = dde->dde_type;
-	enum ddt_type ntype = DDT_TYPE_CURRENT;
-	enum ddt_class oclass = dde->dde_class;
-	enum ddt_class nclass;
+	ddt_type_t otype = dde->dde_type;
+	ddt_type_t ntype = DDT_TYPE_DEFAULT;
+	ddt_class_t oclass = dde->dde_class;
+	ddt_class_t nclass;
 	uint64_t total_refcnt = 0;
 
-	ASSERT(dde->dde_loaded);
-	ASSERT(!dde->dde_loading);
+	ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-		ASSERT(dde->dde_lead_zio[p] == NULL);
+		ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
 		if (ddp->ddp_phys_birth == 0) {
-			ASSERT(ddp->ddp_refcnt == 0);
+			ASSERT0(ddp->ddp_refcnt);
 			continue;
 		}
 		if (p == DDT_PHYS_DITTO) {
@@ -1043,8 +963,9 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 
 	if (otype != DDT_TYPES &&
 	    (otype != ntype || oclass != nclass || total_refcnt == 0)) {
-		VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
-		ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
+		VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
+		ASSERT3U(
+		    ddt_object_contains(ddt, otype, oclass, ddk), ==, ENOENT);
 	}
 
 	if (total_refcnt != 0) {
@@ -1053,7 +974,7 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 		ddt_stat_update(ddt, dde, 0);
 		if (!ddt_object_exists(ddt, ntype, nclass))
 			ddt_object_create(ddt, ntype, nclass, tx);
-		VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+		VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx));
 
 		/*
 		 * If the class changes, the order that we scan this bp
@@ -1079,7 +1000,7 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 	if (avl_numnodes(&ddt->ddt_tree) == 0)
 		return;
 
-	ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
+	ASSERT3U(spa->spa_uberblock.ub_version, >=, SPA_VERSION_DEDUP);
 
 	if (spa->spa_ddt_stat_object == 0) {
 		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
@@ -1092,23 +1013,23 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 		ddt_free(dde);
 	}
 
-	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		uint64_t add, count = 0;
-		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
 			if (ddt_object_exists(ddt, type, class)) {
 				ddt_object_sync(ddt, type, class, tx);
-				VERIFY(ddt_object_count(ddt, type, class,
-				    &add) == 0);
+				VERIFY0(ddt_object_count(ddt, type, class,
+				    &add));
 				count += add;
 			}
 		}
-		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
 			if (count == 0 && ddt_object_exists(ddt, type, class))
 				ddt_object_destroy(ddt, type, class, tx);
 		}
 	}
 
-	bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
 	    sizeof (ddt->ddt_histogram));
 	spa->spa_dedup_dspace = ~0ULL;
 }
@@ -1120,7 +1041,7 @@ ddt_sync(spa_t *spa, uint64_t txg)
 	dmu_tx_t *tx;
 	zio_t *rio;
 
-	ASSERT(spa_syncing_txg(spa) == txg);
+	ASSERT3U(spa_syncing_txg(spa), ==, txg);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
@@ -1157,6 +1078,8 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
 		do {
 			do {
 				ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
+				if (ddt == NULL)
+					continue;
 				int error = ENOENT;
 				if (ddt_object_exists(ddt, ddb->ddb_type,
 				    ddb->ddb_class)) {
@@ -1180,7 +1103,68 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
 	return (SET_ERROR(ENOENT));
 }
 
-/* BEGIN CSTYLED */
+/*
+ * This function is used by Block Cloning (brt.c) to increase reference
+ * counter for the DDT entry if the block is already in DDT.
+ *
+ * Return false if the block, despite having the D bit set, is not present
+ * in the DDT. Currently this is not possible but might be in the future.
+ * See the comment below.
+ */
+boolean_t
+ddt_addref(spa_t *spa, const blkptr_t *bp)
+{
+	ddt_t *ddt;
+	ddt_entry_t *dde;
+	boolean_t result;
+
+	spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
+	ddt = ddt_select(spa, bp);
+	ddt_enter(ddt);
+
+	dde = ddt_lookup(ddt, bp, B_TRUE);
+	ASSERT3P(dde, !=, NULL);
+
+	if (dde->dde_type < DDT_TYPES) {
+		ddt_phys_t *ddp;
+
+		ASSERT3S(dde->dde_class, <, DDT_CLASSES);
+
+		ddp = &dde->dde_phys[BP_GET_NDVAS(bp)];
+
+		/*
+		 * This entry already existed (dde_type is real), so it must
+		 * have refcnt >0 at the start of this txg. We are called from
+		 * brt_pending_apply(), before frees are issued, so the refcnt
+		 * can't be lowered yet. Therefore, it must be >0. We assert
+		 * this because if the order of BRT and DDT interactions were
+		 * ever to change and the refcnt was ever zero here, then
+		 * likely further action is required to fill out the DDT entry,
+		 * and this is a place that is likely to be missed in testing.
+		 */
+		ASSERT3U(ddp->ddp_refcnt, >, 0);
+
+		ddt_phys_addref(ddp);
+		result = B_TRUE;
+	} else {
+		/*
+		 * At the time of implementating this if the block has the
+		 * DEDUP flag set it must exist in the DEDUP table, but
+		 * there are many advocates that want ability to remove
+		 * entries from DDT with refcnt=1. If this will happen,
+		 * we may have a block with the DEDUP set, but which doesn't
+		 * have a corresponding entry in the DDT. Be ready.
+		 */
+		ASSERT3S(dde->dde_class, ==, DDT_CLASSES);
+		ddt_remove(ddt, dde);
+		result = B_FALSE;
+	}
+
+	ddt_exit(ddt);
+	spa_config_exit(spa, SCL_ZIO, FTAG);
+
+	return (result);
+}
+
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
 	"Enable prefetching dedup-ed blks");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/ddt_stats.c b/sys/contrib/openzfs/module/zfs/ddt_stats.c
new file mode 100644
index 000000000000..af5365a1d114
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/ddt_stats.c
@@ -0,0 +1,212 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2022 by Pawel Jakub Dawidek
+ * Copyright (c) 2023, Klara Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/ddt.h>
+#include <sys/ddt_impl.h>
+
+static void
+ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+{
+	spa_t *spa = ddt->ddt_spa;
+	ddt_phys_t *ddp = dde->dde_phys;
+	ddt_key_t *ddk = &dde->dde_key;
+	uint64_t lsize = DDK_GET_LSIZE(ddk);
+	uint64_t psize = DDK_GET_PSIZE(ddk);
+
+	memset(dds, 0, sizeof (*dds));
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		uint64_t dsize = 0;
+		uint64_t refcnt = ddp->ddp_refcnt;
+
+		if (ddp->ddp_phys_birth == 0)
+			continue;
+
+		int ndvas = DDK_GET_CRYPT(&dde->dde_key) ?
+		    SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
+		for (int d = 0; d < ndvas; d++)
+			dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+
+		dds->dds_blocks += 1;
+		dds->dds_lsize += lsize;
+		dds->dds_psize += psize;
+		dds->dds_dsize += dsize;
+
+		dds->dds_ref_blocks += refcnt;
+		dds->dds_ref_lsize += lsize * refcnt;
+		dds->dds_ref_psize += psize * refcnt;
+		dds->dds_ref_dsize += dsize * refcnt;
+	}
+}
+
+void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+{
+	const uint64_t *s = (const uint64_t *)src;
+	uint64_t *d = (uint64_t *)dst;
+	uint64_t *d_end = (uint64_t *)(dst + 1);
+
+	ASSERT(neg == 0 || neg == -1ULL);	/* add or subtract */
+
+	for (int i = 0; i < d_end - d; i++)
+		d[i] += (s[i] ^ neg) - neg;
+}
+
+void
+ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+{
+	ddt_stat_t dds;
+	ddt_histogram_t *ddh;
+	int bucket;
+
+	ddt_stat_generate(ddt, dde, &dds);
+
+	bucket = highbit64(dds.dds_ref_blocks) - 1;
+	ASSERT3U(bucket, >=, 0);
+
+	ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+
+	ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+}
+
+void
+ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
+{
+	for (int h = 0; h < 64; h++)
+		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+}
+
+void
+ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+{
+	memset(dds, 0, sizeof (*dds));
+
+	for (int h = 0; h < 64; h++)
+		ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+}
+
+boolean_t
+ddt_histogram_empty(const ddt_histogram_t *ddh)
+{
+	const uint64_t *s = (const uint64_t *)ddh;
+	const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+
+	while (s < s_end)
+		if (*s++ != 0)
+			return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+void
+ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
+{
+	/* Sum the statistics we cached in ddt_object_sync(). */
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (!ddt)
+			continue;
+
+		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+			for (ddt_class_t class = 0; class < DDT_CLASSES;
+			    class++) {
+				ddt_object_t *ddo =
+				    &ddt->ddt_object_stats[type][class];
+				ddo_total->ddo_count += ddo->ddo_count;
+				ddo_total->ddo_dspace += ddo->ddo_dspace;
+				ddo_total->ddo_mspace += ddo->ddo_mspace;
+			}
+		}
+	}
+
+	/* ... and compute the averages. */
+	if (ddo_total->ddo_count != 0) {
+		ddo_total->ddo_dspace /= ddo_total->ddo_count;
+		ddo_total->ddo_mspace /= ddo_total->ddo_count;
+	}
+}
+
+void
+ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
+{
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (!ddt)
+			continue;
+
+		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+			for (ddt_class_t class = 0; class < DDT_CLASSES;
+			    class++) {
+				ddt_histogram_add(ddh,
+				    &ddt->ddt_histogram_cache[type][class]);
+			}
+		}
+	}
+}
+
+void
+ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
+{
+	ddt_histogram_t *ddh_total;
+
+	ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+	ddt_get_dedup_histogram(spa, ddh_total);
+	ddt_histogram_stat(dds_total, ddh_total);
+	kmem_free(ddh_total, sizeof (ddt_histogram_t));
+}
+
+uint64_t
+ddt_get_dedup_dspace(spa_t *spa)
+{
+	ddt_stat_t dds_total;
+
+	if (spa->spa_dedup_dspace != ~0ULL)
+		return (spa->spa_dedup_dspace);
+
+	memset(&dds_total, 0, sizeof (ddt_stat_t));
+
+	/* Calculate and cache the stats */
+	ddt_get_dedup_stats(spa, &dds_total);
+	spa->spa_dedup_dspace = dds_total.dds_ref_dsize - dds_total.dds_dsize;
+	return (spa->spa_dedup_dspace);
+}
+
+uint64_t
+ddt_get_pool_dedup_ratio(spa_t *spa)
+{
+	ddt_stat_t dds_total = { 0 };
+
+	ddt_get_dedup_stats(spa, &dds_total);
+	if (dds_total.dds_dsize == 0)
+		return (100);
+
+	return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
+}
diff --git a/sys/contrib/openzfs/module/zfs/ddt_zap.c b/sys/contrib/openzfs/module/zfs/ddt_zap.c
index c5c9eda0b2d0..741554de3c60 100644
--- a/sys/contrib/openzfs/module/zfs/ddt_zap.c
+++ b/sys/contrib/openzfs/module/zfs/ddt_zap.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -28,11 +28,60 @@
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/ddt.h>
+#include <sys/ddt_impl.h>
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>
+#include <sys/zio_compress.h>
 
-int ddt_zap_leaf_blockshift = 12;
-int ddt_zap_indirect_blockshift = 12;
+static unsigned int ddt_zap_default_bs = 15;
+static unsigned int ddt_zap_default_ibs = 15;
+
+#define	DDT_ZAP_COMPRESS_BYTEORDER_MASK	0x80
+#define	DDT_ZAP_COMPRESS_FUNCTION_MASK	0x7f
+
+#define	DDT_KEY_WORDS	(sizeof (ddt_key_t) / sizeof (uint64_t))
+
+static size_t
+ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len)
+{
+	uchar_t *version = dst++;
+	int cpfunc = ZIO_COMPRESS_ZLE;
+	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+	size_t c_len;
+
+	ASSERT3U(d_len, >=, s_len + 1);	/* no compression plus version byte */
+
+	c_len = ci->ci_compress((void *)src, dst, s_len, d_len - 1,
+	    ci->ci_level);
+
+	if (c_len == s_len) {
+		cpfunc = ZIO_COMPRESS_OFF;
+		memcpy(dst, src, s_len);
+	}
+
+	*version = cpfunc;
+	if (ZFS_HOST_BYTEORDER)
+		*version |= DDT_ZAP_COMPRESS_BYTEORDER_MASK;
+
+	return (c_len + 1);
+}
+
+static void
+ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
+{
+	uchar_t version = *src++;
+	int cpfunc = version & DDT_ZAP_COMPRESS_FUNCTION_MASK;
+	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+	if (ci->ci_decompress != NULL)
+		(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
+	else
+		memcpy(dst, src, d_len);
+
+	if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) !=
+	    (ZFS_HOST_BYTEORDER != 0))
+		byteswap_uint64_array(dst, d_len);
+}
 
 static int
 ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
@@ -43,10 +92,12 @@ ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
 		flags |= ZAP_FLAG_PRE_HASHED_KEY;
 
 	*objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
-	    ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
+	    ddt_zap_default_bs, ddt_zap_default_ibs,
 	    DMU_OT_NONE, 0, tx);
+	if (*objectp == 0)
+		return (SET_ERROR(ENOTSUP));
 
-	return (*objectp == 0 ? SET_ERROR(ENOTSUP) : 0);
+	return (0);
 }
 
 static int
@@ -56,63 +107,75 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
 }
 
 static int
-ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde)
+ddt_zap_lookup(objset_t *os, uint64_t object,
+    const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize)
 {
 	uchar_t *cbuf;
 	uint64_t one, csize;
 	int error;
 
-	cbuf = kmem_alloc(sizeof (dde->dde_phys) + 1, KM_SLEEP);
-
-	error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key,
+	error = zap_length_uint64(os, object, (uint64_t *)ddk,
 	    DDT_KEY_WORDS, &one, &csize);
 	if (error)
-		goto out;
+		return (error);
 
-	ASSERT(one == 1);
-	ASSERT(csize <= (sizeof (dde->dde_phys) + 1));
+	ASSERT3U(one, ==, 1);
+	ASSERT3U(csize, <=, psize + 1);
 
-	error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key,
+	cbuf = kmem_alloc(csize, KM_SLEEP);
+
+	error = zap_lookup_uint64(os, object, (uint64_t *)ddk,
 	    DDT_KEY_WORDS, 1, csize, cbuf);
-	if (error)
-		goto out;
+	if (error == 0)
+		ddt_zap_decompress(cbuf, phys, csize, psize);
 
-	ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys));
-out:
-	kmem_free(cbuf, sizeof (dde->dde_phys) + 1);
+	kmem_free(cbuf, csize);
 
 	return (error);
 }
 
+static int
+ddt_zap_contains(objset_t *os, uint64_t object, const ddt_key_t *ddk)
+{
+	return (zap_length_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS,
+	    NULL, NULL));
+}
+
 static void
-ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde)
+ddt_zap_prefetch(objset_t *os, uint64_t object, const ddt_key_t *ddk)
 {
-	(void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key,
-	    DDT_KEY_WORDS);
+	(void) zap_prefetch_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS);
 }
 
 static int
-ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk,
+    const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx)
 {
-	uchar_t cbuf[sizeof (dde->dde_phys) + 1];
-	uint64_t csize;
+	const size_t cbuf_size = psize + 1;
+
+	uchar_t *cbuf = kmem_alloc(cbuf_size, KM_SLEEP);
+
+	uint64_t csize = ddt_zap_compress(phys, cbuf, psize, cbuf_size);
 
-	csize = ddt_compress(dde->dde_phys, cbuf,
-	    sizeof (dde->dde_phys), sizeof (cbuf));
+	int error = zap_update_uint64(os, object, (uint64_t *)ddk,
+	    DDT_KEY_WORDS, 1, csize, cbuf, tx);
 
-	return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key,
-	    DDT_KEY_WORDS, 1, csize, cbuf, tx));
+	kmem_free(cbuf, cbuf_size);
+
+	return (error);
 }
 
 static int
-ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk,
+    dmu_tx_t *tx)
 {
-	return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key,
+	return (zap_remove_uint64(os, object, (uint64_t *)ddk,
 	    DDT_KEY_WORDS, tx));
 }
 
 static int
-ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
+ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
+    ddt_phys_t *phys, size_t psize)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
@@ -131,17 +194,23 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
 		zap_cursor_init_serialized(&zc, os, object, *walk);
 	}
 	if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
-		uchar_t cbuf[sizeof (dde->dde_phys) + 1];
 		uint64_t csize = za.za_num_integers;
-		ASSERT(za.za_integer_length == 1);
+
+		ASSERT3U(za.za_integer_length, ==, 1);
+		ASSERT3U(csize, <=, psize + 1);
+
+		uchar_t *cbuf = kmem_alloc(csize, KM_SLEEP);
+
 		error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name,
 		    DDT_KEY_WORDS, 1, csize, cbuf);
-		ASSERT(error == 0);
+		ASSERT0(error);
 		if (error == 0) {
-			ddt_decompress(cbuf, dde->dde_phys, csize,
-			    sizeof (dde->dde_phys));
-			dde->dde_key = *(ddt_key_t *)za.za_name;
+			ddt_zap_decompress(cbuf, phys, csize, psize);
+			*ddk = *(ddt_key_t *)za.za_name;
 		}
+
+		kmem_free(cbuf, csize);
+
 		zap_cursor_advance(&zc);
 		*walk = zap_cursor_serialize(&zc);
 	}
@@ -160,9 +229,17 @@ const ddt_ops_t ddt_zap_ops = {
 	ddt_zap_create,
 	ddt_zap_destroy,
 	ddt_zap_lookup,
+	ddt_zap_contains,
 	ddt_zap_prefetch,
 	ddt_zap_update,
 	ddt_zap_remove,
 	ddt_zap_walk,
 	ddt_zap_count,
 };
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_bs, UINT, ZMOD_RW,
+	"DDT ZAP leaf blockshift");
+ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_ibs, UINT, ZMOD_RW,
+	"DDT ZAP indirect blockshift");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c
index eee3e70bbc95..8b440aafba43 100644
--- a/sys/contrib/openzfs/module/zfs/dmu.c
+++ b/sys/contrib/openzfs/module/zfs/dmu.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -28,6 +28,8 @@
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/dmu.h>
@@ -51,6 +53,7 @@
 #include <sys/sa.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
+#include <sys/brt.h>
 #include <sys/trace_zfs.h>
 #include <sys/zfs_racct.h>
 #include <sys/zfs_rlock.h>
@@ -62,7 +65,7 @@
 /*
  * Enable/disable nopwrite feature.
  */
-int zfs_nopwrite_enabled = 1;
+static int zfs_nopwrite_enabled = 1;
 
 /*
  * Tunable to control percentage of dirtied L1 blocks from frees allowed into
@@ -70,19 +73,27 @@ int zfs_nopwrite_enabled = 1;
  * will wait until the next TXG.
  * A value of zero will disable this throttle.
  */
-unsigned long zfs_per_txg_dirty_frees_percent = 5;
+static uint_t zfs_per_txg_dirty_frees_percent = 30;
 
 /*
- * Enable/disable forcing txg sync when dirty in dmu_offset_next.
+ * Enable/disable forcing txg sync when dirty checking for holes with lseek().
+ * By default this is enabled to ensure accurate hole reporting, it can result
+ * in a significant performance penalty for lseek(SEEK_HOLE) heavy workloads.
+ * Disabling this option will result in holes never being reported in dirty
+ * files which is always safe.
  */
-int zfs_dmu_offset_next_sync = 0;
+static int zfs_dmu_offset_next_sync = 1;
 
 /*
  * Limit the amount we can prefetch with one call to this amount.  This
  * helps to limit the amount of memory that can be used by prefetching.
  * Larger objects should be prefetched a bit at a time.
  */
-int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
+#ifdef _ILP32
+uint_t dmu_prefetch_max = 8 * 1024 * 1024;
+#else
+uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
+#endif
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
@@ -141,7 +152,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj subobj"		}
 };
 
-const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
+dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 	{	byteswap_uint8_array,	"uint8"		},
 	{	byteswap_uint16_array,	"uint16"	},
 	{	byteswap_uint32_array,	"uint32"	},
@@ -154,9 +165,9 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 	{	zfs_acl_byteswap,	"acl"		}
 };
 
-static int
+int
 dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
-    void *tag, dmu_buf_t **dbp)
+    const void *tag, dmu_buf_t **dbp)
 {
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
@@ -174,9 +185,10 @@ dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
 	*dbp = &db->db;
 	return (0);
 }
+
 int
 dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
-    void *tag, dmu_buf_t **dbp)
+    const void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	uint64_t blkid;
@@ -203,7 +215,7 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
 
 int
 dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
-    void *tag, dmu_buf_t **dbp, int flags)
+    const void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
@@ -228,7 +240,7 @@ dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
 
 int
 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
-    void *tag, dmu_buf_t **dbp, int flags)
+    const void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
@@ -338,7 +350,7 @@ dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
  * has not yet been allocated a new bonus dbuf a will be allocated.
  * Returns ENOENT, EIO, or 0.
  */
-int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp,
+int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
     uint32_t flags)
 {
 	dmu_buf_impl_t *db;
@@ -352,8 +364,10 @@ int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp,
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_bonus == NULL) {
-		rw_exit(&dn->dn_struct_rwlock);
-		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
+			rw_exit(&dn->dn_struct_rwlock);
+			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		}
 		if (dn->dn_bonus == NULL)
 			dbuf_create_bonus(dn);
 	}
@@ -385,7 +399,7 @@ int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp,
 }
 
 int
-dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
+dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	int error;
@@ -410,7 +424,8 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
  * dmu_spill_hold_existing() should be used.
  */
 int
-dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
+dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, const void *tag,
+    dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = NULL;
 	int err;
@@ -438,7 +453,7 @@ dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
 }
 
 int
-dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
@@ -467,7 +482,7 @@ dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 }
 
 int
-dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag,
+dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
     dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
@@ -494,7 +509,8 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag,
  */
 int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
-    boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
+    boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp,
+    uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	zstream_t *zs = NULL;
@@ -504,7 +520,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 	zio_t *zio = NULL;
 	boolean_t missed = B_FALSE;
 
-	ASSERT(length <= DMU_MAX_ACCESS);
+	ASSERT(!read || length <= DMU_MAX_ACCESS);
 
 	/*
 	 * Note: We directly notify the prefetch code of this read, so that
@@ -514,11 +530,15 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
 	    DB_RF_NOPREFETCH;
 
+	if ((flags & DMU_READ_NO_DECRYPT) != 0)
+		dbuf_flags |= DB_RF_NO_DECRYPT;
+
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
 		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
-		    P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
+		    P2ALIGN_TYPED(offset, 1ULL << blkshift, uint64_t))
+		    >> blkshift;
 	} else {
 		if (offset + length > dn->dn_datablksz) {
 			zfs_panic_recover("zfs: accessing past end of object "
@@ -538,21 +558,22 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 		zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, 0, offset);
-	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
-	    DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
+	if ((flags & DMU_READ_NO_PREFETCH) == 0) {
 		/*
 		 * Prepare the zfetch before initiating the demand reads, so
 		 * that if multiple threads block on same indirect block, we
 		 * base predictions on the original less racy request order.
 		 */
-		zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks,
-		    read && DNODE_IS_CACHEABLE(dn), B_TRUE);
+		zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read,
+		    B_TRUE);
 	}
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
-			if (zs)
-				dmu_zfetch_run(zs, missed, B_TRUE);
+			if (zs) {
+				dmu_zfetch_run(&dn->dn_zfetch, zs, missed,
+				    B_TRUE);
+			}
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			if (read)
@@ -569,6 +590,14 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 		 * state will not yet be CACHED.
 		 */
 		if (read) {
+			if (i == nblks - 1 && blkid + i < dn->dn_maxblkid &&
+			    offset + length < db->db.db_offset +
+			    db->db.db_size) {
+				if (offset <= db->db.db_offset)
+					dbuf_flags |= DB_RF_PARTIAL_FIRST;
+				else
+					dbuf_flags |= DB_RF_PARTIAL_MORE;
+			}
 			(void) dbuf_read(db, zio, dbuf_flags);
 			if (db->db_state != DB_CACHED)
 				missed = B_TRUE;
@@ -580,7 +609,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 		zfs_racct_write(length, nblks);
 
 	if (zs)
-		dmu_zfetch_run(zs, missed, B_TRUE);
+		dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (read) {
@@ -615,7 +644,8 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 
 int
 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+    uint64_t length, int read, const void *tag, int *numbufsp,
+    dmu_buf_t ***dbpp)
 {
 	dnode_t *dn;
 	int err;
@@ -634,7 +664,7 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
 
 int
 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
-    uint64_t length, boolean_t read, void *tag, int *numbufsp,
+    uint64_t length, boolean_t read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
@@ -651,7 +681,7 @@ dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
 }
 
 void
-dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag)
 {
 	int i;
 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
@@ -668,72 +698,99 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
 }
 
 /*
- * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
+ * Issue prefetch I/Os for the given blocks.  If level is greater than 0, the
  * indirect blocks prefetched will be those that point to the blocks containing
- * the data starting at offset, and continuing to offset + len.
+ * the data starting at offset, and continuing to offset + len.  If the range
+ * it too long, prefetch the first dmu_prefetch_max bytes as requested, while
+ * for the rest only a higher level, also fitting within dmu_prefetch_max.  It
+ * should primarily help random reads, since for long sequential reads there is
+ * a speculative prefetcher.
  *
  * Note that if the indirect blocks above the blocks being prefetched are not
- * in cache, they will be asynchronously read in.
+ * in cache, they will be asynchronously read in.  Dnode read by dnode_hold()
+ * is currently synchronous.
  */
 void
 dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
-	uint64_t blkid;
-	int nblks, err;
-
-	if (len == 0) {  /* they're interested in the bonus buffer */
-		dn = DMU_META_DNODE(os);
 
-		if (object == 0 || object >= DN_MAX_OBJECT)
-			return;
-
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		blkid = dbuf_whichblock(dn, level,
-		    object * sizeof (dnode_phys_t));
-		dbuf_prefetch(dn, level, blkid, pri, 0);
-		rw_exit(&dn->dn_struct_rwlock);
+	if (dmu_prefetch_max == 0 || len == 0) {
+		dmu_prefetch_dnode(os, object, pri);
 		return;
 	}
 
-	/*
-	 * See comment before the definition of dmu_prefetch_max.
-	 */
-	len = MIN(len, dmu_prefetch_max);
-
-	/*
-	 * XXX - Note, if the dnode for the requested object is not
-	 * already cached, we will do a *synchronous* read in the
-	 * dnode_hold() call.  The same is true for any indirects.
-	 */
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err != 0)
+	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return;
 
+	dmu_prefetch_by_dnode(dn, level, offset, len, pri);
+
+	dnode_rele(dn, FTAG);
+}
+
+void
+dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
+    uint64_t len, zio_priority_t pri)
+{
+	int64_t level2 = level;
+	uint64_t start, end, start2, end2;
+
 	/*
-	 * offset + len - 1 is the last byte we want to prefetch for, and offset
-	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
-	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
-	 * offset)  is the first.  Then the number we need to prefetch is the
-	 * last - first + 1.
+	 * Depending on len we may do two prefetches: blocks [start, end) at
+	 * level, and following blocks [start2, end2) at higher level2.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	if (level > 0 || dn->dn_datablkshift != 0) {
-		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
-		    dbuf_whichblock(dn, level, offset) + 1;
+	if (dn->dn_datablkshift != 0) {
+		/*
+		 * The object has multiple blocks.  Calculate the full range
+		 * of blocks [start, end2) and then split it into two parts,
+		 * so that the first [start, end) fits into dmu_prefetch_max.
+		 */
+		start = dbuf_whichblock(dn, level, offset);
+		end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1;
+		uint8_t ibs = dn->dn_indblkshift;
+		uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs;
+		uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs;
+		start2 = end = MIN(end2, start + limit);
+
+		/*
+		 * Find level2 where [start2, end2) fits into dmu_prefetch_max.
+		 */
+		uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
+		limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
+		do {
+			level2++;
+			start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
+			end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps;
+		} while (end2 - start2 > limit);
 	} else {
-		nblks = (offset < dn->dn_datablksz);
+		/* There is only one block.  Prefetch it or nothing. */
+		start = start2 = end2 = 0;
+		end = start + (level == 0 && offset < dn->dn_datablksz);
 	}
 
-	if (nblks != 0) {
-		blkid = dbuf_whichblock(dn, level, offset);
-		for (int i = 0; i < nblks; i++)
-			dbuf_prefetch(dn, level, blkid + i, pri, 0);
-	}
+	for (uint64_t i = start; i < end; i++)
+		dbuf_prefetch(dn, level, i, pri, 0);
+	for (uint64_t i = start2; i < end2; i++)
+		dbuf_prefetch(dn, level2, i, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
+}
 
-	dnode_rele(dn, FTAG);
+/*
+ * Issue prefetch I/Os for the given object's dnode.
+ */
+void
+dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
+{
+	if (object == 0 || object >= DN_MAX_OBJECT)
+		return;
+
+	dnode_t *dn = DMU_META_DNODE(os);
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t));
+	dbuf_prefetch(dn, 0, blkid, pri, 0);
+	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
@@ -798,7 +855,7 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
 		}
 
 		/* set start to the beginning of this L1 indirect */
-		*start = P2ALIGN(*start, iblkrange);
+		*start = P2ALIGN_TYPED(*start, iblkrange, uint64_t);
 	}
 	if (*start < minimum)
 		*start = minimum;
@@ -812,13 +869,14 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
  * otherwise return false.
  * Used below in dmu_free_long_range_impl() to enable abort when unmounting
  */
-/*ARGSUSED*/
 static boolean_t
 dmu_objset_zfs_unmounting(objset_t *os)
 {
 #ifdef _KERNEL
 	if (dmu_objset_type(os) == DMU_OST_ZFS)
 		return (zfs_get_vfs_flag_unmounted(os));
+#else
+	(void) os;
 #endif
 	return (B_FALSE);
 }
@@ -1007,7 +1065,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
 	if (dn->dn_maxblkid == 0) {
 		uint64_t newsz = offset > dn->dn_datablksz ? 0 :
 		    MIN(size, dn->dn_datablksz - offset);
-		bzero((char *)buf + newsz, size - newsz);
+		memset((char *)buf + newsz, 0, size - newsz);
 		size = newsz;
 	}
 
@@ -1087,14 +1145,14 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
-			dmu_buf_will_fill(db, tx);
+			dmu_buf_will_fill(db, tx, B_FALSE);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
 
 		if (tocpy == db->db_size)
-			dmu_buf_fill_done(db, tx);
+			dmu_buf_fill_done(db, tx, B_FALSE);
 
 		offset += tocpy;
 		size -= tocpy;
@@ -1302,27 +1360,24 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
 
 		ASSERT(size > 0);
 
-		bufoff = zfs_uio_offset(uio) - db->db_offset;
+		offset_t off = zfs_uio_offset(uio);
+		bufoff = off - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
-			dmu_buf_will_fill(db, tx);
+			dmu_buf_will_fill(db, tx, B_TRUE);
 		else
 			dmu_buf_will_dirty(db, tx);
 
-		/*
-		 * XXX zfs_uiomove could block forever (eg.nfs-backed
-		 * pages).  There needs to be a uiolockdown() function
-		 * to lock the pages in memory, so that zfs_uiomove won't
-		 * block.
-		 */
 		err = zfs_uio_fault_move((char *)db->db_data + bufoff,
 		    tocpy, UIO_WRITE, uio);
 
-		if (tocpy == db->db_size)
-			dmu_buf_fill_done(db, tx);
+		if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) {
+			/* The fill was reverted.  Undo any uio progress. */
+			zfs_uio_advance(uio, off - zfs_uio_offset(uio));
+		}
 
 		if (err)
 			break;
@@ -1424,7 +1479,7 @@ dmu_return_arcbuf(arc_buf_t *buf)
  */
 int
 dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
-    const zio_prop_t *zp, enum zio_flag flags, dmu_tx_t *tx)
+    const zio_prop_t *zp, zio_flag_t flags, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr =
 	    dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx);
@@ -1454,9 +1509,9 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, FTAG);
+	rw_exit(&dn->dn_struct_rwlock);
 	if (db == NULL)
 		return (SET_ERROR(EIO));
-	rw_exit(&dn->dn_struct_rwlock);
 
 	/*
 	 * We can only assign if the offset is aligned and the arc buf is the
@@ -1500,10 +1555,10 @@ typedef struct {
 	dmu_tx_t		*dsa_tx;
 } dmu_sync_arg_t;
 
-/* ARGSUSED */
 static void
 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
+	(void) buf;
 	dmu_sync_arg_t *dsa = varg;
 	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
 	blkptr_t *bp = zio->io_bp;
@@ -1528,10 +1583,10 @@ dmu_sync_late_arrival_ready(zio_t *zio)
 	dmu_sync_ready(zio, NULL, zio->io_private);
 }
 
-/* ARGSUSED */
 static void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
+	(void) buf;
 	dmu_sync_arg_t *dsa = varg;
 	dbuf_dirty_record_t *dr = dsa->dsa_dr;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
@@ -1575,7 +1630,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 		 * it's an old style hole.
 		 */
 		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
-		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
+		    BP_GET_LOGICAL_BIRTH(&dr->dt.dl.dr_overridden_by) == 0)
 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
 	} else {
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -1606,7 +1661,7 @@ dmu_sync_late_arrival_done(zio_t *zio)
 			blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
 			ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
 			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
-			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+			ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg);
 			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
 			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
 		}
@@ -1626,10 +1681,22 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
 {
 	dmu_sync_arg_t *dsa;
 	dmu_tx_t *tx;
+	int error;
+
+	error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL,
+	    DB_RF_CANFAIL | DB_RF_NOPREFETCH);
+	if (error != 0)
+		return (error);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
-	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
+	/*
+	 * This transaction does not produce any dirty data or log blocks, so
+	 * it should not be throttled.  All other cases wait for TXG sync, by
+	 * which time the log block we are writing will be obsolete, so we can
+	 * skip waiting and just return error here instead.
+	 */
+	if (dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE) != 0) {
 		dmu_tx_abort(tx);
 		/* Make zl_get_data do txg_waited_synced() */
 		return (SET_ERROR(EIO));
@@ -1674,7 +1741,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
 	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
 	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
-	    dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
+	    dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done,
 	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
 
 	return (0);
@@ -1838,9 +1905,9 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = NULL;
 
-	zio_nowait(arc_write(pio, os->os_spa, txg,
-	    zgd->zgd_bp, dr->dt.dl.dr_data, dbuf_is_l2cacheable(db),
-	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
+	zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
+	    dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
+	    &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 
 	return (0);
@@ -1935,7 +2002,7 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
  * When the "redundant_metadata" property is set to "most", only indirect
  * blocks of this level and higher will have an additional ditto block.
  */
-int zfs_redundant_metadata_most_ditto_level = 2;
+static const int zfs_redundant_metadata_most_ditto_level = 2;
 
 void
 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
@@ -1981,12 +2048,22 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 		    ZCHECKSUM_FLAG_EMBEDDED))
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
 
-		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
-		    (os->os_redundant_metadata ==
-		    ZFS_REDUNDANT_METADATA_MOST &&
-		    (level >= zfs_redundant_metadata_most_ditto_level ||
-		    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
+		switch (os->os_redundant_metadata) {
+		case ZFS_REDUNDANT_METADATA_ALL:
 			copies++;
+			break;
+		case ZFS_REDUNDANT_METADATA_MOST:
+			if (level >= zfs_redundant_metadata_most_ditto_level ||
+			    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
+				copies++;
+			break;
+		case ZFS_REDUNDANT_METADATA_SOME:
+			if (DMU_OT_IS_CRITICAL(type))
+				copies++;
+			break;
+		case ZFS_REDUNDANT_METADATA_NONE:
+			break;
+		}
 	} else if (wp & WP_NOFILL) {
 		ASSERT(level == 0);
 
@@ -2072,9 +2149,9 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 	zp->zp_nopwrite = nopwrite;
 	zp->zp_encrypt = encrypt;
 	zp->zp_byteorder = ZFS_HOST_BYTEORDER;
-	bzero(zp->zp_salt, ZIO_DATA_SALT_LEN);
-	bzero(zp->zp_iv, ZIO_DATA_IV_LEN);
-	bzero(zp->zp_mac, ZIO_DATA_MAC_LEN);
+	memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN);
+	memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN);
+	memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN);
 	zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
 	    os->os_zpl_special_smallblock : 0;
 
@@ -2082,18 +2159,18 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 }
 
 /*
- * This function is only called from zfs_holey_common() for zpl_llseek()
- * in order to determine the location of holes.  In order to accurately
- * report holes all dirty data must be synced to disk.  This causes extremely
- * poor performance when seeking for holes in a dirty file.  As a compromise,
- * only provide hole data when the dnode is clean.  When a dnode is dirty
- * report the dnode as having no holes which is always a safe thing to do.
+ * Reports the location of data and holes in an object.  In order to
+ * accurately report holes all dirty data must be synced to disk.  This
+ * causes extremely poor performance when seeking for holes in a dirty file.
+ * As a compromise, only provide hole data when the dnode is clean.  When
+ * a dnode is dirty report the dnode as having no holes by returning EBUSY
+ * which is always safe to do.
  */
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
 	dnode_t *dn;
-	int err;
+	int restarted = 0, err;
 
 restart:
 	err = dnode_hold(os, object, FTAG, &dn);
@@ -2105,19 +2182,23 @@ restart:
 	if (dnode_is_dirty(dn)) {
 		/*
 		 * If the zfs_dmu_offset_next_sync module option is enabled
-		 * then strict hole reporting has been requested.  Dirty
-		 * dnodes must be synced to disk to accurately report all
-		 * holes.  When disabled (the default) dirty dnodes are
-		 * reported to not have any holes which is always safe.
+		 * then hole reporting has been requested.  Dirty dnodes
+		 * must be synced to disk to accurately report holes.
 		 *
-		 * When called by zfs_holey_common() the zp->z_rangelock
-		 * is held to prevent zfs_write() and mmap writeback from
-		 * re-dirtying the dnode after txg_wait_synced().
+		 * Provided a RL_READER rangelock spanning 0-UINT64_MAX is
+		 * held by the caller only a single restart will be required.
+		 * We tolerate callers which do not hold the rangelock by
+		 * returning EBUSY and not reporting holes after one restart.
 		 */
 		if (zfs_dmu_offset_next_sync) {
 			rw_exit(&dn->dn_struct_rwlock);
 			dnode_rele(dn, FTAG);
+
+			if (restarted)
+				return (SET_ERROR(EBUSY));
+
 			txg_wait_synced(dmu_objset_pool(os), 0);
+			restarted = 1;
 			goto restart;
 		}
 
@@ -2133,6 +2214,187 @@ restart:
 	return (err);
 }
 
+int
+dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
+    blkptr_t *bps, size_t *nbpsp)
+{
+	dmu_buf_t **dbp, *dbuf;
+	dmu_buf_impl_t *db;
+	blkptr_t *bp;
+	int error, numbufs;
+
+	error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
+	    &numbufs, &dbp);
+	if (error != 0) {
+		if (error == ESRCH) {
+			error = SET_ERROR(ENXIO);
+		}
+		return (error);
+	}
+
+	ASSERT3U(numbufs, <=, *nbpsp);
+
+	for (int i = 0; i < numbufs; i++) {
+		dbuf = dbp[i];
+		db = (dmu_buf_impl_t *)dbuf;
+
+		mutex_enter(&db->db_mtx);
+
+		if (!list_is_empty(&db->db_dirty_records)) {
+			dbuf_dirty_record_t *dr;
+
+			dr = list_head(&db->db_dirty_records);
+			if (dr->dt.dl.dr_brtwrite) {
+				/*
+				 * This is very special case where we clone a
+				 * block and in the same transaction group we
+				 * read its BP (most likely to clone the clone).
+				 */
+				bp = &dr->dt.dl.dr_overridden_by;
+			} else {
+				/*
+				 * The block was modified in the same
+				 * transaction group.
+				 */
+				mutex_exit(&db->db_mtx);
+				error = SET_ERROR(EAGAIN);
+				goto out;
+			}
+		} else {
+			bp = db->db_blkptr;
+		}
+
+		mutex_exit(&db->db_mtx);
+
+		if (bp == NULL) {
+			/*
+			 * The file size was increased, but the block was never
+			 * written, otherwise we would either have the block
+			 * pointer or the dirty record and would not get here.
+			 * It is effectively a hole, so report it as such.
+			 */
+			BP_ZERO(&bps[i]);
+			continue;
+		}
+		/*
+		 * Make sure we clone only data blocks.
+		 */
+		if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) {
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+
+		/*
+		 * If the block was allocated in transaction group that is not
+		 * yet synced, we could clone it, but we couldn't write this
+		 * operation into ZIL, or it may be impossible to replay, since
+		 * the block may appear not yet allocated at that point.
+		 */
+		if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+		if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
+			error = SET_ERROR(EAGAIN);
+			goto out;
+		}
+
+		bps[i] = *bp;
+	}
+
+	*nbpsp = numbufs;
+out:
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+	return (error);
+}
+
+int
+dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
+    dmu_tx_t *tx, const blkptr_t *bps, size_t nbps)
+{
+	spa_t *spa;
+	dmu_buf_t **dbp, *dbuf;
+	dmu_buf_impl_t *db;
+	struct dirty_leaf *dl;
+	dbuf_dirty_record_t *dr;
+	const blkptr_t *bp;
+	int error = 0, i, numbufs;
+
+	spa = os->os_spa;
+
+	VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
+	    &numbufs, &dbp));
+	ASSERT3U(nbps, ==, numbufs);
+
+	/*
+	 * Before we start cloning make sure that the dbufs sizes match new BPs
+	 * sizes. If they don't, that's a no-go, as we are not able to shrink
+	 * dbufs.
+	 */
+	for (i = 0; i < numbufs; i++) {
+		dbuf = dbp[i];
+		db = (dmu_buf_impl_t *)dbuf;
+		bp = &bps[i];
+
+		ASSERT0(db->db_level);
+		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+		ASSERT(db->db_blkid != DMU_SPILL_BLKID);
+
+		if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) {
+			error = SET_ERROR(EXDEV);
+			goto out;
+		}
+	}
+
+	for (i = 0; i < numbufs; i++) {
+		dbuf = dbp[i];
+		db = (dmu_buf_impl_t *)dbuf;
+		bp = &bps[i];
+
+		ASSERT0(db->db_level);
+		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+		ASSERT(db->db_blkid != DMU_SPILL_BLKID);
+		ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp));
+
+		dmu_buf_will_clone(dbuf, tx);
+
+		mutex_enter(&db->db_mtx);
+
+		dr = list_head(&db->db_dirty_records);
+		VERIFY(dr != NULL);
+		ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
+		dl = &dr->dt.dl;
+		dl->dr_overridden_by = *bp;
+		if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) {
+			if (!BP_IS_EMBEDDED(bp)) {
+				BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg,
+				    BP_GET_BIRTH(bp));
+			} else {
+				BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by,
+				    dr->dr_txg);
+			}
+		}
+		dl->dr_brtwrite = B_TRUE;
+		dl->dr_override_state = DR_OVERRIDDEN;
+
+		mutex_exit(&db->db_mtx);
+
+		/*
+		 * When data in embedded into BP there is no need to create
+		 * BRT entry as there is no data block. Just copy the BP as
+		 * it contains the data.
+		 */
+		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
+			brt_pending_add(spa, bp, tx);
+		}
+	}
+out:
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+	return (error);
+}
+
 void
 __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
@@ -2272,10 +2534,10 @@ byteswap_uint16_array(void *vbuf, size_t size)
 		buf[i] = BSWAP_16(buf[i]);
 }
 
-/* ARGSUSED */
 void
 byteswap_uint8_array(void *vbuf, size_t size)
 {
+	(void) vbuf, (void) size;
 }
 
 void
@@ -2313,6 +2575,8 @@ EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
 EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
 EXPORT_SYMBOL(dmu_buf_rele_array);
 EXPORT_SYMBOL(dmu_prefetch);
+EXPORT_SYMBOL(dmu_prefetch_by_dnode);
+EXPORT_SYMBOL(dmu_prefetch_dnode);
 EXPORT_SYMBOL(dmu_free_range);
 EXPORT_SYMBOL(dmu_free_long_range);
 EXPORT_SYMBOL(dmu_free_long_object);
@@ -2341,16 +2605,15 @@ EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf);
 EXPORT_SYMBOL(dmu_buf_hold);
 EXPORT_SYMBOL(dmu_ot);
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW,
 	"Enable NOP writes");
 
-ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, UINT, ZMOD_RW,
 	"Percentage of dirtied blocks from frees in one TXG");
 
 ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
 	"Enable forcing txg sync to find holes");
 
-ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, INT, ZMOD_RW,
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
 	"Limit one prefetch call to this size");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_diff.c b/sys/contrib/openzfs/module/zfs/dmu_diff.c
index a573a2e1bd41..0def0956beb8 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_diff.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_diff.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -108,15 +108,15 @@ report_dnode(dmu_diffarg_t *da, uint64_t object, dnode_phys_t *dnp)
 	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
 	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
 
-/* ARGSUSED */
 static int
 diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
+	(void) zilog;
 	dmu_diffarg_t *da = arg;
 	int err = 0;
 
-	if (issig(JUSTLOOKING) && issig(FORREAL))
+	if (issig())
 		return (SET_ERROR(EINTR));
 
 	if (zb->zb_level == ZB_DNODE_LEVEL ||
diff --git a/sys/contrib/openzfs/module/zfs/dmu_object.c b/sys/contrib/openzfs/module/zfs/dmu_object.c
index 12cdbd68b104..56986ea43446 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_object.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_object.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -41,12 +41,12 @@
  * determined to be the lowest value that eliminates the measurable effect
  * of lock contention from this code path.
  */
-int dmu_object_alloc_chunk_shift = 7;
+uint_t dmu_object_alloc_chunk_shift = 7;
 
 static uint64_t
 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
-    int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+    int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
 {
 	uint64_t object;
 	uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
@@ -55,7 +55,7 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
 	int dn_slots = dnodesize >> DNODE_SHIFT;
 	boolean_t restarted = B_FALSE;
 	uint64_t *cpuobj = NULL;
-	int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+	uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
 	int error;
 
 	cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE %
@@ -160,7 +160,7 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
 			 * is not suitably aligned.
 			 */
 			os->os_obj_next_chunk =
-			    P2ALIGN(object, dnodes_per_chunk) +
+			    P2ALIGN_TYPED(object, dnodes_per_chunk, uint64_t) +
 			    dnodes_per_chunk;
 			(void) atomic_swap_64(cpuobj, object);
 			mutex_exit(&os->os_obj_lock);
@@ -255,7 +255,7 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
 uint64_t
 dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
-    int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+    int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
 {
 	return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
 	    bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
@@ -409,6 +409,8 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
 		 * hand off to dnode_next_offset() for further scanning.
 		 */
 		while (i <= last_obj) {
+			if (i == 0)
+				return (SET_ERROR(ESRCH));
 			error = dmu_object_info(os, i, &doi);
 			if (error == ENOENT) {
 				if (hole) {
@@ -518,6 +520,6 @@ EXPORT_SYMBOL(dmu_object_zapify);
 EXPORT_SYMBOL(dmu_object_free_zapified);
 
 /* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, UINT, ZMOD_RW,
 	"CPU-specific allocator grabs 2^N objects at once");
 /* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c
index b30a9d619034..8f4fefa4f4dd 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_objset.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -32,6 +32,7 @@
  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -78,16 +79,16 @@ krwlock_t os_lock;
  * datasets.
  * Default is 4 times the number of leaf vdevs.
  */
-int dmu_find_threads = 0;
+static const int dmu_find_threads = 0;
 
 /*
  * Backfill lower metadnode objects after this many have been freed.
  * Backfilling negatively impacts object creation rates, so only do it
  * if there are enough holes to fill.
  */
-int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT;
+static const int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT;
 
-static char *upgrade_tag = "upgrade_tag";
+static const char *upgrade_tag = "upgrade_tag";
 
 static void dmu_objset_find_dp_cb(void *arg);
 
@@ -263,6 +264,19 @@ secondary_cache_changed_cb(void *arg, uint64_t newval)
 }
 
 static void
+prefetch_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	/*
+	 * Inheritance should have been done by now.
+	 */
+	ASSERT(newval == ZFS_PREFETCH_ALL || newval == ZFS_PREFETCH_NONE ||
+	    newval == ZFS_PREFETCH_METADATA);
+	os->os_prefetch = newval;
+}
+
+static void
 sync_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
@@ -287,7 +301,9 @@ redundant_metadata_changed_cb(void *arg, uint64_t newval)
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
-	    newval == ZFS_REDUNDANT_METADATA_MOST);
+	    newval == ZFS_REDUNDANT_METADATA_MOST ||
+	    newval == ZFS_REDUNDANT_METADATA_SOME ||
+	    newval == ZFS_REDUNDANT_METADATA_NONE);
 
 	os->os_redundant_metadata = newval;
 }
@@ -384,10 +400,10 @@ dnode_hash(const objset_t *os, uint64_t obj)
 
 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 	/*
-	 * The low 6 bits of the pointer don't have much entropy, because
-	 * the objset_t is larger than 2^6 bytes long.
+	 * The lower 11 bits of the pointer don't have much entropy, because
+	 * the objset_t is more than 1KB long and so likely aligned to 2KB.
 	 */
-	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 11)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];
@@ -416,28 +432,28 @@ dnode_multilist_index_func(multilist_t *ml, void *obj)
 static inline boolean_t
 dmu_os_is_l2cacheable(objset_t *os)
 {
-	vdev_t *vd = NULL;
-	zfs_cache_type_t cache = os->os_secondary_cache;
-	blkptr_t *bp = os->os_rootbp;
+	if (os->os_secondary_cache == ZFS_CACHE_ALL ||
+	    os->os_secondary_cache == ZFS_CACHE_METADATA) {
+		if (l2arc_exclude_special == 0)
+			return (B_TRUE);
 
-	if (bp != NULL && !BP_IS_HOLE(bp)) {
+		blkptr_t *bp = os->os_rootbp;
+		if (bp == NULL || BP_IS_HOLE(bp))
+			return (B_FALSE);
 		uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
 		vdev_t *rvd = os->os_spa->spa_root_vdev;
+		vdev_t *vd = NULL;
 
 		if (vdev < rvd->vdev_children)
 			vd = rvd->vdev_child[vdev];
 
-		if (cache == ZFS_CACHE_ALL || cache == ZFS_CACHE_METADATA) {
-			if (vd == NULL)
-				return (B_TRUE);
+		if (vd == NULL)
+			return (B_TRUE);
 
-			if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
-			    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) ||
-			    l2arc_exclude_special == 0)
-				return (B_TRUE);
-		}
+		if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+		    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+			return (B_TRUE);
 	}
-
 	return (B_FALSE);
 }
 
@@ -479,7 +495,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 		arc_flags_t aflags = ARC_FLAG_WAIT;
 		zbookmark_phys_t zb;
 		int size;
-		enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+		zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
@@ -516,8 +532,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 		if (arc_buf_size(os->os_phys_buf) < size) {
 			arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,
 			    ARC_BUFC_METADATA, size);
-			bzero(buf->b_data, size);
-			bcopy(os->os_phys_buf->b_data, buf->b_data,
+			memset(buf->b_data, 0, size);
+			memcpy(buf->b_data, os->os_phys_buf->b_data,
 			    arc_buf_size(os->os_phys_buf));
 			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
 			os->os_phys_buf = buf;
@@ -531,7 +547,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 		os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,
 		    ARC_BUFC_METADATA, size);
 		os->os_phys = os->os_phys_buf->b_data;
-		bzero(os->os_phys, size);
+		memset(os->os_phys, 0, size);
 	}
 	/*
 	 * These properties will be filled in by the logic in zfs_get_zplprop()
@@ -559,6 +575,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 			    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 			    secondary_cache_changed_cb, os);
 		}
+		if (err == 0) {
+			err = dsl_prop_register(ds,
+			    zfs_prop_to_name(ZFS_PROP_PREFETCH),
+			    prefetch_changed_cb, os);
+		}
 		if (!ds->ds_is_snapshot) {
 			if (err == 0) {
 				err = dsl_prop_register(ds,
@@ -632,6 +653,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 		os->os_primary_cache = ZFS_CACHE_ALL;
 		os->os_secondary_cache = ZFS_CACHE_ALL;
 		os->os_dnodesize = DNODE_MIN_SIZE;
+		os->os_prefetch = ZFS_PREFETCH_ALL;
 	}
 
 	if (ds == NULL || !ds->ds_is_snapshot)
@@ -714,7 +736,7 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
  * can be held at a time.
  */
 int
-dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag,
+dmu_objset_hold_flags(const char *name, boolean_t decrypt, const void *tag,
     objset_t **osp)
 {
 	dsl_pool_t *dp;
@@ -742,18 +764,18 @@ dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag,
 }
 
 int
-dmu_objset_hold(const char *name, void *tag, objset_t **osp)
+dmu_objset_hold(const char *name, const void *tag, objset_t **osp)
 {
 	return (dmu_objset_hold_flags(name, B_FALSE, tag, osp));
 }
 
 static int
 dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
-    boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
+    boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp)
 {
-	int err;
+	(void) tag;
 
-	err = dmu_objset_from_ds(ds, osp);
+	int err = dmu_objset_from_ds(ds, osp);
 	if (err != 0) {
 		return (err);
 	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
@@ -789,7 +811,7 @@ dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
  */
 int
 dmu_objset_own(const char *name, dmu_objset_type_t type,
-    boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
+    boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
@@ -834,7 +856,7 @@ dmu_objset_own(const char *name, dmu_objset_type_t type,
 
 int
 dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
-    boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
+    boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp)
 {
 	dsl_dataset_t *ds;
 	int err;
@@ -855,7 +877,7 @@ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
 }
 
 void
-dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag)
+dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, const void *tag)
 {
 	ds_hold_flags_t flags;
 	dsl_pool_t *dp = dmu_objset_pool(os);
@@ -866,7 +888,7 @@ dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag)
 }
 
 void
-dmu_objset_rele(objset_t *os, void *tag)
+dmu_objset_rele(objset_t *os, const void *tag)
 {
 	dmu_objset_rele_flags(os, B_FALSE, tag);
 }
@@ -884,7 +906,7 @@ dmu_objset_rele(objset_t *os, void *tag)
  */
 void
 dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
-    boolean_t decrypt, void *tag)
+    boolean_t decrypt, const void *tag)
 {
 	dsl_pool_t *dp;
 	char name[ZFS_MAX_DATASET_NAME_LEN];
@@ -904,7 +926,7 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
 }
 
 void
-dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag)
+dmu_objset_disown(objset_t *os, boolean_t decrypt, const void *tag)
 {
 	ds_hold_flags_t flags;
 
@@ -1118,12 +1140,14 @@ dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 	    (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
 		os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 		if (dmu_objset_userobjused_enabled(os)) {
+			ASSERT3P(ds, !=, NULL);
 			ds->ds_feature_activation[
 			    SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;
 			os->os_phys->os_flags |=
 			    OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
 		}
 		if (dmu_objset_projectquota_enabled(os)) {
+			ASSERT3P(ds, !=, NULL);
 			ds->ds_feature_activation[
 			    SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;
 			os->os_phys->os_flags |=
@@ -1157,7 +1181,6 @@ typedef struct dmu_objset_create_arg {
 	dsl_crypto_params_t *doca_dcp;
 } dmu_objset_create_arg_t;
 
-/*ARGSUSED*/
 static int
 dmu_objset_create_check(void *arg, dmu_tx_t *tx)
 {
@@ -1299,6 +1322,7 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
 			ASSERT3P(ds->ds_key_mapping, !=, NULL);
 			key_mapping_rele(spa, ds->ds_key_mapping, ds);
 			dsl_dataset_sync_done(ds, tx);
+			dmu_buf_rele(ds->ds_dbuf, ds);
 		}
 
 		mutex_enter(&ds->ds_lock);
@@ -1353,7 +1377,6 @@ typedef struct dmu_objset_clone_arg {
 	proc_t *doca_proc;
 } dmu_objset_clone_arg_t;
 
-/*ARGSUSED*/
 static int
 dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
 {
@@ -1565,10 +1588,10 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
 	}
 }
 
-/* ARGSUSED */
 static void
 dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
+	(void) abuf;
 	blkptr_t *bp = zio->io_bp;
 	objset_t *os = arg;
 	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
@@ -1596,10 +1619,10 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
 		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
 }
 
-/* ARGSUSED */
 static void
 dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
+	(void) abuf;
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	objset_t *os = arg;
@@ -1616,28 +1639,92 @@ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
 	kmem_free(bp, sizeof (*bp));
 }
 
+typedef struct sync_objset_arg {
+	zio_t		*soa_zio;
+	objset_t	*soa_os;
+	dmu_tx_t	*soa_tx;
+	kmutex_t	soa_mutex;
+	int		soa_count;
+	taskq_ent_t	soa_tq_ent;
+} sync_objset_arg_t;
+
 typedef struct sync_dnodes_arg {
-	multilist_t *sda_list;
-	int sda_sublist_idx;
-	multilist_t *sda_newlist;
-	dmu_tx_t *sda_tx;
+	multilist_t	*sda_list;
+	int		sda_sublist_idx;
+	multilist_t	*sda_newlist;
+	sync_objset_arg_t *sda_soa;
 } sync_dnodes_arg_t;
 
+static void sync_meta_dnode_task(void *arg);
+
 static void
 sync_dnodes_task(void *arg)
 {
 	sync_dnodes_arg_t *sda = arg;
+	sync_objset_arg_t *soa = sda->sda_soa;
+	objset_t *os = soa->soa_os;
 
+	uint_t allocator = spa_acq_allocator(os->os_spa);
 	multilist_sublist_t *ms =
-	    multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
+	    multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
 
-	dmu_objset_sync_dnodes(ms, sda->sda_tx);
+	dmu_objset_sync_dnodes(ms, soa->soa_tx);
 
 	multilist_sublist_unlock(ms);
+	spa_rel_allocator(os->os_spa, allocator);
 
 	kmem_free(sda, sizeof (*sda));
+
+	mutex_enter(&soa->soa_mutex);
+	ASSERT(soa->soa_count != 0);
+	if (--soa->soa_count != 0) {
+		mutex_exit(&soa->soa_mutex);
+		return;
+	}
+	mutex_exit(&soa->soa_mutex);
+
+	taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,
+	    sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);
 }
 
+/*
+ * Issue the zio_nowait() for all dirty record zios on the meta dnode,
+ * then trigger the callback for the zil_sync. This runs once for each
+ * objset, only after any/all sublists in the objset have been synced.
+ */
+static void
+sync_meta_dnode_task(void *arg)
+{
+	sync_objset_arg_t *soa = arg;
+	objset_t *os = soa->soa_os;
+	dmu_tx_t *tx = soa->soa_tx;
+	int txgoff = tx->tx_txg & TXG_MASK;
+	dbuf_dirty_record_t *dr;
+
+	ASSERT0(soa->soa_count);
+
+	list_t *list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
+	while ((dr = list_remove_head(list)) != NULL) {
+		ASSERT0(dr->dr_dbuf->db_level);
+		zio_nowait(dr->dr_zio);
+	}
+
+	/* Enable dnode backfill if enough objects have been freed. */
+	if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
+		os->os_rescan_dnodes = B_TRUE;
+		os->os_freed_dnodes = 0;
+	}
+
+	/*
+	 * Free intent log blocks up to this tx.
+	 */
+	zil_sync(os->os_zil, tx);
+	os->os_phys->os_zil_header = os->os_zil_header;
+	zio_nowait(soa->soa_zio);
+
+	mutex_destroy(&soa->soa_mutex);
+	kmem_free(soa, sizeof (*soa));
+}
 
 /* called from dsl */
 void
@@ -1647,8 +1734,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
-	list_t *list;
-	dbuf_dirty_record_t *dr;
 	int num_sublists;
 	multilist_t *ml;
 	blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
@@ -1693,8 +1778,8 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 	}
 
 	zio = arc_write(pio, os->os_spa, tx->tx_txg,
-	    blkptr_copy, os->os_phys_buf, dmu_os_is_l2cacheable(os),
-	    &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
+	    blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os),
+	    &zp, dmu_objset_write_ready, NULL, dmu_objset_write_done,
 	    os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	/*
@@ -1735,40 +1820,49 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 		    offsetof(dnode_t, dn_dirty_link[txgoff]));
 	}
 
+	/*
+	 * zio_nowait(zio) is done after any/all sublist and meta dnode
+	 * zios have been nowaited, and the zil_sync() has been performed.
+	 * The soa is freed at the end of sync_meta_dnode_task.
+	 */
+	sync_objset_arg_t *soa = kmem_alloc(sizeof (*soa), KM_SLEEP);
+	soa->soa_zio = zio;
+	soa->soa_os = os;
+	soa->soa_tx = tx;
+	taskq_init_ent(&soa->soa_tq_ent);
+	mutex_init(&soa->soa_mutex, NULL, MUTEX_DEFAULT, NULL);
+
 	ml = &os->os_dirty_dnodes[txgoff];
-	num_sublists = multilist_get_num_sublists(ml);
+	soa->soa_count = num_sublists = multilist_get_num_sublists(ml);
+
 	for (int i = 0; i < num_sublists; i++) {
 		if (multilist_sublist_is_empty_idx(ml, i))
-			continue;
-		sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP);
-		sda->sda_list = ml;
-		sda->sda_sublist_idx = i;
-		sda->sda_tx = tx;
-		(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
-		    sync_dnodes_task, sda, 0);
-		/* callback frees sda */
+			soa->soa_count--;
 	}
-	taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
 
-	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
-	while ((dr = list_head(list)) != NULL) {
-		ASSERT0(dr->dr_dbuf->db_level);
-		list_remove(list, dr);
-		zio_nowait(dr->dr_zio);
-	}
-
-	/* Enable dnode backfill if enough objects have been freed. */
-	if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
-		os->os_rescan_dnodes = B_TRUE;
-		os->os_freed_dnodes = 0;
+	if (soa->soa_count == 0) {
+		taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,
+		    sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);
+	} else {
+		/*
+		 * Sync sublists in parallel. The last to finish
+		 * (i.e., when soa->soa_count reaches zero) must
+		 *  dispatch sync_meta_dnode_task.
+		 */
+		for (int i = 0; i < num_sublists; i++) {
+			if (multilist_sublist_is_empty_idx(ml, i))
+				continue;
+			sync_dnodes_arg_t *sda =
+			    kmem_alloc(sizeof (*sda), KM_SLEEP);
+			sda->sda_list = ml;
+			sda->sda_sublist_idx = i;
+			sda->sda_soa = soa;
+			(void) taskq_dispatch(
+			    dmu_objset_pool(os)->dp_sync_taskq,
+			    sync_dnodes_task, sda, 0);
+			/* sync_dnodes_task frees sda */
+		}
 	}
-
-	/*
-	 * Free intent log blocks up to this tx.
-	 */
-	zil_sync(os->os_zil, tx);
-	os->os_phys->os_zil_header = os->os_zil_header;
-	zio_nowait(zio);
 }
 
 boolean_t
@@ -1984,8 +2078,8 @@ userquota_updates_task(void *arg)
 	dnode_t *dn;
 	userquota_cache_t cache = { { 0 } };
 
-	multilist_sublist_t *list =
-	    multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+	multilist_sublist_t *list = multilist_sublist_lock_idx(
+	    &os->os_synced_dnodes, uua->uua_sublist_idx);
 
 	ASSERT(multilist_sublist_head(list) == NULL ||
 	    dmu_objset_userused_enabled(os));
@@ -2067,8 +2161,8 @@ dnode_rele_task(void *arg)
 	userquota_updates_arg_t *uua = arg;
 	objset_t *os = uua->uua_os;
 
-	multilist_sublist_t *list =
-	    multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+	multilist_sublist_t *list = multilist_sublist_lock_idx(
+	    &os->os_synced_dnodes, uua->uua_sublist_idx);
 
 	dnode_t *dn;
 	while ((dn = multilist_sublist_head(list)) != NULL) {
@@ -2343,7 +2437,7 @@ dmu_objset_space_upgrade(objset_t *os)
 		if (err != 0)
 			return (err);
 
-		if (issig(JUSTLOOKING) && issig(FORREAL))
+		if (issig())
 			return (SET_ERROR(EINTR));
 
 		objerr = dmu_bonus_hold(os, obj, FTAG, &db);
diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c
index 0ec46bdb4f47..0119191d7920 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_recv.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -27,8 +27,12 @@
  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2019 Datto Inc.
+ * Copyright (c) 2022 Axcient.
  */
 
+#include <sys/arc.h>
+#include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_send.h>
@@ -64,12 +68,19 @@
 #endif
 #include <sys/zfs_file.h>
 
-int zfs_recv_queue_length = SPA_MAXBLOCKSIZE;
-int zfs_recv_queue_ff = 20;
-int zfs_recv_write_batch_size = 1024 * 1024;
+static uint_t zfs_recv_queue_length = SPA_MAXBLOCKSIZE;
+static uint_t zfs_recv_queue_ff = 20;
+static uint_t zfs_recv_write_batch_size = 1024 * 1024;
+static int zfs_recv_best_effort_corrective = 0;
 
-static char *dmu_recv_tag = "dmu_recv_tag";
-const char *recv_clone_name = "%recv";
+static const void *const dmu_recv_tag = "dmu_recv_tag";
+const char *const recv_clone_name = "%recv";
+
+typedef enum {
+	ORNS_NO,
+	ORNS_YES,
+	ORNS_MAYBE
+} or_need_sync_t;
 
 static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len,
     void *buf);
@@ -102,6 +113,8 @@ struct receive_writer_arg {
 	boolean_t done;
 
 	int err;
+	const char *tofs;
+	boolean_t heal;
 	boolean_t resumable;
 	boolean_t raw;   /* DMU_BACKUP_FEATURE_RAW set */
 	boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */
@@ -121,6 +134,10 @@ struct receive_writer_arg {
 	uint8_t or_iv[ZIO_DATA_IV_LEN];
 	uint8_t or_mac[ZIO_DATA_MAC_LEN];
 	boolean_t or_byteorder;
+	zio_t *heal_pio;
+
+	/* Keep track of DRR_FREEOBJECTS right after DRR_OBJECT_RANGE */
+	or_need_sync_t or_need_sync;
 };
 
 typedef struct dmu_recv_begin_arg {
@@ -343,9 +360,10 @@ static int
 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
     uint64_t fromguid, uint64_t featureflags)
 {
-	uint64_t val;
+	uint64_t obj;
 	uint64_t children;
 	int error;
+	dsl_dataset_t *snap;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0;
 	boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
@@ -354,7 +372,7 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
 	/* Temporary clone name must not exist. */
 	error = zap_lookup(dp->dp_meta_objset,
 	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
-	    8, 1, &val);
+	    8, 1, &obj);
 	if (error != ENOENT)
 		return (error == 0 ? SET_ERROR(EBUSY) : error);
 
@@ -362,12 +380,16 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
 	if (dsl_dataset_has_resume_receive_state(ds))
 		return (SET_ERROR(EBUSY));
 
-	/* New snapshot name must not exist. */
+	/* New snapshot name must not exist if we're not healing it. */
 	error = zap_lookup(dp->dp_meta_objset,
 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj,
-	    drba->drba_cookie->drc_tosnap, 8, 1, &val);
-	if (error != ENOENT)
+	    drba->drba_cookie->drc_tosnap, 8, 1, &obj);
+	if (drba->drba_cookie->drc_heal) {
+		if (error != 0)
+			return (error);
+	} else if (error != ENOENT) {
 		return (error == 0 ? SET_ERROR(EEXIST) : error);
+	}
 
 	/* Must not have children if receiving a ZVOL. */
 	error = zap_count(dp->dp_meta_objset,
@@ -392,8 +414,40 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
 	if (error != 0)
 		return (error);
 
-	if (fromguid != 0) {
-		dsl_dataset_t *snap;
+	if (drba->drba_cookie->drc_heal) {
+		/* Encryption is incompatible with embedded data. */
+		if (encrypted && embed)
+			return (SET_ERROR(EINVAL));
+
+		/* Healing is not supported when in 'force' mode. */
+		if (drba->drba_cookie->drc_force)
+			return (SET_ERROR(EINVAL));
+
+		/* Must have keys loaded if doing encrypted non-raw recv. */
+		if (encrypted && !raw) {
+			if (spa_keystore_lookup_key(dp->dp_spa, ds->ds_object,
+			    NULL, NULL) != 0)
+				return (SET_ERROR(EACCES));
+		}
+
+		error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap);
+		if (error != 0)
+			return (error);
+
+		/*
+		 * When not doing best effort corrective recv healing can only
+		 * be done if the send stream is for the same snapshot as the
+		 * one we are trying to heal.
+		 */
+		if (zfs_recv_best_effort_corrective == 0 &&
+		    drba->drba_cookie->drc_drrb->drr_toguid !=
+		    dsl_dataset_phys(snap)->ds_guid) {
+			dsl_dataset_rele(snap, FTAG);
+			return (SET_ERROR(ENOTSUP));
+		}
+		dsl_dataset_rele(snap, FTAG);
+	} else if (fromguid != 0) {
+		/* Sanity check the incremental recv */
 		uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 
 		/* Can't perform a raw receive on top of a non-raw receive */
@@ -459,7 +513,7 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
 
 		dsl_dataset_rele(snap, FTAG);
 	} else {
-		/* if full, then must be forced */
+		/* If full and not healing then must be forced. */
 		if (!drba->drba_cookie->drc_force)
 			return (SET_ERROR(EEXIST));
 
@@ -597,7 +651,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 		if (!(flags & DRR_FLAG_SPILL_BLOCK))
 			return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
 	} else {
-		dsflags |= DS_HOLD_FLAG_DECRYPT;
+		/*
+		 * We support unencrypted datasets below encrypted ones now,
+		 * so add the DS_HOLD_FLAG_DECRYPT flag only if we are dealing
+		 * with a dataset we may encrypt.
+		 */
+		if (drba->drba_dcp == NULL ||
+		    drba->drba_dcp->cp_crypt != ZIO_CRYPT_OFF) {
+			dsflags |= DS_HOLD_FLAG_DECRYPT;
+		}
 	}
 
 	error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
@@ -618,6 +680,10 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 		char buf[ZFS_MAX_DATASET_NAME_LEN];
 		objset_t *os;
 
+		/* healing recv must be done "into" an existing snapshot */
+		if (drba->drba_cookie->drc_heal == B_TRUE)
+			return (SET_ERROR(ENOTSUP));
+
 		/*
 		 * If it's a non-clone incremental, we are missing the
 		 * target fs, so fail the recv.
@@ -799,7 +865,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
 
 	error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
 	if (error == 0) {
-		/* create temporary clone */
+		/* Create temporary clone unless we're doing corrective recv */
 		dsl_dataset_t *snap = NULL;
 
 		if (drba->drba_cookie->drc_fromsnapobj != 0) {
@@ -807,8 +873,15 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
 			    drba->drba_cookie->drc_fromsnapobj, FTAG, &snap));
 			ASSERT3P(dcp, ==, NULL);
 		}
-		dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
-		    snap, crflags, drba->drba_cred, dcp, tx);
+		if (drc->drc_heal) {
+			/* When healing we want to use the provided snapshot */
+			VERIFY0(dsl_dataset_snap_lookup(ds, drc->drc_tosnap,
+			    &dsobj));
+		} else {
+			dsobj = dsl_dataset_create_sync(ds->ds_dir,
+			    recv_clone_name, snap, crflags, drba->drba_cred,
+			    dcp, tx);
+		}
 		if (drba->drba_cookie->drc_fromsnapobj != 0)
 			dsl_dataset_rele(snap, FTAG);
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
@@ -925,7 +998,8 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
 	 */
 	rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
 	if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) &&
-	    (featureflags & DMU_BACKUP_FEATURE_RAW) == 0) {
+	    (featureflags & DMU_BACKUP_FEATURE_RAW) == 0 &&
+	    !drc->drc_heal) {
 		(void) dmu_objset_create_impl(dp->dp_spa,
 		    newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
 	}
@@ -981,13 +1055,24 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
 		dsflags |= DS_HOLD_FLAG_DECRYPT;
 	}
 
+	boolean_t recvexist = B_TRUE;
 	if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) {
 		/* %recv does not exist; continue in tofs */
+		recvexist = B_FALSE;
 		error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
 		if (error != 0)
 			return (error);
 	}
 
+	/*
+	 * Resume of full/newfs recv on existing dataset should be done with
+	 * force flag
+	 */
+	if (recvexist && drrb->drr_fromguid == 0 && !drc->drc_force) {
+		dsl_dataset_rele_flags(ds, dsflags, FTAG);
+		return (SET_ERROR(ZFS_ERR_RESUME_EXISTS));
+	}
+
 	/* check that ds is marked inconsistent */
 	if (!DS_IS_INCONSISTENT(ds)) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
@@ -1132,20 +1217,22 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
  * succeeds; otherwise we will leak the holds on the datasets.
  */
 int
-dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
-    boolean_t force, boolean_t resumable, nvlist_t *localprops,
-    nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc,
-    zfs_file_t *fp, offset_t *voffp)
+dmu_recv_begin(const char *tofs, const char *tosnap,
+    dmu_replay_record_t *drr_begin, boolean_t force, boolean_t heal,
+    boolean_t resumable, nvlist_t *localprops, nvlist_t *hidden_args,
+    const char *origin, dmu_recv_cookie_t *drc, zfs_file_t *fp,
+    offset_t *voffp)
 {
 	dmu_recv_begin_arg_t drba = { 0 };
-	int err;
+	int err = 0;
 
-	bzero(drc, sizeof (dmu_recv_cookie_t));
+	memset(drc, 0, sizeof (dmu_recv_cookie_t));
 	drc->drc_drr_begin = drr_begin;
 	drc->drc_drrb = &drr_begin->drr_u.drr_begin;
 	drc->drc_tosnap = tosnap;
 	drc->drc_tofs = tofs;
 	drc->drc_force = force;
+	drc->drc_heal = heal;
 	drc->drc_resumable = resumable;
 	drc->drc_cred = CRED();
 	drc->drc_proc = curproc;
@@ -1169,20 +1256,36 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
 	    DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
 
 	uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
-	void *payload = NULL;
-	if (payloadlen != 0)
-		payload = kmem_alloc(payloadlen, KM_SLEEP);
 
-	err = receive_read_payload_and_next_header(drc, payloadlen,
-	    payload);
-	if (err != 0) {
-		kmem_free(payload, payloadlen);
-		return (err);
-	}
+	/*
+	 * Since OpenZFS 2.0.0, we have enforced a 64MB limit in userspace
+	 * configurable via ZFS_SENDRECV_MAX_NVLIST. We enforce 256MB as a hard
+	 * upper limit. Systems with less than 1GB of RAM will see a lower
+	 * limit from `arc_all_memory() / 4`.
+	 */
+	if (payloadlen > (MIN((1U << 28), arc_all_memory() / 4)))
+		return (E2BIG);
+
+
 	if (payloadlen != 0) {
+		void *payload = vmem_alloc(payloadlen, KM_SLEEP);
+		/*
+		 * For compatibility with recursive send streams, we don't do
+		 * this here if the stream could be part of a package. Instead,
+		 * we'll do it in dmu_recv_stream. If we pull the next header
+		 * too early, and it's the END record, we break the `recv_skip`
+		 * logic.
+		 */
+
+		err = receive_read_payload_and_next_header(drc, payloadlen,
+		    payload);
+		if (err != 0) {
+			vmem_free(payload, payloadlen);
+			return (err);
+		}
 		err = nvlist_unpack(payload, payloadlen, &drc->drc_begin_nvl,
 		    KM_SLEEP);
-		kmem_free(payload, payloadlen);
+		vmem_free(payload, payloadlen);
 		if (err != 0) {
 			kmem_free(drc->drc_next_rrd,
 			    sizeof (*drc->drc_next_rrd));
@@ -1203,7 +1306,6 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
 		    dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
 		    &drba, 5, ZFS_SPACE_CHECK_NORMAL);
 	} else {
-
 		/*
 		 * For non-raw, non-incremental, non-resuming receives the
 		 * user can specify encryption parameters on the command line
@@ -1236,6 +1338,186 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
 	return (err);
 }
 
+/*
+ * Holds data need for corrective recv callback
+ */
+typedef struct cr_cb_data {
+	uint64_t size;
+	zbookmark_phys_t zb;
+	spa_t *spa;
+} cr_cb_data_t;
+
+static void
+corrective_read_done(zio_t *zio)
+{
+	cr_cb_data_t *data = zio->io_private;
+	/* Corruption corrected; update error log if needed */
+	if (zio->io_error == 0) {
+		spa_remove_error(data->spa, &data->zb,
+		    BP_GET_LOGICAL_BIRTH(zio->io_bp));
+	}
+	kmem_free(data, sizeof (cr_cb_data_t));
+	abd_free(zio->io_abd);
+}
+
+/*
+ * zio_rewrite the data pointed to by bp with the data from the rrd's abd.
+ */
+static int
+do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
+    struct receive_record_arg *rrd, blkptr_t *bp)
+{
+	int err;
+	zio_t *io;
+	zbookmark_phys_t zb;
+	dnode_t *dn;
+	abd_t *abd = rrd->abd;
+	zio_cksum_t bp_cksum = bp->blk_cksum;
+	zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_RETRY |
+	    ZIO_FLAG_CANFAIL;
+
+	if (rwa->raw)
+		flags |= ZIO_FLAG_RAW;
+
+	err = dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn);
+	if (err != 0)
+		return (err);
+	SET_BOOKMARK(&zb, dmu_objset_id(rwa->os), drrw->drr_object, 0,
+	    dbuf_whichblock(dn, 0, drrw->drr_offset));
+	dnode_rele(dn, FTAG);
+
+	if (!rwa->raw && DRR_WRITE_COMPRESSED(drrw)) {
+		/* Decompress the stream data */
+		abd_t *dabd = abd_alloc_linear(
+		    drrw->drr_logical_size, B_FALSE);
+		err = zio_decompress_data(drrw->drr_compressiontype,
+		    abd, abd_to_buf(dabd), abd_get_size(abd),
+		    abd_get_size(dabd), NULL);
+
+		if (err != 0) {
+			abd_free(dabd);
+			return (err);
+		}
+		/* Swap in the newly decompressed data into the abd */
+		abd_free(abd);
+		abd = dabd;
+	}
+
+	if (!rwa->raw && BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+		/* Recompress the data */
+		abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp),
+		    B_FALSE);
+		void *buf = abd_to_buf(cabd);
+		uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp),
+		    abd, &buf, abd_get_size(abd),
+		    rwa->os->os_complevel);
+		abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize);
+		/* Swap in newly compressed data into the abd */
+		abd_free(abd);
+		abd = cabd;
+		flags |= ZIO_FLAG_RAW_COMPRESS;
+	}
+
+	/*
+	 * The stream is not encrypted but the data on-disk is.
+	 * We need to re-encrypt the buf using the same
+	 * encryption type, salt, iv, and mac that was used to encrypt
+	 * the block previosly.
+	 */
+	if (!rwa->raw && BP_USES_CRYPT(bp)) {
+		dsl_dataset_t *ds;
+		dsl_crypto_key_t *dck = NULL;
+		uint8_t salt[ZIO_DATA_SALT_LEN];
+		uint8_t iv[ZIO_DATA_IV_LEN];
+		uint8_t mac[ZIO_DATA_MAC_LEN];
+		boolean_t no_crypt = B_FALSE;
+		dsl_pool_t *dp = dmu_objset_pool(rwa->os);
+		abd_t *eabd = abd_alloc_linear(BP_GET_PSIZE(bp), B_FALSE);
+
+		zio_crypt_decode_params_bp(bp, salt, iv);
+		zio_crypt_decode_mac_bp(bp, mac);
+
+		dsl_pool_config_enter(dp, FTAG);
+		err = dsl_dataset_hold_flags(dp, rwa->tofs,
+		    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+		if (err != 0) {
+			dsl_pool_config_exit(dp, FTAG);
+			abd_free(eabd);
+			return (SET_ERROR(EACCES));
+		}
+
+		/* Look up the key from the spa's keystore */
+		err = spa_keystore_lookup_key(rwa->os->os_spa,
+		    zb.zb_objset, FTAG, &dck);
+		if (err != 0) {
+			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT,
+			    FTAG);
+			dsl_pool_config_exit(dp, FTAG);
+			abd_free(eabd);
+			return (SET_ERROR(EACCES));
+		}
+
+		err = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
+		    BP_GET_TYPE(bp), BP_SHOULD_BYTESWAP(bp), salt, iv,
+		    mac, abd_get_size(abd), abd, eabd, &no_crypt);
+
+		spa_keystore_dsl_key_rele(rwa->os->os_spa, dck, FTAG);
+		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+		dsl_pool_config_exit(dp, FTAG);
+
+		ASSERT0(no_crypt);
+		if (err != 0) {
+			abd_free(eabd);
+			return (err);
+		}
+		/* Swap in the newly encrypted data into the abd */
+		abd_free(abd);
+		abd = eabd;
+
+		/*
+		 * We want to prevent zio_rewrite() from trying to
+		 * encrypt the data again
+		 */
+		flags |= ZIO_FLAG_RAW_ENCRYPT;
+	}
+	rrd->abd = abd;
+
+	io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp,
+	    abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags,
+	    &zb);
+
+	ASSERT(abd_get_size(abd) == BP_GET_LSIZE(bp) ||
+	    abd_get_size(abd) == BP_GET_PSIZE(bp));
+
+	/* compute new bp checksum value and make sure it matches the old one */
+	zio_checksum_compute(io, BP_GET_CHECKSUM(bp), abd, abd_get_size(abd));
+	if (!ZIO_CHECKSUM_EQUAL(bp_cksum, io->io_bp->blk_cksum)) {
+		zio_destroy(io);
+		if (zfs_recv_best_effort_corrective != 0)
+			return (0);
+		return (SET_ERROR(ECKSUM));
+	}
+
+	/* Correct the corruption in place */
+	err = zio_wait(io);
+	if (err == 0) {
+		cr_cb_data_t *cb_data =
+		    kmem_alloc(sizeof (cr_cb_data_t), KM_SLEEP);
+		cb_data->spa = rwa->os->os_spa;
+		cb_data->size = drrw->drr_logical_size;
+		cb_data->zb = zb;
+		/* Test if healing worked by re-reading the bp */
+		err = zio_wait(zio_read(rwa->heal_pio, rwa->os->os_spa, bp,
+		    abd_alloc_for_io(drrw->drr_logical_size, B_FALSE),
+		    drrw->drr_logical_size, corrective_read_done,
+		    cb_data, ZIO_PRIORITY_ASYNC_READ, flags, NULL));
+	}
+	if (err != 0 && zfs_recv_best_effort_corrective != 0)
+		err = 0;
+
+	return (err);
+}
+
 static int
 receive_read(dmu_recv_cookie_t *drc, int len, void *buf)
 {
@@ -1249,11 +1531,11 @@ receive_read(dmu_recv_cookie_t *drc, int len, void *buf)
 	    (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0);
 
 	while (done < len) {
-		ssize_t resid;
+		ssize_t resid = len - done;
 		zfs_file_t *fp = drc->drc_fp;
 		int err = zfs_file_read(fp, (char *)buf + done,
 		    len - done, &resid);
-		if (resid == len - done) {
+		if (err == 0 && resid == len - done) {
 			/*
 			 * Note: ECKSUM or ZFS_ERR_STREAM_TRUNCATED indicates
 			 * that the receive was interrupted and can
@@ -1516,17 +1798,19 @@ receive_handle_existing_object(const struct receive_writer_arg *rwa,
 	}
 
 	/*
-	 * The dmu does not currently support decreasing nlevels
-	 * or changing the number of dnode slots on an object. For
-	 * non-raw sends, this does not matter and the new object
-	 * can just use the previous one's nlevels. For raw sends,
-	 * however, the structure of the received dnode (including
-	 * nlevels and dnode slots) must match that of the send
-	 * side. Therefore, instead of using dmu_object_reclaim(),
-	 * we must free the object completely and call
-	 * dmu_object_claim_dnsize() instead.
+	 * The dmu does not currently support decreasing nlevels or changing
+	 * indirect block size if there is already one, same as changing the
+	 * number of of dnode slots on an object.  For non-raw sends this
+	 * does not matter and the new object can just use the previous one's
+	 * parameters.  For raw sends, however, the structure of the received
+	 * dnode (including indirects and dnode slots) must match that of the
+	 * send side.  Therefore, instead of using dmu_object_reclaim(), we
+	 * must free the object completely and call dmu_object_claim_dnsize()
+	 * instead.
 	 */
-	if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) ||
+	if ((rwa->raw && ((doi->doi_indirection > 1 &&
+	    indblksz != doi->doi_metadata_block_size) ||
+	    drro->drr_nlevels < doi->doi_indirection)) ||
 	    dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) {
 		err = dmu_free_long_object(rwa->os, drro->drr_object);
 		if (err != 0)
@@ -1634,6 +1918,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 	if (err == 0) {
 		err = receive_handle_existing_object(rwa, drro, &doi, data,
 		    &object_to_hold, &new_blksz);
+		if (err != 0)
+			return (err);
 	} else if (err == EEXIST) {
 		/*
 		 * The object requested is currently an interior slot of a
@@ -1650,10 +1936,22 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 		/* object was freed and we are about to allocate a new one */
 		object_to_hold = DMU_NEW_OBJECT;
 	} else {
+		/*
+		 * If the only record in this range so far was DRR_FREEOBJECTS
+		 * with at least one actually freed object, it's possible that
+		 * the block will now be converted to a hole. We need to wait
+		 * for the txg to sync to prevent races.
+		 */
+		if (rwa->or_need_sync == ORNS_YES)
+			txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+
 		/* object is free and we are about to allocate a new one */
 		object_to_hold = DMU_NEW_OBJECT;
 	}
 
+	/* Only relevant for the first object in the range */
+	rwa->or_need_sync = ORNS_NO;
+
 	/*
 	 * If this is a multi-slot dnode there is a chance that this
 	 * object will expand into a slot that is already used by
@@ -1800,7 +2098,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 		dmu_buf_will_dirty(db, tx);
 
 		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
-		bcopy(data, db->db_data, DRR_OBJECT_PAYLOAD_SIZE(drro));
+		memcpy(db->db_data, data, DRR_OBJECT_PAYLOAD_SIZE(drro));
 
 		/*
 		 * Raw bonus buffers have their byteorder determined by the
@@ -1815,12 +2113,21 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 		dmu_buf_rele(db, FTAG);
 		dnode_rele(dn, FTAG);
 	}
+
+	/*
+	 * If the receive fails, we want the resume stream to start with the
+	 * same record that we last successfully received. There is no way to
+	 * request resume from the object record, but we can benefit from the
+	 * fact that sender always sends object record before anything else,
+	 * after which it will "resend" data at offset 0 and resume normally.
+	 */
+	save_resume_state(rwa, drro->drr_object, 0, tx);
+
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
-/* ARGSUSED */
 noinline static int
 receive_freeobjects(struct receive_writer_arg *rwa,
     struct drr_freeobjects *drrfo)
@@ -1848,6 +2155,9 @@ receive_freeobjects(struct receive_writer_arg *rwa,
 
 		if (err != 0)
 			return (err);
+
+		if (rwa->or_need_sync == ORNS_MAYBE)
+			rwa->or_need_sync = ORNS_YES;
 	}
 	if (next_err != ESRCH)
 		return (next_err);
@@ -1931,10 +2241,10 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
 			if (err == 0)
 				abd_free(abd);
 		} else {
-			zio_prop_t zp;
+			zio_prop_t zp = {0};
 			dmu_write_policy(rwa->os, dn, 0, 0, &zp);
 
-			enum zio_flag zio_flags = 0;
+			zio_flag_t zio_flags = 0;
 
 			if (rwa->raw) {
 				zp.zp_encrypt = B_TRUE;
@@ -1942,11 +2252,11 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
 				zp.zp_byteorder = ZFS_HOST_BYTEORDER ^
 				    !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^
 				    rwa->byteswap;
-				bcopy(drrw->drr_salt, zp.zp_salt,
+				memcpy(zp.zp_salt, drrw->drr_salt,
 				    ZIO_DATA_SALT_LEN);
-				bcopy(drrw->drr_iv, zp.zp_iv,
+				memcpy(zp.zp_iv, drrw->drr_iv,
 				    ZIO_DATA_IV_LEN);
-				bcopy(drrw->drr_mac, zp.zp_mac,
+				memcpy(zp.zp_mac, drrw->drr_mac,
 				    ZIO_DATA_MAC_LEN);
 				if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) {
 					zp.zp_nopwrite = B_FALSE;
@@ -2043,6 +2353,53 @@ receive_process_write_record(struct receive_writer_arg *rwa,
 	    !DMU_OT_IS_VALID(drrw->drr_type))
 		return (SET_ERROR(EINVAL));
 
+	if (rwa->heal) {
+		blkptr_t *bp;
+		dmu_buf_t *dbp;
+		int flags = DB_RF_CANFAIL;
+
+		if (rwa->raw)
+			flags |= DB_RF_NO_DECRYPT;
+
+		if (rwa->byteswap) {
+			dmu_object_byteswap_t byteswap =
+			    DMU_OT_BYTESWAP(drrw->drr_type);
+			dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(rrd->abd),
+			    DRR_WRITE_PAYLOAD_SIZE(drrw));
+		}
+
+		err = dmu_buf_hold_noread(rwa->os, drrw->drr_object,
+		    drrw->drr_offset, FTAG, &dbp);
+		if (err != 0)
+			return (err);
+
+		/* Try to read the object to see if it needs healing */
+		err = dbuf_read((dmu_buf_impl_t *)dbp, NULL, flags);
+		/*
+		 * We only try to heal when dbuf_read() returns a ECKSUMs.
+		 * Other errors (even EIO) get returned to caller.
+		 * EIO indicates that the device is not present/accessible,
+		 * so writing to it will likely fail.
+		 * If the block is healthy, we don't want to overwrite it
+		 * unnecessarily.
+		 */
+		if (err != ECKSUM) {
+			dmu_buf_rele(dbp, FTAG);
+			return (err);
+		}
+		/* Make sure the on-disk block and recv record sizes match */
+		if (drrw->drr_logical_size != dbp->db_size) {
+			err = ENOTSUP;
+			dmu_buf_rele(dbp, FTAG);
+			return (err);
+		}
+		/* Get the block pointer for the corrupted block */
+		bp = dmu_buf_get_blkptr(dbp);
+		err = do_corrective_recv(rwa, drrw, rrd, bp);
+		dmu_buf_rele(dbp, FTAG);
+		return (err);
+	}
+
 	/*
 	 * For resuming to work, records must be in increasing order
 	 * by (object, offset).
@@ -2183,7 +2540,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
 	 * size of the provided arc_buf_t.
 	 */
 	if (db_spill->db_size != drrs->drr_length) {
-		dmu_buf_will_fill(db_spill, tx);
+		dmu_buf_will_fill(db_spill, tx, B_FALSE);
 		VERIFY0(dbuf_spill_set_blksz(db_spill,
 		    drrs->drr_length, tx));
 	}
@@ -2211,7 +2568,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
 		}
 	}
 
-	bcopy(abd_to_buf(abd), abuf->b_data, DRR_SPILL_PAYLOAD_SIZE(drrs));
+	memcpy(abuf->b_data, abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs));
 	abd_free(abd);
 	dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx);
 
@@ -2222,7 +2579,6 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
 	return (0);
 }
 
-/* ARGSUSED */
 noinline static int
 receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
 {
@@ -2285,11 +2641,13 @@ receive_object_range(struct receive_writer_arg *rwa,
 	rwa->or_crypt_params_present = B_TRUE;
 	rwa->or_firstobj = drror->drr_firstobj;
 	rwa->or_numslots = drror->drr_numslots;
-	bcopy(drror->drr_salt, rwa->or_salt, ZIO_DATA_SALT_LEN);
-	bcopy(drror->drr_iv, rwa->or_iv, ZIO_DATA_IV_LEN);
-	bcopy(drror->drr_mac, rwa->or_mac, ZIO_DATA_MAC_LEN);
+	memcpy(rwa->or_salt, drror->drr_salt, ZIO_DATA_SALT_LEN);
+	memcpy(rwa->or_iv, drror->drr_iv, ZIO_DATA_IV_LEN);
+	memcpy(rwa->or_mac, drror->drr_mac, ZIO_DATA_MAC_LEN);
 	rwa->or_byteorder = byteorder;
 
+	rwa->or_need_sync = ORNS_MAYBE;
+
 	return (0);
 }
 
@@ -2297,7 +2655,6 @@ receive_object_range(struct receive_writer_arg *rwa,
  * Until we have the ability to redact large ranges of data efficiently, we
  * process these records as frees.
  */
-/* ARGSUSED */
 noinline static int
 receive_redact(struct receive_writer_arg *rwa, struct drr_redact *drrr)
 {
@@ -2337,7 +2694,8 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
 		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 		dsl_dataset_name(ds, name);
 		dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
-		(void) dsl_destroy_head(name);
+		if (!drc->drc_heal)
+			(void) dsl_destroy_head(name);
 	}
 }
 
@@ -2446,7 +2804,6 @@ receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf)
  * numbers in the ignore list. In practice, we receive up to 32 object records
  * before receiving write records, so the list can have up to 32 nodes in it.
  */
-/* ARGSUSED */
 static void
 receive_read_prefetch(dmu_recv_cookie_t *drc, uint64_t object, uint64_t offset,
     uint64_t length)
@@ -2699,7 +3056,19 @@ receive_process_record(struct receive_writer_arg *rwa,
 	ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
 	rwa->bytes_read = rrd->bytes_read;
 
-	if (rrd->header.drr_type != DRR_WRITE) {
+	/* We can only heal write records; other ones get ignored */
+	if (rwa->heal && rrd->header.drr_type != DRR_WRITE) {
+		if (rrd->abd != NULL) {
+			abd_free(rrd->abd);
+			rrd->abd = NULL;
+		} else if (rrd->payload != NULL) {
+			kmem_free(rrd->payload, rrd->payload_size);
+			rrd->payload = NULL;
+		}
+		return (0);
+	}
+
+	if (!rwa->heal && rrd->header.drr_type != DRR_WRITE) {
 		err = flush_write_batch(rwa);
 		if (err != 0) {
 			if (rrd->abd != NULL) {
@@ -2734,9 +3103,16 @@ receive_process_record(struct receive_writer_arg *rwa,
 	case DRR_WRITE:
 	{
 		err = receive_process_write_record(rwa, rrd);
-		if (err != EAGAIN) {
+		if (rwa->heal) {
+			/*
+			 * If healing - always free the abd after processing
+			 */
+			abd_free(rrd->abd);
+			rrd->abd = NULL;
+		} else if (err != EAGAIN) {
 			/*
-			 * On success, receive_process_write_record() returns
+			 * On success, a non-healing
+			 * receive_process_write_record() returns
 			 * EAGAIN to indicate that we do not want to free
 			 * the rrd or arc_buf.
 			 */
@@ -2798,7 +3174,7 @@ receive_process_record(struct receive_writer_arg *rwa,
  * dmu_recv_stream's worker thread; pull records off the queue, and then call
  * receive_process_record  When we're done, signal the main thread and exit.
  */
-static void
+static __attribute__((noreturn)) void
 receive_writer_thread(void *arg)
 {
 	struct receive_writer_arg *rwa = arg;
@@ -2827,8 +3203,9 @@ receive_writer_thread(void *arg)
 		 * EAGAIN indicates that this record has been saved (on
 		 * raw->write_batch), and will be used again, so we don't
 		 * free it.
+		 * When healing data we always need to free the record.
 		 */
-		if (err != EAGAIN) {
+		if (err != EAGAIN || rwa->heal) {
 			if (rwa->err == 0)
 				rwa->err = err;
 			kmem_free(rrd, sizeof (*rrd));
@@ -2836,10 +3213,13 @@ receive_writer_thread(void *arg)
 	}
 	kmem_free(rrd, sizeof (*rrd));
 
-	int err = flush_write_batch(rwa);
-	if (rwa->err == 0)
-		rwa->err = err;
-
+	if (rwa->heal) {
+		zio_wait(rwa->heal_pio);
+	} else {
+		int err = flush_write_batch(rwa);
+		if (rwa->err == 0)
+			rwa->err = err;
+	}
 	mutex_enter(&rwa->mutex);
 	rwa->done = B_TRUE;
 	cv_signal(&rwa->cv);
@@ -2923,17 +3303,19 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp)
 		if (err != 0)
 			goto out;
 
-		/*
-		 * If this is a new dataset we set the key immediately.
-		 * Otherwise we don't want to change the key until we
-		 * are sure the rest of the receive succeeded so we stash
-		 * the keynvl away until then.
-		 */
-		err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa),
-		    drc->drc_ds->ds_object, drc->drc_fromsnapobj,
-		    drc->drc_drrb->drr_type, keynvl, drc->drc_newfs);
-		if (err != 0)
-			goto out;
+		if (!drc->drc_heal) {
+			/*
+			 * If this is a new dataset we set the key immediately.
+			 * Otherwise we don't want to change the key until we
+			 * are sure the rest of the receive succeeded so we
+			 * stash the keynvl away until then.
+			 */
+			err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa),
+			    drc->drc_ds->ds_object, drc->drc_fromsnapobj,
+			    drc->drc_drrb->drr_type, keynvl, drc->drc_newfs);
+			if (err != 0)
+				goto out;
+		}
 
 		/* see comment in dmu_recv_end_sync() */
 		drc->drc_ivset_guid = 0;
@@ -2951,6 +3333,17 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp)
 	}
 
 	/*
+	 * For compatibility with recursive send streams, we do this here,
+	 * rather than in dmu_recv_begin. If we pull the next header too
+	 * early, and it's the END record, we break the `recv_skip` logic.
+	 */
+	if (drc->drc_drr_begin->drr_payloadlen == 0) {
+		err = receive_read_payload_and_next_header(drc, 0, NULL);
+		if (err != 0)
+			goto out;
+	}
+
+	/*
 	 * If we failed before this point we will clean up any new resume
 	 * state that was created. Now that we've gotten past the initial
 	 * checks we are ok to retain that resume state.
@@ -2964,11 +3357,17 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp)
 	mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL);
 	rwa->os = drc->drc_os;
 	rwa->byteswap = drc->drc_byteswap;
+	rwa->heal = drc->drc_heal;
+	rwa->tofs = drc->drc_tofs;
 	rwa->resumable = drc->drc_resumable;
 	rwa->raw = drc->drc_raw;
 	rwa->spill = drc->drc_spill;
 	rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0);
 	rwa->os->os_raw_receive = drc->drc_raw;
+	if (drc->drc_heal) {
+		rwa->heal_pio = zio_root(drc->drc_os->os_spa, NULL, NULL,
+		    ZIO_FLAG_GODFATHER);
+	}
 	list_create(&rwa->write_batch, sizeof (struct receive_record_arg),
 	    offsetof(struct receive_record_arg, node.bqn_node));
 
@@ -2990,7 +3389,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp)
 	 * stream, then we free drc->drc_rrd and exit.
 	 */
 	while (rwa->err == 0) {
-		if (issig(JUSTLOOKING) && issig(FORREAL)) {
+		if (issig()) {
 			err = SET_ERROR(EINTR);
 			break;
 		}
@@ -3104,7 +3503,9 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx)
 
 	ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
 
-	if (!drc->drc_newfs) {
+	if (drc->drc_heal) {
+		error = 0;
+	} else if (!drc->drc_newfs) {
 		dsl_dataset_t *origin_head;
 
 		error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
@@ -3180,13 +3581,18 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
 	dmu_recv_cookie_t *drc = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0;
-	uint64_t newsnapobj;
+	uint64_t newsnapobj = 0;
 
 	spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
 	    tx, "snap=%s", drc->drc_tosnap);
 	drc->drc_ds->ds_objset->os_raw_receive = B_FALSE;
 
-	if (!drc->drc_newfs) {
+	if (drc->drc_heal) {
+		if (drc->drc_keynvl != NULL) {
+			nvlist_free(drc->drc_keynvl);
+			drc->drc_keynvl = NULL;
+		}
+	} else if (!drc->drc_newfs) {
 		dsl_dataset_t *origin_head;
 
 		VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
@@ -3300,7 +3706,7 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
 	 * tunable is set, in which case we will leave the newly-generated
 	 * value.
 	 */
-	if (drc->drc_raw && drc->drc_ivset_guid != 0) {
+	if (!drc->drc_heal && drc->drc_raw && drc->drc_ivset_guid != 0) {
 		dmu_object_zapify(dp->dp_meta_objset, newsnapobj,
 		    DMU_OT_DSL_DATASET, tx);
 		VERIFY0(zap_update(dp->dp_meta_objset, newsnapobj,
@@ -3367,7 +3773,7 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
 	if (error != 0) {
 		dmu_recv_cleanup_ds(drc);
 		nvlist_free(drc->drc_keynvl);
-	} else {
+	} else if (!drc->drc_heal) {
 		if (drc->drc_newfs) {
 			zvol_create_minor(drc->drc_tofs);
 		}
@@ -3389,13 +3795,15 @@ dmu_objset_is_receiving(objset_t *os)
 	    os->os_dsl_dataset->ds_owner == dmu_recv_tag);
 }
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, UINT, ZMOD_RW,
 	"Maximum receive queue length");
 
-ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, UINT, ZMOD_RW,
 	"Receive queue fill fraction");
 
-ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, UINT, ZMOD_RW,
 	"Maximum amount of writes to batch into one transaction");
+
+ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, best_effort_corrective, INT, ZMOD_RW,
+	"Ignore errors during corrective receive");
 /* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_redact.c b/sys/contrib/openzfs/module/zfs/dmu_redact.c
index fdbdf7d6e868..1feba0ba83de 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_redact.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_redact.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -40,13 +40,14 @@
  * This controls the number of entries in the buffer the redaction_list_update
  * synctask uses to buffer writes to the redaction list.
  */
-int redact_sync_bufsize = 1024;
+static const int redact_sync_bufsize = 1024;
 
 /*
  * Controls how often to update the redaction list when creating a redaction
  * list.
  */
-uint64_t redaction_list_update_interval_ns = 1000 * 1000 * 1000ULL; /* NS */
+static const uint64_t redaction_list_update_interval_ns =
+    1000 * 1000 * 1000ULL; /* 1s */
 
 /*
  * This tunable controls the length of the queues that zfs redact worker threads
@@ -56,7 +57,7 @@ uint64_t redaction_list_update_interval_ns = 1000 * 1000 * 1000ULL; /* NS */
  * available IO resources, or the queues are consuming too much memory, this
  * variable may need to be decreased.
  */
-int zfs_redact_queue_length = 1024 * 1024;
+static const int zfs_redact_queue_length = 1024 * 1024;
 
 /*
  * These tunables control the fill fraction of the queues by zfs redact. The
@@ -65,7 +66,7 @@ int zfs_redact_queue_length = 1024 * 1024;
  * should be tuned down.  If the queues empty before the signalled thread can
  * catch up, then these should be tuned up.
  */
-uint64_t zfs_redact_queue_ff = 20;
+static const uint64_t zfs_redact_queue_ff = 20;
 
 struct redact_record {
 	bqueue_node_t		ln;
@@ -141,7 +142,7 @@ record_merge_enqueue(bqueue_t *q, struct redact_record **build,
 {
 	if (new->eos_marker) {
 		if (*build != NULL)
-			bqueue_enqueue(q, *build, sizeof (*build));
+			bqueue_enqueue(q, *build, sizeof (**build));
 		bqueue_enqueue_flush(q, new, sizeof (*new));
 		return;
 	}
@@ -249,11 +250,11 @@ zfs_get_deleteq(objset_t *os)
  * Third, if there is a deleted object, we need to create a redaction record for
  * all of the blocks in that object.
  */
-/*ARGSUSED*/
 static int
 redact_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
 {
+	(void) spa, (void) zilog;
 	struct redact_thread_arg *rta = arg;
 	struct redact_record *record;
 
@@ -350,7 +351,7 @@ redact_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	return (0);
 }
 
-static void
+static __attribute__((noreturn)) void
 redact_traverse_thread(void *arg)
 {
 	struct redact_thread_arg *rt_arg = arg;
@@ -745,10 +746,8 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads,
 		bqueue_enqueue(q, record, sizeof (*record));
 		return (0);
 	}
-	if (num_threads > 0) {
-		redact_nodes = kmem_zalloc(num_threads *
-		    sizeof (*redact_nodes), KM_SLEEP);
-	}
+	redact_nodes = vmem_zalloc(num_threads *
+	    sizeof (*redact_nodes), KM_SLEEP);
 
 	avl_create(&start_tree, redact_node_compare_start,
 	    sizeof (struct redact_node),
@@ -821,9 +820,9 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads,
 
 	avl_destroy(&start_tree);
 	avl_destroy(&end_tree);
-	kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes));
+	vmem_free(redact_nodes, num_threads * sizeof (*redact_nodes));
 	if (current_record != NULL)
-		bqueue_enqueue(q, current_record, sizeof (current_record));
+		bqueue_enqueue(q, current_record, sizeof (*current_record));
 	return (err);
 }
 
@@ -836,7 +835,7 @@ struct redact_merge_thread_arg {
 	int error_code;
 };
 
-static void
+static __attribute__((noreturn)) void
 redact_merge_thread(void *arg)
 {
 	struct redact_merge_thread_arg *rmta = arg;
@@ -854,7 +853,7 @@ redact_merge_thread(void *arg)
  * object number.
  */
 static int
-hold_next_object(objset_t *os, struct redact_record *rec, void *tag,
+hold_next_object(objset_t *os, struct redact_record *rec, const void *tag,
     uint64_t *object, dnode_t **dn)
 {
 	int err = 0;
@@ -913,7 +912,7 @@ perform_redaction(objset_t *os, redaction_list_t *rl,
 			object = prev_obj;
 		}
 		while (err == 0 && object <= rec->end_object) {
-			if (issig(JUSTLOOKING) && issig(FORREAL)) {
+			if (issig()) {
 				err = EINTR;
 				break;
 			}
@@ -1031,7 +1030,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
 
 	numsnaps = fnvlist_num_pairs(redactnvl);
 	if (numsnaps > 0)
-		args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP);
+		args = vmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP);
 
 	nvpair_t *pair = NULL;
 	for (int i = 0; i < numsnaps; i++) {
@@ -1080,7 +1079,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
 		kmem_free(newredactbook,
 		    sizeof (char) * ZFS_MAX_DATASET_NAME_LEN);
 		if (args != NULL)
-			kmem_free(args, numsnaps * sizeof (*args));
+			vmem_free(args, numsnaps * sizeof (*args));
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 	err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark);
@@ -1120,7 +1119,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
 	} else {
 		uint64_t *guids = NULL;
 		if (numsnaps > 0) {
-			guids = kmem_zalloc(numsnaps * sizeof (uint64_t),
+			guids = vmem_zalloc(numsnaps * sizeof (uint64_t),
 			    KM_SLEEP);
 		}
 		for (int i = 0; i < numsnaps; i++) {
@@ -1132,10 +1131,9 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
 		dp = NULL;
 		err = dsl_bookmark_create_redacted(newredactbook, snapname,
 		    numsnaps, guids, FTAG, &new_rl);
-		kmem_free(guids, numsnaps * sizeof (uint64_t));
-		if (err != 0) {
+		vmem_free(guids, numsnaps * sizeof (uint64_t));
+		if (err != 0)
 			goto out;
-		}
 	}
 
 	for (int i = 0; i < numsnaps; i++) {
@@ -1189,7 +1187,7 @@ out:
 	}
 
 	if (args != NULL)
-		kmem_free(args, numsnaps * sizeof (*args));
+		vmem_free(args, numsnaps * sizeof (*args));
 	if (dp != NULL)
 		dsl_pool_rele(dp, FTAG);
 	if (ds != NULL) {
diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c
index 0658e13c2d25..cb2b62fed313 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_send.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_send.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -67,7 +67,7 @@
 #endif
 
 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
-int zfs_send_corrupt_data = B_FALSE;
+static int zfs_send_corrupt_data = B_FALSE;
 /*
  * This tunable controls the amount of data (measured in bytes) that will be
  * prefetched by zfs send.  If the main thread is blocking on reads that haven't
@@ -75,7 +75,7 @@ int zfs_send_corrupt_data = B_FALSE;
  * thread is issuing new reads because the prefetches have fallen out of the
  * cache, this may need to be decreased.
  */
-int zfs_send_queue_length = SPA_MAXBLOCKSIZE;
+static uint_t zfs_send_queue_length = SPA_MAXBLOCKSIZE;
 /*
  * This tunable controls the length of the queues that zfs send worker threads
  * use to communicate.  If the send_main_thread is blocking on these queues,
@@ -83,7 +83,7 @@ int zfs_send_queue_length = SPA_MAXBLOCKSIZE;
  * at the start of a send as these threads consume all the available IO
  * resources, this variable may need to be decreased.
  */
-int zfs_send_no_prefetch_queue_length = 1024 * 1024;
+static uint_t zfs_send_no_prefetch_queue_length = 1024 * 1024;
 /*
  * These tunables control the fill fraction of the queues by zfs send.  The fill
  * fraction controls the frequency with which threads have to be cv_signaled.
@@ -91,19 +91,19 @@ int zfs_send_no_prefetch_queue_length = 1024 * 1024;
  * down.  If the queues empty before the signalled thread can catch up, then
  * these should be tuned up.
  */
-int zfs_send_queue_ff = 20;
-int zfs_send_no_prefetch_queue_ff = 20;
+static uint_t zfs_send_queue_ff = 20;
+static uint_t zfs_send_no_prefetch_queue_ff = 20;
 
 /*
  * Use this to override the recordsize calculation for fast zfs send estimates.
  */
-int zfs_override_estimate_recordsize = 0;
+static uint_t zfs_override_estimate_recordsize = 0;
 
 /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
-int zfs_send_set_freerecords_bit = B_TRUE;
+static const boolean_t zfs_send_set_freerecords_bit = B_TRUE;
 
 /* Set this tunable to FALSE is disable sending unmodified spill blocks. */
-int zfs_send_unmodified_spill_blocks = B_TRUE;
+static int zfs_send_unmodified_spill_blocks = B_TRUE;
 
 static inline boolean_t
 overflow_multiply(uint64_t a, uint64_t b, uint64_t *c)
@@ -165,6 +165,7 @@ struct send_range {
 			kmutex_t		lock;
 			kcondvar_t		cv;
 			boolean_t		io_outstanding;
+			boolean_t		io_compressed;
 			int			io_err;
 		} data;
 		struct srh {
@@ -378,7 +379,7 @@ dump_free(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
 		}
 	}
 	/* create a FREE record and make it pending */
-	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_FREE;
 	drrf->drr_object = object;
 	drrf->drr_offset = offset;
@@ -437,7 +438,7 @@ dump_redact(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
 		}
 	}
 	/* create a REDACT record and make it pending */
-	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_REDACT;
 	drrr->drr_object = object;
 	drrr->drr_offset = offset;
@@ -450,7 +451,8 @@ dump_redact(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
 
 static int
 dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object,
-    uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data)
+    uint64_t offset, int lsize, int psize, const blkptr_t *bp,
+    boolean_t io_compressed, void *data)
 {
 	uint64_t payload_size;
 	boolean_t raw = (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW);
@@ -478,7 +480,7 @@ dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object,
 		dscp->dsc_pending_op = PENDING_NONE;
 	}
 	/* write a WRITE record */
-	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_WRITE;
 	drrw->drr_object = object;
 	drrw->drr_type = type;
@@ -487,7 +489,11 @@ dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object,
 	drrw->drr_logical_size = lsize;
 
 	/* only set the compression fields if the buf is compressed or raw */
-	if (raw || lsize != psize) {
+	boolean_t compressed =
+	    (bp != NULL ? BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
+	    io_compressed : lsize != psize);
+	if (raw || compressed) {
+		ASSERT(bp != NULL);
 		ASSERT(raw || dscp->dsc_featureflags &
 		    DMU_BACKUP_FEATURE_COMPRESSED);
 		ASSERT(!BP_IS_EMBEDDED(bp));
@@ -566,7 +572,7 @@ dump_write_embedded(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
 
 	ASSERT(BP_IS_EMBEDDED(bp));
 
-	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_WRITE_EMBEDDED;
 	drrw->drr_object = object;
 	drrw->drr_offset = offset;
@@ -579,7 +585,13 @@ dump_write_embedded(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
 
 	decode_embedded_bp_compressed(bp, buf);
 
-	if (dump_record(dscp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
+	uint32_t psize = drrw->drr_psize;
+	uint32_t rsize = P2ROUNDUP(psize, 8);
+
+	if (psize != rsize)
+		memset(buf + psize, 0, rsize - psize);
+
+	if (dump_record(dscp, buf, rsize) != 0)
 		return (SET_ERROR(EINTR));
 	return (0);
 }
@@ -599,7 +611,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
 	}
 
 	/* write a SPILL record */
-	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_SPILL;
 	drrs->drr_object = object;
 	drrs->drr_length = blksz;
@@ -607,7 +619,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
 
 	/* See comment in dump_dnode() for full details */
 	if (zfs_send_unmodified_spill_blocks &&
-	    (bp->blk_birth <= dscp->dsc_fromtxg)) {
+	    (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) {
 		drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
 	}
 
@@ -681,7 +693,7 @@ dump_freeobjects(dmu_send_cookie_t *dscp, uint64_t firstobj, uint64_t numobjs)
 	}
 
 	/* write a FREEOBJECTS record */
-	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_FREEOBJECTS;
 	drrfo->drr_firstobj = firstobj;
 	drrfo->drr_numobjs = numobjs;
@@ -722,7 +734,7 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
 	}
 
 	/* write an OBJECT record */
-	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_OBJECT;
 	drro->drr_object = object;
 	drro->drr_type = dnp->dn_type;
@@ -758,6 +770,8 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
 		 * to send it.
 		 */
 		if (bonuslen != 0) {
+			if (drro->drr_bonuslen > DN_MAX_BONUS_LEN(dnp))
+				return (SET_ERROR(EINVAL));
 			drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp);
 			bonuslen = drro->drr_raw_bonuslen;
 		}
@@ -790,11 +804,11 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
 	 */
 	if (zfs_send_unmodified_spill_blocks &&
 	    (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
-	    (DN_SPILL_BLKPTR(dnp)->blk_birth <= dscp->dsc_fromtxg)) {
+	    (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) {
 		struct send_range record;
 		blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
 
-		bzero(&record, sizeof (struct send_range));
+		memset(&record, 0, sizeof (struct send_range));
 		record.type = DATA;
 		record.object = object;
 		record.eos_marker = B_FALSE;
@@ -834,7 +848,7 @@ dump_object_range(dmu_send_cookie_t *dscp, const blkptr_t *bp,
 		dscp->dsc_pending_op = PENDING_NONE;
 	}
 
-	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_OBJECT_RANGE;
 	drror->drr_firstobj = firstobj;
 	drror->drr_numslots = numslots;
@@ -927,7 +941,7 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
 		ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
 		if (BP_GET_TYPE(bp) == DMU_OT_SA) {
 			arc_flags_t aflags = ARC_FLAG_WAIT;
-			enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
+			zio_flag_t zioflags = ZIO_FLAG_CANFAIL;
 
 			if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) {
 				ASSERT(BP_IS_PROTECTED(bp));
@@ -1014,7 +1028,8 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
 				int n = MIN(srdp->datablksz,
 				    SPA_OLD_MAXBLOCKSIZE);
 				err = dmu_dump_write(dscp, srdp->obj_type,
-				    range->object, offset, n, n, NULL, data);
+				    range->object, offset, n, n, NULL, B_FALSE,
+				    data);
 				offset += n;
 				/*
 				 * When doing dry run, data==NULL is used as a
@@ -1028,7 +1043,8 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
 		} else {
 			err = dmu_dump_write(dscp, srdp->obj_type,
 			    range->object, offset,
-			    srdp->datablksz, srdp->datasz, bp, data);
+			    srdp->datablksz, srdp->datasz, bp,
+			    srdp->io_compressed, data);
 		}
 		return (err);
 	}
@@ -1081,6 +1097,7 @@ range_alloc(enum type type, uint64_t object, uint64_t start_blkid,
 		cv_init(&range->sru.data.cv, NULL, CV_DEFAULT, NULL);
 		range->sru.data.io_outstanding = 0;
 		range->sru.data.io_err = 0;
+		range->sru.data.io_compressed = B_FALSE;
 	}
 	return (range);
 }
@@ -1089,11 +1106,11 @@ range_alloc(enum type type, uint64_t object, uint64_t start_blkid,
  * This is the callback function to traverse_dataset that acts as a worker
  * thread for dmu_send_impl.
  */
-/*ARGSUSED*/
 static int
 send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
 {
+	(void) zilog;
 	struct send_thread_arg *sta = arg;
 	struct send_range *record;
 
@@ -1106,9 +1123,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	 */
 	if (sta->os->os_encrypted &&
 	    !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
-		spa_log_error(spa, zb);
-		zfs_panic_recover("unencrypted block in encrypted "
-		    "object set %llu", dmu_objset_id(sta->os));
+		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
 		return (SET_ERROR(EIO));
 	}
 
@@ -1126,7 +1141,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 		record->sru.object.bp = *bp;
 		size_t size  = sizeof (*dnp) * (dnp->dn_extra_slots + 1);
 		record->sru.object.dnp = kmem_alloc(size, KM_SLEEP);
-		bcopy(dnp, record->sru.object.dnp, size);
+		memcpy(record->sru.object.dnp, dnp, size);
 		bqueue_enqueue(&sta->q, record, sizeof (*record));
 		return (0);
 	}
@@ -1224,7 +1239,7 @@ redact_list_cb(redact_block_phys_t *rb, void *arg)
  * error code of the thread in case something goes wrong, and pushes the End of
  * Stream record when the traverse_dataset call has finished.
  */
-static void
+static __attribute__((noreturn)) void
 send_traverse_thread(void *arg)
 {
 	struct send_thread_arg *st_arg = arg;
@@ -1314,7 +1329,7 @@ get_next_range(bqueue_t *bq, struct send_range *prev)
 	return (next);
 }
 
-static void
+static __attribute__((noreturn)) void
 redact_list_thread(void *arg)
 {
 	struct redact_list_thread_arg *rlt_arg = arg;
@@ -1509,7 +1524,7 @@ find_next_range(struct send_range **ranges, bqueue_t **qs, uint64_t *out_mask)
  * data from the redact_list_thread and use that to determine which blocks
  * should be redacted.
  */
-static void
+static __attribute__((noreturn)) void
 send_merge_thread(void *arg)
 {
 	struct send_merge_thread_arg *smt_arg = arg;
@@ -1576,8 +1591,6 @@ send_merge_thread(void *arg)
 		}
 		range_free(front_ranges[i]);
 	}
-	if (range == NULL)
-		range = kmem_zalloc(sizeof (*range), KM_SLEEP);
 	range->eos_marker = B_TRUE;
 	bqueue_enqueue_flush(&smt_arg->q, range, 1);
 	spl_fstrans_unmark(cookie);
@@ -1644,12 +1657,15 @@ issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range)
 	    !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) &&
 	    !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp));
 
-	enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
+	zio_flag_t zioflags = ZIO_FLAG_CANFAIL;
 
-	if (srta->featureflags & DMU_BACKUP_FEATURE_RAW)
+	if (srta->featureflags & DMU_BACKUP_FEATURE_RAW) {
 		zioflags |= ZIO_FLAG_RAW;
-	else if (request_compressed)
+		srdp->io_compressed = B_TRUE;
+	} else if (request_compressed) {
 		zioflags |= ZIO_FLAG_RAW_COMPRESS;
+		srdp->io_compressed = B_TRUE;
+	}
 
 	srdp->datasz = (zioflags & ZIO_FLAG_RAW_COMPRESS) ?
 	    BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp);
@@ -1701,8 +1717,10 @@ enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn,
 	struct send_range *range = range_alloc(range_type, dn->dn_object,
 	    blkid, blkid + count, B_FALSE);
 
-	if (blkid == DMU_SPILL_BLKID)
+	if (blkid == DMU_SPILL_BLKID) {
+		ASSERT3P(bp, !=, NULL);
 		ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA);
+	}
 
 	switch (range_type) {
 	case HOLE:
@@ -1731,7 +1749,7 @@ enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn,
  * some indirect blocks can be discarded because they're not holes. Second,
  * it issues prefetches for the data we need to send.
  */
-static void
+static __attribute__((noreturn)) void
 send_reader_thread(void *arg)
 {
 	struct send_reader_thread_arg *srta = arg;
@@ -1823,8 +1841,7 @@ send_reader_thread(void *arg)
 				continue;
 			}
 			uint64_t file_max =
-			    (dn->dn_maxblkid < range->end_blkid ?
-			    dn->dn_maxblkid : range->end_blkid);
+			    MIN(dn->dn_maxblkid, range->end_blkid);
 			/*
 			 * The object exists, so we need to try to find the
 			 * blkptr for each block in the range we're processing.
@@ -1900,7 +1917,7 @@ send_reader_thread(void *arg)
 
 struct dmu_send_params {
 	/* Pool args */
-	void *tag; // Tag that dp was held with, will be used to release dp.
+	const void *tag; // Tag dp was held with, will be used to release dp.
 	dsl_pool_t *dp;
 	/* To snapshot args */
 	const char *tosnap;
@@ -1936,7 +1953,7 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os,
 {
 	dsl_dataset_t *to_ds = dspp->to_ds;
 	dsl_pool_t *dp = dspp->dp;
-#ifdef _KERNEL
+
 	if (dmu_objset_type(os) == DMU_OST_ZFS) {
 		uint64_t version;
 		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0)
@@ -1945,7 +1962,6 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os,
 		if (version >= ZPL_VERSION_SA)
 			*featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
 	}
-#endif
 
 	/* raw sends imply large_block_ok */
 	if ((dspp->rawok || dspp->large_block_ok) &&
@@ -2144,6 +2160,7 @@ setup_resume_points(struct dmu_send_params *dspp,
     struct send_merge_thread_arg *smt_arg, boolean_t resuming, objset_t *os,
     redaction_list_t *redact_rl, nvlist_t *nvl)
 {
+	(void) smt_arg;
 	dsl_dataset_t *to_ds = dspp->to_ds;
 	int err = 0;
 
@@ -2348,7 +2365,7 @@ dmu_send_impl(struct dmu_send_params *dspp)
 	dsl_dataset_t *to_ds = dspp->to_ds;
 	zfs_bookmark_phys_t *ancestor_zb = &dspp->ancestor_zb;
 	dsl_pool_t *dp = dspp->dp;
-	void *tag = dspp->tag;
+	const void *tag = dspp->tag;
 
 	err = dmu_objset_from_ds(to_ds, &os);
 	if (err != 0) {
@@ -2497,8 +2514,7 @@ dmu_send_impl(struct dmu_send_params *dspp)
 	}
 
 	if (featureflags & DMU_BACKUP_FEATURE_RAW) {
-		uint64_t ivset_guid = (ancestor_zb != NULL) ?
-		    ancestor_zb->zbm_ivset_guid : 0;
+		uint64_t ivset_guid = ancestor_zb->zbm_ivset_guid;
 		nvlist_t *keynvl = NULL;
 		ASSERT(os->os_encrypted);
 
@@ -2536,7 +2552,7 @@ dmu_send_impl(struct dmu_send_params *dspp)
 	while (err == 0 && !range->eos_marker) {
 		err = do_dump(&dsc, range);
 		range = get_next_range(&srt_arg->q, range);
-		if (issig(JUSTLOOKING) && issig(FORREAL))
+		if (issig())
 			err = SET_ERROR(EINTR);
 	}
 
@@ -2583,7 +2599,7 @@ dmu_send_impl(struct dmu_send_params *dspp)
 	 * the receive side that the stream is incomplete.
 	 */
 	if (!dspp->savedok) {
-		bzero(drr, sizeof (dmu_replay_record_t));
+		memset(drr, 0, sizeof (dmu_replay_record_t));
 		drr->drr_type = DRR_END;
 		drr->drr_u.drr_end.drr_checksum = dsc.dsc_zc;
 		drr->drr_u.drr_end.drr_toguid = dsc.dsc_toguid;
@@ -2684,7 +2700,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
 			uint64_t size = dspp.numfromredactsnaps *
 			    sizeof (uint64_t);
 			dspp.fromredactsnaps = kmem_zalloc(size, KM_SLEEP);
-			bcopy(fromredact, dspp.fromredactsnaps, size);
+			memcpy(dspp.fromredactsnaps, fromredact, size);
 		}
 
 		boolean_t is_before =
@@ -2702,6 +2718,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
 		dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED;
 		err = dmu_send_impl(&dspp);
 	}
+	if (dspp.fromredactsnaps)
+		kmem_free(dspp.fromredactsnaps,
+		    dspp.numfromredactsnaps * sizeof (uint64_t));
+
 	dsl_dataset_rele(dspp.to_ds, FTAG);
 	return (err);
 }
@@ -2770,6 +2790,7 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
 			}
 
 			if (err == 0) {
+				owned = B_TRUE;
 				err = zap_lookup(dspp.dp->dp_meta_objset,
 				    dspp.to_ds->ds_object,
 				    DS_FIELD_RESUME_TOGUID, 8, 1,
@@ -2783,21 +2804,24 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
 				    sizeof (dspp.saved_toname),
 				    dspp.saved_toname);
 			}
-			if (err != 0)
+			/* Only disown if there was an error in the lookups */
+			if (owned && (err != 0))
 				dsl_dataset_disown(dspp.to_ds, dsflags, FTAG);
 
 			kmem_strfree(name);
 		} else {
 			err = dsl_dataset_own(dspp.dp, tosnap, dsflags,
 			    FTAG, &dspp.to_ds);
+			if (err == 0)
+				owned = B_TRUE;
 		}
-		owned = B_TRUE;
 	} else {
 		err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG,
 		    &dspp.to_ds);
 	}
 
 	if (err != 0) {
+		/* Note: dsl dataset is not owned at this point */
 		dsl_pool_rele(dspp.dp, FTAG);
 		return (err);
 	}
@@ -2869,7 +2893,7 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
 					    sizeof (uint64_t);
 					dspp.fromredactsnaps = kmem_zalloc(size,
 					    KM_SLEEP);
-					bcopy(fromredact, dspp.fromredactsnaps,
+					memcpy(dspp.fromredactsnaps, fromredact,
 					    size);
 				}
 				if (!dsl_dataset_is_before(dspp.to_ds, fromds,
@@ -2910,6 +2934,10 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
 			/* dmu_send_impl will call dsl_pool_rele for us. */
 			err = dmu_send_impl(&dspp);
 		} else {
+			if (dspp.fromredactsnaps)
+				kmem_free(dspp.fromredactsnaps,
+				    dspp.numfromredactsnaps *
+				    sizeof (uint64_t));
 			dsl_pool_rele(dspp.dp, FTAG);
 		}
 	} else {
@@ -3002,7 +3030,7 @@ dmu_send_estimate_fast(dsl_dataset_t *origds, dsl_dataset_t *fromds,
 
 		dsl_dataset_name(origds, dsname);
 		(void) strcat(dsname, "/");
-		(void) strcat(dsname, recv_clone_name);
+		(void) strlcat(dsname, recv_clone_name, sizeof (dsname));
 
 		err = dsl_dataset_hold(origds->ds_dir->dd_pool,
 		    dsname, FTAG, &ds);
@@ -3072,25 +3100,23 @@ out:
 	return (err);
 }
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_send, zfs_send_, corrupt_data, INT, ZMOD_RW,
 	"Allow sending corrupt data");
 
-ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_length, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_length, UINT, ZMOD_RW,
 	"Maximum send queue length");
 
 ZFS_MODULE_PARAM(zfs_send, zfs_send_, unmodified_spill_blocks, INT, ZMOD_RW,
 	"Send unmodified spill blocks");
 
-ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_length, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_length, UINT, ZMOD_RW,
 	"Maximum send queue length for non-prefetch queues");
 
-ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_ff, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_ff, UINT, ZMOD_RW,
 	"Send queue fill fraction");
 
-ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_ff, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_ff, UINT, ZMOD_RW,
 	"Send queue fill fraction for non-prefetch queues");
 
-ZFS_MODULE_PARAM(zfs_send, zfs_, override_estimate_recordsize, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_send, zfs_, override_estimate_recordsize, UINT, ZMOD_RW,
 	"Override block size estimate with fixed size");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_traverse.c b/sys/contrib/openzfs/module/zfs/dmu_traverse.c
index 862c0bf404ad..15cc2885e805 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_traverse.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_traverse.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -39,9 +39,9 @@
 #include <sys/callb.h>
 #include <sys/zfeature.h>
 
-int32_t zfs_pd_bytes_max = 50 * 1024 * 1024;	/* 50MB */
-int32_t send_holes_without_birth_time = 1;
-int32_t zfs_traverse_indirect_prefetch_limit = 32;
+static int32_t zfs_pd_bytes_max = 50 * 1024 * 1024;	/* 50MB */
+static int32_t send_holes_without_birth_time = 1;
+static uint_t zfs_traverse_indirect_prefetch_limit = 32;
 
 typedef struct prefetch_data {
 	kmutex_t pd_mtx;
@@ -83,7 +83,8 @@ traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 	if (BP_IS_HOLE(bp))
 		return (0);
 
-	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
+	if (claim_txg == 0 &&
+	    BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(td->td_spa))
 		return (-1);
 
 	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
@@ -108,9 +109,10 @@ traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
 		if (BP_IS_HOLE(bp))
 			return (0);
 
-		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+		if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
 			return (0);
 
+		ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
 		    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
@@ -153,10 +155,10 @@ typedef enum resume_skip {
  * Otherwise returns RESUME_SKIP_NONE.
  */
 static resume_skip_t
-resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
+resume_skip_check(const traverse_data_t *td, const dnode_phys_t *dnp,
     const zbookmark_phys_t *zb)
 {
-	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
+	if (td->td_resume != NULL) {
 		/*
 		 * If we already visited this bp & everything below,
 		 * don't bother doing it again.
@@ -164,12 +166,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
 		if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
 			return (RESUME_SKIP_ALL);
 
-		/*
-		 * If we found the block we're trying to resume from, zero
-		 * the bookmark out to indicate that we have resumed.
-		 */
-		if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
-			bzero(td->td_resume, sizeof (*zb));
+		if (memcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
 			if (td->td_flags & TRAVERSE_POST)
 				return (RESUME_SKIP_CHILDREN);
 		}
@@ -181,22 +178,22 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
  * Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE.
  */
 static boolean_t
-traverse_prefetch_metadata(traverse_data_t *td,
+traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
-	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+	    ARC_FLAG_PRESCIENT_PREFETCH;
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
 
 	if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
 		return (B_FALSE);
 	/*
-	 * If we are in the process of resuming, don't prefetch, because
-	 * some children will not be needed (and in fact may have already
-	 * been freed).
+	 * If this bp is before the resume point, it may have already been
+	 * freed.
 	 */
-	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
+	if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE)
 		return (B_FALSE);
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
+	if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg)
 		return (B_FALSE);
 	if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
 		return (B_FALSE);
@@ -239,7 +236,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		ASSERT(0);
 	}
 
-	if (bp->blk_birth == 0) {
+	if (BP_GET_LOGICAL_BIRTH(bp) == 0) {
 		/*
 		 * Since this block has a birth time of 0 it must be one of
 		 * two things: a hole created before the
@@ -267,7 +264,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		    zb->zb_object == DMU_META_DNODE_OBJECT) &&
 		    td->td_hole_birth_enabled_txg <= td->td_min_txg)
 			return (0);
-	} else if (bp->blk_birth <= td->td_min_txg) {
+	} else if (BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) {
 		return (0);
 	}
 
@@ -342,7 +339,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 					SET_BOOKMARK(czb, zb->zb_objset,
 					    zb->zb_object, zb->zb_level - 1,
 					    zb->zb_blkid * epb + pidx);
-					if (traverse_prefetch_metadata(td,
+					if (traverse_prefetch_metadata(td, dnp,
 					    &((blkptr_t *)buf->b_data)[pidx],
 					    czb) == B_TRUE) {
 						prefetched++;
@@ -504,12 +501,12 @@ prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
-		traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
+		traverse_prefetch_metadata(td, dnp, &dnp->dn_blkptr[j], &czb);
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
-		traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb);
+		traverse_prefetch_metadata(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
 	}
 }
 
@@ -560,11 +557,11 @@ traverse_dnode(traverse_data_t *td, const blkptr_t *bp, const dnode_phys_t *dnp,
 	return (err);
 }
 
-/* ARGSUSED */
 static int
 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
+	(void) zilog, (void) dnp;
 	prefetch_data_t *pfd = arg;
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
 	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
@@ -670,7 +667,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
 
 	/* See comment on ZIL traversal in dsl_scan_visitds. */
 	if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
-		enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+		zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 		uint32_t flags = ARC_FLAG_WAIT;
 		objset_phys_t *osp;
 		arc_buf_t *buf;
@@ -809,11 +806,10 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
 EXPORT_SYMBOL(traverse_dataset);
 EXPORT_SYMBOL(traverse_pool);
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW,
 	"Max number of bytes to prefetch");
 
-ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, UINT, ZMOD_RW,
 	"Traverse prefetch number of blocks pointed by indirect block");
 
 #if defined(_KERNEL)
@@ -822,6 +818,6 @@ MODULE_PARM_DESC(ignore_hole_birth,
 	"Alias for send_holes_without_birth_time");
 #endif
 
+/* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , send_holes_without_birth_time, INT, ZMOD_RW,
 	"Ignore hole_birth txg for zfs send");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c
index 5fa516866668..8451b5082e86 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_tx.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -53,8 +53,8 @@ dmu_tx_stats_t dmu_tx_stats = {
 	{ "dmu_tx_dirty_throttle",	KSTAT_DATA_UINT64 },
 	{ "dmu_tx_dirty_delay",		KSTAT_DATA_UINT64 },
 	{ "dmu_tx_dirty_over_max",	KSTAT_DATA_UINT64 },
-	{ "dmu_tx_wrlog_over_max",	KSTAT_DATA_UINT64 },
 	{ "dmu_tx_dirty_frees_delay",	KSTAT_DATA_UINT64 },
+	{ "dmu_tx_wrlog_delay",		KSTAT_DATA_UINT64 },
 	{ "dmu_tx_quota",		KSTAT_DATA_UINT64 },
 };
 
@@ -210,16 +210,22 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 	dmu_buf_impl_t *db;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	db = dbuf_hold_level(dn, level, blkid, FTAG);
+	err = dbuf_hold_impl(dn, level, blkid, TRUE, FALSE, FTAG, &db);
 	rw_exit(&dn->dn_struct_rwlock);
-	if (db == NULL)
-		return (SET_ERROR(EIO));
-	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
+	if (err == ENOENT)
+		return (0);
+	if (err != 0)
+		return (err);
+	/*
+	 * PARTIAL_FIRST allows caching for uncacheable blocks.  It will
+	 * be cleared after dmu_buf_will_dirty() call dbuf_read() again.
+	 */
+	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH |
+	    (level == 0 ? DB_RF_PARTIAL_FIRST : 0));
 	dbuf_rele(db, FTAG);
 	return (err);
 }
 
-/* ARGSUSED */
 static void
 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 {
@@ -291,6 +297,53 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 }
 
 static void
+dmu_tx_count_append(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+	dnode_t *dn = txh->txh_dnode;
+	int err = 0;
+
+	if (len == 0)
+		return;
+
+	(void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);
+
+	if (dn == NULL)
+		return;
+
+	/*
+	 * For i/o error checking, read the blocks that will be needed
+	 * to perform the append; first level-0 block (if not aligned, i.e.
+	 * if they are partial-block writes), no additional blocks are read.
+	 */
+	if (dn->dn_maxblkid == 0) {
+		if (off < dn->dn_datablksz &&
+		    (off > 0 || len < dn->dn_datablksz)) {
+			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+			if (err != 0) {
+				txh->txh_tx->tx_err = err;
+			}
+		}
+	} else {
+		zio_t *zio = zio_root(dn->dn_objset->os_spa,
+		    NULL, NULL, ZIO_FLAG_CANFAIL);
+
+		/* first level-0 block */
+		uint64_t start = off >> dn->dn_datablkshift;
+		if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
+			err = dmu_tx_check_ioerr(zio, dn, 0, start);
+			if (err != 0) {
+				txh->txh_tx->tx_err = err;
+			}
+		}
+
+		err = zio_wait(zio);
+		if (err != 0) {
+			txh->txh_tx->tx_err = err;
+		}
+	}
+}
+
+static void
 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 {
 	(void) zfs_refcount_add_many(&txh->txh_space_towrite,
@@ -331,6 +384,42 @@ dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
 }
 
 /*
+ * Should be used when appending to an object and the exact offset is unknown.
+ * The write must occur at or beyond the specified offset.  Only the L0 block
+ * at provided offset will be prefetched.
+ */
+void
+dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT0(tx->tx_txg);
+	ASSERT3U(len, <=, DMU_MAX_ACCESS);
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	    object, THT_APPEND, off, DMU_OBJECT_END);
+	if (txh != NULL) {
+		dmu_tx_count_append(txh, off, len);
+		dmu_tx_count_dnode(txh);
+	}
+}
+
+void
+dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT0(tx->tx_txg);
+	ASSERT3U(len, <=, DMU_MAX_ACCESS);
+
+	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_APPEND, off, DMU_OBJECT_END);
+	if (txh != NULL) {
+		dmu_tx_count_append(txh, off, len);
+		dmu_tx_count_dnode(txh);
+	}
+}
+
+/*
  * This function marks the transaction as being a "net free".  The end
  * result is that refquotas will be disabled for this transaction, and
  * this transaction will be able to use half of the pool space overhead
@@ -345,7 +434,7 @@ dmu_tx_mark_netfree(dmu_tx_t *tx)
 }
 
 static void
-dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 {
 	dmu_tx_t *tx = txh->txh_tx;
 	dnode_t *dn = txh->txh_dnode;
@@ -353,15 +442,11 @@ dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 
 	ASSERT(tx->tx_txg == 0);
 
-	dmu_tx_count_dnode(txh);
-
 	if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
 		return;
 	if (len == DMU_OBJECT_END)
 		len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
 
-	dmu_tx_count_dnode(txh);
-
 	/*
 	 * For i/o error checking, we read the first and last level-0
 	 * blocks if they are not aligned, and all the level-1 blocks.
@@ -441,8 +526,10 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_FREE, off, len);
-	if (txh != NULL)
-		(void) dmu_tx_hold_free_impl(txh, off, len);
+	if (txh != NULL) {
+		dmu_tx_count_dnode(txh);
+		dmu_tx_count_free(txh, off, len);
+	}
 }
 
 void
@@ -451,8 +538,35 @@ dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 	dmu_tx_hold_t *txh;
 
 	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
-	if (txh != NULL)
-		(void) dmu_tx_hold_free_impl(txh, off, len);
+	if (txh != NULL) {
+		dmu_tx_count_dnode(txh);
+		dmu_tx_count_free(txh, off, len);
+	}
+}
+
+static void
+dmu_tx_count_clone(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+
+	/*
+	 * Reuse dmu_tx_count_free(), it does exactly what we need for clone.
+	 */
+	dmu_tx_count_free(txh, off, len);
+}
+
+void
+dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT0(tx->tx_txg);
+	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
+
+	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_CLONE, off, len);
+	if (txh != NULL) {
+		dmu_tx_count_dnode(txh);
+		dmu_tx_count_clone(txh, off, len);
+	}
 }
 
 static void
@@ -461,6 +575,7 @@ dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
 	dmu_tx_t *tx = txh->txh_tx;
 	dnode_t *dn = txh->txh_dnode;
 	int err;
+	extern int zap_micro_max_size;
 
 	ASSERT(tx->tx_txg == 0);
 
@@ -476,7 +591,7 @@ dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
 	 *    - 2 grown ptrtbl blocks
 	 */
 	(void) zfs_refcount_add_many(&txh->txh_space_towrite,
-	    MZAP_MAX_BLKSZ, FTAG);
+	    zap_micro_max_size, FTAG);
 
 	if (dn == NULL)
 		return;
@@ -638,6 +753,26 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 				if (blkid == 0)
 					match_offset = TRUE;
 				break;
+			case THT_APPEND:
+				if (blkid >= beginblk && (blkid <= endblk ||
+				    txh->txh_arg2 == DMU_OBJECT_END))
+					match_offset = TRUE;
+
+				/*
+				 * THT_WRITE used for bonus and spill blocks.
+				 */
+				ASSERT(blkid != DMU_BONUS_BLKID &&
+				    blkid != DMU_SPILL_BLKID);
+
+				/*
+				 * They might have to increase nlevels,
+				 * thus dirtying the new TLIBs.  Or the
+				 * might have to change the block size,
+				 * thus dirying the new lvl=0 blk=0.
+				 */
+				if (blkid == 0)
+					match_offset = TRUE;
+				break;
 			case THT_FREE:
 				/*
 				 * We will dirty all the level 1 blocks in
@@ -662,6 +797,10 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 			case THT_NEWOBJECT:
 				match_object = TRUE;
 				break;
+			case THT_CLONE:
+				if (blkid >= beginblk && blkid <= endblk)
+					match_offset = TRUE;
+				break;
 			default:
 				cmn_err(CE_PANIC, "bad txh_type %d",
 				    txh->txh_type);
@@ -683,8 +822,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
  * If we can't do 10 iops, something is wrong.  Let us go ahead
  * and hit zfs_dirty_data_max.
  */
-hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */
-int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
+static const hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */
 
 /*
  * We delay transactions when we've determined that the backend storage
@@ -781,34 +919,49 @@ static void
 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
 {
 	dsl_pool_t *dp = tx->tx_pool;
-	uint64_t delay_min_bytes =
+	uint64_t delay_min_bytes, wrlog;
+	hrtime_t wakeup, tx_time = 0, now;
+
+	/* Calculate minimum transaction time for the dirty data amount. */
+	delay_min_bytes =
 	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
-	hrtime_t wakeup, min_tx_time, now;
+	if (dirty > delay_min_bytes) {
+		/*
+		 * The caller has already waited until we are under the max.
+		 * We make them pass us the amount of dirty data so we don't
+		 * have to handle the case of it being >= the max, which
+		 * could cause a divide-by-zero if it's == the max.
+		 */
+		ASSERT3U(dirty, <, zfs_dirty_data_max);
 
-	if (dirty <= delay_min_bytes)
-		return;
+		tx_time = zfs_delay_scale * (dirty - delay_min_bytes) /
+		    (zfs_dirty_data_max - dirty);
+	}
 
-	/*
-	 * The caller has already waited until we are under the max.
-	 * We make them pass us the amount of dirty data so we don't
-	 * have to handle the case of it being >= the max, which could
-	 * cause a divide-by-zero if it's == the max.
-	 */
-	ASSERT3U(dirty, <, zfs_dirty_data_max);
+	/* Calculate minimum transaction time for the TX_WRITE log size. */
+	wrlog = aggsum_upper_bound(&dp->dp_wrlog_total);
+	delay_min_bytes =
+	    zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;
+	if (wrlog >= zfs_wrlog_data_max) {
+		tx_time = zfs_delay_max_ns;
+	} else if (wrlog > delay_min_bytes) {
+		tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) /
+		    (zfs_wrlog_data_max - wrlog), tx_time);
+	}
+
+	if (tx_time == 0)
+		return;
 
+	tx_time = MIN(tx_time, zfs_delay_max_ns);
 	now = gethrtime();
-	min_tx_time = zfs_delay_scale *
-	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
-	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
-	if (now > tx->tx_start + min_tx_time)
+	if (now > tx->tx_start + tx_time)
 		return;
 
 	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
-	    uint64_t, min_tx_time);
+	    uint64_t, tx_time);
 
 	mutex_enter(&dp->dp_lock);
-	wakeup = MAX(tx->tx_start + min_tx_time,
-	    dp->dp_last_wakeup + min_tx_time);
+	wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time);
 	dp->dp_last_wakeup = wakeup;
 	mutex_exit(&dp->dp_lock);
 
@@ -886,8 +1039,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 	}
 
 	if (!tx->tx_dirty_delayed &&
-	    dsl_pool_wrlog_over_max(tx->tx_pool)) {
-		DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max);
+	    dsl_pool_need_wrlog_delay(tx->tx_pool)) {
+		tx->tx_wait_dirty = B_TRUE;
+		DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay);
 		return (SET_ERROR(ERESTART));
 	}
 
@@ -1244,8 +1398,7 @@ dmu_tx_do_callbacks(list_t *cb_list, int error)
 {
 	dmu_tx_callback_t *dcb;
 
-	while ((dcb = list_tail(cb_list)) != NULL) {
-		list_remove(cb_list, dcb);
+	while ((dcb = list_remove_tail(cb_list)) != NULL) {
 		dcb->dcb_func(dcb->dcb_data, error);
 		kmem_free(dcb, sizeof (dmu_tx_callback_t));
 	}
@@ -1405,6 +1558,8 @@ dmu_tx_fini(void)
 EXPORT_SYMBOL(dmu_tx_create);
 EXPORT_SYMBOL(dmu_tx_hold_write);
 EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode);
+EXPORT_SYMBOL(dmu_tx_hold_append);
+EXPORT_SYMBOL(dmu_tx_hold_append_by_dnode);
 EXPORT_SYMBOL(dmu_tx_hold_free);
 EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode);
 EXPORT_SYMBOL(dmu_tx_hold_zap);
diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
index 043344a1375f..ed50f1889b59 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -28,6 +28,7 @@
  */
 
 #include <sys/zfs_context.h>
+#include <sys/arc_impl.h>
 #include <sys/dnode.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_zfetch.h>
@@ -43,38 +44,63 @@
  * so it can't hurt performance.
  */
 
-int zfs_prefetch_disable = B_FALSE;
+static int zfs_prefetch_disable = B_FALSE;
 
 /* max # of streams per zfetch */
-unsigned int	zfetch_max_streams = 8;
+static unsigned int	zfetch_max_streams = 8;
 /* min time before stream reclaim */
-unsigned int	zfetch_min_sec_reap = 2;
+static unsigned int	zfetch_min_sec_reap = 1;
+/* max time before stream delete */
+static unsigned int	zfetch_max_sec_reap = 2;
+#ifdef _ILP32
+/* min bytes to prefetch per stream (default 2MB) */
+static unsigned int	zfetch_min_distance = 2 * 1024 * 1024;
 /* max bytes to prefetch per stream (default 8MB) */
 unsigned int	zfetch_max_distance = 8 * 1024 * 1024;
+#else
+/* min bytes to prefetch per stream (default 4MB) */
+static unsigned int	zfetch_min_distance = 4 * 1024 * 1024;
+/* max bytes to prefetch per stream (default 64MB) */
+unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
+#endif
 /* max bytes to prefetch indirects for per stream (default 64MB) */
 unsigned int	zfetch_max_idistance = 64 * 1024 * 1024;
-/* max number of bytes in an array_read in which we allow prefetching (1MB) */
-unsigned long	zfetch_array_rd_sz = 1024 * 1024;
+/* max request reorder distance within a stream (default 16MB) */
+unsigned int	zfetch_max_reorder = 16 * 1024 * 1024;
+/* Max log2 fraction of holes in a stream */
+unsigned int	zfetch_hole_shift = 2;
 
 typedef struct zfetch_stats {
 	kstat_named_t zfetchstat_hits;
+	kstat_named_t zfetchstat_future;
+	kstat_named_t zfetchstat_stride;
+	kstat_named_t zfetchstat_past;
 	kstat_named_t zfetchstat_misses;
 	kstat_named_t zfetchstat_max_streams;
 	kstat_named_t zfetchstat_io_issued;
+	kstat_named_t zfetchstat_io_active;
 } zfetch_stats_t;
 
 static zfetch_stats_t zfetch_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
+	{ "future",			KSTAT_DATA_UINT64 },
+	{ "stride",			KSTAT_DATA_UINT64 },
+	{ "past",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "max_streams",		KSTAT_DATA_UINT64 },
-	{ "io_issued",		KSTAT_DATA_UINT64 },
+	{ "io_issued",			KSTAT_DATA_UINT64 },
+	{ "io_active",			KSTAT_DATA_UINT64 },
 };
 
 struct {
 	wmsum_t zfetchstat_hits;
+	wmsum_t zfetchstat_future;
+	wmsum_t zfetchstat_stride;
+	wmsum_t zfetchstat_past;
 	wmsum_t zfetchstat_misses;
 	wmsum_t zfetchstat_max_streams;
 	wmsum_t zfetchstat_io_issued;
+	aggsum_t zfetchstat_io_active;
 } zfetch_sums;
 
 #define	ZFETCHSTAT_BUMP(stat)					\
@@ -83,7 +109,7 @@ struct {
 	wmsum_add(&zfetch_sums.stat, val)
 
 
-kstat_t		*zfetch_ksp;
+static kstat_t		*zfetch_ksp;
 
 static int
 zfetch_kstats_update(kstat_t *ksp, int rw)
@@ -94,12 +120,20 @@ zfetch_kstats_update(kstat_t *ksp, int rw)
 		return (EACCES);
 	zs->zfetchstat_hits.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_hits);
+	zs->zfetchstat_future.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_future);
+	zs->zfetchstat_stride.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_stride);
+	zs->zfetchstat_past.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_past);
 	zs->zfetchstat_misses.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_misses);
 	zs->zfetchstat_max_streams.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_max_streams);
 	zs->zfetchstat_io_issued.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_io_issued);
+	zs->zfetchstat_io_active.value.ui64 =
+	    aggsum_value(&zfetch_sums.zfetchstat_io_active);
 	return (0);
 }
 
@@ -107,9 +141,13 @@ void
 zfetch_init(void)
 {
 	wmsum_init(&zfetch_sums.zfetchstat_hits, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_future, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_stride, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_past, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
+	aggsum_init(&zfetch_sums.zfetchstat_io_active, 0);
 
 	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
@@ -131,9 +169,14 @@ zfetch_fini(void)
 	}
 
 	wmsum_fini(&zfetch_sums.zfetchstat_hits);
+	wmsum_fini(&zfetch_sums.zfetchstat_future);
+	wmsum_fini(&zfetch_sums.zfetchstat_stride);
+	wmsum_fini(&zfetch_sums.zfetchstat_past);
 	wmsum_fini(&zfetch_sums.zfetchstat_misses);
 	wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
 	wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
+	ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active));
+	aggsum_fini(&zfetch_sums.zfetchstat_io_active);
 }
 
 /*
@@ -195,75 +238,219 @@ dmu_zfetch_fini(zfetch_t *zf)
 }
 
 /*
- * If there aren't too many streams already, create a new stream.
+ * If there aren't too many active streams already, create one more.
+ * In process delete/reuse all streams without hits for zfetch_max_sec_reap.
+ * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever.
  * The "blkid" argument is the next block that we expect this stream to access.
- * While we're here, clean up old streams (which haven't been
- * accessed for at least zfetch_min_sec_reap seconds).
  */
 static void
 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 {
-	zstream_t *zs_next;
-	hrtime_t now = gethrtime();
+	zstream_t *zs, *zs_next, *zs_old = NULL;
+	uint_t now = gethrestime_sec(), t;
 
 	ASSERT(MUTEX_HELD(&zf->zf_lock));
 
 	/*
-	 * Clean up old streams.
+	 * Delete too old streams, reusing the first found one.
 	 */
-	for (zstream_t *zs = list_head(&zf->zf_stream);
-	    zs != NULL; zs = zs_next) {
+	t = now - zfetch_max_sec_reap;
+	for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) {
 		zs_next = list_next(&zf->zf_stream, zs);
 		/*
 		 * Skip if still active.  1 -- zf_stream reference.
 		 */
+		if ((int)(zs->zs_atime - t) >= 0)
+			continue;
 		if (zfs_refcount_count(&zs->zs_refs) != 1)
 			continue;
-		if (((now - zs->zs_atime) / NANOSEC) >
-		    zfetch_min_sec_reap)
+		if (zs_old)
 			dmu_zfetch_stream_remove(zf, zs);
+		else
+			zs_old = zs;
+	}
+	if (zs_old) {
+		zs = zs_old;
+		list_remove(&zf->zf_stream, zs);
+		goto reuse;
 	}
 
 	/*
 	 * The maximum number of streams is normally zfetch_max_streams,
 	 * but for small files we lower it such that it's at least possible
 	 * for all the streams to be non-overlapping.
-	 *
-	 * If we are already at the maximum number of streams for this file,
-	 * even after removing old streams, then don't create this stream.
 	 */
 	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
-	    zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
+	    (zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) /
 	    zfetch_max_distance));
 	if (zf->zf_numstreams >= max_streams) {
+		t = now - zfetch_min_sec_reap;
+		for (zs = list_head(&zf->zf_stream); zs != NULL;
+		    zs = list_next(&zf->zf_stream, zs)) {
+			if ((int)(zs->zs_atime - t) >= 0)
+				continue;
+			if (zfs_refcount_count(&zs->zs_refs) != 1)
+				continue;
+			if (zs_old == NULL ||
+			    (int)(zs_old->zs_atime - zs->zs_atime) >= 0)
+				zs_old = zs;
+		}
+		if (zs_old) {
+			zs = zs_old;
+			list_remove(&zf->zf_stream, zs);
+			goto reuse;
+		}
 		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
 		return;
 	}
 
-	zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
-	zs->zs_blkid = blkid;
-	zs->zs_pf_blkid1 = blkid;
-	zs->zs_pf_blkid = blkid;
-	zs->zs_ipf_blkid1 = blkid;
-	zs->zs_ipf_blkid = blkid;
-	zs->zs_atime = now;
-	zs->zs_fetch = zf;
-	zs->zs_missed = B_FALSE;
+	zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
 	zfs_refcount_create(&zs->zs_callers);
 	zfs_refcount_create(&zs->zs_refs);
 	/* One reference for zf_stream. */
 	zfs_refcount_add(&zs->zs_refs, NULL);
 	zf->zf_numstreams++;
+
+reuse:
 	list_insert_head(&zf->zf_stream, zs);
+	zs->zs_blkid = blkid;
+	/* Allow immediate stream reuse until first hit. */
+	zs->zs_atime = now - zfetch_min_sec_reap;
+	memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges));
+	zs->zs_pf_dist = 0;
+	zs->zs_ipf_dist = 0;
+	zs->zs_pf_start = blkid;
+	zs->zs_pf_end = blkid;
+	zs->zs_ipf_start = blkid;
+	zs->zs_ipf_end = blkid;
+	zs->zs_missed = B_FALSE;
+	zs->zs_more = B_FALSE;
 }
 
 static void
-dmu_zfetch_stream_done(void *arg, boolean_t io_issued)
+dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
 {
 	zstream_t *zs = arg;
 
+	if (io_issued && level == 0 && blkid < zs->zs_blkid)
+		zs->zs_more = B_TRUE;
 	if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
 		dmu_zfetch_stream_fini(zs);
+	aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
+}
+
+/*
+ * Process stream hit access for nblks blocks starting at zs_blkid.  Return
+ * number of blocks to proceed for after aggregation with future ranges.
+ */
+static uint64_t
+dmu_zfetch_hit(zstream_t *zs, uint64_t nblks)
+{
+	uint_t i, j;
+
+	/* Optimize sequential accesses (no future ranges). */
+	if (zs->zs_ranges[0].start == 0)
+		goto done;
+
+	/* Look for intersections with further ranges. */
+	for (i = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0 || r->start > nblks)
+			break;
+		if (r->end >= nblks) {
+			nblks = r->end;
+			i++;
+			break;
+		}
+	}
+
+	/* Delete all found intersecting ranges, updates remaining. */
+	for (j = 0; i < ZFETCH_RANGES; i++, j++) {
+		if (zs->zs_ranges[i].start == 0)
+			break;
+		ASSERT3U(zs->zs_ranges[i].start, >, nblks);
+		ASSERT3U(zs->zs_ranges[i].end, >, nblks);
+		zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks;
+		zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks;
+	}
+	if (j < ZFETCH_RANGES) {
+		zs->zs_ranges[j].start = 0;
+		zs->zs_ranges[j].end = 0;
+	}
+
+done:
+	zs->zs_blkid += nblks;
+	return (nblks);
+}
+
+/*
+ * Process future stream access for nblks blocks starting at blkid.  Return
+ * number of blocks to proceed for if future ranges reach fill threshold.
+ */
+static uint64_t
+dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks)
+{
+	ASSERT3U(blkid, >, zs->zs_blkid);
+	blkid -= zs->zs_blkid;
+	ASSERT3U(blkid + nblks, <=, UINT16_MAX);
+
+	/* Search for first and last intersection or insert point. */
+	uint_t f = ZFETCH_RANGES, l = 0, i;
+	for (i = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0 || r->start > blkid + nblks)
+			break;
+		if (r->end < blkid)
+			continue;
+		if (f > i)
+			f = i;
+		if (l < i)
+			l = i;
+	}
+	if (f <= l) {
+		/* Got some intersecting range, expand it if needed. */
+		if (zs->zs_ranges[f].start > blkid)
+			zs->zs_ranges[f].start = blkid;
+		zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks);
+		if (f < l) {
+			/* Got more than one intersection, remove others. */
+			for (f++, l++; l < ZFETCH_RANGES; f++, l++) {
+				zs->zs_ranges[f].start = zs->zs_ranges[l].start;
+				zs->zs_ranges[f].end = zs->zs_ranges[l].end;
+			}
+			zs->zs_ranges[f].start = 0;
+			zs->zs_ranges[f].end = 0;
+		}
+	} else if (i < ZFETCH_RANGES) {
+		/* Got no intersecting ranges, insert new one. */
+		for (l = ZFETCH_RANGES - 1; l > i; l--) {
+			zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start;
+			zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end;
+		}
+		zs->zs_ranges[i].start = blkid;
+		zs->zs_ranges[i].end = blkid + nblks;
+	} else {
+		/* No space left to insert.  Drop the range. */
+		return (0);
+	}
+
+	/* Check if with the new access addition we reached fill threshold. */
+	if (zfetch_hole_shift >= 16)
+		return (0);
+	uint_t hole = 0;
+	for (i = f = l = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0)
+			break;
+		hole += r->start - f;
+		f = r->end;
+		if (hole <= r->end >> zfetch_hole_shift)
+			l = r->end;
+	}
+	if (l > 0)
+		return (dmu_zfetch_hit(zs, l));
+
+	return (0);
 }
 
 /*
@@ -283,15 +470,15 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
     boolean_t fetch_data, boolean_t have_lock)
 {
 	zstream_t *zs;
-	int64_t pf_start, ipf_start;
-	int64_t pf_ahead_blks, max_blks;
-	int max_dist_blks, pf_nblks, ipf_nblks;
-	uint64_t end_of_access_blkid, maxblkid;
-	end_of_access_blkid = blkid + nblks;
 	spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
+	zfs_prefetch_type_t os_prefetch = zf->zf_dnode->dn_objset->os_prefetch;
 
-	if (zfs_prefetch_disable)
+	if (zfs_prefetch_disable || os_prefetch == ZFS_PREFETCH_NONE)
 		return (NULL);
+
+	if (os_prefetch == ZFS_PREFETCH_METADATA)
+		fetch_data = B_FALSE;
+
 	/*
 	 * If we haven't yet loaded the indirect vdevs' mappings, we
 	 * can only read from blocks that we carefully ensure are on
@@ -316,7 +503,7 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	 * A fast path for small files for which no prefetch will
 	 * happen.
 	 */
-	maxblkid = zf->zf_dnode->dn_maxblkid;
+	uint64_t maxblkid = zf->zf_dnode->dn_maxblkid;
 	if (maxblkid < 2) {
 		if (!have_lock)
 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
@@ -325,115 +512,147 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	mutex_enter(&zf->zf_lock);
 
 	/*
-	 * Find matching prefetch stream.  Depending on whether the accesses
+	 * Find perfect prefetch stream.  Depending on whether the accesses
 	 * are block-aligned, first block of the new access may either follow
 	 * the last block of the previous access, or be equal to it.
 	 */
+	unsigned int dbs = zf->zf_dnode->dn_datablkshift;
+	uint64_t end_blkid = blkid + nblks;
 	for (zs = list_head(&zf->zf_stream); zs != NULL;
 	    zs = list_next(&zf->zf_stream, zs)) {
 		if (blkid == zs->zs_blkid) {
-			break;
+			goto hit;
 		} else if (blkid + 1 == zs->zs_blkid) {
 			blkid++;
 			nblks--;
-			break;
+			goto hit;
 		}
 	}
 
 	/*
-	 * If the file is ending, remove the matching stream if found.
-	 * If not found then it is too late to create a new one now.
+	 * Find close enough prefetch stream.  Access crossing stream position
+	 * is a hit in its new part.  Access ahead of stream position considered
+	 * a hit for metadata prefetch, since we do not care about fill percent,
+	 * or stored for future otherwise.  Access behind stream position is
+	 * silently ignored, since we already skipped it reaching fill percent.
 	 */
-	if (end_of_access_blkid >= maxblkid) {
-		if (zs != NULL)
-			dmu_zfetch_stream_remove(zf, zs);
-		mutex_exit(&zf->zf_lock);
-		if (!have_lock)
-			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		return (NULL);
+	uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX);
+	uint_t t = gethrestime_sec() - zfetch_max_sec_reap;
+	for (zs = list_head(&zf->zf_stream); zs != NULL;
+	    zs = list_next(&zf->zf_stream, zs)) {
+		if (blkid > zs->zs_blkid) {
+			if (end_blkid <= zs->zs_blkid + max_reorder) {
+				if (!fetch_data) {
+					nblks = dmu_zfetch_hit(zs,
+					    end_blkid - zs->zs_blkid);
+					ZFETCHSTAT_BUMP(zfetchstat_stride);
+					goto future;
+				}
+				nblks = dmu_zfetch_future(zs, blkid, nblks);
+				if (nblks > 0)
+					ZFETCHSTAT_BUMP(zfetchstat_stride);
+				else
+					ZFETCHSTAT_BUMP(zfetchstat_future);
+				goto future;
+			}
+		} else if (end_blkid >= zs->zs_blkid) {
+			nblks -= zs->zs_blkid - blkid;
+			blkid += zs->zs_blkid - blkid;
+			goto hit;
+		} else if (end_blkid + max_reorder > zs->zs_blkid &&
+		    (int)(zs->zs_atime - t) >= 0) {
+			ZFETCHSTAT_BUMP(zfetchstat_past);
+			zs->zs_atime = gethrestime_sec();
+			goto out;
+		}
 	}
 
-	/* Exit if we already prefetched this block before. */
-	if (nblks == 0) {
-		mutex_exit(&zf->zf_lock);
-		if (!have_lock)
-			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		return (NULL);
-	}
+	/*
+	 * This access is not part of any existing stream.  Create a new
+	 * stream for it unless we are at the end of file.
+	 */
+	if (end_blkid < maxblkid)
+		dmu_zfetch_stream_create(zf, end_blkid);
+	mutex_exit(&zf->zf_lock);
+	if (!have_lock)
+		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+	ZFETCHSTAT_BUMP(zfetchstat_misses);
+	return (NULL);
 
-	if (zs == NULL) {
-		/*
-		 * This access is not part of any existing stream.  Create
-		 * a new stream for it.
-		 */
-		dmu_zfetch_stream_create(zf, end_of_access_blkid);
+hit:
+	nblks = dmu_zfetch_hit(zs, nblks);
+	ZFETCHSTAT_BUMP(zfetchstat_hits);
+
+future:
+	zs->zs_atime = gethrestime_sec();
+
+	/* Exit if we already prefetched for this position before. */
+	if (nblks == 0)
+		goto out;
+
+	/* If the file is ending, remove the stream. */
+	end_blkid = zs->zs_blkid;
+	if (end_blkid >= maxblkid) {
+		dmu_zfetch_stream_remove(zf, zs);
+out:
 		mutex_exit(&zf->zf_lock);
 		if (!have_lock)
 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		ZFETCHSTAT_BUMP(zfetchstat_misses);
 		return (NULL);
 	}
 
 	/*
 	 * This access was to a block that we issued a prefetch for on
-	 * behalf of this stream. Issue further prefetches for this stream.
+	 * behalf of this stream.  Calculate further prefetch distances.
 	 *
-	 * Normally, we start prefetching where we stopped
-	 * prefetching last (zs_pf_blkid).  But when we get our first
-	 * hit on this stream, zs_pf_blkid == zs_blkid, we don't
-	 * want to prefetch the block we just accessed.  In this case,
-	 * start just after the block we just accessed.
-	 */
-	pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
-	if (zs->zs_pf_blkid1 < end_of_access_blkid)
-		zs->zs_pf_blkid1 = end_of_access_blkid;
-	if (zs->zs_ipf_blkid1 < end_of_access_blkid)
-		zs->zs_ipf_blkid1 = end_of_access_blkid;
-
-	/*
-	 * Double our amount of prefetched data, but don't let the
-	 * prefetch get further ahead than zfetch_max_distance.
+	 * Start prefetch from the demand access size (nblks).  Double the
+	 * distance every access up to zfetch_min_distance.  After that only
+	 * if needed increase the distance by 1/8 up to zfetch_max_distance.
+	 *
+	 * Don't double the distance beyond single block if we have more
+	 * than ~6% of ARC held by active prefetches.  It should help with
+	 * getting out of RAM on some badly mispredicted read patterns.
 	 */
+	unsigned int nbytes = nblks << dbs;
+	unsigned int pf_nblks;
 	if (fetch_data) {
-		max_dist_blks =
-		    zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
-		/*
-		 * Previously, we were (zs_pf_blkid - blkid) ahead.  We
-		 * want to now be double that, so read that amount again,
-		 * plus the amount we are catching up by (i.e. the amount
-		 * read just now).
-		 */
-		pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
-		max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
-		pf_nblks = MIN(pf_ahead_blks, max_blks);
+		if (unlikely(zs->zs_pf_dist < nbytes))
+			zs->zs_pf_dist = nbytes;
+		else if (zs->zs_pf_dist < zfetch_min_distance &&
+		    (zs->zs_pf_dist < (1 << dbs) ||
+		    aggsum_compare(&zfetch_sums.zfetchstat_io_active,
+		    arc_c_max >> (4 + dbs)) < 0))
+			zs->zs_pf_dist *= 2;
+		else if (zs->zs_more)
+			zs->zs_pf_dist += zs->zs_pf_dist / 8;
+		zs->zs_more = B_FALSE;
+		if (zs->zs_pf_dist > zfetch_max_distance)
+			zs->zs_pf_dist = zfetch_max_distance;
+		pf_nblks = zs->zs_pf_dist >> dbs;
 	} else {
 		pf_nblks = 0;
 	}
-
-	zs->zs_pf_blkid = pf_start + pf_nblks;
+	if (zs->zs_pf_start < end_blkid)
+		zs->zs_pf_start = end_blkid;
+	if (zs->zs_pf_end < end_blkid + pf_nblks)
+		zs->zs_pf_end = end_blkid + pf_nblks;
 
 	/*
-	 * Do the same for indirects, starting from where we stopped last,
-	 * or where we will stop reading data blocks (and the indirects
-	 * that point to them).
+	 * Do the same for indirects, starting where we will stop reading
+	 * data blocks (and the indirects that point to them).
 	 */
-	ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
-	max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
-	/*
-	 * We want to double our distance ahead of the data prefetch
-	 * (or reader, if we are not prefetching data).  Previously, we
-	 * were (zs_ipf_blkid - blkid) ahead.  To double that, we read
-	 * that amount again, plus the amount we are catching up by
-	 * (i.e. the amount read now + the amount of data prefetched now).
-	 */
-	pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
-	max_blks = max_dist_blks - (ipf_start - zs->zs_pf_blkid);
-	ipf_nblks = MIN(pf_ahead_blks, max_blks);
-	zs->zs_ipf_blkid = ipf_start + ipf_nblks;
-
-	zs->zs_blkid = end_of_access_blkid;
-	/* Protect the stream from reclamation. */
-	zs->zs_atime = gethrtime();
+	if (unlikely(zs->zs_ipf_dist < nbytes))
+		zs->zs_ipf_dist = nbytes;
+	else
+		zs->zs_ipf_dist *= 2;
+	if (zs->zs_ipf_dist > zfetch_max_idistance)
+		zs->zs_ipf_dist = zfetch_max_idistance;
+	pf_nblks = zs->zs_ipf_dist >> dbs;
+	if (zs->zs_ipf_start < zs->zs_pf_end)
+		zs->zs_ipf_start = zs->zs_pf_end;
+	if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
+		zs->zs_ipf_end = zs->zs_pf_end + pf_nblks;
+
 	zfs_refcount_add(&zs->zs_refs, NULL);
 	/* Count concurrent callers. */
 	zfs_refcount_add(&zs->zs_callers, NULL);
@@ -441,15 +660,13 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 
 	if (!have_lock)
 		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-
-	ZFETCHSTAT_BUMP(zfetchstat_hits);
 	return (zs);
 }
 
 void
-dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
+dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
+    boolean_t have_lock)
 {
-	zfetch_t *zf = zs->zs_fetch;
 	int64_t pf_start, pf_end, ipf_start, ipf_end;
 	int epbs, issued;
 
@@ -470,13 +687,13 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
 
 	mutex_enter(&zf->zf_lock);
 	if (zs->zs_missed) {
-		pf_start = zs->zs_pf_blkid1;
-		pf_end = zs->zs_pf_blkid1 = zs->zs_pf_blkid;
+		pf_start = zs->zs_pf_start;
+		pf_end = zs->zs_pf_start = zs->zs_pf_end;
 	} else {
 		pf_start = pf_end = 0;
 	}
-	ipf_start = MAX(zs->zs_pf_blkid1, zs->zs_ipf_blkid1);
-	ipf_end = zs->zs_ipf_blkid1 = zs->zs_ipf_blkid;
+	ipf_start = zs->zs_ipf_start;
+	ipf_end = zs->zs_ipf_start = zs->zs_ipf_end;
 	mutex_exit(&zf->zf_lock);
 	ASSERT3S(pf_start, <=, pf_end);
 	ASSERT3S(ipf_start, <=, ipf_end);
@@ -488,14 +705,14 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
 	issued = pf_end - pf_start + ipf_end - ipf_start;
 	if (issued > 1) {
 		/* More references on top of taken in dmu_zfetch_prepare(). */
-		for (int i = 0; i < issued - 1; i++)
-			zfs_refcount_add(&zs->zs_refs, NULL);
+		zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL);
 	} else if (issued == 0) {
 		/* Some other thread has done our work, so drop the ref. */
 		if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
 			dmu_zfetch_stream_fini(zs);
 		return;
 	}
+	aggsum_add(&zfetch_sums.zfetchstat_io_active, issued);
 
 	if (!have_lock)
 		rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
@@ -503,13 +720,11 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
 	issued = 0;
 	for (int64_t blk = pf_start; blk < pf_end; blk++) {
 		issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
-		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
-		    dmu_zfetch_stream_done, zs);
+		    ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
 	}
 	for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
 		issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
-		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
-		    dmu_zfetch_stream_done, zs);
+		    ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
 	}
 
 	if (!have_lock)
@@ -527,10 +742,9 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
 
 	zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
 	if (zs)
-		dmu_zfetch_run(zs, missed, have_lock);
+		dmu_zfetch_run(zf, zs, missed, have_lock);
 }
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
 	"Disable all ZFS prefetching");
 
@@ -540,12 +754,20 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW,
 	"Min time before stream reclaim");
 
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW,
+	"Max time before stream delete");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW,
+	"Min bytes to prefetch per stream");
+
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
 	"Max bytes to prefetch per stream");
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
 	"Max bytes to prefetch indirects for per stream");
 
-ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, ULONG, ZMOD_RW,
-	"Number of bytes in a array_read");
-/* END CSTYLED */
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW,
+	"Max request reorder distance within a stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW,
+	"Max log2 fraction of holes in a stream");
diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c
index db1a5d71df3c..a703fd414f87 100644
--- a/sys/contrib/openzfs/module/zfs/dnode.c
+++ b/sys/contrib/openzfs/module/zfs/dnode.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -71,6 +71,8 @@ dnode_stats_t dnode_stats = {
 	{ "dnode_move_active",			KSTAT_DATA_UINT64 },
 };
 
+dnode_sums_t dnode_sums;
+
 static kstat_t *dnode_ksp;
 static kmem_cache_t *dnode_cache;
 
@@ -97,6 +99,14 @@ dbuf_compare(const void *x1, const void *x2)
 	if (likely(cmp))
 		return (cmp);
 
+	if (d1->db_state == DB_MARKER) {
+		ASSERT3S(d2->db_state, !=, DB_MARKER);
+		return (TREE_PCMP(d1->db_parent, d2));
+	} else if (d2->db_state == DB_MARKER) {
+		ASSERT3S(d1->db_state, !=, DB_MARKER);
+		return (TREE_PCMP(d1, d2->db_parent));
+	}
+
 	if (d1->db_state == DB_SEARCH) {
 		ASSERT3S(d2->db_state, !=, DB_SEARCH);
 		return (-1);
@@ -108,12 +118,11 @@ dbuf_compare(const void *x1, const void *x2)
 	return (TREE_PCMP(d1, d2));
 }
 
-/* ARGSUSED */
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
 {
+	(void) unused, (void) kmflag;
 	dnode_t *dn = arg;
-	int i;
 
 	rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);
 	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -129,17 +138,17 @@ dnode_cons(void *arg, void *unused, int kmflag)
 	zfs_refcount_create(&dn->dn_tx_holds);
 	list_link_init(&dn->dn_link);
 
-	bzero(&dn->dn_next_type[0], sizeof (dn->dn_next_type));
-	bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
-	bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
-	bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
-	bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
-	bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
-	bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
-	bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
-	bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid));
+	memset(dn->dn_next_type, 0, sizeof (dn->dn_next_type));
+	memset(dn->dn_next_nblkptr, 0, sizeof (dn->dn_next_nblkptr));
+	memset(dn->dn_next_nlevels, 0, sizeof (dn->dn_next_nlevels));
+	memset(dn->dn_next_indblkshift, 0, sizeof (dn->dn_next_indblkshift));
+	memset(dn->dn_next_bonustype, 0, sizeof (dn->dn_next_bonustype));
+	memset(dn->dn_rm_spillblk, 0, sizeof (dn->dn_rm_spillblk));
+	memset(dn->dn_next_bonuslen, 0, sizeof (dn->dn_next_bonuslen));
+	memset(dn->dn_next_blksz, 0, sizeof (dn->dn_next_blksz));
+	memset(dn->dn_next_maxblkid, 0, sizeof (dn->dn_next_maxblkid));
 
-	for (i = 0; i < TXG_SIZE; i++) {
+	for (int i = 0; i < TXG_SIZE; i++) {
 		multilist_link_init(&dn->dn_dirty_link[i]);
 		dn->dn_free_ranges[i] = NULL;
 		list_create(&dn->dn_dirty_records[i],
@@ -174,11 +183,10 @@ dnode_cons(void *arg, void *unused, int kmflag)
 	return (0);
 }
 
-/* ARGSUSED */
 static void
 dnode_dest(void *arg, void *unused)
 {
-	int i;
+	(void) unused;
 	dnode_t *dn = arg;
 
 	rw_destroy(&dn->dn_struct_rwlock);
@@ -190,7 +198,7 @@ dnode_dest(void *arg, void *unused)
 	zfs_refcount_destroy(&dn->dn_tx_holds);
 	ASSERT(!list_link_active(&dn->dn_link));
 
-	for (i = 0; i < TXG_SIZE; i++) {
+	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 		list_destroy(&dn->dn_dirty_records[i]);
@@ -227,6 +235,72 @@ dnode_dest(void *arg, void *unused)
 	avl_destroy(&dn->dn_dbufs);
 }
 
+static int
+dnode_kstats_update(kstat_t *ksp, int rw)
+{
+	dnode_stats_t *ds = ksp->ks_data;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+	ds->dnode_hold_dbuf_hold.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_hold_dbuf_hold);
+	ds->dnode_hold_dbuf_read.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_hold_dbuf_read);
+	ds->dnode_hold_alloc_hits.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_hold_alloc_hits);
+	ds->dnode_hold_alloc_misses.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_hold_alloc_misses);
+	ds->dnode_hold_alloc_interior.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_hold_alloc_interior);
+	ds->dnode_hold_alloc_lock_retry.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_hold_alloc_lock_retry);
+	ds->dnode_hold_alloc_lock_misses.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_hold_alloc_lock_misses);
+	ds->dnode_hold_alloc_type_none.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_hold_alloc_type_none);
+	ds->dnode_hold_free_hits.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_hold_free_hits);
+	ds->dnode_hold_free_misses.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_hold_free_misses);
+	ds->dnode_hold_free_lock_misses.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_hold_free_lock_misses);
+	ds->dnode_hold_free_lock_retry.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_hold_free_lock_retry);
+	ds->dnode_hold_free_refcount.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_hold_free_refcount);
+	ds->dnode_hold_free_overflow.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_hold_free_overflow);
+	ds->dnode_free_interior_lock_retry.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_free_interior_lock_retry);
+	ds->dnode_allocate.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_allocate);
+	ds->dnode_reallocate.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_reallocate);
+	ds->dnode_buf_evict.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_buf_evict);
+	ds->dnode_alloc_next_chunk.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_alloc_next_chunk);
+	ds->dnode_alloc_race.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_alloc_race);
+	ds->dnode_alloc_next_block.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_alloc_next_block);
+	ds->dnode_move_invalid.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_move_invalid);
+	ds->dnode_move_recheck1.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_move_recheck1);
+	ds->dnode_move_recheck2.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_move_recheck2);
+	ds->dnode_move_special.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_move_special);
+	ds->dnode_move_handle.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_move_handle);
+	ds->dnode_move_rwlock.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_move_rwlock);
+	ds->dnode_move_active.value.ui64 =
+	    wmsum_value(&dnode_sums.dnode_move_active);
+	return (0);
+}
+
 void
 dnode_init(void)
 {
@@ -235,11 +309,41 @@ dnode_init(void)
 	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
 	kmem_cache_set_move(dnode_cache, dnode_move);
 
+	wmsum_init(&dnode_sums.dnode_hold_dbuf_hold, 0);
+	wmsum_init(&dnode_sums.dnode_hold_dbuf_read, 0);
+	wmsum_init(&dnode_sums.dnode_hold_alloc_hits, 0);
+	wmsum_init(&dnode_sums.dnode_hold_alloc_misses, 0);
+	wmsum_init(&dnode_sums.dnode_hold_alloc_interior, 0);
+	wmsum_init(&dnode_sums.dnode_hold_alloc_lock_retry, 0);
+	wmsum_init(&dnode_sums.dnode_hold_alloc_lock_misses, 0);
+	wmsum_init(&dnode_sums.dnode_hold_alloc_type_none, 0);
+	wmsum_init(&dnode_sums.dnode_hold_free_hits, 0);
+	wmsum_init(&dnode_sums.dnode_hold_free_misses, 0);
+	wmsum_init(&dnode_sums.dnode_hold_free_lock_misses, 0);
+	wmsum_init(&dnode_sums.dnode_hold_free_lock_retry, 0);
+	wmsum_init(&dnode_sums.dnode_hold_free_refcount, 0);
+	wmsum_init(&dnode_sums.dnode_hold_free_overflow, 0);
+	wmsum_init(&dnode_sums.dnode_free_interior_lock_retry, 0);
+	wmsum_init(&dnode_sums.dnode_allocate, 0);
+	wmsum_init(&dnode_sums.dnode_reallocate, 0);
+	wmsum_init(&dnode_sums.dnode_buf_evict, 0);
+	wmsum_init(&dnode_sums.dnode_alloc_next_chunk, 0);
+	wmsum_init(&dnode_sums.dnode_alloc_race, 0);
+	wmsum_init(&dnode_sums.dnode_alloc_next_block, 0);
+	wmsum_init(&dnode_sums.dnode_move_invalid, 0);
+	wmsum_init(&dnode_sums.dnode_move_recheck1, 0);
+	wmsum_init(&dnode_sums.dnode_move_recheck2, 0);
+	wmsum_init(&dnode_sums.dnode_move_special, 0);
+	wmsum_init(&dnode_sums.dnode_move_handle, 0);
+	wmsum_init(&dnode_sums.dnode_move_rwlock, 0);
+	wmsum_init(&dnode_sums.dnode_move_active, 0);
+
 	dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 	if (dnode_ksp != NULL) {
 		dnode_ksp->ks_data = &dnode_stats;
+		dnode_ksp->ks_update = dnode_kstats_update;
 		kstat_install(dnode_ksp);
 	}
 }
@@ -252,6 +356,35 @@ dnode_fini(void)
 		dnode_ksp = NULL;
 	}
 
+	wmsum_fini(&dnode_sums.dnode_hold_dbuf_hold);
+	wmsum_fini(&dnode_sums.dnode_hold_dbuf_read);
+	wmsum_fini(&dnode_sums.dnode_hold_alloc_hits);
+	wmsum_fini(&dnode_sums.dnode_hold_alloc_misses);
+	wmsum_fini(&dnode_sums.dnode_hold_alloc_interior);
+	wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_retry);
+	wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_misses);
+	wmsum_fini(&dnode_sums.dnode_hold_alloc_type_none);
+	wmsum_fini(&dnode_sums.dnode_hold_free_hits);
+	wmsum_fini(&dnode_sums.dnode_hold_free_misses);
+	wmsum_fini(&dnode_sums.dnode_hold_free_lock_misses);
+	wmsum_fini(&dnode_sums.dnode_hold_free_lock_retry);
+	wmsum_fini(&dnode_sums.dnode_hold_free_refcount);
+	wmsum_fini(&dnode_sums.dnode_hold_free_overflow);
+	wmsum_fini(&dnode_sums.dnode_free_interior_lock_retry);
+	wmsum_fini(&dnode_sums.dnode_allocate);
+	wmsum_fini(&dnode_sums.dnode_reallocate);
+	wmsum_fini(&dnode_sums.dnode_buf_evict);
+	wmsum_fini(&dnode_sums.dnode_alloc_next_chunk);
+	wmsum_fini(&dnode_sums.dnode_alloc_race);
+	wmsum_fini(&dnode_sums.dnode_alloc_next_block);
+	wmsum_fini(&dnode_sums.dnode_move_invalid);
+	wmsum_fini(&dnode_sums.dnode_move_recheck1);
+	wmsum_fini(&dnode_sums.dnode_move_recheck2);
+	wmsum_fini(&dnode_sums.dnode_move_special);
+	wmsum_fini(&dnode_sums.dnode_move_handle);
+	wmsum_fini(&dnode_sums.dnode_move_rwlock);
+	wmsum_fini(&dnode_sums.dnode_move_active);
+
 	kmem_cache_destroy(dnode_cache);
 	dnode_cache = NULL;
 }
@@ -319,7 +452,7 @@ dnode_byteswap(dnode_phys_t *dnp)
 	int i;
 
 	if (dnp->dn_type == DMU_OT_NONE) {
-		bzero(dnp, sizeof (dnode_phys_t));
+		memset(dnp, 0, sizeof (dnode_phys_t));
 		return;
 	}
 
@@ -344,20 +477,11 @@ dnode_byteswap(dnode_phys_t *dnp)
 	 * dnode dnode is smaller than a regular dnode.
 	 */
 	if (dnp->dn_bonuslen != 0) {
-		/*
-		 * Note that the bonus length calculated here may be
-		 * longer than the actual bonus buffer.  This is because
-		 * we always put the bonus buffer after the last block
-		 * pointer (instead of packing it against the end of the
-		 * dnode buffer).
-		 */
-		int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
-		int slots = dnp->dn_extra_slots + 1;
-		size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
 		dmu_object_byteswap_t byteswap;
 		ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
 		byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
-		dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
+		dmu_ot_byteswap[byteswap].ob_func(DN_BONUS(dnp),
+		    DN_MAX_BONUS_LEN(dnp));
 	}
 
 	/* Swap SPILL block if we have one */
@@ -397,7 +521,7 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 		/* clear any data after the end of the new size */
 		size_t diff = dn->dn_bonuslen - newsize;
 		char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize;
-		bzero(data_end, diff);
+		memset(data_end, 0, diff);
 	}
 
 	dn->dn_bonuslen = newsize;
@@ -598,12 +722,13 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 	DNODE_STAT_BUMP(dnode_allocate);
 
 	ASSERT(dn->dn_type == DMU_OT_NONE);
-	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
+	ASSERT0(memcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)));
 	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
 	ASSERT(ot != DMU_OT_NONE);
 	ASSERT(DMU_OT_IS_VALID(ot));
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
+	    (bonustype == DMU_OTN_UINT64_METADATA && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
 	ASSERT(DMU_OT_IS_VALID(bonustype));
 	ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
@@ -751,8 +876,6 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 static void
 dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 {
-	int i;
-
 	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
 	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
 	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
@@ -776,29 +899,29 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 	ndn->dn_datablksz = odn->dn_datablksz;
 	ndn->dn_maxblkid = odn->dn_maxblkid;
 	ndn->dn_num_slots = odn->dn_num_slots;
-	bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
+	memcpy(ndn->dn_next_type, odn->dn_next_type,
 	    sizeof (odn->dn_next_type));
-	bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
+	memcpy(ndn->dn_next_nblkptr, odn->dn_next_nblkptr,
 	    sizeof (odn->dn_next_nblkptr));
-	bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
+	memcpy(ndn->dn_next_nlevels, odn->dn_next_nlevels,
 	    sizeof (odn->dn_next_nlevels));
-	bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
+	memcpy(ndn->dn_next_indblkshift, odn->dn_next_indblkshift,
 	    sizeof (odn->dn_next_indblkshift));
-	bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
+	memcpy(ndn->dn_next_bonustype, odn->dn_next_bonustype,
 	    sizeof (odn->dn_next_bonustype));
-	bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
+	memcpy(ndn->dn_rm_spillblk, odn->dn_rm_spillblk,
 	    sizeof (odn->dn_rm_spillblk));
-	bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
+	memcpy(ndn->dn_next_bonuslen, odn->dn_next_bonuslen,
 	    sizeof (odn->dn_next_bonuslen));
-	bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
+	memcpy(ndn->dn_next_blksz, odn->dn_next_blksz,
 	    sizeof (odn->dn_next_blksz));
-	bcopy(&odn->dn_next_maxblkid[0], &ndn->dn_next_maxblkid[0],
+	memcpy(ndn->dn_next_maxblkid, odn->dn_next_maxblkid,
 	    sizeof (odn->dn_next_maxblkid));
-	for (i = 0; i < TXG_SIZE; i++) {
+	for (int i = 0; i < TXG_SIZE; i++) {
 		list_move_tail(&ndn->dn_dirty_records[i],
 		    &odn->dn_dirty_records[i]);
 	}
-	bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
+	memcpy(ndn->dn_free_ranges, odn->dn_free_ranges,
 	    sizeof (odn->dn_free_ranges));
 	ndn->dn_allocated_txg = odn->dn_allocated_txg;
 	ndn->dn_free_txg = odn->dn_free_txg;
@@ -852,7 +975,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 	/*
 	 * Satisfy the destructor.
 	 */
-	for (i = 0; i < TXG_SIZE; i++) {
+	for (int i = 0; i < TXG_SIZE; i++) {
 		list_create(&odn->dn_dirty_records[i],
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
@@ -889,7 +1012,6 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 	odn->dn_moved = (uint8_t)-1;
 }
 
-/*ARGSUSED*/
 static kmem_cbrc_t
 dnode_move(void *buf, void *newbuf, size_t size, void *arg)
 {
@@ -1123,9 +1245,11 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
 	return (B_TRUE);
 }
 
-static void
+static uint_t
 dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
 {
+	uint_t reclaimed = 0;
+
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	for (int i = idx; i < idx + slots; i++) {
@@ -1137,8 +1261,11 @@ dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
 			ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
 			dnode_destroy(dnh->dnh_dnode);
 			dnh->dnh_dnode = DN_SLOT_FREE;
+			reclaimed++;
 		}
 	}
+
+	return (reclaimed);
 }
 
 void
@@ -1156,7 +1283,7 @@ dnode_free_interior_slots(dnode_t *dn)
 
 	while (!dnode_slots_tryenter(children, idx, slots)) {
 		DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
-		cond_resched();
+		kpreempt(KPREEMPT_SYNC);
 	}
 
 	dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
@@ -1273,7 +1400,7 @@ dnode_buf_evict_async(void *dbu)
  */
 int
 dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
-    void *tag, dnode_t **dnp)
+    const void *tag, dnode_t **dnp)
 {
 	int epb, idx, err;
 	int drop_struct_lock = FALSE;
@@ -1437,7 +1564,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 			dnode_slots_rele(dnc, idx, slots);
 			while (!dnode_slots_tryenter(dnc, idx, slots)) {
 				DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
-				cond_resched();
+				kpreempt(KPREEMPT_SYNC);
 			}
 
 			/*
@@ -1451,6 +1578,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 			} else {
 				dn = dnode_create(os, dn_block + idx, db,
 				    object, dnh);
+				dmu_buf_add_user_size(&db->db,
+				    sizeof (dnode_t));
 			}
 		}
 
@@ -1492,7 +1621,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 		dnode_slots_rele(dnc, idx, slots);
 		while (!dnode_slots_tryenter(dnc, idx, slots)) {
 			DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
-			cond_resched();
+			kpreempt(KPREEMPT_SYNC);
 		}
 
 		if (!dnode_check_slots_free(dnc, idx, slots)) {
@@ -1508,8 +1637,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 		 * to be freed.  Single slot dnodes can be safely
 		 * re-purposed as a performance optimization.
 		 */
-		if (slots > 1)
-			dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+		if (slots > 1) {
+			uint_t reclaimed =
+			    dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+			if (reclaimed > 0)
+				dmu_buf_sub_user_size(&db->db,
+				    reclaimed * sizeof (dnode_t));
+		}
 
 		dnh = &dnc->dnc_children[idx];
 		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
@@ -1517,6 +1651,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 		} else {
 			dn = dnode_create(os, dn_block + idx, db,
 			    object, dnh);
+			dmu_buf_add_user_size(&db->db, sizeof (dnode_t));
 		}
 
 		mutex_enter(&dn->dn_mtx);
@@ -1567,7 +1702,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
  * Return held dnode if the object is allocated, NULL if not.
  */
 int
-dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
+dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp)
 {
 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
 	    dnp));
@@ -1579,7 +1714,7 @@ dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
  * new reference.
  */
 boolean_t
-dnode_add_ref(dnode_t *dn, void *tag)
+dnode_add_ref(dnode_t *dn, const void *tag)
 {
 	mutex_enter(&dn->dn_mtx);
 	if (zfs_refcount_is_zero(&dn->dn_holds)) {
@@ -1592,14 +1727,14 @@ dnode_add_ref(dnode_t *dn, void *tag)
 }
 
 void
-dnode_rele(dnode_t *dn, void *tag)
+dnode_rele(dnode_t *dn, const void *tag)
 {
 	mutex_enter(&dn->dn_mtx);
 	dnode_rele_and_unlock(dn, tag, B_FALSE);
 }
 
 void
-dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
+dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting)
 {
 	uint64_t refs;
 	/* Get while the hold prevents the dnode from moving. */
@@ -1621,7 +1756,9 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
 	 * other direct or indirect hold on the dnode must first drop the dnode
 	 * handle.
 	 */
+#ifdef ZFS_DEBUG
 	ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
+#endif
 
 	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
 	if (refs == 0 && db != NULL) {
@@ -1649,7 +1786,14 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)
 }
 
 /*
- * Checks if the dnode contains any uncommitted dirty records.
+ * Checks if the dnode itself is dirty, or is carrying any uncommitted records.
+ * It is important to check both conditions, as some operations (eg appending
+ * to a file) can dirty both as a single logical unit, but they are not synced
+ * out atomically, so checking one and not the other can result in an object
+ * appearing to be clean mid-way through a commit.
+ *
+ * Do not change this lightly! If you get it wrong, dmu_offset_next() can
+ * detect a hole where there is really data, leading to silent corruption.
  */
 boolean_t
 dnode_is_dirty(dnode_t *dn)
@@ -1657,7 +1801,8 @@ dnode_is_dirty(dnode_t *dn)
 	mutex_enter(&dn->dn_mtx);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
-		if (multilist_link_active(&dn->dn_dirty_link[i])) {
+		if (multilist_link_active(&dn->dn_dirty_link[i]) ||
+		    !list_is_empty(&dn->dn_dirty_records[i])) {
 			mutex_exit(&dn->dn_mtx);
 			return (B_TRUE);
 		}
@@ -1767,7 +1912,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 	if (ibs == dn->dn_indblkshift)
 		ibs = 0;
 
-	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
+	if (size == dn->dn_datablksz && ibs == 0)
 		return (0);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
@@ -1790,24 +1935,25 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 	if (ibs && dn->dn_nlevels != 1)
 		goto fail;
 
-	/* resize the old block */
-	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
-	if (err == 0) {
-		dbuf_new_size(db, size, tx);
-	} else if (err != ENOENT) {
-		goto fail;
-	}
-
-	dnode_setdblksz(dn, size);
 	dnode_setdirty(dn, tx);
-	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
+	if (size != dn->dn_datablksz) {
+		/* resize the old block */
+		err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
+		if (err == 0) {
+			dbuf_new_size(db, size, tx);
+		} else if (err != ENOENT) {
+			goto fail;
+		}
+
+		dnode_setdblksz(dn, size);
+		dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size;
+		if (db)
+			dbuf_rele(db, FTAG);
+	}
 	if (ibs) {
 		dn->dn_indblkshift = ibs;
-		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+		dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
 	}
-	/* release after we have fixed the blocksize in the dnode */
-	if (db)
-		dbuf_rele(db, FTAG);
 
 	rw_exit(&dn->dn_struct_rwlock);
 	return (0);
@@ -2032,7 +2178,7 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
 }
 
 void
-dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag)
+dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag)
 {
 	/*
 	 * Don't set dirtyctx to SYNC if we're just modifying this as we
@@ -2082,7 +2228,7 @@ dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len,
 
 			dmu_buf_will_dirty(&db->db, tx);
 			data = db->db.db_data;
-			bzero(data + blkoff, len);
+			memset(data + blkoff, 0, len);
 		}
 		dbuf_rele(db, FTAG);
 	}
@@ -2292,19 +2438,11 @@ dnode_spill_freed(dnode_t *dn)
 uint64_t
 dnode_block_freed(dnode_t *dn, uint64_t blkid)
 {
-	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
 	int i;
 
 	if (blkid == DMU_BONUS_BLKID)
 		return (FALSE);
 
-	/*
-	 * If we're in the process of opening the pool, dp will not be
-	 * set yet, but there shouldn't be anything dirty.
-	 */
-	if (dp == NULL)
-		return (FALSE);
-
 	if (dn->dn_free_txg)
 		return (TRUE);
 
@@ -2419,7 +2557,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 	}
 
 	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
-	    db->db_blkptr->blk_birth <= txg ||
+	    BP_GET_LOGICAL_BIRTH(db->db_blkptr) <= txg ||
 	    BP_IS_HOLE(db->db_blkptr))) {
 		/*
 		 * This can only happen when we are searching up the tree
@@ -2467,7 +2605,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 		    i >= 0 && i < epb; i += inc) {
 			if (BP_GET_FILL(&bp[i]) >= minfill &&
 			    BP_GET_FILL(&bp[i]) <= maxfill &&
-			    (hole || bp[i].blk_birth > txg))
+			    (hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg))
 				break;
 			if (inc > 0 || *offset > 0)
 				*offset += inc;
@@ -2481,8 +2619,9 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 
 		if (inc < 0) {
 			/* traversing backwards; position offset at the end */
-			ASSERT3U(*offset, <=, start);
-			*offset = MIN(*offset + (1ULL << span) - 1, start);
+			if (span < 8 * sizeof (*offset))
+				*offset = MIN(*offset + (1ULL << span) - 1,
+				    start);
 		} else if (*offset < start) {
 			*offset = start;
 		}
@@ -2589,3 +2728,8 @@ EXPORT_SYMBOL(dnode_free_range);
 EXPORT_SYMBOL(dnode_evict_dbufs);
 EXPORT_SYMBOL(dnode_evict_bonus);
 #endif
+
+ZFS_MODULE_PARAM(zfs, zfs_, default_bs, INT, ZMOD_RW,
+	"Default dnode block shift");
+ZFS_MODULE_PARAM(zfs, zfs_, default_ibs, INT, ZMOD_RW,
+	"Default dnode indirect block shift");
diff --git a/sys/contrib/openzfs/module/zfs/dnode_sync.c b/sys/contrib/openzfs/module/zfs/dnode_sync.c
index dd37e3af7ed5..f67dad002319 100644
--- a/sys/contrib/openzfs/module/zfs/dnode_sync.c
+++ b/sys/contrib/openzfs/module/zfs/dnode_sync.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -70,8 +70,8 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
 	dmu_buf_impl_t *children[DN_MAX_NBLKPTR];
 	ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR);
 	for (i = 0; i < nblkptr; i++) {
-		children[i] =
-		    dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
+		children[i] = dbuf_find(dn->dn_objset, dn->dn_object,
+		    old_toplvl, i, NULL);
 	}
 
 	/* transfer dnode's block pointers to new indirect block */
@@ -82,7 +82,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
 	ASSERT(db->db.db_data);
 	ASSERT(arc_released(db->db_buf));
 	ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
-	bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+	memcpy(db->db.db_data, dn->dn_phys->dn_blkptr,
 	    sizeof (blkptr_t) * nblkptr);
 	arc_buf_freeze(db->db_buf);
 
@@ -119,7 +119,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
 		mutex_exit(&child->db_mtx);
 	}
 
-	bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
+	memset(dn->dn_phys->dn_blkptr, 0, sizeof (blkptr_t) * nblkptr);
 
 	rw_exit(&db->db_rwlock);
 	if (dn->dn_dbuf != NULL)
@@ -158,7 +158,7 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
 		dmu_object_type_t type = BP_GET_TYPE(bp);
 		uint64_t lvl = BP_GET_LEVEL(bp);
 
-		bzero(bp, sizeof (blkptr_t));
+		memset(bp, 0, sizeof (blkptr_t));
 
 		if (spa_feature_is_active(dn->dn_objset->os_spa,
 		    SPA_FEATURE_HOLE_BIRTH)) {
@@ -175,19 +175,21 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
 static void
 free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 {
-	int off, num;
-	int i, err, epbs;
+	uint64_t off, num, i, j;
+	unsigned int epbs;
+	int err;
 	uint64_t txg = tx->tx_txg;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-	off = start - (db->db_blkid * 1<<epbs);
+	off = start - (db->db_blkid << epbs);
 	num = end - start + 1;
 
-	ASSERT3U(off, >=, 0);
-	ASSERT3U(num, >=, 0);
+	ASSERT3U(dn->dn_phys->dn_indblkshift, >=, SPA_BLKPTRSHIFT);
+	ASSERT3U(end + 1, >=, start);
+	ASSERT3U(start, >=, (db->db_blkid << epbs));
 	ASSERT3U(db->db_level, >, 0);
 	ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
 	ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
@@ -197,7 +199,6 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 		uint64_t *buf;
 		dmu_buf_impl_t *child;
 		dbuf_dirty_record_t *dr;
-		int j;
 
 		ASSERT(db->db_level == 1);
 
@@ -217,8 +218,11 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 			for (j = 0; j < child->db.db_size >> 3; j++) {
 				if (buf[j] != 0) {
 					panic("freed data not zero: "
-					    "child=%p i=%d off=%d num=%d\n",
-					    (void *)child, i, off, num);
+					    "child=%p i=%llu off=%llu "
+					    "num=%llu\n",
+					    (void *)child, (u_longlong_t)i,
+					    (u_longlong_t)off,
+					    (u_longlong_t)num);
 				}
 			}
 		}
@@ -234,8 +238,11 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 			for (j = 0; j < child->db.db_size >> 3; j++) {
 				if (buf[j] != 0) {
 					panic("freed data not zero: "
-					    "child=%p i=%d off=%d num=%d\n",
-					    (void *)child, i, off, num);
+					    "child=%p i=%llu off=%llu "
+					    "num=%llu\n",
+					    (void *)child, (u_longlong_t)i,
+					    (u_longlong_t)off,
+					    (u_longlong_t)num);
 				}
 			}
 		}
@@ -347,7 +354,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
 		rw_enter(&db->db_rwlock, RW_WRITER);
 		for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
 			ASSERT(BP_IS_HOLE(bp));
-		bzero(db->db.db_data, db->db.db_size);
+		memset(db->db.db_data, 0, db->db.db_size);
 		free_blocks(dn, db->db_blkptr, 1, tx);
 		rw_exit(&db->db_rwlock);
 	}
@@ -475,7 +482,14 @@ dnode_evict_dbufs(dnode_t *dn)
 		    zfs_refcount_is_zero(&db->db_holds)) {
 			db_marker->db_level = db->db_level;
 			db_marker->db_blkid = db->db_blkid;
-			db_marker->db_state = DB_SEARCH;
+			/*
+			 * Insert a MARKER node with the same level and blkid.
+			 * And to resolve any ties in dbuf_compare() use the
+			 * pointer of the dbuf that we are evicting. Pass the
+			 * address in db_parent.
+			 */
+			db_marker->db_state = DB_MARKER;
+			db_marker->db_parent = (void *)((uintptr_t)db - 1);
 			avl_insert_here(&dn->dn_dbufs, db_marker, db,
 			    AVL_BEFORE);
 
@@ -597,7 +611,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 	ASSERT(dn->dn_free_txg > 0);
 	if (dn->dn_allocated_txg != dn->dn_free_txg)
 		dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
-	bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
+	memset(dn->dn_phys, 0, sizeof (dnode_phys_t) * dn->dn_num_slots);
 	dnode_free_interior_slots(dn);
 
 	mutex_enter(&dn->dn_mtx);
@@ -620,6 +634,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 
 /*
  * Write out the dnode's dirty buffers.
+ * Does not wait for zio completions.
  */
 void
 dnode_sync(dnode_t *dn, dmu_tx_t *tx)
@@ -634,7 +649,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
 	ASSERT(dnp->dn_type != DMU_OT_NONE ||
-	    bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0);
+	    memcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0);
 	DNODE_VERIFY(dn);
 
 	ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
@@ -655,8 +670,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 			    DNODE_FLAG_USEROBJUSED_ACCOUNTED;
 		mutex_exit(&dn->dn_mtx);
 		dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
-	} else {
-		/* Once we account for it, we should always account for it */
+	} else if (!(os->os_encrypted && dmu_objset_is_receiving(os))) {
+		/*
+		 * Once we account for it, we should always account for it,
+		 * except for the case of a raw receive. We will not be able
+		 * to account for it until the receiving dataset has been
+		 * mounted.
+		 */
 		ASSERT(!(dn->dn_phys->dn_flags &
 		    DNODE_FLAG_USERUSED_ACCOUNTED));
 		ASSERT(!(dn->dn_phys->dn_flags &
@@ -822,7 +842,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 		ASSERT(dn->dn_allocated_txg == tx->tx_txg);
 		if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
 			/* zero the new blkptrs we are gaining */
-			bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
+			memset(dnp->dn_blkptr + dnp->dn_nblkptr, 0,
 			    sizeof (blkptr_t) *
 			    (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
 #ifdef ZFS_DEBUG
@@ -849,6 +869,8 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 		dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
 	}
 
+	ASSERT3U(dnp->dn_bonuslen, <=, DN_MAX_BONUS_LEN(dnp));
+
 	/*
 	 * Although we have dropped our reference to the dnode, it
 	 * can't be evicted until its written, and we haven't yet
diff --git a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
index bead7da2237f..5fd8bc2a2682 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
@@ -34,10 +34,11 @@
 #include <sys/dsl_bookmark.h>
 #include <zfs_namecheck.h>
 #include <sys/dmu_send.h>
+#include <sys/dbuf.h>
 
 static int
 dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname,
-    dsl_dataset_t **dsp, void *tag, char **shortnamep)
+    dsl_dataset_t **dsp, const void *tag, char **shortnamep)
 {
 	char buf[ZFS_MAX_DATASET_NAME_LEN];
 	char *hashp;
@@ -82,7 +83,7 @@ dsl_bookmark_lookup_impl(dsl_dataset_t *ds, const char *shortname,
 	 * Zero out the bookmark in case the one stored on disk
 	 * is in an older, shorter format.
 	 */
-	bzero(bmark_phys, sizeof (*bmark_phys));
+	memset(bmark_phys, 0, sizeof (*bmark_phys));
 
 	err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t),
 	    sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, NULL, 0,
@@ -160,15 +161,14 @@ dsl_bookmark_create_nvl_validate_pair(const char *bmark, const char *source)
 int
 dsl_bookmark_create_nvl_validate(nvlist_t *bmarks)
 {
-	char *first;
-	size_t first_len;
+	const char *first = NULL;
+	size_t first_len = 0;
 
-	first = NULL;
 	for (nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) {
 
-		char *bmark = nvpair_name(pair);
-		char *source;
+		const char *bmark = nvpair_name(pair);
+		const char *source;
 
 		/* list structure: values must be snapshots XOR bookmarks */
 		if (nvpair_value_string(pair, &source) != 0)
@@ -178,7 +178,7 @@ dsl_bookmark_create_nvl_validate(nvlist_t *bmarks)
 
 		/* same pool check */
 		if (first == NULL) {
-			char *cp = strpbrk(bmark, "/#");
+			const char *cp = strpbrk(bmark, "/#");
 			if (cp == NULL)
 				return (-1);
 			first = bmark;
@@ -230,7 +230,6 @@ dsl_bookmark_create_check_impl(dsl_pool_t *dp,
 	switch (error) {
 	case ESRCH:
 		/* happy path: new bmark doesn't exist, proceed after switch */
-		error = 0;
 		break;
 	case 0:
 		error = SET_ERROR(EEXIST);
@@ -307,11 +306,11 @@ dsl_bookmark_create_check(void *arg, dmu_tx_t *tx)
 
 	for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
-		char *new = nvpair_name(pair);
+		const char *new = nvpair_name(pair);
 
 		int error = schema_err;
 		if (error == 0) {
-			char *source = fnvpair_value_string(pair);
+			const char *source = fnvpair_value_string(pair);
 			error = dsl_bookmark_create_check_impl(dp, new, source);
 			if (error != 0)
 				error = SET_ERROR(error);
@@ -347,6 +346,8 @@ dsl_bookmark_set_phys(zfs_bookmark_phys_t *zbm, dsl_dataset_t *snap)
 	spa_t *spa = dsl_dataset_get_spa(snap);
 	objset_t *mos = spa_get_dsl(spa)->dp_meta_objset;
 	dsl_dataset_phys_t *dsp = dsl_dataset_phys(snap);
+
+	memset(zbm, 0, sizeof (zfs_bookmark_phys_t));
 	zbm->zbm_guid = dsp->ds_guid;
 	zbm->zbm_creation_txg = dsp->ds_creation_txg;
 	zbm->zbm_creation_time = dsp->ds_creation_time;
@@ -380,10 +381,6 @@ dsl_bookmark_set_phys(zfs_bookmark_phys_t *zbm, dsl_dataset_t *snap)
 		    &zbm->zbm_compressed_freed_before_next_snap,
 		    &zbm->zbm_uncompressed_freed_before_next_snap);
 		dsl_dataset_rele(nextds, FTAG);
-	} else {
-		bzero(&zbm->zbm_flags,
-		    sizeof (zfs_bookmark_phys_t) -
-		    offsetof(zfs_bookmark_phys_t, zbm_flags));
 	}
 }
 
@@ -426,8 +423,8 @@ dsl_bookmark_node_add(dsl_dataset_t *hds, dsl_bookmark_node_t *dbn,
 		spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2, tx);
 	}
 
-	__attribute__((unused)) zfs_bookmark_phys_t zero_phys = { 0 };
-	ASSERT0(bcmp(((char *)&dbn->dbn_phys) + bookmark_phys_size,
+	zfs_bookmark_phys_t zero_phys = { 0 };
+	ASSERT0(memcmp(((char *)&dbn->dbn_phys) + bookmark_phys_size,
 	    &zero_phys, sizeof (zfs_bookmark_phys_t) - bookmark_phys_size));
 
 	VERIFY0(zap_add(mos, hds->ds_bookmarks_obj, dbn->dbn_name,
@@ -441,8 +438,8 @@ dsl_bookmark_node_add(dsl_dataset_t *hds, dsl_bookmark_node_t *dbn,
  */
 static void
 dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot,
-    dmu_tx_t *tx, uint64_t num_redact_snaps, uint64_t *redact_snaps, void *tag,
-    redaction_list_t **redaction_list)
+    dmu_tx_t *tx, uint64_t num_redact_snaps, uint64_t *redact_snaps,
+    const void *tag, redaction_list_t **redaction_list)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
@@ -463,26 +460,43 @@ dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot,
 	    SPA_FEATURE_REDACTED_DATASETS, &dsnumsnaps, &dsredactsnaps);
 	if (redaction_list != NULL || bookmark_redacted) {
 		redaction_list_t *local_rl;
+		boolean_t spill = B_FALSE;
 		if (bookmark_redacted) {
 			redact_snaps = dsredactsnaps;
 			num_redact_snaps = dsnumsnaps;
 		}
+		int bonuslen = sizeof (redaction_list_phys_t) +
+		    num_redact_snaps * sizeof (uint64_t);
+		if (bonuslen > dmu_bonus_max())
+			spill = B_TRUE;
 		dbn->dbn_phys.zbm_redaction_obj = dmu_object_alloc(mos,
 		    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
-		    DMU_OTN_UINT64_METADATA, sizeof (redaction_list_phys_t) +
-		    num_redact_snaps * sizeof (uint64_t), tx);
+		    DMU_OTN_UINT64_METADATA, spill ? 0 : bonuslen, tx);
 		spa_feature_incr(dp->dp_spa,
 		    SPA_FEATURE_REDACTION_BOOKMARKS, tx);
+		if (spill) {
+			spa_feature_incr(dp->dp_spa,
+			    SPA_FEATURE_REDACTION_LIST_SPILL, tx);
+		}
 
 		VERIFY0(dsl_redaction_list_hold_obj(dp,
 		    dbn->dbn_phys.zbm_redaction_obj, tag, &local_rl));
 		dsl_redaction_list_long_hold(dp, local_rl, tag);
 
-		ASSERT3U((local_rl)->rl_dbuf->db_size, >=,
-		    sizeof (redaction_list_phys_t) + num_redact_snaps *
-		    sizeof (uint64_t));
-		dmu_buf_will_dirty(local_rl->rl_dbuf, tx);
-		bcopy(redact_snaps, local_rl->rl_phys->rlp_snaps,
+		if (!spill) {
+			ASSERT3U(local_rl->rl_bonus->db_size, >=, bonuslen);
+			dmu_buf_will_dirty(local_rl->rl_bonus, tx);
+		} else {
+			dmu_buf_t *db;
+			VERIFY0(dmu_spill_hold_by_bonus(local_rl->rl_bonus,
+			    DB_RF_MUST_SUCCEED, FTAG, &db));
+			dmu_buf_will_fill(db, tx, B_FALSE);
+			VERIFY0(dbuf_spill_set_blksz(db, P2ROUNDUP(bonuslen,
+			    SPA_MINBLOCKSIZE), tx));
+			local_rl->rl_phys = db->db_data;
+			local_rl->rl_dbuf = db;
+		}
+		memcpy(local_rl->rl_phys->rlp_snaps, redact_snaps,
 		    sizeof (uint64_t) * num_redact_snaps);
 		local_rl->rl_phys->rlp_num_snaps = num_redact_snaps;
 		if (bookmark_redacted) {
@@ -593,8 +607,8 @@ dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx)
 	for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
 
-		char *new = nvpair_name(pair);
-		char *source = fnvpair_value_string(pair);
+		const char *new = nvpair_name(pair);
+		const char *source = fnvpair_value_string(pair);
 
 		if (strchr(source, '@') != NULL) {
 			dsl_bookmark_create_sync_impl_snap(new, source, tx,
@@ -640,11 +654,15 @@ dsl_bookmark_create_redacted_check(void *arg, dmu_tx_t *tx)
 	    SPA_FEATURE_REDACTION_BOOKMARKS))
 		return (SET_ERROR(ENOTSUP));
 	/*
-	 * If the list of redact snaps will not fit in the bonus buffer with
-	 * the furthest reached object and offset, fail.
+	 * If the list of redact snaps will not fit in the bonus buffer (or
+	 * spill block, with the REDACTION_LIST_SPILL feature) with the
+	 * furthest reached object and offset, fail.
 	 */
-	if (dbcra->dbcra_numsnaps > (dmu_bonus_max() -
-	    sizeof (redaction_list_phys_t)) / sizeof (uint64_t))
+	uint64_t snaplimit = ((spa_feature_is_enabled(dp->dp_spa,
+	    SPA_FEATURE_REDACTION_LIST_SPILL) ? spa_maxblocksize(dp->dp_spa) :
+	    dmu_bonus_max()) -
+	    sizeof (redaction_list_phys_t)) / sizeof (uint64_t);
+	if (dbcra->dbcra_numsnaps > snaplimit)
 		return (SET_ERROR(E2BIG));
 
 	if (dsl_bookmark_create_nvl_validate_pair(
@@ -667,7 +685,8 @@ dsl_bookmark_create_redacted_sync(void *arg, dmu_tx_t *tx)
 
 int
 dsl_bookmark_create_redacted(const char *bookmark, const char *snapshot,
-    uint64_t numsnaps, uint64_t *snapguids, void *tag, redaction_list_t **rl)
+    uint64_t numsnaps, uint64_t *snapguids, const void *tag,
+    redaction_list_t **rl)
 {
 	dsl_bookmark_create_redacted_arg_t dbcra;
 
@@ -1043,6 +1062,14 @@ dsl_bookmark_destroy_sync_impl(dsl_dataset_t *ds, const char *name,
 	}
 
 	if (dbn->dbn_phys.zbm_redaction_obj != 0) {
+		dnode_t *rl;
+		VERIFY0(dnode_hold(mos,
+		    dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl));
+		if (rl->dn_have_spill) {
+			spa_feature_decr(dmu_objset_spa(mos),
+			    SPA_FEATURE_REDACTION_LIST_SPILL, tx);
+		}
+		dnode_rele(rl, FTAG);
 		VERIFY0(dmu_object_free(mos,
 		    dbn->dbn_phys.zbm_redaction_obj, tx));
 		spa_feature_decr(dmu_objset_spa(mos),
@@ -1191,19 +1218,19 @@ dsl_redaction_list_long_held(redaction_list_t *rl)
 }
 
 void
-dsl_redaction_list_long_hold(dsl_pool_t *dp, redaction_list_t *rl, void *tag)
+dsl_redaction_list_long_hold(dsl_pool_t *dp, redaction_list_t *rl,
+    const void *tag)
 {
 	ASSERT(dsl_pool_config_held(dp));
 	(void) zfs_refcount_add(&rl->rl_longholds, tag);
 }
 
 void
-dsl_redaction_list_long_rele(redaction_list_t *rl, void *tag)
+dsl_redaction_list_long_rele(redaction_list_t *rl, const void *tag)
 {
 	(void) zfs_refcount_remove(&rl->rl_longholds, tag);
 }
 
-/* ARGSUSED */
 static void
 redaction_list_evict_sync(void *rlu)
 {
@@ -1214,17 +1241,19 @@ redaction_list_evict_sync(void *rlu)
 }
 
 void
-dsl_redaction_list_rele(redaction_list_t *rl, void *tag)
+dsl_redaction_list_rele(redaction_list_t *rl, const void *tag)
 {
-	dmu_buf_rele(rl->rl_dbuf, tag);
+	if (rl->rl_bonus != rl->rl_dbuf)
+		dmu_buf_rele(rl->rl_dbuf, tag);
+	dmu_buf_rele(rl->rl_bonus, tag);
 }
 
 int
-dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, void *tag,
+dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, const void *tag,
     redaction_list_t **rlp)
 {
 	objset_t *mos = dp->dp_meta_objset;
-	dmu_buf_t *dbuf;
+	dmu_buf_t *dbuf, *spill_dbuf;
 	redaction_list_t *rl;
 	int err;
 
@@ -1239,13 +1268,18 @@ dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, void *tag,
 		redaction_list_t *winner = NULL;
 
 		rl = kmem_zalloc(sizeof (redaction_list_t), KM_SLEEP);
-		rl->rl_dbuf = dbuf;
+		rl->rl_bonus = dbuf;
+		if (dmu_spill_hold_existing(dbuf, tag, &spill_dbuf) == 0) {
+			rl->rl_dbuf = spill_dbuf;
+		} else {
+			rl->rl_dbuf = dbuf;
+		}
 		rl->rl_object = rlobj;
-		rl->rl_phys = dbuf->db_data;
+		rl->rl_phys = rl->rl_dbuf->db_data;
 		rl->rl_mos = dp->dp_meta_objset;
 		zfs_refcount_create(&rl->rl_longholds);
 		dmu_buf_init_user(&rl->rl_dbu, redaction_list_evict_sync, NULL,
-		    &rl->rl_dbuf);
+		    &rl->rl_bonus);
 		if ((winner = dmu_buf_set_user_ie(dbuf, &rl->rl_dbu)) != NULL) {
 			kmem_free(rl, sizeof (*rl));
 			rl = winner;
@@ -1295,7 +1329,7 @@ dsl_bookmark_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
 	 * The empty-string name can't be in the AVL, and it compares
 	 * before any entries with this TXG.
 	 */
-	search.dbn_name = "";
+	search.dbn_name = (char *)"";
 	VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL);
 	dsl_bookmark_node_t *dbn =
 	    avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER);
@@ -1422,7 +1456,7 @@ dsl_bookmark_next_changed(dsl_dataset_t *head, dsl_dataset_t *origin,
 	 * The empty-string name can't be in the AVL, and it compares
 	 * before any entries with this TXG.
 	 */
-	search.dbn_name = "";
+	search.dbn_name = (char *)"";
 	VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL);
 	dsl_bookmark_node_t *dbn =
 	    avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER);
@@ -1470,10 +1504,11 @@ dsl_bookmark_next_changed(dsl_dataset_t *head, dsl_dataset_t *origin,
  * Adjust the FBN of any bookmarks that reference this block, whose "next"
  * is the head dataset.
  */
-/* ARGSUSED */
 void
 dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 {
+	(void) tx;
+
 	/*
 	 * Iterate over bookmarks whose "next" is the head dataset.
 	 */
@@ -1485,7 +1520,8 @@ dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 		 * If the block was live (referenced) at the time of this
 		 * bookmark, add its space to the bookmark's FBN.
 		 */
-		if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg &&
+		if (BP_GET_LOGICAL_BIRTH(bp) <=
+		    dbn->dbn_phys.zbm_creation_txg &&
 		    (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) {
 			mutex_enter(&dbn->dbn_lock);
 			dbn->dbn_phys.zbm_referenced_freed_before_next_snap +=
diff --git a/sys/contrib/openzfs/module/zfs/dsl_crypt.c b/sys/contrib/openzfs/module/zfs/dsl_crypt.c
index 26d4c2fe7e33..8e1055d9bcb1 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_crypt.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_crypt.c
@@ -80,13 +80,13 @@
 int zfs_disable_ivset_guid_check = 0;
 
 static void
-dsl_wrapping_key_hold(dsl_wrapping_key_t *wkey, void *tag)
+dsl_wrapping_key_hold(dsl_wrapping_key_t *wkey, const void *tag)
 {
 	(void) zfs_refcount_add(&wkey->wk_refcnt, tag);
 }
 
 static void
-dsl_wrapping_key_rele(dsl_wrapping_key_t *wkey, void *tag)
+dsl_wrapping_key_rele(dsl_wrapping_key_t *wkey, const void *tag)
 {
 	(void) zfs_refcount_remove(&wkey->wk_refcnt, tag);
 }
@@ -97,7 +97,7 @@ dsl_wrapping_key_free(dsl_wrapping_key_t *wkey)
 	ASSERT0(zfs_refcount_count(&wkey->wk_refcnt));
 
 	if (wkey->wk_key.ck_data) {
-		bzero(wkey->wk_key.ck_data,
+		memset(wkey->wk_key.ck_data, 0,
 		    CRYPTO_BITS2BYTES(wkey->wk_key.ck_length));
 		kmem_free(wkey->wk_key.ck_data,
 		    CRYPTO_BITS2BYTES(wkey->wk_key.ck_length));
@@ -119,9 +119,8 @@ dsl_wrapping_key_create(uint8_t *wkeydata, zfs_keyformat_t keyformat,
 	/* allocate and initialize the underlying crypto key */
 	wkey->wk_key.ck_data = kmem_alloc(WRAPPING_KEY_LEN, KM_SLEEP);
 
-	wkey->wk_key.ck_format = CRYPTO_KEY_RAW;
 	wkey->wk_key.ck_length = CRYPTO_BYTES2BITS(WRAPPING_KEY_LEN);
-	bcopy(wkeydata, wkey->wk_key.ck_data, WRAPPING_KEY_LEN);
+	memcpy(wkey->wk_key.ck_data, wkeydata, WRAPPING_KEY_LEN);
 
 	/* initialize the rest of the struct */
 	zfs_refcount_create(&wkey->wk_refcnt);
@@ -144,7 +143,7 @@ dsl_crypto_params_create_nvlist(dcp_cmd_t cmd, nvlist_t *props,
 	dsl_wrapping_key_t *wkey = NULL;
 	uint8_t *wkeydata = NULL;
 	uint_t wkeydata_len = 0;
-	char *keylocation = NULL;
+	const char *keylocation = NULL;
 
 	dcp = kmem_zalloc(sizeof (dsl_crypto_params_t), KM_SLEEP);
 	dcp->cp_cmd = cmd;
@@ -267,6 +266,40 @@ spa_crypto_key_compare(const void *a, const void *b)
 	return (0);
 }
 
+/*
+ * this compares a crypto key based on zk_guid. See comment on
+ * spa_crypto_key_compare for more information.
+ */
+boolean_t
+dmu_objset_crypto_key_equal(objset_t *osa, objset_t *osb)
+{
+	dsl_crypto_key_t *dcka = NULL;
+	dsl_crypto_key_t *dckb = NULL;
+	uint64_t obja, objb;
+	boolean_t equal;
+	spa_t *spa;
+
+	spa = dmu_objset_spa(osa);
+	if (spa != dmu_objset_spa(osb))
+		return (B_FALSE);
+	obja = dmu_objset_ds(osa)->ds_object;
+	objb = dmu_objset_ds(osb)->ds_object;
+
+	if (spa_keystore_lookup_key(spa, obja, FTAG, &dcka) != 0)
+		return (B_FALSE);
+	if (spa_keystore_lookup_key(spa, objb, FTAG, &dckb) != 0) {
+		spa_keystore_dsl_key_rele(spa, dcka, FTAG);
+		return (B_FALSE);
+	}
+
+	equal = (dcka->dck_key.zk_guid == dckb->dck_key.zk_guid);
+
+	spa_keystore_dsl_key_rele(spa, dcka, FTAG);
+	spa_keystore_dsl_key_rele(spa, dckb, FTAG);
+
+	return (equal);
+}
+
 static int
 spa_key_mapping_compare(const void *a, const void *b)
 {
@@ -369,7 +402,7 @@ dsl_dir_incompatible_encryption_version(dsl_dir_t *dd)
 
 static int
 spa_keystore_wkey_hold_ddobj_impl(spa_t *spa, uint64_t ddobj,
-    void *tag, dsl_wrapping_key_t **wkey_out)
+    const void *tag, dsl_wrapping_key_t **wkey_out)
 {
 	int ret;
 	dsl_wrapping_key_t search_wkey;
@@ -399,7 +432,7 @@ error:
 }
 
 static int
-spa_keystore_wkey_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag,
+spa_keystore_wkey_hold_dd(spa_t *spa, dsl_dir_t *dd, const void *tag,
     dsl_wrapping_key_t **wkey_out)
 {
 	int ret;
@@ -515,7 +548,7 @@ dsl_crypto_key_free(dsl_crypto_key_t *dck)
 }
 
 static void
-dsl_crypto_key_rele(dsl_crypto_key_t *dck, void *tag)
+dsl_crypto_key_rele(dsl_crypto_key_t *dck, const void *tag)
 {
 	if (zfs_refcount_remove(&dck->dck_holds, tag) == 0)
 		dsl_crypto_key_free(dck);
@@ -523,7 +556,7 @@ dsl_crypto_key_rele(dsl_crypto_key_t *dck, void *tag)
 
 static int
 dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey,
-    uint64_t dckobj, void *tag, dsl_crypto_key_t **dck_out)
+    uint64_t dckobj, const void *tag, dsl_crypto_key_t **dck_out)
 {
 	int ret;
 	uint64_t crypt = 0, guid = 0, version = 0;
@@ -542,6 +575,12 @@ dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey,
 	if (ret != 0)
 		goto error;
 
+	/* handle a future crypto suite that we don't support */
+	if (crypt >= ZIO_CRYPT_FUNCTIONS) {
+		ret = (SET_ERROR(ZFS_ERR_CRYPTO_NOTSUP));
+		goto error;
+	}
+
 	ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &guid);
 	if (ret != 0)
 		goto error;
@@ -592,7 +631,7 @@ dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey,
 
 error:
 	if (dck != NULL) {
-		bzero(dck, sizeof (dsl_crypto_key_t));
+		memset(dck, 0, sizeof (dsl_crypto_key_t));
 		kmem_free(dck, sizeof (dsl_crypto_key_t));
 	}
 
@@ -601,7 +640,7 @@ error:
 }
 
 static int
-spa_keystore_dsl_key_hold_impl(spa_t *spa, uint64_t dckobj, void *tag,
+spa_keystore_dsl_key_hold_impl(spa_t *spa, uint64_t dckobj, const void *tag,
     dsl_crypto_key_t **dck_out)
 {
 	int ret;
@@ -632,7 +671,7 @@ error:
 }
 
 static int
-spa_keystore_dsl_key_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag,
+spa_keystore_dsl_key_hold_dd(spa_t *spa, dsl_dir_t *dd, const void *tag,
     dsl_crypto_key_t **dck_out)
 {
 	int ret;
@@ -690,7 +729,7 @@ spa_keystore_dsl_key_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag,
 }
 
 void
-spa_keystore_dsl_key_rele(spa_t *spa, dsl_crypto_key_t *dck, void *tag)
+spa_keystore_dsl_key_rele(spa_t *spa, dsl_crypto_key_t *dck, const void *tag)
 {
 	rw_enter(&spa->spa_keystore.sk_dk_lock, RW_WRITER);
 
@@ -937,7 +976,7 @@ error:
 }
 
 void
-key_mapping_add_ref(dsl_key_mapping_t *km, void *tag)
+key_mapping_add_ref(dsl_key_mapping_t *km, const void *tag)
 {
 	ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1);
 	zfs_refcount_add(&km->km_refcnt, tag);
@@ -954,7 +993,7 @@ key_mapping_add_ref(dsl_key_mapping_t *km, void *tag)
  * mapping after unmounting a dataset.
  */
 void
-key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag)
+key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, const void *tag)
 {
 	ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1);
 
@@ -985,7 +1024,7 @@ key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag)
 }
 
 int
-spa_keystore_create_mapping(spa_t *spa, dsl_dataset_t *ds, void *tag,
+spa_keystore_create_mapping(spa_t *spa, dsl_dataset_t *ds, const void *tag,
     dsl_key_mapping_t **km_out)
 {
 	int ret;
@@ -1044,7 +1083,7 @@ spa_keystore_create_mapping(spa_t *spa, dsl_dataset_t *ds, void *tag,
 }
 
 int
-spa_keystore_remove_mapping(spa_t *spa, uint64_t dsobj, void *tag)
+spa_keystore_remove_mapping(spa_t *spa, uint64_t dsobj, const void *tag)
 {
 	int ret;
 	dsl_key_mapping_t search_km;
@@ -1082,7 +1121,7 @@ error_unlock:
  * without getting a reference to it.
  */
 int
-spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, void *tag,
+spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, const void *tag,
     dsl_crypto_key_t **dck_out)
 {
 	int ret;
@@ -1138,7 +1177,7 @@ dmu_objset_check_wkey_loaded(dsl_dir_t *dd)
 	return (0);
 }
 
-static zfs_keystatus_t
+zfs_keystatus_t
 dsl_dataset_get_keystatus(dsl_dir_t *dd)
 {
 	/* check if this dd has a has a dsl key */
@@ -1507,7 +1546,7 @@ spa_keystore_change_key_sync(void *arg, dmu_tx_t *tx)
 	dsl_crypto_params_t *dcp = skcka->skcka_cp;
 	dsl_wrapping_key_t *wkey = NULL, *found_wkey;
 	dsl_wrapping_key_t wkey_search;
-	char *keylocation = dcp->cp_keylocation;
+	const char *keylocation = dcp->cp_keylocation;
 	uint64_t rddobj, new_rddobj;
 
 	/* create and initialize the wrapping key */
@@ -2007,14 +2046,6 @@ dsl_crypto_recv_raw_objset_check(dsl_dataset_t *ds, dsl_dataset_t *fromds,
 	if (ret != 0)
 		return (ret);
 
-	/*
-	 * Useraccounting is not portable and must be done with the keys loaded.
-	 * Therefore, whenever we do any kind of receive the useraccounting
-	 * must not be present.
-	 */
-	ASSERT0(os->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE);
-	ASSERT0(os->os_flags & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE);
-
 	mdn = DMU_META_DNODE(os);
 
 	/*
@@ -2104,8 +2135,9 @@ dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype,
 	 * written out raw next time.
 	 */
 	arc_release(os->os_phys_buf, &os->os_phys_buf);
-	bcopy(portable_mac, os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN);
-	bzero(os->os_phys->os_local_mac, ZIO_OBJSET_MAC_LEN);
+	memcpy(os->os_phys->os_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
+	memset(os->os_phys->os_local_mac, 0, ZIO_OBJSET_MAC_LEN);
+	os->os_flags &= ~OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 	os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
 
 	/* set metadnode compression and checksum */
@@ -2127,9 +2159,6 @@ dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype,
 		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 		dsl_dataset_sync(ds, zio, tx);
 		VERIFY0(zio_wait(zio));
-
-		/* dsl_dataset_sync_done will drop this reference. */
-		dmu_buf_add_ref(ds->ds_dbuf, ds);
 		dsl_dataset_sync_done(ds, tx);
 	}
 }
@@ -2152,10 +2181,16 @@ dsl_crypto_recv_raw_key_check(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx)
 	 * wrapping key.
 	 */
 	ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE, &intval);
-	if (ret != 0 || intval >= ZIO_CRYPT_FUNCTIONS ||
-	    intval <= ZIO_CRYPT_OFF)
+	if (ret != 0 || intval <= ZIO_CRYPT_OFF)
 		return (SET_ERROR(EINVAL));
 
+	/*
+	 * Flag a future crypto suite that we don't support differently, so
+	 * we can return a more useful error to the user.
+	 */
+	if (intval >= ZIO_CRYPT_FUNCTIONS)
+		return (SET_ERROR(ZFS_ERR_CRYPTO_NOTSUP));
+
 	ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_GUID, &intval);
 	if (ret != 0)
 		return (SET_ERROR(EINVAL));
@@ -2237,7 +2272,7 @@ dsl_crypto_recv_raw_key_sync(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx)
 	uint8_t *keydata, *hmac_keydata, *iv, *mac;
 	uint64_t crypt, key_guid, keyformat, iters, salt;
 	uint64_t version = ZIO_CRYPT_KEY_CURRENT_VERSION;
-	char *keylocation = "prompt";
+	const char *keylocation = "prompt";
 
 	/* lookup the values we need to create the DSL Crypto Key */
 	crypt = fnvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE);
@@ -2555,7 +2590,7 @@ dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey,
 	    DSL_CRYPTO_KEY_VERSION, sizeof (uint64_t), 1, &version, tx));
 
 	zio_crypt_key_destroy(&dck.dck_key);
-	bzero(&dck.dck_key, sizeof (zio_crypt_key_t));
+	memset(&dck.dck_key, 0, sizeof (zio_crypt_key_t));
 
 	return (dck.dck_obj);
 }
@@ -2679,6 +2714,7 @@ spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj,
 	objset_phys_t *osp = buf;
 	uint8_t portable_mac[ZIO_OBJSET_MAC_LEN];
 	uint8_t local_mac[ZIO_OBJSET_MAC_LEN];
+	const uint8_t zeroed_mac[ZIO_OBJSET_MAC_LEN] = {0};
 
 	/* look up the key from the spa's keystore */
 	ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck);
@@ -2695,16 +2731,30 @@ spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj,
 
 	/* if we are generating encode the HMACs in the objset_phys_t */
 	if (generate) {
-		bcopy(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN);
-		bcopy(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN);
+		memcpy(osp->os_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
+		memcpy(osp->os_local_mac, local_mac, ZIO_OBJSET_MAC_LEN);
 		abd_return_buf_copy(abd, buf, datalen);
 		return (0);
 	}
 
-	if (bcmp(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN) != 0 ||
-	    bcmp(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN) != 0) {
-		abd_return_buf(abd, buf, datalen);
-		return (SET_ERROR(ECKSUM));
+	if (memcmp(portable_mac, osp->os_portable_mac,
+	    ZIO_OBJSET_MAC_LEN) != 0 ||
+	    memcmp(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN) != 0) {
+		/*
+		 * If the MAC is zeroed out, we failed to decrypt it.
+		 * This should only arise, at least on Linux,
+		 * if we hit edge case handling for useraccounting, since we
+		 * shouldn't get here without bailing out on error earlier
+		 * otherwise.
+		 *
+		 * So if we're in that case, we can just fall through and
+		 * special-casing noticing that it's zero will handle it
+		 * elsewhere, since we can just regenerate it.
+		 */
+		if (memcmp(local_mac, zeroed_mac, ZIO_OBJSET_MAC_LEN) != 0) {
+			abd_return_buf(abd, buf, datalen);
+			return (SET_ERROR(ECKSUM));
+		}
 	}
 
 	abd_return_buf(abd, buf, datalen);
@@ -2746,11 +2796,11 @@ spa_do_crypt_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, abd_t *abd,
 	 * Otherwise verify that the MAC matched what we expected.
 	 */
 	if (generate) {
-		bcopy(digestbuf, mac, ZIO_DATA_MAC_LEN);
+		memcpy(mac, digestbuf, ZIO_DATA_MAC_LEN);
 		return (0);
 	}
 
-	if (bcmp(digestbuf, mac, ZIO_DATA_MAC_LEN) != 0)
+	if (memcmp(digestbuf, mac, ZIO_DATA_MAC_LEN) != 0)
 		return (SET_ERROR(ECKSUM));
 
 	return (0);
@@ -2849,9 +2899,9 @@ spa_do_crypt_abd(boolean_t encrypt, spa_t *spa, const zbookmark_phys_t *zb,
 error:
 	if (encrypt) {
 		/* zero out any state we might have changed while encrypting */
-		bzero(salt, ZIO_DATA_SALT_LEN);
-		bzero(iv, ZIO_DATA_IV_LEN);
-		bzero(mac, ZIO_DATA_MAC_LEN);
+		memset(salt, 0, ZIO_DATA_SALT_LEN);
+		memset(iv, 0, ZIO_DATA_IV_LEN);
+		memset(mac, 0, ZIO_DATA_MAC_LEN);
 		abd_return_buf(pabd, plainbuf, datalen);
 		abd_return_buf_copy(cabd, cipherbuf, datalen);
 	} else {
diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
index f99964511aa6..b4de0e7ff073 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -73,13 +73,22 @@
  * The SPA supports block sizes up to 16MB.  However, very large blocks
  * can have an impact on i/o latency (e.g. tying up a spinning disk for
  * ~300ms), and also potentially on the memory allocator.  Therefore,
- * we do not allow the recordsize to be set larger than zfs_max_recordsize
- * (default 1MB).  Larger blocks can be created by changing this tunable,
- * and pools with larger blocks can always be imported and used, regardless
- * of this setting.
+ * we did not allow the recordsize to be set larger than zfs_max_recordsize
+ * (former default: 1MB).  Larger blocks could be created by changing this
+ * tunable, and pools with larger blocks could always be imported and used,
+ * regardless of this setting.
+ *
+ * We do, however, still limit it by default to 1M on x86_32, because Linux's
+ * 3/1 memory split doesn't leave much room for 16M chunks.
  */
-int zfs_max_recordsize = 1 * 1024 * 1024;
-int zfs_allow_redacted_dataset_mount = 0;
+#ifdef _ILP32
+uint_t zfs_max_recordsize =  1 * 1024 * 1024;
+#else
+uint_t zfs_max_recordsize = 16 * 1024 * 1024;
+#endif
+static int zfs_allow_redacted_dataset_mount = 0;
+
+int zfs_snapshot_history_enabled = 1;
 
 #define	SWITCH64(x, y) \
 	{ \
@@ -90,8 +99,6 @@ int zfs_allow_redacted_dataset_mount = 0;
 
 #define	DS_REF_MAX	(1ULL << 62)
 
-extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
-
 static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds,
     uint64_t obj, dmu_tx_t *tx);
 static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds,
@@ -99,7 +106,7 @@ static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds,
 
 static void unload_zfeature(dsl_dataset_t *ds, spa_feature_t f);
 
-extern int spa_asize_inflation;
+extern uint_t spa_asize_inflation;
 
 static zil_header_t zero_zil;
 
@@ -149,7 +156,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 		return;
 	}
 
-	ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	mutex_enter(&ds->ds_lock);
 	delta = parent_delta(ds, used);
@@ -183,7 +191,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 	 * they do not need to be freed.
 	 */
 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-	    bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+	    BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
 	    !(BP_IS_EMBEDDED(bp))) {
 		ASSERT(dsl_dir_is_clone(ds->ds_dir));
 		ASSERT(spa_feature_is_enabled(spa,
@@ -229,7 +237,7 @@ dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
 		mutex_exit(&ds->ds_remap_deadlist_lock);
 
 		BP_ZERO(&fakebp);
-		fakebp.blk_birth = birth;
+		BP_SET_LOGICAL_BIRTH(&fakebp, birth);
 		DVA_SET_VDEV(dva, vdev);
 		DVA_SET_OFFSET(dva, offset);
 		DVA_SET_ASIZE(dva, size);
@@ -252,7 +260,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 		return (0);
 
 	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(bp->blk_birth <= tx->tx_txg);
+	ASSERT(BP_GET_LOGICAL_BIRTH(bp) <= tx->tx_txg);
 
 	if (ds == NULL) {
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
@@ -270,7 +278,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 	 * they do not need to be freed.
 	 */
 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-	    bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+	    BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
 	    !(BP_IS_EMBEDDED(bp))) {
 		ASSERT(dsl_dir_is_clone(ds->ds_dir));
 		ASSERT(spa_feature_is_enabled(spa,
@@ -278,7 +286,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 		bplist_append(&ds->ds_dir->dd_pending_frees, bp);
 	}
 
-	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+	if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 		int64_t delta;
 
 		dprintf_bp(bp, "freeing ds=%llu", (u_longlong_t)ds->ds_object);
@@ -310,16 +318,16 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 		ASSERT3U(ds->ds_prev->ds_object, ==,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj);
 		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
-		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
+		/* if (logical birth > prev prev snap txg) prev unique += bs */
 		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
-		    ds->ds_object && bp->blk_birth >
+		    ds->ds_object && BP_GET_LOGICAL_BIRTH(bp) >
 		    dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 			mutex_enter(&ds->ds_prev->ds_lock);
 			dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
 			mutex_exit(&ds->ds_prev->ds_lock);
 		}
-		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+		if (BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {
 			dsl_dir_transfer_space(ds->ds_dir, used,
 			    DD_USED_HEAD, DD_USED_SNAP, tx);
 		}
@@ -524,7 +532,7 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
 	matchtype_t mt = 0;
 	int err;
 
-	dsl_dir_snap_cmtime_update(ds->ds_dir);
+	dsl_dir_snap_cmtime_update(ds->ds_dir, tx);
 
 	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_NORMALIZE;
@@ -541,7 +549,7 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
 }
 
 boolean_t
-dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
+dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, const void *tag)
 {
 	dmu_buf_t *dbuf = ds->ds_dbuf;
 	boolean_t result = B_FALSE;
@@ -559,7 +567,7 @@ dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
 }
 
 int
-dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, const void *tag,
     dsl_dataset_t **dsp)
 {
 	objset_t *mos = dp->dp_meta_objset;
@@ -633,6 +641,8 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 				    dsl_dataset_phys(ds)->ds_prev_snap_obj,
 				    ds, &ds->ds_prev);
 			}
+			if (err != 0)
+				goto after_dsl_bookmark_fini;
 			err = dsl_bookmark_init_ds(ds);
 		} else {
 			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
@@ -681,11 +691,11 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 			winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
 
 		if (err != 0 || winner != NULL) {
-			bplist_destroy(&ds->ds_pending_deadlist);
 			dsl_deadlist_close(&ds->ds_deadlist);
 			if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
 				dsl_deadlist_close(&ds->ds_remap_deadlist);
 			dsl_bookmark_fini_ds(ds);
+after_dsl_bookmark_fini:
 			if (ds->ds_prev)
 				dsl_dataset_rele(ds->ds_prev, ds);
 			dsl_dir_rele(ds->ds_dir, ds);
@@ -696,6 +706,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 
 			list_destroy(&ds->ds_prop_cbs);
 			list_destroy(&ds->ds_sendstreams);
+			bplist_destroy(&ds->ds_pending_deadlist);
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_opening_lock);
 			mutex_destroy(&ds->ds_sendstream_lock);
@@ -748,7 +759,7 @@ dsl_dataset_create_key_mapping(dsl_dataset_t *ds)
 
 int
 dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj,
-    ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp)
+    ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp)
 {
 	int err;
 
@@ -769,7 +780,7 @@ dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj,
 
 int
 dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
-    void *tag, dsl_dataset_t **dsp)
+    const void *tag, dsl_dataset_t **dsp)
 {
 	dsl_dir_t *dd;
 	const char *snapname;
@@ -822,7 +833,7 @@ dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
 }
 
 int
-dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag,
+dsl_dataset_hold(dsl_pool_t *dp, const char *name, const void *tag,
     dsl_dataset_t **dsp)
 {
 	return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp));
@@ -830,7 +841,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag,
 
 static int
 dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
-    void *tag, boolean_t override, dsl_dataset_t **dsp)
+    const void *tag, boolean_t override, dsl_dataset_t **dsp)
 {
 	int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp);
 	if (err != 0)
@@ -846,21 +857,21 @@ dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
 
 int
 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
-    void *tag, dsl_dataset_t **dsp)
+    const void *tag, dsl_dataset_t **dsp)
 {
 	return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_FALSE, dsp));
 }
 
 int
 dsl_dataset_own_obj_force(dsl_pool_t *dp, uint64_t dsobj,
-    ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp)
+    ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp)
 {
 	return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_TRUE, dsp));
 }
 
 static int
 dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
-    void *tag, boolean_t override, dsl_dataset_t **dsp)
+    const void *tag, boolean_t override, dsl_dataset_t **dsp)
 {
 	int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp);
 	if (err != 0)
@@ -874,14 +885,14 @@ dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
 
 int
 dsl_dataset_own_force(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
-    void *tag, dsl_dataset_t **dsp)
+    const void *tag, dsl_dataset_t **dsp)
 {
 	return (dsl_dataset_own_impl(dp, name, flags, tag, B_TRUE, dsp));
 }
 
 int
 dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
-    void *tag, dsl_dataset_t **dsp)
+    const void *tag, dsl_dataset_t **dsp)
 {
 	return (dsl_dataset_own_impl(dp, name, flags, tag, B_FALSE, dsp));
 }
@@ -896,14 +907,14 @@ dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
  * and accessed.
  */
 void
-dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
+dsl_dataset_long_hold(dsl_dataset_t *ds, const void *tag)
 {
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 	(void) zfs_refcount_add(&ds->ds_longholds, tag);
 }
 
 void
-dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
+dsl_dataset_long_rele(dsl_dataset_t *ds, const void *tag)
 {
 	(void) zfs_refcount_remove(&ds->ds_longholds, tag);
 }
@@ -960,7 +971,7 @@ dsl_dataset_namelen(dsl_dataset_t *ds)
 }
 
 void
-dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
+dsl_dataset_rele(dsl_dataset_t *ds, const void *tag)
 {
 	dmu_buf_rele(ds->ds_dbuf, tag);
 }
@@ -978,7 +989,8 @@ dsl_dataset_remove_key_mapping(dsl_dataset_t *ds)
 }
 
 void
-dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
+dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags,
+    const void *tag)
 {
 	if (flags & DS_HOLD_FLAG_DECRYPT)
 		dsl_dataset_remove_key_mapping(ds);
@@ -987,7 +999,7 @@ dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
 }
 
 void
-dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
+dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, const void *tag)
 {
 	ASSERT3P(ds->ds_owner, ==, tag);
 	ASSERT(ds->ds_dbuf != NULL);
@@ -1000,7 +1012,7 @@ dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
 }
 
 boolean_t
-dsl_dataset_tryown(dsl_dataset_t *ds, void *tag, boolean_t override)
+dsl_dataset_tryown(dsl_dataset_t *ds, const void *tag, boolean_t override)
 {
 	boolean_t gotit = FALSE;
 
@@ -1150,7 +1162,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
 	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
-	bzero(dsphys, sizeof (dsl_dataset_phys_t));
+	memset(dsphys, 0, sizeof (dsl_dataset_phys_t));
 	dsphys->ds_dir_obj = dd->dd_object;
 	dsphys->ds_flags = flags;
 	dsphys->ds_fsid_guid = unique_create();
@@ -1250,20 +1262,17 @@ dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
 	objset_t *os;
 
 	VERIFY0(dmu_objset_from_ds(ds, &os));
-	if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
+	if (memcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
 		dsl_pool_t *dp = ds->ds_dir->dd_pool;
 		zio_t *zio;
 
-		bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+		memset(&os->os_zil_header, 0, sizeof (os->os_zil_header));
 		if (os->os_encrypted)
 			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
 
 		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 		dsl_dataset_sync(ds, zio, tx);
 		VERIFY0(zio_wait(zio));
-
-		/* dsl_dataset_sync_done will drop this reference. */
-		dmu_buf_add_ref(ds->ds_dbuf, ds);
 		dsl_dataset_sync_done(ds, tx);
 	}
 }
@@ -1612,7 +1621,7 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
 		for (pair = nvlist_next_nvpair(cnt_track, NULL);
 		    pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
 			int error = 0;
-			char *name;
+			const char *name;
 			uint64_t cnt = 0;
 			dsl_dataset_t *ds;
 
@@ -1644,7 +1653,7 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
 	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
 		int error = 0;
 		dsl_dataset_t *ds;
-		char *name, *atp = NULL;
+		const char *name, *atp = NULL;
 		char dsname[ZFS_MAX_DATASET_NAME_LEN];
 
 		name = nvpair_name(pair);
@@ -1687,7 +1696,6 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
 	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj, crtxg;
 	objset_t *mos = dp->dp_meta_objset;
-	static zil_header_t zero_zil __maybe_unused;
 	objset_t *os __maybe_unused;
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
@@ -1698,7 +1706,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
 	 */
 	ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
 	    dmu_objset_from_ds(ds, &os) != 0 ||
-	    bcmp(&os->os_phys->os_zil_header, &zero_zil,
+	    memcmp(&os->os_phys->os_zil_header, &zero_zil,
 	    sizeof (zero_zil)) == 0);
 
 	/* Should not snapshot a dirty dataset. */
@@ -1720,7 +1728,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
 	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
-	bzero(dsphys, sizeof (dsl_dataset_phys_t));
+	memset(dsphys, 0, sizeof (dsl_dataset_phys_t));
 	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
 	dsphys->ds_fsid_guid = unique_create();
 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
@@ -1854,9 +1862,10 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
 
 	dsl_scan_ds_snapshotted(ds, tx);
 
-	dsl_dir_snap_cmtime_update(ds->ds_dir);
+	dsl_dir_snap_cmtime_update(ds->ds_dir, tx);
 
-	spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " ");
+	if (zfs_snapshot_history_enabled)
+		spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " ");
 }
 
 void
@@ -1869,7 +1878,7 @@ dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
 	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
 		dsl_dataset_t *ds;
-		char *name, *atp;
+		const char *name, *atp;
 		char dsname[ZFS_MAX_DATASET_NAME_LEN];
 
 		name = nvpair_name(pair);
@@ -1898,7 +1907,7 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
 	boolean_t needsuspend;
 	int error;
 	spa_t *spa;
-	char *firstname;
+	const char *firstname;
 	nvlist_t *suspended = NULL;
 
 	pair = nvlist_next_nvpair(snaps, NULL);
@@ -1917,8 +1926,8 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
 		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 		    pair = nvlist_next_nvpair(snaps, pair)) {
 			char fsname[ZFS_MAX_DATASET_NAME_LEN];
-			char *snapname = nvpair_name(pair);
-			char *atp;
+			const char *snapname = nvpair_name(pair);
+			const char *atp;
 			void *cookie;
 
 			atp = strchr(snapname, '@');
@@ -2061,8 +2070,9 @@ dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
 	return (error);
 }
 
+/* Nonblocking dataset sync. Assumes dataset:objset is always 1:1 */
 void
-dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *rio, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(ds->ds_objset != NULL);
@@ -2090,17 +2100,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 		ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
 	}
 
-	dmu_objset_sync(ds->ds_objset, zio, tx);
-
-	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
-		if (zfeature_active(f, ds->ds_feature_activation[f])) {
-			if (zfeature_active(f, ds->ds_feature[f]))
-				continue;
-			dsl_dataset_activate_feature(ds->ds_object, f,
-			    ds->ds_feature_activation[f], tx);
-			ds->ds_feature[f] = ds->ds_feature_activation[f];
-		}
-	}
+	dmu_objset_sync(ds->ds_objset, rio, tx);
 }
 
 /*
@@ -2116,8 +2116,6 @@ dsl_livelist_should_disable(dsl_dataset_t *ds)
 
 	used = dsl_dir_get_usedds(ds->ds_dir);
 	referenced = dsl_get_referenced(ds);
-	ASSERT3U(referenced, >=, 0);
-	ASSERT3U(used, >=, 0);
 	if (referenced == 0)
 		return (B_FALSE);
 	percent_shared = (100 * (referenced - used)) / referenced;
@@ -2272,9 +2270,18 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
 	else
 		ASSERT0(os->os_next_write_raw[tx->tx_txg & TXG_MASK]);
 
-	ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (zfeature_active(f,
+		    ds->ds_feature_activation[f])) {
+			if (zfeature_active(f, ds->ds_feature[f]))
+				continue;
+			dsl_dataset_activate_feature(ds->ds_object, f,
+			    ds->ds_feature_activation[f], tx);
+			ds->ds_feature[f] = ds->ds_feature_activation[f];
+		}
+	}
 
-	dmu_buf_rele(ds->ds_dbuf, ds);
+	ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
 }
 
 int
@@ -2331,161 +2338,147 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
 	nvlist_free(propval);
 }
 
-/*
- * Returns a string that represents the receive resume stats token. It should
- * be freed with strfree().
- */
-char *
-get_receive_resume_stats_impl(dsl_dataset_t *ds)
+static char *
+get_receive_resume_token_impl(dsl_dataset_t *ds)
 {
+	if (!dsl_dataset_has_resume_receive_state(ds))
+		return (NULL);
+
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	char *str;
+	void *packed;
+	uint8_t *compressed;
+	uint64_t val;
+	nvlist_t *token_nv = fnvlist_alloc();
+	size_t packed_size, compressed_size;
+
+	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
+		fnvlist_add_uint64(token_nv, "fromguid", val);
+	}
+	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
+		fnvlist_add_uint64(token_nv, "object", val);
+	}
+	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
+		fnvlist_add_uint64(token_nv, "offset", val);
+	}
+	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
+		fnvlist_add_uint64(token_nv, "bytes", val);
+	}
+	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
+		fnvlist_add_uint64(token_nv, "toguid", val);
+	}
+	char buf[MAXNAMELEN];
+	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
+		fnvlist_add_string(token_nv, "toname", buf);
+	}
+	if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_LARGEBLOCK) == 0) {
+		fnvlist_add_boolean(token_nv, "largeblockok");
+	}
+	if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_EMBEDOK) == 0) {
+		fnvlist_add_boolean(token_nv, "embedok");
+	}
+	if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_COMPRESSOK) == 0) {
+		fnvlist_add_boolean(token_nv, "compressok");
+	}
+	if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_RAWOK) == 0) {
+		fnvlist_add_boolean(token_nv, "rawok");
+	}
+	if (dsl_dataset_feature_is_active(ds,
+	    SPA_FEATURE_REDACTED_DATASETS)) {
+		uint64_t num_redact_snaps = 0;
+		uint64_t *redact_snaps = NULL;
+		VERIFY3B(dsl_dataset_get_uint64_array_feature(ds,
+		    SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps,
+		    &redact_snaps), ==, B_TRUE);
+		fnvlist_add_uint64_array(token_nv, "redact_snaps",
+		    redact_snaps, num_redact_snaps);
+	}
+	if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) {
+		uint64_t num_redact_snaps = 0, int_size = 0;
+		uint64_t *redact_snaps = NULL;
+		VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size,
+		    &num_redact_snaps));
+		ASSERT3U(int_size, ==, sizeof (uint64_t));
 
-	if (dsl_dataset_has_resume_receive_state(ds)) {
-		char *str;
-		void *packed;
-		uint8_t *compressed;
-		uint64_t val;
-		nvlist_t *token_nv = fnvlist_alloc();
-		size_t packed_size, compressed_size;
-
-		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
-			fnvlist_add_uint64(token_nv, "fromguid", val);
-		}
-		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
-			fnvlist_add_uint64(token_nv, "object", val);
-		}
-		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
-			fnvlist_add_uint64(token_nv, "offset", val);
-		}
-		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
-			fnvlist_add_uint64(token_nv, "bytes", val);
-		}
-		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
-			fnvlist_add_uint64(token_nv, "toguid", val);
-		}
-		char buf[MAXNAMELEN];
-		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
-			fnvlist_add_string(token_nv, "toname", buf);
-		}
-		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_LARGEBLOCK) == 0) {
-			fnvlist_add_boolean(token_nv, "largeblockok");
-		}
-		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_EMBEDOK) == 0) {
-			fnvlist_add_boolean(token_nv, "embedok");
-		}
-		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_COMPRESSOK) == 0) {
-			fnvlist_add_boolean(token_nv, "compressok");
-		}
-		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_RAWOK) == 0) {
-			fnvlist_add_boolean(token_nv, "rawok");
-		}
-		if (dsl_dataset_feature_is_active(ds,
-		    SPA_FEATURE_REDACTED_DATASETS)) {
-			uint64_t num_redact_snaps;
-			uint64_t *redact_snaps;
-			VERIFY(dsl_dataset_get_uint64_array_feature(ds,
-			    SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps,
-			    &redact_snaps));
-			fnvlist_add_uint64_array(token_nv, "redact_snaps",
-			    redact_snaps, num_redact_snaps);
-		}
-		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) {
-			uint64_t num_redact_snaps, int_size;
-			uint64_t *redact_snaps;
-			VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object,
-			    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size,
-			    &num_redact_snaps));
-			ASSERT3U(int_size, ==, sizeof (uint64_t));
-
-			redact_snaps = kmem_alloc(int_size * num_redact_snaps,
-			    KM_SLEEP);
-			VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object,
-			    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size,
-			    num_redact_snaps, redact_snaps));
-			fnvlist_add_uint64_array(token_nv, "book_redact_snaps",
-			    redact_snaps, num_redact_snaps);
-			kmem_free(redact_snaps, int_size * num_redact_snaps);
-		}
-		packed = fnvlist_pack(token_nv, &packed_size);
-		fnvlist_free(token_nv);
-		compressed = kmem_alloc(packed_size, KM_SLEEP);
-
-		compressed_size = gzip_compress(packed, compressed,
-		    packed_size, packed_size, 6);
-
-		zio_cksum_t cksum;
-		fletcher_4_native_varsize(compressed, compressed_size, &cksum);
-
-		size_t alloc_size = compressed_size * 2 + 1;
-		str = kmem_alloc(alloc_size, KM_SLEEP);
-		for (int i = 0; i < compressed_size; i++) {
-			size_t offset = i * 2;
-			(void) snprintf(str + offset, alloc_size - offset,
-		    "%02x", compressed[i]);
-		}
-		str[compressed_size * 2] = '\0';
-		char *propval = kmem_asprintf("%u-%llx-%llx-%s",
-		    ZFS_SEND_RESUME_TOKEN_VERSION,
-		    (longlong_t)cksum.zc_word[0],
-		    (longlong_t)packed_size, str);
-		kmem_free(packed, packed_size);
-		kmem_free(str, alloc_size);
-		kmem_free(compressed, packed_size);
-		return (propval);
-	}
-	return (kmem_strdup(""));
+		redact_snaps = kmem_alloc(int_size * num_redact_snaps,
+		    KM_SLEEP);
+		VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size,
+		    num_redact_snaps, redact_snaps));
+		fnvlist_add_uint64_array(token_nv, "book_redact_snaps",
+		    redact_snaps, num_redact_snaps);
+		kmem_free(redact_snaps, int_size * num_redact_snaps);
+	}
+	packed = fnvlist_pack(token_nv, &packed_size);
+	fnvlist_free(token_nv);
+	compressed = kmem_alloc(packed_size, KM_SLEEP);
+
+	compressed_size = gzip_compress(packed, compressed,
+	    packed_size, packed_size, 6);
+
+	zio_cksum_t cksum;
+	fletcher_4_native_varsize(compressed, compressed_size, &cksum);
+
+	size_t alloc_size = compressed_size * 2 + 1;
+	str = kmem_alloc(alloc_size, KM_SLEEP);
+	for (int i = 0; i < compressed_size; i++) {
+		size_t offset = i * 2;
+		(void) snprintf(str + offset, alloc_size - offset,
+	    "%02x", compressed[i]);
+	}
+	str[compressed_size * 2] = '\0';
+	char *propval = kmem_asprintf("%u-%llx-%llx-%s",
+	    ZFS_SEND_RESUME_TOKEN_VERSION,
+	    (longlong_t)cksum.zc_word[0],
+	    (longlong_t)packed_size, str);
+	kmem_free(packed, packed_size);
+	kmem_free(str, alloc_size);
+	kmem_free(compressed, packed_size);
+	return (propval);
 }
 
 /*
- * Returns a string that represents the receive resume stats token of the
- * dataset's child. It should be freed with strfree().
+ * Returns a string that represents the receive resume state token. It should
+ * be freed with strfree(). NULL is returned if no resume state is present.
  */
 char *
-get_child_receive_stats(dsl_dataset_t *ds)
+get_receive_resume_token(dsl_dataset_t *ds)
 {
-	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+	/*
+	 * A failed "newfs" (e.g. full) resumable receive leaves
+	 * the stats set on this dataset.  Check here for the prop.
+	 */
+	char *token = get_receive_resume_token_impl(ds);
+	if (token != NULL)
+		return (token);
+	/*
+	 * A failed incremental resumable receive leaves the
+	 * stats set on our child named "%recv".  Check the child
+	 * for the prop.
+	 */
+	/* 6 extra bytes for /%recv */
+	char name[ZFS_MAX_DATASET_NAME_LEN + 6];
 	dsl_dataset_t *recv_ds;
-	dsl_dataset_name(ds, recvname);
-	if (strlcat(recvname, "/", sizeof (recvname)) <
-	    sizeof (recvname) &&
-	    strlcat(recvname, recv_clone_name, sizeof (recvname)) <
-	    sizeof (recvname) &&
-	    dsl_dataset_hold(ds->ds_dir->dd_pool, recvname, FTAG,
-	    &recv_ds)  == 0) {
-		char *propval = get_receive_resume_stats_impl(recv_ds);
+	dsl_dataset_name(ds, name);
+	if (strlcat(name, "/", sizeof (name)) < sizeof (name) &&
+	    strlcat(name, recv_clone_name, sizeof (name)) < sizeof (name) &&
+	    dsl_dataset_hold(ds->ds_dir->dd_pool, name, FTAG, &recv_ds) == 0) {
+		token = get_receive_resume_token_impl(recv_ds);
 		dsl_dataset_rele(recv_ds, FTAG);
-		return (propval);
 	}
-	return (kmem_strdup(""));
-}
-
-static void
-get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
-{
-	char *propval = get_receive_resume_stats_impl(ds);
-	if (strcmp(propval, "") != 0) {
-		dsl_prop_nvlist_add_string(nv,
-		    ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
-	} else {
-		char *childval = get_child_receive_stats(ds);
-		if (strcmp(childval, "") != 0) {
-			dsl_prop_nvlist_add_string(nv,
-			    ZFS_PROP_RECEIVE_RESUME_TOKEN, childval);
-		}
-		kmem_strfree(childval);
-	}
-	kmem_strfree(propval);
+	return (token);
 }
 
 uint64_t
@@ -2744,6 +2737,8 @@ dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
 		    relpath[0] != '\0'))
 			mnt = value + 1;
 
+		mnt = kmem_strdup(mnt);
+
 		if (relpath[0] == '\0') {
 			(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s",
 			    root, mnt);
@@ -2753,6 +2748,7 @@ dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
 			    relpath);
 		}
 		kmem_free(buf, ZAP_MAXVALUELEN);
+		kmem_strfree(mnt);
 	}
 
 	return (0);
@@ -2761,7 +2757,7 @@ dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
 void
 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool;
 
 	ASSERT(dsl_pool_config_held(dp));
 
@@ -2812,6 +2808,8 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 	    dsl_get_userrefs(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
 	    dsl_get_defer_destroy(ds));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOTS_CHANGED,
+	    dsl_dir_snap_cmtime(ds->ds_dir).tv_sec);
 	dsl_dataset_crypt_stats(ds, nv);
 
 	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
@@ -2823,28 +2821,11 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 	}
 
 	if (!dsl_dataset_is_snapshot(ds)) {
-		/*
-		 * A failed "newfs" (e.g. full) resumable receive leaves
-		 * the stats set on this dataset.  Check here for the prop.
-		 */
-		get_receive_resume_stats(ds, nv);
-
-		/*
-		 * A failed incremental resumable receive leaves the
-		 * stats set on our child named "%recv".  Check the child
-		 * for the prop.
-		 */
-		/* 6 extra bytes for /%recv */
-		char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
-		dsl_dataset_t *recv_ds;
-		dsl_dataset_name(ds, recvname);
-		if (strlcat(recvname, "/", sizeof (recvname)) <
-		    sizeof (recvname) &&
-		    strlcat(recvname, recv_clone_name, sizeof (recvname)) <
-		    sizeof (recvname) &&
-		    dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
-			get_receive_resume_stats(recv_ds, nv);
-			dsl_dataset_rele(recv_ds, FTAG);
+		char *token = get_receive_resume_token(ds);
+		if (token != NULL) {
+			dsl_prop_nvlist_add_string(nv,
+			    ZFS_PROP_RECEIVE_RESUME_TOKEN, token);
+			kmem_strfree(token);
 		}
 	}
 }
@@ -2915,7 +2896,7 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
 	if (snap == NULL)
 		return (B_FALSE);
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	birth = dsl_dataset_get_blkptr(ds)->blk_birth;
+	birth = BP_GET_LOGICAL_BIRTH(dsl_dataset_get_blkptr(ds));
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
 		objset_t *os, *os_snap;
@@ -2928,26 +2909,18 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
 			return (B_TRUE);
 		if (dmu_objset_from_ds(snap, &os_snap) != 0)
 			return (B_TRUE);
-		return (bcmp(&os->os_phys->os_meta_dnode,
+		return (memcmp(&os->os_phys->os_meta_dnode,
 		    &os_snap->os_phys->os_meta_dnode,
 		    sizeof (os->os_phys->os_meta_dnode)) != 0);
 	}
 	return (B_FALSE);
 }
 
-typedef struct dsl_dataset_rename_snapshot_arg {
-	const char *ddrsa_fsname;
-	const char *ddrsa_oldsnapname;
-	const char *ddrsa_newsnapname;
-	boolean_t ddrsa_recursive;
-	dmu_tx_t *ddrsa_tx;
-} dsl_dataset_rename_snapshot_arg_t;
-
-/* ARGSUSED */
 static int
 dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
     dsl_dataset_t *hds, void *arg)
 {
+	(void) dp;
 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
 	int error;
 	uint64_t val;
@@ -2973,7 +2946,7 @@ dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
 	return (error);
 }
 
-static int
+int
 dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
@@ -3035,7 +3008,7 @@ dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
 	return (0);
 }
 
-static void
+void
 dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
@@ -3299,8 +3272,8 @@ struct promotenode {
 
 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
 static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
-    void *tag);
-static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
+    const void *tag);
+static void promote_rele(dsl_dataset_promote_arg_t *ddpa, const void *tag);
 
 int
 dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
@@ -3309,7 +3282,6 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *hds;
 	struct promotenode *snap;
-	dsl_dataset_t *origin_ds, *origin_head;
 	int err;
 	uint64_t unused;
 	uint64_t ss_mv_cnt;
@@ -3329,12 +3301,11 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
 	}
 
 	snap = list_head(&ddpa->shared_snaps);
-	origin_head = snap->ds;
 	if (snap == NULL) {
 		err = SET_ERROR(ENOENT);
 		goto out;
 	}
-	origin_ds = snap->ds;
+	dsl_dataset_t *const origin_ds = snap->ds;
 
 	/*
 	 * Encrypted clones share a DSL Crypto Key with their origin's dsl dir.
@@ -3430,10 +3401,10 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
 	 * Check that bookmarks that are being transferred don't have
 	 * name conflicts.
 	 */
-	for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks);
+	for (dsl_bookmark_node_t *dbn = avl_first(&origin_ds->ds_bookmarks);
 	    dbn != NULL && dbn->dbn_phys.zbm_creation_txg <=
 	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
-	    dbn = AVL_NEXT(&origin_head->ds_bookmarks, dbn)) {
+	    dbn = AVL_NEXT(&origin_ds->ds_bookmarks, dbn)) {
 		if (strlen(dbn->dbn_name) >= max_snap_len) {
 			err = SET_ERROR(ENAMETOOLONG);
 			goto out;
@@ -3447,7 +3418,8 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
 			conflicting_snaps = B_TRUE;
 		} else if (err == ESRCH) {
 			err = 0;
-		} else if (err != 0) {
+		}
+		if (err != 0) {
 			goto out;
 		}
 	}
@@ -3741,6 +3713,15 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
 
 	dsl_dir_rele(odd, FTAG);
 	promote_rele(ddpa, FTAG);
+
+	/*
+	 * Transfer common error blocks from old head to new head.
+	 */
+	if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) {
+		uint64_t old_head = origin_head->ds_object;
+		uint64_t new_head = hds->ds_object;
+		spa_swap_errlog(dp->dp_spa, new_head, old_head, tx);
+	}
 }
 
 /*
@@ -3751,7 +3732,7 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
  */
 static int
 snaplist_make(dsl_pool_t *dp,
-    uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
+    uint64_t first_obj, uint64_t last_obj, list_t *l, const void *tag)
 {
 	uint64_t obj = last_obj;
 
@@ -3796,15 +3777,14 @@ snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
 }
 
 static void
-snaplist_destroy(list_t *l, void *tag)
+snaplist_destroy(list_t *l, const void *tag)
 {
 	struct promotenode *snap;
 
 	if (l == NULL || !list_link_active(&l->list_head))
 		return;
 
-	while ((snap = list_tail(l)) != NULL) {
-		list_remove(l, snap);
+	while ((snap = list_remove_tail(l)) != NULL) {
 		dsl_dataset_rele(snap->ds, tag);
 		kmem_free(snap, sizeof (*snap));
 	}
@@ -3812,7 +3792,7 @@ snaplist_destroy(list_t *l, void *tag)
 }
 
 static int
-promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
+promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, const void *tag)
 {
 	int error;
 	dsl_dir_t *dd;
@@ -3862,7 +3842,7 @@ out:
 }
 
 static void
-promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
+promote_rele(dsl_dataset_promote_arg_t *ddpa, const void *tag)
 {
 	snaplist_destroy(&ddpa->shared_snaps, tag);
 	snaplist_destroy(&ddpa->clone_snaps, tag);
@@ -4305,7 +4285,6 @@ typedef struct dsl_dataset_set_qr_arg {
 } dsl_dataset_set_qr_arg_t;
 
 
-/* ARGSUSED */
 static int
 dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
 {
@@ -4512,7 +4491,6 @@ typedef struct dsl_dataset_set_compression_arg {
 	uint64_t ddsca_value;
 } dsl_dataset_set_compression_arg_t;
 
-/* ARGSUSED */
 static int
 dsl_dataset_set_compression_check(void *arg, dmu_tx_t *tx)
 {
@@ -4540,6 +4518,7 @@ dsl_dataset_set_compression_sync(void *arg, dmu_tx_t *tx)
 
 	uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value);
 	spa_feature_t f = zio_compress_to_feature(compval);
+	ASSERT3S(f, !=, SPA_FEATURE_NONE);
 	ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN);
 
 	VERIFY0(dsl_dataset_hold(dp, ddsca->ddsca_name, FTAG, &ds));
@@ -4951,7 +4930,7 @@ dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps,
 	if (num_redact_snaps > 0) {
 		ftuaa->array = kmem_alloc(num_redact_snaps * sizeof (uint64_t),
 		    KM_SLEEP);
-		bcopy(redact_snaps, ftuaa->array, num_redact_snaps *
+		memcpy(ftuaa->array, redact_snaps, num_redact_snaps *
 		    sizeof (uint64_t));
 	}
 	dsl_dataset_activate_feature(dsobj, SPA_FEATURE_REDACTED_DATASETS,
@@ -4959,19 +4938,45 @@ dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps,
 	ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa;
 }
 
-/* BEGIN CSTYLED */
-#if defined(_LP64)
-#define	RECORDSIZE_PERM ZMOD_RW
-#else
-/* Limited to 1M on 32-bit platforms due to lack of virtual address space */
-#define	RECORDSIZE_PERM ZMOD_RD
-#endif
-ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, RECORDSIZE_PERM,
+/*
+ * Find and return (in *oldest_dsobj) the oldest snapshot of the dsobj
+ * dataset whose birth time is >= min_txg.
+ */
+int
+dsl_dataset_oldest_snapshot(spa_t *spa, uint64_t head_ds, uint64_t min_txg,
+    uint64_t *oldest_dsobj)
+{
+	dsl_dataset_t *ds;
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+
+	int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+
+	while (prev_obj != 0 && min_txg < prev_obj_txg) {
+		dsl_dataset_rele(ds, FTAG);
+		if ((error = dsl_dataset_hold_obj(dp, prev_obj,
+		    FTAG, &ds)) != 0)
+			return (error);
+		prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+		prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	}
+	*oldest_dsobj = ds->ds_object;
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, UINT, ZMOD_RW,
 	"Max allowed record size");
 
 ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW,
 	"Allow mounting of redacted datasets");
-/* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs, zfs_, snapshot_history_enabled, INT, ZMOD_RW,
+	"Include snapshot events in pool history/events");
 
 EXPORT_SYMBOL(dsl_dataset_hold);
 EXPORT_SYMBOL(dsl_dataset_hold_flags);
diff --git a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c
index a77e381520db..eff1f7de7731 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -92,7 +92,7 @@
  * will be loaded into memory and shouldn't take up an inordinate amount of
  * space. We settled on ~500000 entries, corresponding to roughly 128M.
  */
-unsigned long zfs_livelist_max_entries = 500000;
+uint64_t zfs_livelist_max_entries = 500000;
 
 /*
  * We can approximate how much of a performance gain a livelist will give us
@@ -173,8 +173,8 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
 		 * in parallel.  Then open them all in a second pass.
 		 */
 		dle->dle_bpobj.bpo_object = za.za_first_integer;
-		dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object,
-		    0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object,
+		    ZIO_PRIORITY_SYNC_READ);
 
 		avl_add(&dl->dl_tree, dle);
 	}
@@ -235,8 +235,8 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl)
 		 * in parallel.  Then open them all in a second pass.
 		 */
 		dlce->dlce_bpobj = za.za_first_integer;
-		dmu_prefetch(dl->dl_os, dlce->dlce_bpobj,
-		    0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj,
+		    ZIO_PRIORITY_SYNC_READ);
 		avl_add(&dl->dl_cache, dlce);
 	}
 	VERIFY3U(error, ==, ENOENT);
@@ -438,6 +438,18 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
 	}
 }
 
+/*
+ * Prefetch metadata required for dle_enqueue_subobj().
+ */
+static void
+dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+    uint64_t obj)
+{
+	if (dle->dle_bpobj.bpo_object !=
+	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj)
+		bpobj_prefetch_subobj(&dle->dle_bpobj, obj);
+}
+
 void
 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
@@ -462,7 +474,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
 	dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
 	dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
 
-	dle_tofind.dle_mintxg = bp->blk_birth;
+	dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp);
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
@@ -471,7 +483,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
 
 	if (dle == NULL) {
 		zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu",
-		    bp, (longlong_t)bp->blk_birth);
+		    bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 		dle = avl_first(&dl->dl_tree);
 	}
 
@@ -542,6 +554,7 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
 	ASSERT3P(dle, !=, NULL);
 	dle_prev = AVL_PREV(&dl->dl_tree, dle);
+	ASSERT3P(dle_prev, !=, NULL);
 
 	dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
 
@@ -809,6 +822,27 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
 	dle_enqueue_subobj(dl, dle, obj, tx);
 }
 
+/*
+ * Prefetch metadata required for dsl_deadlist_insert_bpobj().
+ */
+static void
+dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+
+	ASSERT(MUTEX_HELD(&dl->dl_lock));
+
+	dsl_deadlist_load_tree(dl);
+
+	dle_tofind.dle_mintxg = birth;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	if (dle == NULL)
+		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+	dle_prefetch_subobj(dl, dle, obj);
+}
+
 static int
 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
@@ -825,12 +859,12 @@ dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
 void
 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
 {
-	zap_cursor_t zc;
-	zap_attribute_t za;
+	zap_cursor_t zc, pzc;
+	zap_attribute_t *za, *pza;
 	dmu_buf_t *bonus;
 	dsl_deadlist_phys_t *dlp;
 	dmu_object_info_t doi;
-	int error;
+	int error, perror, i;
 
 	VERIFY0(dmu_object_info(dl->dl_os, obj, &doi));
 	if (doi.doi_type == DMU_OT_BPOBJ) {
@@ -841,23 +875,46 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
 		return;
 	}
 
+	za = kmem_alloc(sizeof (*za), KM_SLEEP);
+	pza = kmem_alloc(sizeof (*pza), KM_SLEEP);
+
 	mutex_enter(&dl->dl_lock);
+	/*
+	 * Prefetch up to 128 deadlists first and then more as we progress.
+	 * The limit is a balance between ARC use and diminishing returns.
+	 */
+	for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0;
+	    (perror = zap_cursor_retrieve(&pzc, pza)) == 0 && i < 128;
+	    zap_cursor_advance(&pzc), i++) {
+		dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
+		    zfs_strtonum(pza->za_name, NULL));
+	}
 	for (zap_cursor_init(&zc, dl->dl_os, obj);
-	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
+	    (error = zap_cursor_retrieve(&zc, za)) == 0;
 	    zap_cursor_advance(&zc)) {
-		uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
-		dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
-		VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
+		dsl_deadlist_insert_bpobj(dl, za->za_first_integer,
+		    zfs_strtonum(za->za_name, NULL), tx);
+		VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx));
+		if (perror == 0) {
+			dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
+			    zfs_strtonum(pza->za_name, NULL));
+			zap_cursor_advance(&pzc);
+			perror = zap_cursor_retrieve(&pzc, pza);
+		}
 	}
 	VERIFY3U(error, ==, ENOENT);
 	zap_cursor_fini(&zc);
+	zap_cursor_fini(&pzc);
 
 	VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
 	dlp = bonus->db_data;
 	dmu_buf_will_dirty(bonus, tx);
-	bzero(dlp, sizeof (*dlp));
+	memset(dlp, 0, sizeof (*dlp));
 	dmu_buf_rele(bonus, FTAG);
 	mutex_exit(&dl->dl_lock);
+
+	kmem_free(za, sizeof (*za));
+	kmem_free(pza, sizeof (*pza));
 }
 
 /*
@@ -868,8 +925,9 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
     dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t dle_tofind;
-	dsl_deadlist_entry_t *dle;
+	dsl_deadlist_entry_t *dle, *pdle;
 	avl_index_t where;
+	int i;
 
 	ASSERT(!dl->dl_oldfmt);
 
@@ -881,11 +939,23 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
+	/*
+	 * Prefetch up to 128 deadlists first and then more as we progress.
+	 * The limit is a balance between ARC use and diminishing returns.
+	 */
+	for (pdle = dle, i = 0; pdle && i < 128; i++) {
+		bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
+		pdle = AVL_NEXT(&dl->dl_tree, pdle);
+	}
 	while (dle) {
 		uint64_t used, comp, uncomp;
 		dsl_deadlist_entry_t *dle_next;
 
 		bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
+		if (pdle) {
+			bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
+			pdle = AVL_NEXT(&dl->dl_tree, pdle);
+		}
 
 		VERIFY0(bpobj_space(&dle->dle_bpobj,
 		    &used, &comp, &uncomp));
@@ -930,8 +1000,6 @@ livelist_compare(const void *larg, const void *rarg)
 	/* if vdevs are equal, sort by offsets. */
 	uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
 	uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
-	if (l_dva0_offset == r_dva0_offset)
-		ASSERT3U(l->blk_birth, ==, r->blk_birth);
 	return (TREE_CMP(l_dva0_offset, r_dva0_offset));
 }
 
@@ -946,9 +1014,9 @@ struct livelist_iter_arg {
  * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
  * corresponding FREE are stored in the supplied bplist.
  *
- * Note that multiple FREE and ALLOC entries for the same blkptr may
- * be encountered when dedup is involved. For this reason we keep a
- * refcount for all the FREE entries of each blkptr and ensure that
+ * Note that multiple FREE and ALLOC entries for the same blkptr may be
+ * encountered when dedup or block cloning is involved.  For this reason we
+ * keep a refcount for all the FREE entries of each blkptr and ensure that
  * each of those FREE entries has a corresponding ALLOC preceding it.
  */
 static int
@@ -967,6 +1035,12 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
 	livelist_entry_t node;
 	node.le_bp = *bp;
 	livelist_entry_t *found = avl_find(avl, &node, NULL);
+	if (found) {
+		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp));
+		ASSERT3U(BP_GET_CHECKSUM(bp), ==,
+		    BP_GET_CHECKSUM(&found->le_bp));
+		ASSERT3U(BP_GET_BIRTH(bp), ==, BP_GET_BIRTH(&found->le_bp));
+	}
 	if (bp_freed) {
 		if (found == NULL) {
 			/* first free entry for this blkptr */
@@ -976,10 +1050,10 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
 			e->le_refcnt = 1;
 			avl_add(avl, e);
 		} else {
-			/* dedup block free */
-			ASSERT(BP_GET_DEDUP(bp));
-			ASSERT3U(BP_GET_CHECKSUM(bp), ==,
-			    BP_GET_CHECKSUM(&found->le_bp));
+			/*
+			 * Deduped or cloned block free.  We could assert D bit
+			 * for dedup, but there is no such one for cloning.
+			 */
 			ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt);
 			found->le_refcnt++;
 		}
@@ -995,14 +1069,6 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
 				/* all tracked free pairs have been matched */
 				avl_remove(avl, found);
 				kmem_free(found, sizeof (livelist_entry_t));
-			} else {
-				/*
-				 * This is definitely a deduped blkptr so
-				 * let's validate it.
-				 */
-				ASSERT(BP_GET_DEDUP(bp));
-				ASSERT3U(BP_GET_CHECKSUM(bp), ==,
-				    BP_GET_CHECKSUM(&found->le_bp));
 			}
 		}
 	}
@@ -1028,16 +1094,19 @@ dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
 	    .t = t
 	};
 	int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);
+	VERIFY(err != 0 || avl_numnodes(&avl) == 0);
 
-	VERIFY0(avl_numnodes(&avl));
+	void *cookie = NULL;
+	livelist_entry_t *le = NULL;
+	while ((le = avl_destroy_nodes(&avl, &cookie)) != NULL) {
+		kmem_free(le, sizeof (livelist_entry_t));
+	}
 	avl_destroy(&avl);
 	return (err);
 }
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, U64, ZMOD_RW,
 	"Size to start the next sub-livelist in a livelist");
 
 ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW,
 	"Threshold at which livelist is disabled");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dsl_deleg.c b/sys/contrib/openzfs/module/zfs/dsl_deleg.c
index cf8a3c9bbdfb..645ad8e5b8dc 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_deleg.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_deleg.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/dsl_destroy.c b/sys/contrib/openzfs/module/zfs/dsl_destroy.c
index a2748197f29d..d4a6e5b6e9fd 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_destroy.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_destroy.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -49,6 +49,8 @@
 #include <sys/zthr.h>
 #include <sys/spa_impl.h>
 
+extern int zfs_snapshot_history_enabled;
+
 int
 dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
 {
@@ -130,10 +132,11 @@ process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 
 	ASSERT(!BP_IS_HOLE(bp));
 
-	if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
+	if (BP_GET_LOGICAL_BIRTH(bp) <=
+	    dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
 		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx);
 		if (poa->ds_prev && !poa->after_branch_point &&
-		    bp->blk_birth >
+		    BP_GET_LOGICAL_BIRTH(bp) >
 		    dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
 			dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
 			    bp_get_dsize_sync(dp->dp_spa, bp);
@@ -200,7 +203,7 @@ rck_alloc(dsl_dataset_t *clone)
 
 static void
 dsl_dir_remove_clones_key_impl(dsl_dir_t *dd, uint64_t mintxg, dmu_tx_t *tx,
-    list_t *stack, void *tag)
+    list_t *stack, const void *tag)
 {
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 
@@ -311,7 +314,8 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
+	    tx->tx_txg);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	ASSERT(zfs_refcount_is_zero(&ds->ds_longholds));
 
@@ -321,14 +325,19 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
-		spa_history_log_internal_ds(ds, "defer_destroy", tx, " ");
+		if (zfs_snapshot_history_enabled) {
+			spa_history_log_internal_ds(ds, "defer_destroy", tx,
+			    " ");
+		}
 		return;
 	}
 
 	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 
-	/* We need to log before removing it from the namespace. */
-	spa_history_log_internal_ds(ds, "destroy", tx, " ");
+	if (zfs_snapshot_history_enabled) {
+		/* We need to log before removing it from the namespace. */
+		spa_history_log_internal_ds(ds, "destroy", tx, " ");
+	}
 
 	dsl_scan_ds_destroyed(ds, tx);
 
@@ -651,7 +660,7 @@ dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
 	    zfs_lua_max_memlimit,
 	    fnvlist_lookup_nvpair(wrapper, ZCP_ARG_ARGLIST), result);
 	if (error != 0) {
-		char *errorstr = NULL;
+		const char *errorstr = NULL;
 		(void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr);
 		if (errorstr != NULL) {
 			zfs_dbgmsg("%s", errorstr);
@@ -699,11 +708,11 @@ struct killarg {
 	dmu_tx_t *tx;
 };
 
-/* ARGSUSED */
 static int
 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
+	(void) spa, (void) dnp;
 	struct killarg *ka = arg;
 	dmu_tx_t *tx = ka->tx;
 
@@ -720,7 +729,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
 	} else {
 		ASSERT(zilog == NULL);
-		ASSERT3U(bp->blk_birth, >,
+		ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
 		    dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
 		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
 	}
@@ -1010,7 +1019,8 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 	ASSERT(ds->ds_prev == NULL ||
 	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
+	    tx->tx_txg);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
@@ -1118,6 +1128,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 		while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) !=
 		    NULL) {
 			if (dbn->dbn_phys.zbm_redaction_obj != 0) {
+				dnode_t *rl;
+				VERIFY0(dnode_hold(mos,
+				    dbn->dbn_phys.zbm_redaction_obj, FTAG,
+				    &rl));
+				if (rl->dn_have_spill) {
+					spa_feature_decr(dmu_objset_spa(mos),
+					    SPA_FEATURE_REDACTION_LIST_SPILL,
+					    tx);
+				}
+				dnode_rele(rl, FTAG);
 				VERIFY0(dmu_object_free(mos,
 				    dbn->dbn_phys.zbm_redaction_obj, tx));
 				spa_feature_decr(dmu_objset_spa(mos),
@@ -1153,6 +1173,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 		dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
 		dsl_dataset_rele(prev, FTAG);
 	}
+	/* Delete errlog. */
+	if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG))
+		spa_delete_dataset_errlog(dp->dp_spa, ds->ds_object, tx);
 }
 
 void
@@ -1246,10 +1269,10 @@ dsl_destroy_head(const char *name)
  * inconsistent datasets, even if we encounter an error trying to
  * process one of them.
  */
-/* ARGSUSED */
 int
 dsl_destroy_inconsistent(const char *dsname, void *arg)
 {
+	(void) arg;
 	objset_t *os;
 
 	if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
diff --git a/sys/contrib/openzfs/module/zfs/dsl_dir.c b/sys/contrib/openzfs/module/zfs/dsl_dir.c
index 84caace4dbab..baf970121a61 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_dir.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_dir.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -26,6 +26,7 @@
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
  */
 
 #include <sys/dmu.h>
@@ -54,6 +55,15 @@
 #include "zfs_prop.h"
 
 /*
+ * This controls if we verify the ZVOL quota or not.
+ * Currently, quotas are not implemented for ZVOLs.
+ * The quota size is the size of the ZVOL.
+ * The size of the volume already implies the ZVOL size quota.
+ * The quota mechanism can introduce a significant performance drop.
+ */
+static int zvol_enforce_quotas = B_TRUE;
+
+/*
  * Filesystem and Snapshot Limits
  * ------------------------------
  *
@@ -121,8 +131,6 @@
  * dsl_dir_init_fs_ss_count().
  */
 
-extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
-
 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
 
 typedef struct ddulrt_arg {
@@ -162,7 +170,7 @@ dsl_dir_evict_async(void *dbu)
 
 int
 dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
-    const char *tail, void *tag, dsl_dir_t **ddp)
+    const char *tail, const void *tag, dsl_dir_t **ddp)
 {
 	dmu_buf_t *dbuf;
 	dsl_dir_t *dd;
@@ -209,8 +217,6 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
 			}
 		}
 
-		dsl_dir_snap_cmtime_update(dd);
-
 		if (dsl_dir_phys(dd)->dd_parent_obj) {
 			err = dsl_dir_hold_obj(dp,
 			    dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
@@ -272,6 +278,16 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
 			}
 		}
 
+		if (dsl_dir_is_zapified(dd)) {
+			inode_timespec_t t = {0};
+			(void) zap_lookup(dp->dp_meta_objset, ddobj,
+			    DD_FIELD_SNAPSHOTS_CHANGED,
+			    sizeof (uint64_t),
+			    sizeof (inode_timespec_t) / sizeof (uint64_t),
+			    &t);
+			dd->dd_snap_cmtime = t;
+		}
+
 		dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
 		    &dd->dd_dbuf);
 		winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
@@ -322,7 +338,7 @@ errout:
 }
 
 void
-dsl_dir_rele(dsl_dir_t *dd, void *tag)
+dsl_dir_rele(dsl_dir_t *dd, const void *tag)
 {
 	dprintf_dd(dd, "%s\n", "");
 	spa_close(dd->dd_pool->dp_spa, tag);
@@ -337,7 +353,7 @@ dsl_dir_rele(dsl_dir_t *dd, void *tag)
  * the spa.
  */
 void
-dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
+dsl_dir_async_rele(dsl_dir_t *dd, const void *tag)
 {
 	dprintf_dd(dd, "%s\n", "");
 	spa_async_close(dd->dd_pool->dp_spa, tag);
@@ -422,8 +438,7 @@ getcomponent(const char *path, char *component, const char **nextp)
 	} else if (p[0] == '/') {
 		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
 			return (SET_ERROR(ENAMETOOLONG));
-		(void) strncpy(component, path, p - path);
-		component[p - path] = '\0';
+		(void) strlcpy(component, path, p - path + 1);
 		p++;
 	} else if (p[0] == '@') {
 		/*
@@ -434,8 +449,7 @@ getcomponent(const char *path, char *component, const char **nextp)
 			return (SET_ERROR(EINVAL));
 		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
 			return (SET_ERROR(ENAMETOOLONG));
-		(void) strncpy(component, path, p - path);
-		component[p - path] = '\0';
+		(void) strlcpy(component, path, p - path + 1);
 	} else {
 		panic("invalid p=%p", (void *)p);
 	}
@@ -451,7 +465,7 @@ getcomponent(const char *path, char *component, const char **nextp)
  * (*tail)[0] == '@' means that the last component is a snapshot.
  */
 int
-dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
+dsl_dir_hold(dsl_pool_t *dp, const char *name, const void *tag,
     dsl_dir_t **ddp, const char **tailp)
 {
 	char *buf;
@@ -764,6 +778,8 @@ dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop,
 	 */
 	if (secpolicy_zfs_proc(cr, proc) == 0)
 		return (ENFORCE_NEVER);
+#else
+	(void) proc;
 #endif
 
 	if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
@@ -801,7 +817,7 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
 {
 	objset_t *os = dd->dd_pool->dp_meta_objset;
 	uint64_t limit, count;
-	char *count_prop;
+	const char *count_prop;
 	enforce_res_t enforce;
 	int err = 0;
 
@@ -809,6 +825,18 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
 	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
 	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
 
+	if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
+		/*
+		 * We don't enforce the limit for temporary snapshots. This is
+		 * indicated by a NULL cred_t argument.
+		 */
+		if (cr == NULL)
+			return (0);
+
+		count_prop = DD_FIELD_SNAPSHOT_COUNT;
+	} else {
+		count_prop = DD_FIELD_FILESYSTEM_COUNT;
+	}
 	/*
 	 * If we're allowed to change the limit, don't enforce the limit
 	 * e.g. this can happen if a snapshot is taken by an administrative
@@ -828,19 +856,6 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
 	if (delta == 0)
 		return (0);
 
-	if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
-		/*
-		 * We don't enforce the limit for temporary snapshots. This is
-		 * indicated by a NULL cred_t argument.
-		 */
-		if (cr == NULL)
-			return (0);
-
-		count_prop = DD_FIELD_SNAPSHOT_COUNT;
-	} else {
-		count_prop = DD_FIELD_FILESYSTEM_COUNT;
-	}
-
 	/*
 	 * If an ancestor has been provided, stop checking the limit once we
 	 * hit that dir. We need this during rename so that we don't overcount
@@ -1172,10 +1187,9 @@ dsl_dir_space_towrite(dsl_dir_t *dd)
 
 	ASSERT(MUTEX_HELD(&dd->dd_lock));
 
-	for (int i = 0; i < TXG_SIZE; i++) {
+	for (int i = 0; i < TXG_SIZE; i++)
 		space += dd->dd_space_towrite[i & TXG_MASK];
-		ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0);
-	}
+
 	return (space);
 }
 
@@ -1262,6 +1276,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 	uint64_t quota;
 	struct tempreserve *tr;
 	int retval;
+	uint64_t ext_quota;
 	uint64_t ref_rsrv;
 
 top_of_function:
@@ -1305,7 +1320,9 @@ top_of_function:
 	 * If this transaction will result in a net free of space,
 	 * we want to let it through.
 	 */
-	if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
+	if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0 ||
+	    (tx->tx_objset && dmu_objset_type(tx->tx_objset) == DMU_OST_ZVOL &&
+	    zvol_enforce_quotas == B_FALSE))
 		quota = UINT64_MAX;
 	else
 		quota = dsl_dir_phys(dd)->dd_quota;
@@ -1320,7 +1337,6 @@ top_of_function:
 	 * we're very close to full, this will allow a steady trickle of
 	 * removes to get through.
 	 */
-	uint64_t deferred = 0;
 	if (dd->dd_parent == NULL) {
 		uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
 		    (netfree) ?
@@ -1335,21 +1351,31 @@ top_of_function:
 	/*
 	 * If they are requesting more space, and our current estimate
 	 * is over quota, they get to try again unless the actual
-	 * on-disk is over quota and there are no pending changes (which
-	 * may free up space for us).
+	 * on-disk is over quota and there are no pending changes
+	 * or deferred frees (which may free up space for us).
 	 */
-	if (used_on_disk + est_inflight >= quota) {
-		if (est_inflight > 0 || used_on_disk < quota ||
-		    (retval == ENOSPC && used_on_disk < quota + deferred))
-			retval = ERESTART;
+	ext_quota = quota >> 5;
+	if (quota == UINT64_MAX)
+		ext_quota = 0;
+
+	if (used_on_disk >= quota) {
+		if (retval == ENOSPC && (used_on_disk - quota) <
+		    dsl_pool_deferred_space(dd->dd_pool)) {
+			retval = SET_ERROR(ERESTART);
+		}
+		/* Quota exceeded */
+		mutex_exit(&dd->dd_lock);
+		DMU_TX_STAT_BUMP(dmu_tx_quota);
+		return (retval);
+	} else if (used_on_disk + est_inflight >= quota + ext_quota) {
 		dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
-		    "quota=%lluK tr=%lluK err=%d\n",
+		    "quota=%lluK tr=%lluK\n",
 		    (u_longlong_t)used_on_disk>>10,
 		    (u_longlong_t)est_inflight>>10,
-		    (u_longlong_t)quota>>10, (u_longlong_t)asize>>10, retval);
+		    (u_longlong_t)quota>>10, (u_longlong_t)asize>>10);
 		mutex_exit(&dd->dd_lock);
 		DMU_TX_STAT_BUMP(dmu_tx_quota);
-		return (SET_ERROR(retval));
+		return (SET_ERROR(ERESTART));
 	}
 
 	/* We need to up our estimated delta before dropping dd_lock */
@@ -1377,10 +1403,9 @@ top_of_function:
 		ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
 		first = B_FALSE;
 		goto top_of_function;
-
-	} else {
-		return (0);
 	}
+
+	return (0);
 }
 
 /*
@@ -1459,7 +1484,7 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
 	if (tr_cookie == NULL)
 		return;
 
-	while ((tr = list_head(tr_list)) != NULL) {
+	while ((tr = list_remove_head(tr_list)) != NULL) {
 		if (tr->tr_ds) {
 			mutex_enter(&tr->tr_ds->dd_lock);
 			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
@@ -1469,7 +1494,6 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
 		} else {
 			arc_tempreserve_clear(tr->tr_size);
 		}
-		list_remove(tr_list, tr);
 		kmem_free(tr, sizeof (struct tempreserve));
 	}
 
@@ -1896,10 +1920,10 @@ typedef struct dsl_valid_rename_arg {
 	int nest_delta;
 } dsl_valid_rename_arg_t;
 
-/* ARGSUSED */
 static int
 dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
+	(void) dp;
 	dsl_valid_rename_arg_t *dvra = arg;
 	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
 
@@ -2094,6 +2118,8 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
 	VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
 	    &mynewname));
 
+	ASSERT3P(mynewname, !=, NULL);
+
 	/* Log this before we change the name. */
 	spa_history_log_internal_dd(dd, "rename", tx,
 	    "-> %s", ddra->ddra_newname);
@@ -2236,13 +2262,25 @@ dsl_dir_snap_cmtime(dsl_dir_t *dd)
 }
 
 void
-dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
+dsl_dir_snap_cmtime_update(dsl_dir_t *dd, dmu_tx_t *tx)
 {
+	dsl_pool_t *dp = dmu_tx_pool(tx);
 	inode_timespec_t t;
-
 	gethrestime(&t);
+
 	mutex_enter(&dd->dd_lock);
 	dd->dd_snap_cmtime = t;
+	if (spa_feature_is_enabled(dp->dp_spa,
+	    SPA_FEATURE_EXTENSIBLE_DATASET)) {
+		objset_t *mos = dd->dd_pool->dp_meta_objset;
+		uint64_t ddobj = dd->dd_object;
+		dsl_dir_zapify(dd, tx);
+		VERIFY0(zap_update(mos, ddobj,
+		    DD_FIELD_SNAPSHOTS_CHANGED,
+		    sizeof (uint64_t),
+		    sizeof (inode_timespec_t) / sizeof (uint64_t),
+		    &t, tx));
+	}
 	mutex_exit(&dd->dd_lock);
 }
 
@@ -2396,6 +2434,7 @@ dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds,
 		 * The delete queue is ZPL specific, and libzpool doesn't have
 		 * it. It doesn't make sense to wait for it.
 		 */
+		(void) ds;
 		*in_progress = B_FALSE;
 		break;
 #endif
@@ -2448,3 +2487,7 @@ dsl_dir_cancel_waiters(dsl_dir_t *dd)
 EXPORT_SYMBOL(dsl_dir_set_quota);
 EXPORT_SYMBOL(dsl_dir_set_reservation);
 #endif
+
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , zvol_enforce_quotas, INT, ZMOD_RW,
+	"Enable strict ZVOL quota enforcment");
diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c
index 1350f1329564..342ec5c15c79 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_pool.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -99,32 +99,31 @@
  * capped at zfs_dirty_data_max_max.  It can also be overridden with a module
  * parameter.
  */
-unsigned long zfs_dirty_data_max = 0;
-unsigned long zfs_dirty_data_max_max = 0;
-int zfs_dirty_data_max_percent = 10;
-int zfs_dirty_data_max_max_percent = 25;
+uint64_t zfs_dirty_data_max = 0;
+uint64_t zfs_dirty_data_max_max = 0;
+uint_t zfs_dirty_data_max_percent = 10;
+uint_t zfs_dirty_data_max_max_percent = 25;
 
 /*
- * zfs_wrlog_data_max, the upper limit of TX_WRITE log data.
- * Once it is reached, write operation is blocked,
- * until log data is cleared out after txg sync.
+ * The upper limit of TX_WRITE log data.  Write operations are throttled
+ * when approaching the limit until log data is cleared out after txg sync.
  * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY.
  */
-unsigned long zfs_wrlog_data_max = 0;
+uint64_t zfs_wrlog_data_max = 0;
 
 /*
  * If there's at least this much dirty data (as a percentage of
  * zfs_dirty_data_max), push out a txg.  This should be less than
  * zfs_vdev_async_write_active_min_dirty_percent.
  */
-int zfs_dirty_data_sync_percent = 20;
+static uint_t zfs_dirty_data_sync_percent = 20;
 
 /*
  * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
  * and delay each transaction.
  * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
  */
-int zfs_delay_min_dirty_percent = 60;
+uint_t zfs_delay_min_dirty_percent = 60;
 
 /*
  * This controls how quickly the delay approaches infinity.
@@ -139,12 +138,7 @@ int zfs_delay_min_dirty_percent = 60;
  * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
  * multiply in dmu_tx_delay().
  */
-unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
-
-/*
- * This determines the number of threads used by the dp_sync_taskq.
- */
-int zfs_sync_taskq_batch_pct = 75;
+uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
 
 /*
  * These tunables determine the behavior of how zil_itxg_clean() is
@@ -172,9 +166,9 @@ int zfs_sync_taskq_batch_pct = 75;
  * Additionally, the number of threads used by the taskq can be
  * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable.
  */
-int zfs_zil_clean_taskq_nthr_pct = 100;
-int zfs_zil_clean_taskq_minalloc = 1024;
-int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
+static int zfs_zil_clean_taskq_nthr_pct = 100;
+static int zfs_zil_clean_taskq_minalloc = 1024;
+static int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
 
 int
 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
@@ -215,9 +209,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	txg_list_create(&dp->dp_early_sync_tasks, spa,
 	    offsetof(dsl_sync_task_t, dst_node));
 
-	dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
-	    zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
-	    TASKQ_THREADS_CPU_PCT);
+	dp->dp_sync_taskq = spa_sync_tq_create(spa, "dp_sync_taskq");
 
 	dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
 	    zfs_zil_clean_taskq_nthr_pct, minclsyspri,
@@ -332,7 +324,6 @@ dsl_pool_open(dsl_pool_t *dp)
 			/*
 			 * We might not have created the remap bpobj yet.
 			 */
-			err = 0;
 		} else {
 			goto out;
 		}
@@ -411,7 +402,7 @@ dsl_pool_close(dsl_pool_t *dp)
 	txg_list_destroy(&dp->dp_dirty_dirs);
 
 	taskq_destroy(dp->dp_zil_clean_taskq);
-	taskq_destroy(dp->dp_sync_taskq);
+	spa_sync_tq_destroy(dp->dp_spa);
 
 	/*
 	 * We can't set retry to TRUE since we're explicitly specifying
@@ -439,10 +430,8 @@ dsl_pool_close(dsl_pool_t *dp)
 
 	taskq_destroy(dp->dp_unlinked_drain_taskq);
 	taskq_destroy(dp->dp_zrele_taskq);
-	if (dp->dp_blkstats != NULL) {
-		mutex_destroy(&dp->dp_blkstats->zab_lock);
+	if (dp->dp_blkstats != NULL)
 		vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
-	}
 	kmem_free(dp, sizeof (dsl_pool_t));
 }
 
@@ -476,8 +465,8 @@ dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
 }
 
 dsl_pool_t *
-dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp,
-    uint64_t txg)
+dsl_pool_create(spa_t *spa, nvlist_t *zplprops __attribute__((unused)),
+    dsl_crypto_params_t *dcp, uint64_t txg)
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
@@ -623,15 +612,18 @@ dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg)
 
 	/* Choose a value slightly bigger than min dirty sync bytes */
 	uint64_t sync_min =
-	    zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100;
+	    zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200;
 	if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0)
 		txg_kick(dp, txg);
 }
 
 boolean_t
-dsl_pool_wrlog_over_max(dsl_pool_t *dp)
+dsl_pool_need_wrlog_delay(dsl_pool_t *dp)
 {
-	return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0);
+	uint64_t delay_min_bytes =
+	    zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;
+
+	return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0);
 }
 
 static void
@@ -641,6 +633,9 @@ dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg)
 	delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
 	aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta);
 	aggsum_add(&dp->dp_wrlog_total, delta);
+	/* Compact per-CPU sums after the big change. */
+	(void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
+	(void) aggsum_value(&dp->dp_wrlog_total);
 }
 
 #ifdef ZFS_DEBUG
@@ -664,12 +659,15 @@ dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
 
 	return (B_TRUE);
 }
+#else
+#define	dsl_early_sync_task_verify(dp, txg) \
+	((void) sizeof (dp), (void) sizeof (txg), B_TRUE)
 #endif
 
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
-	zio_t *zio;
+	zio_t *rio;	/* root zio for all dirty dataset syncs */
 	dmu_tx_t *tx;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
@@ -699,9 +697,10 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	}
 
 	/*
-	 * Write out all dirty blocks of dirty datasets.
+	 * Write out all dirty blocks of dirty datasets. Note, this could
+	 * create a very large (+10k) zio tree.
 	 */
-	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		/*
 		 * We must not sync any non-MOS datasets twice, because
@@ -710,9 +709,9 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 		 */
 		ASSERT(!list_link_active(&ds->ds_synced_link));
 		list_insert_tail(&synced_datasets, ds);
-		dsl_dataset_sync(ds, zio, tx);
+		dsl_dataset_sync(ds, rio, tx);
 	}
-	VERIFY0(zio_wait(zio));
+	VERIFY0(zio_wait(rio));
 
 	/*
 	 * Update the long range free counter after
@@ -743,13 +742,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	 * user accounting information (and we won't get confused
 	 * about which blocks are part of the snapshot).
 	 */
-	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		objset_t *os = ds->ds_objset;
 
 		ASSERT(list_link_active(&ds->ds_synced_link));
 		dmu_buf_rele(ds->ds_dbuf, ds);
-		dsl_dataset_sync(ds, zio, tx);
+		dsl_dataset_sync(ds, rio, tx);
 
 		/*
 		 * Release any key mappings created by calls to
@@ -762,7 +761,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 			key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
 		}
 	}
-	VERIFY0(zio_wait(zio));
+	VERIFY0(zio_wait(rio));
 
 	/*
 	 * Now that the datasets have been completely synced, we can
@@ -783,6 +782,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 		}
 
 		dsl_dataset_sync_done(ds, tx);
+		dmu_buf_rele(ds->ds_dbuf, ds);
 	}
 
 	while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
@@ -947,24 +947,30 @@ dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy)
 	return (quota);
 }
 
+uint64_t
+dsl_pool_deferred_space(dsl_pool_t *dp)
+{
+	return (metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)));
+}
+
 boolean_t
 dsl_pool_need_dirty_delay(dsl_pool_t *dp)
 {
 	uint64_t delay_min_bytes =
 	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
 
-	mutex_enter(&dp->dp_lock);
-	uint64_t dirty = dp->dp_dirty_total;
-	mutex_exit(&dp->dp_lock);
-
-	return (dirty > delay_min_bytes);
+	/*
+	 * We are not taking the dp_lock here and few other places, since torn
+	 * reads are unlikely: on 64-bit systems due to register size and on
+	 * 32-bit due to memory constraints.  Pool-wide locks in hot path may
+	 * be too expensive, while we do not need a precise result here.
+	 */
+	return (dp->dp_dirty_total > delay_min_bytes);
 }
 
 static boolean_t
 dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg)
 {
-	ASSERT(MUTEX_HELD(&dp->dp_lock));
-
 	uint64_t dirty_min_bytes =
 	    zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
 	uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
@@ -1007,7 +1013,6 @@ dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
 	mutex_exit(&dp->dp_lock);
 }
 
-/* ARGSUSED */
 static int
 upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
@@ -1042,7 +1047,7 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 		 * will be wrong.
 		 */
 		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-		ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
+		ASSERT0(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(prev)->ds_bp));
 		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 		/* The origin doesn't get attached to itself */
@@ -1098,7 +1103,6 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 	    tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 }
 
-/* ARGSUSED */
 static int
 upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
@@ -1377,7 +1381,7 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
  */
 
 int
-dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
+dsl_pool_hold(const char *name, const void *tag, dsl_pool_t **dp)
 {
 	spa_t *spa;
 	int error;
@@ -1391,14 +1395,14 @@ dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
 }
 
 void
-dsl_pool_rele(dsl_pool_t *dp, void *tag)
+dsl_pool_rele(dsl_pool_t *dp, const void *tag)
 {
 	dsl_pool_config_exit(dp, tag);
 	spa_close(dp->dp_spa, tag);
 }
 
 void
-dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
+dsl_pool_config_enter(dsl_pool_t *dp, const void *tag)
 {
 	/*
 	 * We use a "reentrant" reader-writer lock, but not reentrantly.
@@ -1417,14 +1421,14 @@ dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
 }
 
 void
-dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag)
+dsl_pool_config_enter_prio(dsl_pool_t *dp, const void *tag)
 {
 	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
 	rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
 }
 
 void
-dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
+dsl_pool_config_exit(dsl_pool_t *dp, const void *tag)
 {
 	rrw_exit(&dp->dp_config_rwlock, tag);
 }
@@ -1444,37 +1448,33 @@ dsl_pool_config_held_writer(dsl_pool_t *dp)
 EXPORT_SYMBOL(dsl_pool_config_enter);
 EXPORT_SYMBOL(dsl_pool_config_exit);
 
-/* BEGIN CSTYLED */
 /* zfs_dirty_data_max_percent only applied at module load in arc_init(). */
-ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, INT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, UINT, ZMOD_RD,
 	"Max percent of RAM allowed to be dirty");
 
 /* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */
-ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, INT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, UINT, ZMOD_RD,
 	"zfs_dirty_data_max upper bound as % of RAM");
 
-ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, UINT, ZMOD_RW,
 	"Transaction delay threshold");
 
-ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, U64, ZMOD_RW,
 	"Determines the dirty space limit");
 
-ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, U64, ZMOD_RW,
 	"The size limit of write-transaction zil log data");
 
 /* zfs_dirty_data_max_max only applied at module load in arc_init(). */
-ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, U64, ZMOD_RD,
 	"zfs_dirty_data_max upper bound in bytes");
 
-ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW,
 	"Dirty data txg sync threshold as a percentage of zfs_dirty_data_max");
 
-ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW,
 	"How quickly delay approaches infinity");
 
-ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW,
-	"Max percent of CPUs that are used to sync dirty data");
-
 ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW,
 	"Max percent of CPUs that are used per dp_sync_taskq");
 
@@ -1483,4 +1483,3 @@ ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_minalloc, INT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_maxalloc, INT, ZMOD_RW,
 	"Max number of taskq entries that are cached");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dsl_prop.c b/sys/contrib/openzfs/module/zfs/dsl_prop.c
index dfa04d7681be..99f931cd8632 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_prop.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_prop.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -23,6 +23,7 @@
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 Martin Matuska. All rights reserved.
  * Copyright 2019 Joyent, Inc.
+ * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
  */
 
 #include <sys/zfs_context.h>
@@ -41,6 +42,7 @@
 
 #define	ZPROP_INHERIT_SUFFIX "$inherit"
 #define	ZPROP_RECVD_SUFFIX "$recvd"
+#define	ZPROP_IUV_SUFFIX "$iuv"
 
 static int
 dodefault(zfs_prop_t prop, int intsz, int numints, void *buf)
@@ -57,7 +59,7 @@ dodefault(zfs_prop_t prop, int intsz, int numints, void *buf)
 	if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
 		if (intsz != 1)
 			return (SET_ERROR(EOVERFLOW));
-		(void) strncpy(buf, zfs_prop_default_string(prop),
+		(void) strlcpy(buf, zfs_prop_default_string(prop),
 		    numints);
 	} else {
 		if (intsz != 8 || numints < 1)
@@ -69,6 +71,17 @@ dodefault(zfs_prop_t prop, int intsz, int numints, void *buf)
 	return (0);
 }
 
+static int
+dsl_prop_known_index(zfs_prop_t prop, uint64_t value)
+{
+	const char *str = NULL;
+	if (prop != ZPROP_CONT && prop != ZPROP_INVAL &&
+	    zfs_prop_get_type(prop) == PROP_TYPE_INDEX)
+		return (!zfs_prop_index_to_string(prop, value, &str));
+
+	return (-1);
+}
+
 int
 dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
     int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot)
@@ -81,6 +94,7 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
 	boolean_t inheriting = B_FALSE;
 	char *inheritstr;
 	char *recvdstr;
+	char *iuvstr;
 
 	ASSERT(dsl_pool_config_held(dd->dd_pool));
 
@@ -88,9 +102,10 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
 		setpoint[0] = '\0';
 
 	prop = zfs_name_to_prop(propname);
-	inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+	inheritable = (prop == ZPROP_USERPROP || zfs_prop_inheritable(prop));
 	inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
 	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+	iuvstr = kmem_asprintf("%s%s", propname, ZPROP_IUV_SUFFIX);
 
 	/*
 	 * Note: dd may become NULL, therefore we shouldn't dereference it
@@ -105,6 +120,18 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
 			inheriting = B_TRUE;
 		}
 
+		/* Check for a iuv value. */
+		err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+		    iuvstr, intsz, numints, buf);
+		if (err == 0 && dsl_prop_known_index(prop,
+		    *(uint64_t *)buf) != 1)
+			err = ENOENT;
+		if (err != ENOENT) {
+			if (setpoint != NULL && err == 0)
+				dsl_dir_name(dd, setpoint);
+			break;
+		}
+
 		/* Check for a local value. */
 		err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
 		    propname, intsz, numints, buf);
@@ -155,6 +182,7 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
 
 	kmem_strfree(inheritstr);
 	kmem_strfree(recvdstr);
+	kmem_strfree(iuvstr);
 
 	return (err);
 }
@@ -168,7 +196,7 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
 	uint64_t zapobj;
 
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
-	inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+	inheritable = (prop == ZPROP_USERPROP || zfs_prop_inheritable(prop));
 	zapobj = dsl_dataset_phys(ds)->ds_props_obj;
 
 	if (zapobj != 0) {
@@ -504,10 +532,10 @@ dsl_prop_hascb(dsl_dataset_t *ds)
 	return (!list_is_empty(&ds->ds_prop_cbs));
 }
 
-/* ARGSUSED */
 static int
 dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
+	(void) arg;
 	dsl_dir_t *dd = ds->ds_dir;
 	dsl_prop_record_t *pr;
 	dsl_prop_cb_record_t *cbr;
@@ -647,6 +675,45 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
 	dsl_dir_rele(dd, FTAG);
 }
 
+
+/*
+ * For newer values in zfs index type properties, we add a new key
+ * propname$iuv (iuv = Ignore Unknown Values) to the properties zap object
+ * to store the new property value and store the default value in the
+ * existing prop key. So that the propname$iuv key is ignored by the older zfs
+ * versions and the default property value from the existing prop key is
+ * used.
+ */
+static void
+dsl_prop_set_iuv(objset_t *mos, uint64_t zapobj, const char *propname,
+    int intsz, int numints, const void *value, dmu_tx_t *tx)
+{
+	char *iuvstr = kmem_asprintf("%s%s", propname, ZPROP_IUV_SUFFIX);
+	boolean_t iuv = B_FALSE;
+	zfs_prop_t prop = zfs_name_to_prop(propname);
+
+	switch (prop) {
+	case ZFS_PROP_REDUNDANT_METADATA:
+		if (*(uint64_t *)value == ZFS_REDUNDANT_METADATA_SOME ||
+		    *(uint64_t *)value == ZFS_REDUNDANT_METADATA_NONE)
+			iuv = B_TRUE;
+		break;
+	default:
+		break;
+	}
+
+	if (iuv) {
+		VERIFY0(zap_update(mos, zapobj, iuvstr, intsz, numints,
+		    value, tx));
+		uint64_t val = zfs_prop_default_numeric(prop);
+		VERIFY0(zap_update(mos, zapobj, propname, intsz, numints,
+		    &val, tx));
+	} else {
+		zap_remove(mos, zapobj, iuvstr, tx);
+	}
+	kmem_strfree(iuvstr);
+}
+
 void
 dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
     zprop_source_t source, int intsz, int numints, const void *value,
@@ -659,6 +726,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
 	const char *valstr = NULL;
 	char *inheritstr;
 	char *recvdstr;
+	char *iuvstr;
 	char *tbuf = NULL;
 	int err;
 	uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
@@ -692,6 +760,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
 
 	inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
 	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+	iuvstr = kmem_asprintf("%s%s", propname, ZPROP_IUV_SUFFIX);
 
 	switch ((int)source) {
 	case ZPROP_SRC_NONE:
@@ -709,11 +778,14 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
 		/*
 		 * remove propname$inherit
 		 * set propname -> value
+		 * set propname$iuv -> new property value
 		 */
 		err = zap_remove(mos, zapobj, inheritstr, tx);
 		ASSERT(err == 0 || err == ENOENT);
 		VERIFY0(zap_update(mos, zapobj, propname,
 		    intsz, numints, value, tx));
+		(void) dsl_prop_set_iuv(mos, zapobj, propname, intsz,
+		    numints, value, tx);
 		break;
 	case ZPROP_SRC_INHERITED:
 		/*
@@ -723,6 +795,8 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
 		 */
 		err = zap_remove(mos, zapobj, propname, tx);
 		ASSERT(err == 0 || err == ENOENT);
+		err = zap_remove(mos, zapobj, iuvstr, tx);
+		ASSERT(err == 0 || err == ENOENT);
 		if (version >= SPA_VERSION_RECVD_PROPS &&
 		    dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) {
 			dummy = 0;
@@ -749,7 +823,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
 		ASSERT(err == 0 || err == ENOENT);
 		err = zap_remove(mos, zapobj, inheritstr, tx);
 		ASSERT(err == 0 || err == ENOENT);
-		fallthrough;
+		zfs_fallthrough;
 	case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
 		/*
 		 * remove propname$recvd
@@ -763,6 +837,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
 
 	kmem_strfree(inheritstr);
 	kmem_strfree(recvdstr);
+	kmem_strfree(iuvstr);
 
 	/*
 	 * If we are left with an empty snap zap we can destroy it.
@@ -881,7 +956,7 @@ dsl_props_set_check(void *arg, dmu_tx_t *tx)
 			return (SET_ERROR(ENAMETOOLONG));
 		}
 		if (nvpair_type(elem) == DATA_TYPE_STRING) {
-			char *valstr = fnvpair_value_string(elem);
+			const char *valstr = fnvpair_value_string(elem);
 			if (strlen(valstr) >= (version <
 			    SPA_VERSION_STMF_PROP ?
 			    ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
@@ -1012,6 +1087,14 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
 
 			propname = za.za_name;
 			source = setpoint;
+
+			/* Skip if iuv entries are preset. */
+			valstr = kmem_asprintf("%s%s", propname,
+			    ZPROP_IUV_SUFFIX);
+			err = zap_contains(mos, propobj, valstr);
+			kmem_strfree(valstr);
+			if (err == 0)
+				continue;
 		} else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) {
 			/* Skip explicitly inherited entries. */
 			continue;
@@ -1019,8 +1102,8 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
 			if (flags & DSL_PROP_GET_LOCAL)
 				continue;
 
-			(void) strncpy(buf, za.za_name, (suffix - za.za_name));
-			buf[suffix - za.za_name] = '\0';
+			(void) strlcpy(buf, za.za_name,
+			    MIN(sizeof (buf), suffix - za.za_name + 1));
 			propname = buf;
 
 			if (!(flags & DSL_PROP_GET_RECEIVED)) {
@@ -1044,6 +1127,16 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
 
 			source = ((flags & DSL_PROP_GET_INHERITING) ?
 			    setpoint : ZPROP_SOURCE_VAL_RECVD);
+		} else if (strcmp(suffix, ZPROP_IUV_SUFFIX) == 0) {
+			(void) strlcpy(buf, za.za_name,
+			    MIN(sizeof (buf), suffix - za.za_name + 1));
+			propname = buf;
+			source = setpoint;
+			prop = zfs_name_to_prop(propname);
+
+			if (dsl_prop_known_index(prop,
+			    za.za_first_integer) != 1)
+				continue;
 		} else {
 			/*
 			 * For backward compatibility, skip suffixes we don't
@@ -1055,12 +1148,12 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
 		prop = zfs_name_to_prop(propname);
 
 		/* Skip non-inheritable properties. */
-		if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL &&
-		    !zfs_prop_inheritable(prop))
+		if ((flags & DSL_PROP_GET_INHERITING) &&
+		    prop != ZPROP_USERPROP && !zfs_prop_inheritable(prop))
 			continue;
 
 		/* Skip properties not valid for this type. */
-		if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL &&
+		if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_USERPROP &&
 		    !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT, B_FALSE))
 			continue;
 
diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c
index d25c067dfbc1..085cfd3c5691 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_scan.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -37,6 +37,7 @@
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
+#include <sys/arc_impl.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
@@ -46,12 +47,14 @@
 #include <sys/vdev_impl.h>
 #include <sys/zil_impl.h>
 #include <sys/zio_checksum.h>
+#include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/range_tree.h>
+#include <sys/dbuf.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
@@ -126,9 +129,20 @@ static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
 static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
 static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
 static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
-static uint64_t dsl_scan_count_data_disks(vdev_t *vd);
+static uint64_t dsl_scan_count_data_disks(spa_t *spa);
+static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb);
 
-extern int zfs_vdev_async_write_active_min_dirty_percent;
+extern uint_t zfs_vdev_async_write_active_min_dirty_percent;
+static int zfs_scan_blkstats = 0;
+
+/*
+ * 'zpool status' uses bytes processed per pass to report throughput and
+ * estimate time remaining.  We define a pass to start when the scanning
+ * phase completes for a sequential resilver.  Optionally, this value
+ * may be used to reset the pass statistics every N txgs to provide an
+ * estimated completion time based on currently observed performance.
+ */
+static uint_t zfs_scan_report_txgs = 0;
 
 /*
  * By default zfs will check to ensure it is not over the hard memory
@@ -136,7 +150,7 @@ extern int zfs_vdev_async_write_active_min_dirty_percent;
  * this value can be set to 1 to enable checking before scanning each
  * block.
  */
-int zfs_scan_strict_mem_lim = B_FALSE;
+static int zfs_scan_strict_mem_lim = B_FALSE;
 
 /*
  * Maximum number of parallelly executed bytes per leaf vdev. We attempt
@@ -146,41 +160,57 @@ int zfs_scan_strict_mem_lim = B_FALSE;
  * overload the drives with I/O, since that is protected by
  * zfs_vdev_scrub_max_active.
  */
-unsigned long zfs_scan_vdev_limit = 4 << 20;
+static uint64_t zfs_scan_vdev_limit = 16 << 20;
+
+static uint_t zfs_scan_issue_strategy = 0;
 
-int zfs_scan_issue_strategy = 0;
-int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */
-unsigned long zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
+/* don't queue & sort zios, go direct */
+static int zfs_scan_legacy = B_FALSE;
+static uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
 
 /*
  * fill_weight is non-tunable at runtime, so we copy it at module init from
  * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
  * break queue sorting.
  */
-int zfs_scan_fill_weight = 3;
+static uint_t zfs_scan_fill_weight = 3;
 static uint64_t fill_weight;
 
 /* See dsl_scan_should_clear() for details on the memory limit tunables */
-uint64_t zfs_scan_mem_lim_min = 16 << 20;	/* bytes */
-uint64_t zfs_scan_mem_lim_soft_max = 128 << 20;	/* bytes */
-int zfs_scan_mem_lim_fact = 20;		/* fraction of physmem */
-int zfs_scan_mem_lim_soft_fact = 20;	/* fraction of mem lim above */
-
-int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
-int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */
-int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
-int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
-int zfs_scan_checkpoint_intval = 7200; /* in seconds */
+static const uint64_t zfs_scan_mem_lim_min = 16 << 20;	/* bytes */
+static const uint64_t zfs_scan_mem_lim_soft_max = 128 << 20;	/* bytes */
+
+
+/* fraction of physmem */
+static uint_t zfs_scan_mem_lim_fact = 20;
+
+/* fraction of mem lim above */
+static uint_t zfs_scan_mem_lim_soft_fact = 20;
+
+/* minimum milliseconds to scrub per txg */
+static uint_t zfs_scrub_min_time_ms = 1000;
+
+/* minimum milliseconds to obsolete per txg */
+static uint_t zfs_obsolete_min_time_ms = 500;
+
+/* minimum milliseconds to free per txg */
+static uint_t zfs_free_min_time_ms = 1000;
+
+/* minimum milliseconds to resilver per txg */
+static uint_t zfs_resilver_min_time_ms = 3000;
+
+static uint_t zfs_scan_checkpoint_intval = 7200; /* in seconds */
 int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */
-int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
-int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
-enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+static int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+static int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
+static const ddt_class_t zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
 /* max number of blocks to free in a single TXG */
-unsigned long zfs_async_block_max_blocks = ULONG_MAX;
+static uint64_t zfs_async_block_max_blocks = UINT64_MAX;
 /* max number of dedup blocks to free in a single TXG */
-unsigned long zfs_max_async_dedup_frees = 100000;
+static uint64_t zfs_max_async_dedup_frees = 100000;
 
-int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */
+/* set to disable resilver deferring */
+static int zfs_resilver_disable_defer = B_FALSE;
 
 /*
  * We wait a few txgs after importing a pool to begin scanning so that
@@ -201,7 +231,10 @@ int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */
 /*
  * Enable/disable the processing of the free_bpobj object.
  */
-int zfs_free_bpobj_enabled = 1;
+static int zfs_free_bpobj_enabled = 1;
+
+/* Error blocks to be scrubbed in one txg. */
+static uint_t zfs_scrub_error_blocks_per_txg = 1 << 12;
 
 /* the order has to match pool_scan_type */
 static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
@@ -219,9 +252,9 @@ typedef struct {
 
 /*
  * This controls what conditions are placed on dsl_scan_sync_state():
- * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0
- * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0.
- * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise
+ * SYNC_OPTIONAL) write out scn_phys iff scn_queues_pending == 0
+ * SYNC_MANDATORY) write out scn_phys always. scn_queues_pending must be 0.
+ * SYNC_CACHED) if scn_queues_pending == 0, write out scn_phys. Otherwise
  *	write out the scn_phys_cached version.
  * See dsl_scan_sync_state for details.
  */
@@ -264,7 +297,7 @@ typedef struct scan_io {
 	 * event of an error. This array must go at the end of the
 	 * struct to allow this for the variable number of elements.
 	 */
-	dva_t			sio_dva[0];
+	dva_t			sio_dva[];
 } scan_io_t;
 
 #define	SIO_SET_OFFSET(sio, x)		DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
@@ -279,12 +312,14 @@ typedef struct scan_io {
 struct dsl_scan_io_queue {
 	dsl_scan_t	*q_scn; /* associated dsl_scan_t */
 	vdev_t		*q_vd; /* top-level vdev that this queue represents */
+	zio_t		*q_zio; /* scn_zio_root child for waiting on IO */
 
 	/* trees used for sorting I/Os and extents of I/Os */
 	range_tree_t	*q_exts_by_addr;
-	zfs_btree_t		q_exts_by_size;
+	zfs_btree_t	q_exts_by_size;
 	avl_tree_t	q_sios_by_addr;
 	uint64_t	q_sio_memused;
+	uint64_t	q_last_ext_addr;
 
 	/* members for zio rate limiting */
 	uint64_t	q_maxinflight_bytes;
@@ -392,25 +427,25 @@ dsl_scan_resilvering(dsl_pool_t *dp)
 static inline void
 sio2bp(const scan_io_t *sio, blkptr_t *bp)
 {
-	bzero(bp, sizeof (*bp));
+	memset(bp, 0, sizeof (*bp));
 	bp->blk_prop = sio->sio_blk_prop;
-	bp->blk_phys_birth = sio->sio_phys_birth;
-	bp->blk_birth = sio->sio_birth;
+	BP_SET_PHYSICAL_BIRTH(bp, sio->sio_phys_birth);
+	BP_SET_LOGICAL_BIRTH(bp, sio->sio_birth);
 	bp->blk_fill = 1;	/* we always only work with data pointers */
 	bp->blk_cksum = sio->sio_cksum;
 
 	ASSERT3U(sio->sio_nr_dvas, >, 0);
 	ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
 
-	bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t));
+	memcpy(bp->blk_dva, sio->sio_dva, sio->sio_nr_dvas * sizeof (dva_t));
 }
 
 static inline void
 bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
 {
 	sio->sio_blk_prop = bp->blk_prop;
-	sio->sio_phys_birth = bp->blk_phys_birth;
-	sio->sio_birth = bp->blk_birth;
+	sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp);
+	sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp);
 	sio->sio_cksum = bp->blk_cksum;
 	sio->sio_nr_dvas = BP_GET_NDVAS(bp);
 
@@ -447,14 +482,16 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 
 	/*
 	 * Calculate the max number of in-flight bytes for pool-wide
-	 * scanning operations (minimum 1MB). Limits for the issuing
-	 * phase are done per top-level vdev and are handled separately.
+	 * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
+	 * Limits for the issuing phase are done per top-level vdev and
+	 * are handled separately.
 	 */
-	scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
-	    dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
+	scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
+	    zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
 
 	avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
 	    offsetof(scan_ds_t, sds_node));
+	mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
 	    sizeof (scan_prefetch_issue_ctx_t),
 	    offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
@@ -481,8 +518,16 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 		    &scn->scn_phys.scn_queue_obj);
 	} else {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_ERRORSCRUB, sizeof (uint64_t),
+		    ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys);
+
+		if (err != 0 && err != ENOENT)
+			return (err);
+
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 		    &scn->scn_phys);
+
 		/*
 		 * Detect if the pool contains the signature of #2094.  If it
 		 * does properly update the scn->scn_phys structure and notify
@@ -507,7 +552,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 					return (EOVERFLOW);
 				}
 
-				bcopy(zaptmp, &scn->scn_phys,
+				memcpy(&scn->scn_phys, zaptmp,
 				    SCAN_PHYS_NUMINTS * sizeof (uint64_t));
 				scn->scn_phys.scn_flags = overflow;
 
@@ -529,7 +574,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 		 * counter to how far we've scanned. We know we're consistent
 		 * up to here.
 		 */
-		scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
+		scn->scn_issued_before_pass = scn->scn_phys.scn_examined -
+		    scn->scn_phys.scn_skipped;
 
 		if (dsl_scan_is_running(scn) &&
 		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
@@ -566,7 +612,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 		}
 	}
 
-	bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+	memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
 
 	/* reload the queue into the in-core state */
 	if (scn->scn_phys.scn_queue_obj != 0) {
@@ -585,6 +631,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 	}
 
 	spa_scan_stat_init(spa);
+	vdev_scan_stat_init(spa->spa_root_vdev);
+
 	return (0);
 }
 
@@ -599,6 +647,7 @@ dsl_scan_fini(dsl_pool_t *dp)
 
 		scan_ds_queue_clear(scn);
 		avl_destroy(&scn->scn_queue);
+		mutex_destroy(&scn->scn_queue_lock);
 		scan_ds_prefetch_queue_clear(scn);
 		avl_destroy(&scn->scn_prefetch_queue);
 
@@ -631,18 +680,96 @@ dsl_scan_scrubbing(const dsl_pool_t *dp)
 }
 
 boolean_t
+dsl_errorscrubbing(const dsl_pool_t *dp)
+{
+	dsl_errorscrub_phys_t *errorscrub_phys = &dp->dp_scan->errorscrub_phys;
+
+	return (errorscrub_phys->dep_state == DSS_ERRORSCRUBBING &&
+	    errorscrub_phys->dep_func == POOL_SCAN_ERRORSCRUB);
+}
+
+boolean_t
+dsl_errorscrub_is_paused(const dsl_scan_t *scn)
+{
+	return (dsl_errorscrubbing(scn->scn_dp) &&
+	    scn->errorscrub_phys.dep_paused_flags);
+}
+
+boolean_t
 dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
 {
 	return (dsl_scan_scrubbing(scn->scn_dp) &&
 	    scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
 }
 
+static void
+dsl_errorscrub_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	scn->errorscrub_phys.dep_cursor =
+	    zap_cursor_serialize(&scn->errorscrub_cursor);
+
+	VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS,
+	    &scn->errorscrub_phys, tx));
+}
+
+static void
+dsl_errorscrub_setup_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+	pool_scan_func_t *funcp = arg;
+	dsl_pool_t *dp = scn->scn_dp;
+	spa_t *spa = dp->dp_spa;
+
+	ASSERT(!dsl_scan_is_running(scn));
+	ASSERT(!dsl_errorscrubbing(scn->scn_dp));
+	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+
+	memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
+	scn->errorscrub_phys.dep_func = *funcp;
+	scn->errorscrub_phys.dep_state = DSS_ERRORSCRUBBING;
+	scn->errorscrub_phys.dep_start_time = gethrestime_sec();
+	scn->errorscrub_phys.dep_to_examine = spa_get_last_errlog_size(spa);
+	scn->errorscrub_phys.dep_examined = 0;
+	scn->errorscrub_phys.dep_errors = 0;
+	scn->errorscrub_phys.dep_cursor = 0;
+	zap_cursor_init_serialized(&scn->errorscrub_cursor,
+	    spa->spa_meta_objset, spa->spa_errlog_last,
+	    scn->errorscrub_phys.dep_cursor);
+
+	vdev_config_dirty(spa->spa_root_vdev);
+	spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_START);
+
+	dsl_errorscrub_sync_state(scn, tx);
+
+	spa_history_log_internal(spa, "error scrub setup", tx,
+	    "func=%u mintxg=%u maxtxg=%llu",
+	    *funcp, 0, (u_longlong_t)tx->tx_txg);
+}
+
+static int
+dsl_errorscrub_setup_check(void *arg, dmu_tx_t *tx)
+{
+	(void) arg;
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+	if (dsl_scan_is_running(scn) || (dsl_errorscrubbing(scn->scn_dp))) {
+		return (SET_ERROR(EBUSY));
+	}
+
+	if (spa_get_last_errlog_size(scn->scn_dp->dp_spa) == 0) {
+		return (ECANCELED);
+	}
+	return (0);
+}
+
 /*
  * Writes out a persistent dsl_scan_phys_t record to the pool directory.
  * Because we can be running in the block sorting algorithm, we do not always
  * want to write out the record, only when it is "safe" to do so. This safety
  * condition is achieved by making sure that the sorting queues are empty
- * (scn_bytes_pending == 0). When this condition is not true, the sync'd state
+ * (scn_queues_pending == 0). When this condition is not true, the sync'd state
  * is inconsistent with how much actual scanning progress has been made. The
  * kind of sync to be performed is specified by the sync_type argument. If the
  * sync is optional, we only sync if the queues are empty. If the sync is
@@ -665,8 +792,8 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
 	int i;
 	spa_t *spa = scn->scn_dp->dp_spa;
 
-	ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0);
-	if (scn->scn_bytes_pending == 0) {
+	ASSERT(sync_type != SYNC_MANDATORY || scn->scn_queues_pending == 0);
+	if (scn->scn_queues_pending == 0) {
 		for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
 			vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
 			dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
@@ -688,7 +815,7 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
 		    DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 		    &scn->scn_phys, tx));
-		bcopy(&scn->scn_phys, &scn->scn_phys_cached,
+		memcpy(&scn->scn_phys_cached, &scn->scn_phys,
 		    sizeof (scn->scn_phys));
 
 		if (scn->scn_checkpointing)
@@ -705,14 +832,15 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
 	}
 }
 
-/* ARGSUSED */
 int
 dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
 {
+	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
 
-	if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd))
+	if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd) ||
+	    dsl_errorscrubbing(scn->scn_dp))
 		return (SET_ERROR(EBUSY));
 
 	return (0);
@@ -721,6 +849,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
 void
 dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 {
+	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	pool_scan_func_t *funcp = arg;
 	dmu_object_type_t ot = 0;
@@ -729,7 +858,15 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 
 	ASSERT(!dsl_scan_is_running(scn));
 	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
-	bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+	memset(&scn->scn_phys, 0, sizeof (scn->scn_phys));
+
+	/*
+	 * If we are starting a fresh scrub, we erase the error scrub
+	 * information from disk.
+	 */
+	memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
+	dsl_errorscrub_sync_state(scn, tx);
+
 	scn->scn_phys.scn_func = *funcp;
 	scn->scn_phys.scn_state = DSS_SCANNING;
 	scn->scn_phys.scn_min_txg = 0;
@@ -744,6 +881,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 	scn->scn_last_checkpoint = 0;
 	scn->scn_checkpointing = B_FALSE;
 	spa_scan_stat_init(spa);
+	vdev_scan_stat_init(spa->spa_root_vdev);
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
@@ -791,13 +929,19 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 
 	/* back to the generic stuff */
 
-	if (dp->dp_blkstats == NULL) {
-		dp->dp_blkstats =
-		    vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
-		mutex_init(&dp->dp_blkstats->zab_lock, NULL,
-		    MUTEX_DEFAULT, NULL);
+	if (zfs_scan_blkstats) {
+		if (dp->dp_blkstats == NULL) {
+			dp->dp_blkstats =
+			    vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+		}
+		memset(&dp->dp_blkstats->zab_type, 0,
+		    sizeof (dp->dp_blkstats->zab_type));
+	} else {
+		if (dp->dp_blkstats) {
+			vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+			dp->dp_blkstats = NULL;
+		}
 	}
-	bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type));
 
 	if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
 		ot = DMU_OT_ZAP_OTHER;
@@ -805,7 +949,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
 	    ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
 
-	bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+	memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
 
 	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
 
@@ -816,8 +960,9 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 }
 
 /*
- * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
- * Can also be called to resume a paused scrub.
+ * Called by ZFS_IOC_POOL_SCRUB and ZFS_IOC_POOL_SCAN ioctl to start a scrub,
+ * error scrub or resilver. Can also be called to resume a paused scrub or
+ * error scrub.
  */
 int
 dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
@@ -843,6 +988,26 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
 		return (0);
 	}
 
+	if (func == POOL_SCAN_ERRORSCRUB) {
+		if (dsl_errorscrub_is_paused(dp->dp_scan)) {
+			/*
+			 * got error scrub start cmd, resume paused error scrub.
+			 */
+			int err = dsl_scrub_set_pause_resume(scn->scn_dp,
+			    POOL_SCRUB_NORMAL);
+			if (err == 0) {
+				spa_event_notify(spa, NULL, NULL,
+				    ESC_ZFS_ERRORSCRUB_RESUME);
+				return (ECANCELED);
+			}
+			return (SET_ERROR(err));
+		}
+
+		return (dsl_sync_task(spa_name(dp->dp_spa),
+		    dsl_errorscrub_setup_check, dsl_errorscrub_setup_sync,
+		    &func, 0, ZFS_SPACE_CHECK_RESERVED));
+	}
+
 	if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
 		/* got scrub start cmd, resume paused scrub */
 		int err = dsl_scrub_set_pause_resume(scn->scn_dp,
@@ -851,7 +1016,6 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
 			spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
 			return (SET_ERROR(ECANCELED));
 		}
-
 		return (SET_ERROR(err));
 	}
 
@@ -859,7 +1023,33 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
 	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
-/* ARGSUSED */
+static void
+dsl_errorscrub_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	spa_t *spa = dp->dp_spa;
+
+	if (complete) {
+		spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_FINISH);
+		spa_history_log_internal(spa, "error scrub done", tx,
+		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
+	} else {
+		spa_history_log_internal(spa, "error scrub canceled", tx,
+		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
+	}
+
+	scn->errorscrub_phys.dep_state = complete ? DSS_FINISHED : DSS_CANCELED;
+	spa->spa_scrub_active = B_FALSE;
+	spa_errlog_rotate(spa);
+	scn->errorscrub_phys.dep_end_time = gethrestime_sec();
+	zap_cursor_fini(&scn->errorscrub_cursor);
+
+	if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
+		spa->spa_errata = 0;
+
+	ASSERT(!dsl_errorscrubbing(scn->scn_dp));
+}
+
 static void
 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 {
@@ -920,13 +1110,13 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 
 	if (dsl_scan_restarting(scn, tx))
 		spa_history_log_internal(spa, "scan aborted, restarting", tx,
-		    "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
+		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 	else if (!complete)
 		spa_history_log_internal(spa, "scan cancelled", tx,
-		    "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
+		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 	else
 		spa_history_log_internal(spa, "scan done", tx,
-		    "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
+		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		spa->spa_scrub_active = B_FALSE;
@@ -989,7 +1179,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 		    vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
 			spa_history_log_internal(spa,
 			    "starting deferred resilver", tx, "errors=%llu",
-			    (u_longlong_t)spa_get_errlog_size(spa));
+			    (u_longlong_t)spa_approx_errlog_size(spa));
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 		}
 
@@ -1006,10 +1196,96 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 	ASSERT(!dsl_scan_is_running(scn));
 }
 
-/* ARGSUSED */
+static int
+dsl_errorscrub_pause_resume_check(void *arg, dmu_tx_t *tx)
+{
+	pool_scrub_cmd_t *cmd = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_scan_t *scn = dp->dp_scan;
+
+	if (*cmd == POOL_SCRUB_PAUSE) {
+		/*
+		 * can't pause a error scrub when there is no in-progress
+		 * error scrub.
+		 */
+		if (!dsl_errorscrubbing(dp))
+			return (SET_ERROR(ENOENT));
+
+		/* can't pause a paused error scrub */
+		if (dsl_errorscrub_is_paused(scn))
+			return (SET_ERROR(EBUSY));
+	} else if (*cmd != POOL_SCRUB_NORMAL) {
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	return (0);
+}
+
+static void
+dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
+{
+	pool_scrub_cmd_t *cmd = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	spa_t *spa = dp->dp_spa;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	if (*cmd == POOL_SCRUB_PAUSE) {
+		spa->spa_scan_pass_errorscrub_pause = gethrestime_sec();
+		scn->errorscrub_phys.dep_paused_flags = B_TRUE;
+		dsl_errorscrub_sync_state(scn, tx);
+		spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED);
+	} else {
+		ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
+		if (dsl_errorscrub_is_paused(scn)) {
+			/*
+			 * We need to keep track of how much time we spend
+			 * paused per pass so that we can adjust the error scrub
+			 * rate shown in the output of 'zpool status'.
+			 */
+			spa->spa_scan_pass_errorscrub_spent_paused +=
+			    gethrestime_sec() -
+			    spa->spa_scan_pass_errorscrub_pause;
+
+			spa->spa_scan_pass_errorscrub_pause = 0;
+			scn->errorscrub_phys.dep_paused_flags = B_FALSE;
+
+			zap_cursor_init_serialized(
+			    &scn->errorscrub_cursor,
+			    spa->spa_meta_objset, spa->spa_errlog_last,
+			    scn->errorscrub_phys.dep_cursor);
+
+			dsl_errorscrub_sync_state(scn, tx);
+		}
+	}
+}
+
+static int
+dsl_errorscrub_cancel_check(void *arg, dmu_tx_t *tx)
+{
+	(void) arg;
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+	/* can't cancel a error scrub when there is no one in-progress */
+	if (!dsl_errorscrubbing(scn->scn_dp))
+		return (SET_ERROR(ENOENT));
+	return (0);
+}
+
+static void
+dsl_errorscrub_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+	(void) arg;
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+	dsl_errorscrub_done(scn, B_FALSE, tx);
+	dsl_errorscrub_sync_state(scn, tx);
+	spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL,
+	    ESC_ZFS_ERRORSCRUB_ABORT);
+}
+
 static int
 dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
 {
+	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	if (!dsl_scan_is_running(scn))
@@ -1017,10 +1293,10 @@ dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
 	return (0);
 }
 
-/* ARGSUSED */
 static void
 dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
 {
+	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	dsl_scan_done(scn, B_FALSE, tx);
@@ -1031,6 +1307,11 @@ dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
 int
 dsl_scan_cancel(dsl_pool_t *dp)
 {
+	if (dsl_errorscrubbing(dp)) {
+		return (dsl_sync_task(spa_name(dp->dp_spa),
+		    dsl_errorscrub_cancel_check, dsl_errorscrub_cancel_sync,
+		    NULL, 3, ZFS_SPACE_CHECK_RESERVED));
+	}
 	return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
 	    dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
 }
@@ -1097,6 +1378,12 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
 int
 dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
 {
+	if (dsl_errorscrubbing(dp)) {
+		return (dsl_sync_task(spa_name(dp->dp_spa),
+		    dsl_errorscrub_pause_resume_check,
+		    dsl_errorscrub_pause_resume_sync, &cmd, 3,
+		    ZFS_SPACE_CHECK_RESERVED));
+	}
 	return (dsl_sync_task(spa_name(dp->dp_spa),
 	    dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
 	    ZFS_SPACE_CHECK_RESERVED));
@@ -1204,7 +1491,7 @@ scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
 	dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
 	    DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
 
-	ASSERT0(scn->scn_bytes_pending);
+	ASSERT0(scn->scn_queues_pending);
 	ASSERT(scn->scn_phys.scn_queue_obj != 0);
 
 	VERIFY0(dmu_object_free(dp->dp_meta_objset,
@@ -1275,9 +1562,13 @@ dsl_scan_should_clear(dsl_scan_t *scn)
 		mutex_enter(&tvd->vdev_scan_io_queue_lock);
 		queue = tvd->vdev_scan_io_queue;
 		if (queue != NULL) {
-			/* # extents in exts_by_size = # in exts_by_addr */
+			/*
+			 * # of extents in exts_by_addr = # in exts_by_size.
+			 * B-tree efficiency is ~75%, but can be as low as 50%.
+			 */
 			mused += zfs_btree_numnodes(&queue->q_exts_by_size) *
-			    sizeof (range_seg_gap_t) + queue->q_sio_memused;
+			    ((sizeof (range_seg_gap_t) + sizeof (uint64_t)) *
+			    3 / 2) + queue->q_sio_memused;
 		}
 		mutex_exit(&tvd->vdev_scan_io_queue_lock);
 	}
@@ -1285,7 +1576,7 @@ dsl_scan_should_clear(dsl_scan_t *scn)
 	dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
 
 	if (mused == 0)
-		ASSERT0(scn->scn_bytes_pending);
+		ASSERT0(scn->scn_queues_pending);
 
 	/*
 	 * If we are above our hard limit, we need to clear out memory.
@@ -1335,12 +1626,13 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 	uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
 	uint64_t sync_time_ns = curr_time_ns -
 	    scn->scn_dp->dp_spa->spa_sync_starttime;
-	int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
-	int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+	uint64_t dirty_min_bytes = zfs_dirty_data_max *
+	    zfs_vdev_async_write_active_min_dirty_percent / 100;
+	uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
 	    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
 
 	if ((NSEC2MSEC(scan_time_ns) > mintime &&
-	    (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+	    (scn->scn_dp->dp_dirty_total >= dirty_min_bytes ||
 	    txg_sync_waiting(scn->scn_dp) ||
 	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa) ||
@@ -1378,16 +1670,52 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 	return (B_FALSE);
 }
 
+static boolean_t
+dsl_error_scrub_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
+{
+	/*
+	 * We suspend if:
+	 *  - we have scrubbed for at least the minimum time (default 1 sec
+	 *    for error scrub), someone is explicitly waiting for this txg
+	 *    to complete, or we have used up all of the time in the txg
+	 *    timeout (default 5 sec).
+	 *  or
+	 *  - the spa is shutting down because this pool is being exported
+	 *    or the machine is rebooting.
+	 */
+	uint64_t curr_time_ns = gethrtime();
+	uint64_t error_scrub_time_ns = curr_time_ns - scn->scn_sync_start_time;
+	uint64_t sync_time_ns = curr_time_ns -
+	    scn->scn_dp->dp_spa->spa_sync_starttime;
+	int mintime = zfs_scrub_min_time_ms;
+
+	if ((NSEC2MSEC(error_scrub_time_ns) > mintime &&
+	    (txg_sync_waiting(scn->scn_dp) ||
+	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+	    spa_shutting_down(scn->scn_dp->dp_spa)) {
+		if (zb) {
+			dprintf("error scrub suspending at bookmark "
+			    "%llx/%llx/%llx/%llx\n",
+			    (longlong_t)zb->zb_objset,
+			    (longlong_t)zb->zb_object,
+			    (longlong_t)zb->zb_level,
+			    (longlong_t)zb->zb_blkid);
+		}
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
 typedef struct zil_scan_arg {
 	dsl_pool_t	*zsa_dp;
 	zil_header_t	*zsa_zh;
 } zil_scan_arg_t;
 
-/* ARGSUSED */
 static int
 dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
     uint64_t claim_txg)
 {
+	(void) zilog;
 	zil_scan_arg_t *zsa = arg;
 	dsl_pool_t *dp = zsa->zsa_dp;
 	dsl_scan_t *scn = dp->dp_scan;
@@ -1395,7 +1723,8 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 	zbookmark_phys_t zb;
 
 	ASSERT(!BP_IS_REDACTED(bp));
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+	if (BP_IS_HOLE(bp) ||
+	    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
 		return (0);
 
 	/*
@@ -1404,7 +1733,8 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 	 * (on-disk) even if it hasn't been claimed (even though for
 	 * scrub there's nothing to do to it).
 	 */
-	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
+	if (claim_txg == 0 &&
+	    BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa))
 		return (0);
 
 	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
@@ -1414,11 +1744,11 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 	return (0);
 }
 
-/* ARGSUSED */
 static int
 dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
     uint64_t claim_txg)
 {
+	(void) zilog;
 	if (lrc->lrc_txtype == TX_WRITE) {
 		zil_scan_arg_t *zsa = arg;
 		dsl_pool_t *dp = zsa->zsa_dp;
@@ -1430,7 +1760,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
 
 		ASSERT(!BP_IS_REDACTED(bp));
 		if (BP_IS_HOLE(bp) ||
-		    bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+		    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
 			return (0);
 
 		/*
@@ -1438,9 +1768,10 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
 		 * already txg sync'ed (but this log block contains
 		 * other records that are not synced)
 		 */
-		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+		if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
 			return (0);
 
+		ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 		    lr->lr_foid, ZB_ZIL_LEVEL,
 		    lr->lr_offset / BP_GET_LSIZE(bp));
@@ -1491,7 +1822,7 @@ scan_prefetch_queue_compare(const void *a, const void *b)
 }
 
 static void
-scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag)
+scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, const void *tag)
 {
 	if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) {
 		zfs_refcount_destroy(&spc->spc_refcnt);
@@ -1500,7 +1831,7 @@ scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag)
 }
 
 static scan_prefetch_ctx_t *
-scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag)
+scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, const void *tag)
 {
 	scan_prefetch_ctx_t *spc;
 
@@ -1522,7 +1853,7 @@ scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag)
 }
 
 static void
-scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag)
+scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, const void *tag)
 {
 	zfs_refcount_add(&spc->spc_refcnt, tag);
 }
@@ -1576,7 +1907,8 @@ dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
 	if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp))
 		return;
 
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg ||
+	if (BP_IS_HOLE(bp) ||
+	    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg ||
 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
 	    BP_GET_TYPE(bp) != DMU_OT_OBJSET))
 		return;
@@ -1643,6 +1975,7 @@ static void
 dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *private)
 {
+	(void) zio;
 	scan_prefetch_ctx_t *spc = private;
 	dsl_scan_t *scn = spc->spc_scn;
 	spa_t *spa = scn->scn_dp->dp_spa;
@@ -1687,6 +2020,11 @@ dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
 		    zb->zb_objset, DMU_META_DNODE_OBJECT);
 
 		if (OBJSET_BUF_HAS_USERUSED(buf)) {
+			if (OBJSET_BUF_HAS_PROJECTUSED(buf)) {
+				dsl_scan_prefetch_dnode(scn,
+				    &osp->os_projectused_dnode, zb->zb_objset,
+				    DMU_PROJECTUSED_OBJECT);
+			}
 			dsl_scan_prefetch_dnode(scn,
 			    &osp->os_groupused_dnode, zb->zb_objset,
 			    DMU_GROUPUSED_OBJECT);
@@ -1702,7 +2040,6 @@ out:
 	scan_prefetch_ctx_rele(spc, scn);
 }
 
-/* ARGSUSED */
 static void
 dsl_scan_prefetch_thread(void *arg)
 {
@@ -1748,10 +2085,16 @@ dsl_scan_prefetch_thread(void *arg)
 			zio_flags |= ZIO_FLAG_RAW;
 		}
 
+		/* We don't need data L1 buffer since we do not prefetch L0. */
+		blkptr_t *bp = &spic->spic_bp;
+		if (BP_GET_LEVEL(bp) == 1 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
+		    BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+			flags |= ARC_FLAG_NO_BUF;
+
 		/* issue the prefetch asynchronously */
-		(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
-		    &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
-		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
+		(void) arc_read(scn->scn_zio_root, spa, bp,
+		    dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB,
+		    zio_flags, &flags, &spic->spic_zb);
 
 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 	}
@@ -1788,24 +2131,23 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
 
 		/*
 		 * If we found the block we're trying to resume from, or
-		 * we went past it to a different object, zero it out to
-		 * indicate that it's OK to start checking for suspending
-		 * again.
+		 * we went past it, zero it out to indicate that it's OK
+		 * to start checking for suspending again.
 		 */
-		if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
-		    zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
+		if (zbookmark_subtree_tbd(dnp, zb,
+		    &scn->scn_phys.scn_bookmark)) {
 			dprintf("resuming at %llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
-			bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
+			memset(&scn->scn_phys.scn_bookmark, 0, sizeof (*zb));
 		}
 	}
 	return (B_FALSE);
 }
 
-static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+static void dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
     dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
     dmu_objset_type_t ostype, dmu_tx_t *tx);
 inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
@@ -1822,11 +2164,25 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
     const zbookmark_phys_t *zb, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
+	spa_t *spa = dp->dp_spa;
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
 	int err;
 
 	ASSERT(!BP_IS_REDACTED(bp));
 
+	/*
+	 * There is an unlikely case of encountering dnodes with contradicting
+	 * dn_bonuslen and DNODE_FLAG_SPILL_BLKPTR flag before in files created
+	 * or modified before commit 4254acb was merged. As it is not possible
+	 * to know which of the two is correct, report an error.
+	 */
+	if (dnp != NULL &&
+	    dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) {
+		scn->scn_phys.scn_errors++;
+		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
+		return (SET_ERROR(EINVAL));
+	}
+
 	if (BP_GET_LEVEL(bp) > 0) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
@@ -1834,7 +2190,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		arc_buf_t *buf;
 
-		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
@@ -1862,7 +2218,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 			zio_flags |= ZIO_FLAG_RAW;
 		}
 
-		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
@@ -1881,7 +2237,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 		objset_phys_t *osp;
 		arc_buf_t *buf;
 
-		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
@@ -1912,6 +2268,15 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 			    DMU_USERUSED_OBJECT, tx);
 		}
 		arc_buf_destroy(buf, &buf);
+	} else if (!zfs_blkptr_verify(spa, bp,
+	    BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
+		/*
+		 * Sanity check the block pointer contents, this is handled
+		 * by arc_read() for the cases above.
+		 */
+		scn->scn_phys.scn_errors++;
+		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
+		return (SET_ERROR(EINVAL));
 	}
 
 	return (0);
@@ -1947,12 +2312,11 @@ dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
  * first 5; we want them to be useful.
  */
 static void
-dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
     dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
     dmu_objset_type_t ostype, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
-	blkptr_t *bp_toread = NULL;
 
 	if (dsl_scan_check_suspend(scn, zb))
 		return;
@@ -1962,19 +2326,6 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
 
 	scn->scn_visited_this_txg++;
 
-	/*
-	 * This debugging is commented out to conserve stack space.  This
-	 * function is called recursively and the debugging adds several
-	 * bytes to the stack for each call.  It can be commented back in
-	 * if required to debug an issue in dsl_scan_visitbp().
-	 *
-	 * dprintf_bp(bp,
-	 *     "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
-	 *     ds, ds ? ds->ds_object : 0,
-	 *     zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
-	 *     bp);
-	 */
-
 	if (BP_IS_HOLE(bp)) {
 		scn->scn_holes_this_txg++;
 		return;
@@ -1986,16 +2337,28 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
 		return;
 	}
 
-	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
+	/*
+	 * Check if this block contradicts any filesystem flags.
+	 */
+	spa_feature_t f = SPA_FEATURE_LARGE_BLOCKS;
+	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
+		ASSERT(dsl_dataset_feature_is_active(ds, f));
+
+	f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
+	if (f != SPA_FEATURE_NONE)
+		ASSERT(dsl_dataset_feature_is_active(ds, f));
+
+	f = zio_compress_to_feature(BP_GET_COMPRESS(bp));
+	if (f != SPA_FEATURE_NONE)
+		ASSERT(dsl_dataset_feature_is_active(ds, f));
+
+	if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) {
 		scn->scn_lt_min_this_txg++;
 		return;
 	}
 
-	bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
-	*bp_toread = *bp;
-
-	if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0)
-		goto out;
+	if (dsl_scan_recurse(scn, ds, ostype, dnp, bp, zb, tx) != 0)
+		return;
 
 	/*
 	 * If dsl_scan_ddt() has already visited this block, it will have
@@ -2005,7 +2368,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
 	if (ddt_class_contains(dp->dp_spa,
 	    scn->scn_phys.scn_ddt_class_max, bp)) {
 		scn->scn_ddt_contained_this_txg++;
-		goto out;
+		return;
 	}
 
 	/*
@@ -2015,15 +2378,12 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
 	 * Don't scan it now unless we need to because something
 	 * under it was modified.
 	 */
-	if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
+	if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
 		scn->scn_gt_max_this_txg++;
-		goto out;
+		return;
 	}
 
 	scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
-
-out:
-	kmem_free(bp_toread, sizeof (blkptr_t));
 }
 
 static void
@@ -2340,7 +2700,6 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
 	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 }
 
-/* ARGSUSED */
 static int
 enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
@@ -2366,8 +2725,10 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 			return (err);
 		ds = prev;
 	}
+	mutex_enter(&scn->scn_queue_lock);
 	scan_ds_queue_insert(scn, ds->ds_object,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
+	mutex_exit(&scn->scn_queue_lock);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
@@ -2525,10 +2886,10 @@ out:
 	dsl_dataset_rele(ds, FTAG);
 }
 
-/* ARGSUSED */
 static int
 enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
+	(void) arg;
 	dsl_dataset_t *ds;
 	int err;
 	dsl_scan_t *scn = dp->dp_scan;
@@ -2558,22 +2919,23 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 		ds = prev;
 	}
 
+	mutex_enter(&scn->scn_queue_lock);
 	scan_ds_queue_insert(scn, ds->ds_object,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
+	mutex_exit(&scn->scn_queue_lock);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
-/* ARGSUSED */
 void
 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
     ddt_entry_t *dde, dmu_tx_t *tx)
 {
+	(void) tx;
 	const ddt_key_t *ddk = &dde->dde_key;
 	ddt_phys_t *ddp = dde->dde_phys;
 	blkptr_t bp;
 	zbookmark_phys_t zb = { 0 };
-	int p;
 
 	if (!dsl_scan_is_running(scn))
 		return;
@@ -2592,7 +2954,7 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
 	if (scn->scn_done_txg != 0)
 		return;
 
-	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		if (ddp->ddp_phys_birth == 0 ||
 		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
 			continue;
@@ -2609,7 +2971,7 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
  * If there are N references to a deduped block, we don't want to scrub it
  * N times -- ideally, we should scrub it exactly once.
  *
- * We leverage the fact that the dde's replication class (enum ddt_class)
+ * We leverage the fact that the dde's replication class (ddt_class_t)
  * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
  * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
  *
@@ -2640,12 +3002,10 @@ static void
 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
-	ddt_entry_t dde;
+	ddt_entry_t dde = {{{{0}}}};
 	int error;
 	uint64_t n = 0;
 
-	bzero(&dde, sizeof (ddt_entry_t));
-
 	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
 		ddt_t *ddt;
 
@@ -2708,7 +3068,6 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
 		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
 		dsl_scan_visit_rootbp(scn, NULL,
 		    &dp->dp_meta_rootbp, tx);
-		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 		if (scn->scn_suspending)
 			return;
 
@@ -2738,7 +3097,7 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
 	 * In case we suspended right at the end of the ds, zero the
 	 * bookmark so we don't think that we're still trying to resume.
 	 */
-	bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
+	memset(&scn->scn_phys.scn_bookmark, 0, sizeof (zbookmark_phys_t));
 
 	/*
 	 * Keep pulling things out of the dataset avl queue. Updates to the
@@ -2777,8 +3136,9 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
 }
 
 static uint64_t
-dsl_scan_count_data_disks(vdev_t *rvd)
+dsl_scan_count_data_disks(spa_t *spa)
 {
+	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t i, leaves = 0;
 
 	for (i = 0; i < rvd->vdev_children; i++) {
@@ -2820,12 +3180,13 @@ scan_io_queue_check_suspend(dsl_scan_t *scn)
 	uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
 	uint64_t sync_time_ns = curr_time_ns -
 	    scn->scn_dp->dp_spa->spa_sync_starttime;
-	int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
-	int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+	uint64_t dirty_min_bytes = zfs_dirty_data_max *
+	    zfs_vdev_async_write_active_min_dirty_percent / 100;
+	uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
 	    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
 
 	return ((NSEC2MSEC(scan_time_ns) > mintime &&
-	    (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+	    (scn->scn_dp->dp_dirty_total >= dirty_min_bytes ||
 	    txg_sync_waiting(scn->scn_dp) ||
 	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa));
@@ -2844,7 +3205,6 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
 {
 	dsl_scan_t *scn = queue->q_scn;
 	scan_io_t *sio;
-	int64_t bytes_issued = 0;
 	boolean_t suspended = B_FALSE;
 
 	while ((sio = list_head(io_list)) != NULL) {
@@ -2856,16 +3216,12 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
 		}
 
 		sio2bp(sio, &bp);
-		bytes_issued += SIO_GET_ASIZE(sio);
 		scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
 		    &sio->sio_zb, queue);
 		(void) list_remove_head(io_list);
 		scan_io_queues_update_zio_stats(queue, &bp);
 		sio_free(sio);
 	}
-
-	atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
-
 	return (suspended);
 }
 
@@ -2910,6 +3266,8 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
 
 		next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
 		avl_remove(&queue->q_sios_by_addr, sio);
+		if (avl_is_empty(&queue->q_sios_by_addr))
+			atomic_add_64(&queue->q_scn->scn_queues_pending, -1);
 		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 
 		bytes_issued += SIO_GET_ASIZE(sio);
@@ -2931,12 +3289,13 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
 		range_tree_resize_segment(queue->q_exts_by_addr, rs,
 		    SIO_GET_OFFSET(sio), rs_get_end(rs,
 		    queue->q_exts_by_addr) - SIO_GET_OFFSET(sio));
-
+		queue->q_last_ext_addr = SIO_GET_OFFSET(sio);
 		return (B_TRUE);
 	} else {
 		uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr);
 		uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr);
 		range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart);
+		queue->q_last_ext_addr = -1;
 		return (B_FALSE);
 	}
 }
@@ -2961,31 +3320,8 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 	ASSERT(scn->scn_is_sorted);
 
-	/* handle tunable overrides */
-	if (scn->scn_checkpointing || scn->scn_clearing) {
-		if (zfs_scan_issue_strategy == 1) {
-			return (range_tree_first(rt));
-		} else if (zfs_scan_issue_strategy == 2) {
-			/*
-			 * We need to get the original entry in the by_addr
-			 * tree so we can modify it.
-			 */
-			range_seg_t *size_rs =
-			    zfs_btree_first(&queue->q_exts_by_size, NULL);
-			if (size_rs == NULL)
-				return (NULL);
-			uint64_t start = rs_get_start(size_rs, rt);
-			uint64_t size = rs_get_end(size_rs, rt) - start;
-			range_seg_t *addr_rs = range_tree_find(rt, start,
-			    size);
-			ASSERT3P(addr_rs, !=, NULL);
-			ASSERT3U(rs_get_start(size_rs, rt), ==,
-			    rs_get_start(addr_rs, rt));
-			ASSERT3U(rs_get_end(size_rs, rt), ==,
-			    rs_get_end(addr_rs, rt));
-			return (addr_rs);
-		}
-	}
+	if (!scn->scn_checkpointing && !scn->scn_clearing)
+		return (NULL);
 
 	/*
 	 * During normal clearing, we want to issue our largest segments
@@ -2996,28 +3332,42 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
 	 * so the way we are sorted now is as good as it will ever get.
 	 * In this case, we instead switch to issuing extents in LBA order.
 	 */
-	if (scn->scn_checkpointing) {
+	if ((zfs_scan_issue_strategy < 1 && scn->scn_checkpointing) ||
+	    zfs_scan_issue_strategy == 1)
 		return (range_tree_first(rt));
-	} else if (scn->scn_clearing) {
-		/*
-		 * We need to get the original entry in the by_addr
-		 * tree so we can modify it.
-		 */
-		range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size,
-		    NULL);
-		if (size_rs == NULL)
-			return (NULL);
-		uint64_t start = rs_get_start(size_rs, rt);
-		uint64_t size = rs_get_end(size_rs, rt) - start;
-		range_seg_t *addr_rs = range_tree_find(rt, start, size);
-		ASSERT3P(addr_rs, !=, NULL);
-		ASSERT3U(rs_get_start(size_rs, rt), ==, rs_get_start(addr_rs,
-		    rt));
-		ASSERT3U(rs_get_end(size_rs, rt), ==, rs_get_end(addr_rs, rt));
-		return (addr_rs);
-	} else {
-		return (NULL);
+
+	/*
+	 * Try to continue previous extent if it is not completed yet.  After
+	 * shrink in scan_io_queue_gather() it may no longer be the best, but
+	 * otherwise we leave shorter remnant every txg.
+	 */
+	uint64_t start;
+	uint64_t size = 1ULL << rt->rt_shift;
+	range_seg_t *addr_rs;
+	if (queue->q_last_ext_addr != -1) {
+		start = queue->q_last_ext_addr;
+		addr_rs = range_tree_find(rt, start, size);
+		if (addr_rs != NULL)
+			return (addr_rs);
 	}
+
+	/*
+	 * Nothing to continue, so find new best extent.
+	 */
+	uint64_t *v = zfs_btree_first(&queue->q_exts_by_size, NULL);
+	if (v == NULL)
+		return (NULL);
+	queue->q_last_ext_addr = start = *v << rt->rt_shift;
+
+	/*
+	 * We need to get the original entry in the by_addr tree so we can
+	 * modify it.
+	 */
+	addr_rs = range_tree_find(rt, start, size);
+	ASSERT3P(addr_rs, !=, NULL);
+	ASSERT3U(rs_get_start(addr_rs, rt), ==, start);
+	ASSERT3U(rs_get_end(addr_rs, rt), >, start);
+	return (addr_rs);
 }
 
 static void
@@ -3026,15 +3376,19 @@ scan_io_queues_run_one(void *arg)
 	dsl_scan_io_queue_t *queue = arg;
 	kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
 	boolean_t suspended = B_FALSE;
-	range_seg_t *rs = NULL;
-	scan_io_t *sio = NULL;
+	range_seg_t *rs;
+	scan_io_t *sio;
+	zio_t *zio;
 	list_t sio_list;
 
 	ASSERT(queue->q_scn->scn_is_sorted);
 
 	list_create(&sio_list, sizeof (scan_io_t),
 	    offsetof(scan_io_t, sio_nodes.sio_list_node));
+	zio = zio_null(queue->q_scn->scn_zio_root, queue->q_scn->scn_dp->dp_spa,
+	    NULL, NULL, NULL, ZIO_FLAG_CANFAIL);
 	mutex_enter(q_lock);
+	queue->q_zio = zio;
 
 	/* Calculate maximum in-flight bytes for this vdev. */
 	queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit *
@@ -3049,12 +3403,12 @@ scan_io_queues_run_one(void *arg)
 	/* loop until we run out of time or sios */
 	while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) {
 		uint64_t seg_start = 0, seg_end = 0;
-		boolean_t more_left = B_TRUE;
+		boolean_t more_left;
 
 		ASSERT(list_is_empty(&sio_list));
 
 		/* loop while we still have sios left to process in this rs */
-		while (more_left) {
+		do {
 			scan_io_t *first_sio, *last_sio;
 
 			/*
@@ -3083,7 +3437,7 @@ scan_io_queues_run_one(void *arg)
 
 			if (suspended)
 				break;
-		}
+		} while (more_left);
 
 		/* update statistics for debugging purposes */
 		scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
@@ -3096,12 +3450,12 @@ scan_io_queues_run_one(void *arg)
 	 * If we were suspended in the middle of processing,
 	 * requeue any unfinished sios and exit.
 	 */
-	while ((sio = list_head(&sio_list)) != NULL) {
-		list_remove(&sio_list, sio);
+	while ((sio = list_remove_head(&sio_list)) != NULL)
 		scan_io_queue_insert_impl(queue, sio);
-	}
 
+	queue->q_zio = NULL;
 	mutex_exit(q_lock);
+	zio_nowait(zio);
 	list_destroy(&sio_list);
 }
 
@@ -3122,7 +3476,7 @@ scan_io_queues_run(dsl_scan_t *scn)
 	ASSERT(scn->scn_is_sorted);
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
-	if (scn->scn_bytes_pending == 0)
+	if (scn->scn_queues_pending == 0)
 		return;
 
 	if (scn->scn_taskq == NULL) {
@@ -3290,6 +3644,19 @@ dsl_scan_active(dsl_scan_t *scn)
 	return ((used != 0) || (clones_left));
 }
 
+boolean_t
+dsl_errorscrub_active(dsl_scan_t *scn)
+{
+	spa_t *spa = scn->scn_dp->dp_spa;
+	if (spa->spa_load_state != SPA_LOAD_NONE)
+		return (B_FALSE);
+	if (spa_shutting_down(spa))
+		return (B_FALSE);
+	if (dsl_errorscrubbing(scn->scn_dp))
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
 static boolean_t
 dsl_scan_check_deferred(vdev_t *vd)
 {
@@ -3439,11 +3806,12 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
 		scn->scn_dedup_frees_this_txg = 0;
 
 		/*
-		 * Write out changes to the DDT that may be required as a
-		 * result of the blocks freed.  This ensures that the DDT
-		 * is clean when a scrub/resilver runs.
+		 * Write out changes to the DDT and the BRT that may be required
+		 * as a result of the blocks freed.  This ensures that the DDT
+		 * and the BRT are clean when a scrub/resilver runs.
 		 */
 		ddt_sync(spa, tx->tx_txg);
+		brt_sync(spa, tx->tx_txg);
 	}
 	if (err != 0)
 		return (err);
@@ -3505,6 +3873,387 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
 	return (0);
 }
 
+static void
+name_to_bookmark(char *buf, zbookmark_phys_t *zb)
+{
+	zb->zb_objset = zfs_strtonum(buf, &buf);
+	ASSERT(*buf == ':');
+	zb->zb_object = zfs_strtonum(buf + 1, &buf);
+	ASSERT(*buf == ':');
+	zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
+	ASSERT(*buf == ':');
+	zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
+	ASSERT(*buf == '\0');
+}
+
+static void
+name_to_object(char *buf, uint64_t *obj)
+{
+	*obj = zfs_strtonum(buf, &buf);
+	ASSERT(*buf == '\0');
+}
+
+static void
+read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	dsl_dataset_t *ds;
+	objset_t *os;
+	if (dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds) != 0)
+		return;
+
+	if (dmu_objset_from_ds(ds, &os) != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return;
+	}
+
+	/*
+	 * If the key is not loaded dbuf_dnode_findbp() will error out with
+	 * EACCES. However in that case dnode_hold() will eventually call
+	 * dbuf_read()->zio_wait() which may call spa_log_error(). This will
+	 * lead to a deadlock due to us holding the mutex spa_errlist_lock.
+	 * Avoid this by checking here if the keys are loaded, if not return.
+	 * If the keys are not loaded the head_errlog feature is meaningless
+	 * as we cannot figure out the birth txg of the block pointer.
+	 */
+	if (dsl_dataset_get_keystatus(ds->ds_dir) ==
+	    ZFS_KEYSTATUS_UNAVAILABLE) {
+		dsl_dataset_rele(ds, FTAG);
+		return;
+	}
+
+	dnode_t *dn;
+	blkptr_t bp;
+
+	if (dnode_hold(os, zb.zb_object, FTAG, &dn) != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return;
+	}
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	int error = dbuf_dnode_findbp(dn, zb.zb_level, zb.zb_blkid, &bp, NULL,
+	    NULL);
+
+	if (error) {
+		rw_exit(&dn->dn_struct_rwlock);
+		dnode_rele(dn, FTAG);
+		dsl_dataset_rele(ds, FTAG);
+		return;
+	}
+
+	if (!error && BP_IS_HOLE(&bp)) {
+		rw_exit(&dn->dn_struct_rwlock);
+		dnode_rele(dn, FTAG);
+		dsl_dataset_rele(ds, FTAG);
+		return;
+	}
+
+	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW |
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB;
+
+	/* If it's an intent log block, failure is expected. */
+	if (zb.zb_level == ZB_ZIL_LEVEL)
+		zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+	ASSERT(!BP_IS_EMBEDDED(&bp));
+	scan_exec_io(dp, &bp, zio_flags, &zb, NULL);
+	rw_exit(&dn->dn_struct_rwlock);
+	dnode_rele(dn, FTAG);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * We keep track of the scrubbed error blocks in "count". This will be used
+ * when deciding whether we exceeded zfs_scrub_error_blocks_per_txg. This
+ * function is modelled after check_filesystem().
+ */
+static int
+scrub_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep,
+    int *count)
+{
+	dsl_dataset_t *ds;
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	int error = dsl_dataset_hold_obj(dp, fs, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	uint64_t latest_txg;
+	uint64_t txg_to_consider = spa->spa_syncing_txg;
+	boolean_t check_snapshot = B_TRUE;
+
+	error = find_birth_txg(ds, zep, &latest_txg);
+
+	/*
+	 * If find_birth_txg() errors out, then err on the side of caution and
+	 * proceed. In worst case scenario scrub all objects. If zep->zb_birth
+	 * is 0 (e.g. in case of encryption with unloaded keys) also proceed to
+	 * scrub all objects.
+	 */
+	if (error == 0 && zep->zb_birth == latest_txg) {
+		/* Block neither free nor re written. */
+		zbookmark_phys_t zb;
+		zep_to_zb(fs, zep, &zb);
+		scn->scn_zio_root = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL);
+		/* We have already acquired the config lock for spa */
+		read_by_block_level(scn, zb);
+
+		(void) zio_wait(scn->scn_zio_root);
+		scn->scn_zio_root = NULL;
+
+		scn->errorscrub_phys.dep_examined++;
+		scn->errorscrub_phys.dep_to_examine--;
+		(*count)++;
+		if ((*count) == zfs_scrub_error_blocks_per_txg ||
+		    dsl_error_scrub_check_suspend(scn, &zb)) {
+			dsl_dataset_rele(ds, FTAG);
+			return (SET_ERROR(EFAULT));
+		}
+
+		check_snapshot = B_FALSE;
+	} else if (error == 0) {
+		txg_to_consider = latest_txg;
+	}
+
+	/*
+	 * Retrieve the number of snapshots if the dataset is not a snapshot.
+	 */
+	uint64_t snap_count = 0;
+	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
+
+		error = zap_count(spa->spa_meta_objset,
+		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+
+		if (error != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			return (error);
+		}
+	}
+
+	if (snap_count == 0) {
+		/* Filesystem without snapshots. */
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
+	}
+
+	uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+
+	dsl_dataset_rele(ds, FTAG);
+
+	/* Check only snapshots created from this file system. */
+	while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
+	    snap_obj_txg <= txg_to_consider) {
+
+		error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds);
+		if (error != 0)
+			return (error);
+
+		if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) {
+			snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+			snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+			dsl_dataset_rele(ds, FTAG);
+			continue;
+		}
+
+		boolean_t affected = B_TRUE;
+		if (check_snapshot) {
+			uint64_t blk_txg;
+			error = find_birth_txg(ds, zep, &blk_txg);
+
+			/*
+			 * Scrub the snapshot also when zb_birth == 0 or when
+			 * find_birth_txg() returns an error.
+			 */
+			affected = (error == 0 && zep->zb_birth == blk_txg) ||
+			    (error != 0) || (zep->zb_birth == 0);
+		}
+
+		/* Scrub snapshots. */
+		if (affected) {
+			zbookmark_phys_t zb;
+			zep_to_zb(snap_obj, zep, &zb);
+			scn->scn_zio_root = zio_root(spa, NULL, NULL,
+			    ZIO_FLAG_CANFAIL);
+			/* We have already acquired the config lock for spa */
+			read_by_block_level(scn, zb);
+
+			(void) zio_wait(scn->scn_zio_root);
+			scn->scn_zio_root = NULL;
+
+			scn->errorscrub_phys.dep_examined++;
+			scn->errorscrub_phys.dep_to_examine--;
+			(*count)++;
+			if ((*count) == zfs_scrub_error_blocks_per_txg ||
+			    dsl_error_scrub_check_suspend(scn, &zb)) {
+				dsl_dataset_rele(ds, FTAG);
+				return (EFAULT);
+			}
+		}
+		snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+		snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+		dsl_dataset_rele(ds, FTAG);
+	}
+	return (0);
+}
+
+void
+dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	spa_t *spa = dp->dp_spa;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	/*
+	 * Only process scans in sync pass 1.
+	 */
+
+	if (spa_sync_pass(spa) > 1)
+		return;
+
+	/*
+	 * If the spa is shutting down, then stop scanning. This will
+	 * ensure that the scan does not dirty any new data during the
+	 * shutdown phase.
+	 */
+	if (spa_shutting_down(spa))
+		return;
+
+	if (!dsl_errorscrub_active(scn) || dsl_errorscrub_is_paused(scn)) {
+		return;
+	}
+
+	if (dsl_scan_resilvering(scn->scn_dp)) {
+		/* cancel the error scrub if resilver started */
+		dsl_scan_cancel(scn->scn_dp);
+		return;
+	}
+
+	spa->spa_scrub_active = B_TRUE;
+	scn->scn_sync_start_time = gethrtime();
+
+	/*
+	 * zfs_scan_suspend_progress can be set to disable scrub progress.
+	 * See more detailed comment in dsl_scan_sync().
+	 */
+	if (zfs_scan_suspend_progress) {
+		uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+		int mintime = zfs_scrub_min_time_ms;
+
+		while (zfs_scan_suspend_progress &&
+		    !txg_sync_waiting(scn->scn_dp) &&
+		    !spa_shutting_down(scn->scn_dp->dp_spa) &&
+		    NSEC2MSEC(scan_time_ns) < mintime) {
+			delay(hz);
+			scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+		}
+		return;
+	}
+
+	int i = 0;
+	zap_attribute_t *za;
+	zbookmark_phys_t *zb;
+	boolean_t limit_exceeded = B_FALSE;
+
+	za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
+	zb = kmem_zalloc(sizeof (zbookmark_phys_t), KM_SLEEP);
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+		for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
+		    zap_cursor_advance(&scn->errorscrub_cursor)) {
+			name_to_bookmark(za->za_name, zb);
+
+			scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+			    NULL, ZIO_FLAG_CANFAIL);
+			dsl_pool_config_enter(dp, FTAG);
+			read_by_block_level(scn, *zb);
+			dsl_pool_config_exit(dp, FTAG);
+
+			(void) zio_wait(scn->scn_zio_root);
+			scn->scn_zio_root = NULL;
+
+			scn->errorscrub_phys.dep_examined += 1;
+			scn->errorscrub_phys.dep_to_examine -= 1;
+			i++;
+			if (i == zfs_scrub_error_blocks_per_txg ||
+			    dsl_error_scrub_check_suspend(scn, zb)) {
+				limit_exceeded = B_TRUE;
+				break;
+			}
+		}
+
+		if (!limit_exceeded)
+			dsl_errorscrub_done(scn, B_TRUE, tx);
+
+		dsl_errorscrub_sync_state(scn, tx);
+		kmem_free(za, sizeof (*za));
+		kmem_free(zb, sizeof (*zb));
+		return;
+	}
+
+	int error = 0;
+	for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
+	    zap_cursor_advance(&scn->errorscrub_cursor)) {
+
+		zap_cursor_t *head_ds_cursor;
+		zap_attribute_t *head_ds_attr;
+		zbookmark_err_phys_t head_ds_block;
+
+		head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
+		head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+		uint64_t head_ds_err_obj = za->za_first_integer;
+		uint64_t head_ds;
+		name_to_object(za->za_name, &head_ds);
+		boolean_t config_held = B_FALSE;
+		uint64_t top_affected_fs;
+
+		for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset,
+		    head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor,
+		    head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) {
+
+			name_to_errphys(head_ds_attr->za_name, &head_ds_block);
+
+			/*
+			 * In case we are called from spa_sync the pool
+			 * config is already held.
+			 */
+			if (!dsl_pool_config_held(dp)) {
+				dsl_pool_config_enter(dp, FTAG);
+				config_held = B_TRUE;
+			}
+
+			error = find_top_affected_fs(spa,
+			    head_ds, &head_ds_block, &top_affected_fs);
+			if (error)
+				break;
+
+			error = scrub_filesystem(spa, top_affected_fs,
+			    &head_ds_block, &i);
+
+			if (error == SET_ERROR(EFAULT)) {
+				limit_exceeded = B_TRUE;
+				break;
+			}
+		}
+
+		zap_cursor_fini(head_ds_cursor);
+		kmem_free(head_ds_cursor, sizeof (*head_ds_cursor));
+		kmem_free(head_ds_attr, sizeof (*head_ds_attr));
+
+		if (config_held)
+			dsl_pool_config_exit(dp, FTAG);
+	}
+
+	kmem_free(za, sizeof (*za));
+	kmem_free(zb, sizeof (*zb));
+	if (!limit_exceeded)
+		dsl_errorscrub_done(scn, B_TRUE, tx);
+
+	dsl_errorscrub_sync_state(scn, tx);
+}
+
 /*
  * This is the primary entry point for scans that is called from syncing
  * context. Scans must happen entirely during syncing context so that we
@@ -3608,8 +4357,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 	 */
 	if (zfs_scan_suspend_progress) {
 		uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
-		int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
-		    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+		uint_t mintime = (scn->scn_phys.scn_func ==
+		    POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms :
+		    zfs_scrub_min_time_ms;
 
 		while (zfs_scan_suspend_progress &&
 		    !txg_sync_waiting(scn->scn_dp) &&
@@ -3622,6 +4372,16 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 	}
 
 	/*
+	 * Disabled by default, set zfs_scan_report_txgs to report
+	 * average performance over the last zfs_scan_report_txgs TXGs.
+	 */
+	if (zfs_scan_report_txgs != 0 &&
+	    tx->tx_txg % zfs_scan_report_txgs == 0) {
+		scn->scn_issued_before_pass += spa->spa_scan_pass_issued;
+		spa_scan_stat_init(spa);
+	}
+
+	/*
 	 * It is possible to switch from unsorted to sorted at any time,
 	 * but afterwards the scan will remain sorted unless reloaded from
 	 * a checkpoint after a reboot.
@@ -3680,12 +4440,13 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 		taskqid_t prefetch_tqid;
 
 		/*
-		 * Recalculate the max number of in-flight bytes for pool-wide
-		 * scanning operations (minimum 1MB). Limits for the issuing
-		 * phase are done per top-level vdev and are handled separately.
+		 * Calculate the max number of in-flight bytes for pool-wide
+		 * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
+		 * Limits for the issuing phase are done per top-level vdev and
+		 * are handled separately.
 		 */
-		scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
-		    dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
+		scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
+		    zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
 
 		if (scnp->scn_ddt_bookmark.ddb_class <=
 		    scnp->scn_ddt_class_max) {
@@ -3749,12 +4510,15 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 			if (scn->scn_is_sorted) {
 				scn->scn_checkpointing = B_TRUE;
 				scn->scn_clearing = B_TRUE;
+				scn->scn_issued_before_pass +=
+				    spa->spa_scan_pass_issued;
+				spa_scan_stat_init(spa);
 			}
 			zfs_dbgmsg("scan complete for %s txg %llu",
 			    spa->spa_name,
 			    (longlong_t)tx->tx_txg);
 		}
-	} else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) {
+	} else if (scn->scn_is_sorted && scn->scn_queues_pending != 0) {
 		ASSERT(scn->scn_clearing);
 
 		/* need to issue scrubbing IOs from per-vdev queues */
@@ -3784,7 +4548,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 		    spa->spa_name);
 		ASSERT3U(scn->scn_done_txg, !=, 0);
 		ASSERT0(spa->spa_scrub_inflight);
-		ASSERT0(scn->scn_bytes_pending);
+		ASSERT0(scn->scn_queues_pending);
 		dsl_scan_done(scn, B_TRUE, tx);
 		sync_type = SYNC_MANDATORY;
 	}
@@ -3793,10 +4557,8 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 }
 
 static void
-count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
+count_block_issued(spa_t *spa, const blkptr_t *bp, boolean_t all)
 {
-	int i;
-
 	/*
 	 * Don't count embedded bp's, since we already did the work of
 	 * scanning these when we scanned the containing block.
@@ -3811,18 +4573,22 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
 	 * zio code will only try the first one unless there is an issue.
 	 * Therefore, we should only count the first DVA for these IOs.
 	 */
-	if (scn->scn_is_sorted) {
-		atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
-		    DVA_GET_ASIZE(&bp->blk_dva[0]));
-	} else {
-		spa_t *spa = scn->scn_dp->dp_spa;
+	atomic_add_64(&spa->spa_scan_pass_issued,
+	    all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
+}
 
-		for (i = 0; i < BP_GET_NDVAS(bp); i++) {
-			atomic_add_64(&spa->spa_scan_pass_issued,
-			    DVA_GET_ASIZE(&bp->blk_dva[i]));
-		}
-	}
+static void
+count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all)
+{
+	if (BP_IS_EMBEDDED(bp))
+		return;
+	atomic_add_64(&scn->scn_phys.scn_skipped,
+	    all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
+}
 
+static void
+count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+{
 	/*
 	 * If we resume after a reboot, zab will be NULL; don't record
 	 * incomplete stats in that case.
@@ -3830,9 +4596,7 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
 	if (zab == NULL)
 		return;
 
-	mutex_enter(&zab->zab_lock);
-
-	for (i = 0; i < 4; i++) {
+	for (int i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
 		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
 
@@ -3867,28 +4631,27 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
 			break;
 		}
 	}
-
-	mutex_exit(&zab->zab_lock);
 }
 
 static void
 scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
 {
 	avl_index_t idx;
-	int64_t asize = SIO_GET_ASIZE(sio);
 	dsl_scan_t *scn = queue->q_scn;
 
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
+	if (unlikely(avl_is_empty(&queue->q_sios_by_addr)))
+		atomic_add_64(&scn->scn_queues_pending, 1);
 	if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
 		/* block is already scheduled for reading */
-		atomic_add_64(&scn->scn_bytes_pending, -asize);
 		sio_free(sio);
 		return;
 	}
 	avl_insert(&queue->q_sios_by_addr, sio, idx);
 	queue->q_sio_memused += SIO_GET_MUSED(sio);
-	range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize);
+	range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio),
+	    SIO_GET_ASIZE(sio));
 }
 
 /*
@@ -3901,7 +4664,6 @@ static void
 scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
     int zio_flags, const zbookmark_phys_t *zb)
 {
-	dsl_scan_t *scn = queue->q_scn;
 	scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
 
 	ASSERT0(BP_IS_GANG(bp));
@@ -3911,13 +4673,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
 	sio->sio_flags = zio_flags;
 	sio->sio_zb = *zb;
 
-	/*
-	 * Increment the bytes pending counter now so that we can't
-	 * get an integer underflow in case the worker processes the
-	 * zio before we get to incrementing this counter.
-	 */
-	atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio));
-
+	queue->q_last_ext_addr = -1;
 	scan_io_queue_insert_impl(queue, sio);
 }
 
@@ -3967,15 +4723,15 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
-	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+	uint64_t phys_birth = BP_GET_BIRTH(bp);
 	size_t psize = BP_GET_PSIZE(bp);
 	boolean_t needs_io = B_FALSE;
 	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
 
-
+	count_block(dp->dp_blkstats, bp);
 	if (phys_birth <= scn->scn_phys.scn_min_txg ||
 	    phys_birth >= scn->scn_phys.scn_max_txg) {
-		count_block(scn, dp->dp_blkstats, bp);
+		count_block_skipped(scn, bp, B_TRUE);
 		return (0);
 	}
 
@@ -4003,8 +4759,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 		 * Keep track of how much data we've examined so that
 		 * zpool(8) status can make useful progress reports.
 		 */
-		scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
-		spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
+		uint64_t asize = DVA_GET_ASIZE(dva);
+		scn->scn_phys.scn_examined += asize;
+		spa->spa_scan_pass_exam += asize;
 
 		/* if it's a resilver, this may not be in the target range */
 		if (!needs_io)
@@ -4015,7 +4772,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 	if (needs_io && !zfs_no_scrub_io) {
 		dsl_scan_enqueue(dp, bp, zio_flags, zb);
 	} else {
-		count_block(scn, dp->dp_blkstats, bp);
+		count_block_skipped(scn, bp, B_TRUE);
 	}
 
 	/* do not relocate this block */
@@ -4047,7 +4804,14 @@ dsl_scan_scrub_done(zio_t *zio)
 
 	if (zio->io_error && (zio->io_error != ECKSUM ||
 	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
-		atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors);
+		if (dsl_errorscrubbing(spa->spa_dsl_pool) &&
+		    !dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) {
+			atomic_inc_64(&spa->spa_dsl_pool->dp_scan
+			    ->errorscrub_phys.dep_errors);
+		} else {
+			atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys
+			    .scn_errors);
+		}
 	}
 }
 
@@ -4066,6 +4830,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
 	dsl_scan_t *scn = dp->dp_scan;
 	size_t size = BP_GET_PSIZE(bp);
 	abd_t *data = abd_alloc_for_io(size, B_FALSE);
+	zio_t *pio;
 
 	if (queue == NULL) {
 		ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
@@ -4074,6 +4839,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
 		mutex_exit(&spa->spa_scrub_lock);
+		pio = scn->scn_zio_root;
 	} else {
 		kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
 
@@ -4082,12 +4848,14 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
 		while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
 			cv_wait(&queue->q_zio_cv, q_lock);
 		queue->q_inflight_bytes += BP_GET_PSIZE(bp);
+		pio = queue->q_zio;
 		mutex_exit(q_lock);
 	}
 
-	count_block(scn, dp->dp_blkstats, bp);
-	zio_nowait(zio_read(scn->scn_zio_root, spa, bp, data, size,
-	    dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
+	ASSERT(pio != NULL);
+	count_block_issued(spa, bp, queue == NULL);
+	zio_nowait(zio_read(pio, spa, bp, data, size, dsl_scan_scrub_done,
+	    queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
 }
 
 /*
@@ -4121,33 +4889,93 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
  * extents that are more completely filled (in a 3:2 ratio) vs just larger.
  * Note that as an optimization, we replace multiplication and division by
  * 100 with bitshifting by 7 (which effectively multiplies and divides by 128).
+ *
+ * Since we do not care if one extent is only few percent better than another,
+ * compress the score into 6 bits via binary logarithm AKA highbit64() and
+ * put into otherwise unused due to ashift high bits of offset.  This allows
+ * to reduce q_exts_by_size B-tree elements to only 64 bits and compare them
+ * with single operation.  Plus it makes scrubs more sequential and reduces
+ * chances that minor extent change move it within the B-tree.
  */
+__attribute__((always_inline)) inline
 static int
 ext_size_compare(const void *x, const void *y)
 {
-	const range_seg_gap_t *rsa = x, *rsb = y;
+	const uint64_t *a = x, *b = y;
 
-	uint64_t sa = rsa->rs_end - rsa->rs_start;
-	uint64_t sb = rsb->rs_end - rsb->rs_start;
-	uint64_t score_a, score_b;
+	return (TREE_CMP(*a, *b));
+}
 
-	score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) *
-	    fill_weight * rsa->rs_fill) >> 7);
-	score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) *
-	    fill_weight * rsb->rs_fill) >> 7);
+ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t,
+    ext_size_compare)
 
-	if (score_a > score_b)
-		return (-1);
-	if (score_a == score_b) {
-		if (rsa->rs_start < rsb->rs_start)
-			return (-1);
-		if (rsa->rs_start == rsb->rs_start)
-			return (0);
-		return (1);
-	}
-	return (1);
+static void
+ext_size_create(range_tree_t *rt, void *arg)
+{
+	(void) rt;
+	zfs_btree_t *size_tree = arg;
+
+	zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf,
+	    sizeof (uint64_t));
 }
 
+static void
+ext_size_destroy(range_tree_t *rt, void *arg)
+{
+	(void) rt;
+	zfs_btree_t *size_tree = arg;
+	ASSERT0(zfs_btree_numnodes(size_tree));
+
+	zfs_btree_destroy(size_tree);
+}
+
+static uint64_t
+ext_size_value(range_tree_t *rt, range_seg_gap_t *rsg)
+{
+	(void) rt;
+	uint64_t size = rsg->rs_end - rsg->rs_start;
+	uint64_t score = rsg->rs_fill + ((((rsg->rs_fill << 7) / size) *
+	    fill_weight * rsg->rs_fill) >> 7);
+	ASSERT3U(rt->rt_shift, >=, 8);
+	return (((uint64_t)(64 - highbit64(score)) << 56) | rsg->rs_start);
+}
+
+static void
+ext_size_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+	zfs_btree_t *size_tree = arg;
+	ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP);
+	uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs);
+	zfs_btree_add(size_tree, &v);
+}
+
+static void
+ext_size_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+	zfs_btree_t *size_tree = arg;
+	ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP);
+	uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs);
+	zfs_btree_remove(size_tree, &v);
+}
+
+static void
+ext_size_vacate(range_tree_t *rt, void *arg)
+{
+	zfs_btree_t *size_tree = arg;
+	zfs_btree_clear(size_tree);
+	zfs_btree_destroy(size_tree);
+
+	ext_size_create(rt, arg);
+}
+
+static const range_tree_ops_t ext_size_ops = {
+	.rtop_create = ext_size_create,
+	.rtop_destroy = ext_size_destroy,
+	.rtop_add = ext_size_add,
+	.rtop_remove = ext_size_remove,
+	.rtop_vacate = ext_size_vacate
+};
+
 /*
  * Comparator for the q_sios_by_addr tree. Sorting is simply performed
  * based on LBA-order (from lowest to highest).
@@ -4170,9 +4998,10 @@ scan_io_queue_create(vdev_t *vd)
 	q->q_scn = scn;
 	q->q_vd = vd;
 	q->q_sio_memused = 0;
+	q->q_last_ext_addr = -1;
 	cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
-	q->q_exts_by_addr = range_tree_create_impl(&rt_btree_ops, RANGE_SEG_GAP,
-	    &q->q_exts_by_size, 0, 0, ext_size_compare, zfs_scan_max_ext_gap);
+	q->q_exts_by_addr = range_tree_create_gap(&ext_size_ops, RANGE_SEG_GAP,
+	    &q->q_exts_by_size, 0, vd->vdev_ashift, zfs_scan_max_ext_gap);
 	avl_create(&q->q_sios_by_addr, sio_addr_compare,
 	    sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
 
@@ -4190,21 +5019,20 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
 	dsl_scan_t *scn = queue->q_scn;
 	scan_io_t *sio;
 	void *cookie = NULL;
-	int64_t bytes_dequeued = 0;
 
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
+	if (!avl_is_empty(&queue->q_sios_by_addr))
+		atomic_add_64(&scn->scn_queues_pending, -1);
 	while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
 	    NULL) {
 		ASSERT(range_tree_contains(queue->q_exts_by_addr,
 		    SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
-		bytes_dequeued += SIO_GET_ASIZE(sio);
 		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 		sio_free(sio);
 	}
 
 	ASSERT0(queue->q_sio_memused);
-	atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
 	range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
 	range_tree_destroy(queue->q_exts_by_addr);
 	avl_destroy(&queue->q_sios_by_addr);
@@ -4300,28 +5128,22 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
 	sio_free(srch_sio);
 
 	if (sio != NULL) {
-		int64_t asize = SIO_GET_ASIZE(sio);
 		blkptr_t tmpbp;
 
 		/* Got it while it was cold in the queue */
 		ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
-		ASSERT3U(size, ==, asize);
+		ASSERT3U(size, ==, SIO_GET_ASIZE(sio));
 		avl_remove(&queue->q_sios_by_addr, sio);
+		if (avl_is_empty(&queue->q_sios_by_addr))
+			atomic_add_64(&scn->scn_queues_pending, -1);
 		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 
 		ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
 		range_tree_remove_fill(queue->q_exts_by_addr, start, size);
 
-		/*
-		 * We only update scn_bytes_pending in the cold path,
-		 * otherwise it will already have been accounted for as
-		 * part of the zio's execution.
-		 */
-		atomic_add_64(&scn->scn_bytes_pending, -asize);
-
-		/* count the block as though we issued it */
+		/* count the block as though we skipped it */
 		sio2bp(sio, &tmpbp);
-		count_block(scn, dp->dp_blkstats, &tmpbp);
+		count_block_skipped(scn, &tmpbp, B_FALSE);
 
 		sio_free(sio);
 	}
@@ -4379,20 +5201,19 @@ dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd)
 		spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
 }
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, U64, ZMOD_RW,
 	"Max bytes in flight per leaf vdev for scrubs and resilvers");
 
-ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to scrub per txg");
 
-ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to obsolete per txg");
 
-ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to free per txg");
 
-ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to resilver per txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_suspend_progress, INT, ZMOD_RW,
@@ -4404,40 +5225,48 @@ ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW,
 	"Set to disable scrub prefetching");
 
-ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, U64, ZMOD_RW,
 	"Max number of blocks freed in one txg");
 
-ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, U64, ZMOD_RW,
 	"Max number of dedup blocks freed in one txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW,
 	"Enable processing of the free_bpobj");
 
-ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, scan_blkstats, INT, ZMOD_RW,
+	"Enable block statistics calculation during scrub");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, UINT, ZMOD_RW,
 	"Fraction of RAM for scan hard limit");
 
-ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, INT, ZMOD_RW,
-	"IO issuing strategy during scrubbing. "
-	"0 = default, 1 = LBA, 2 = size");
+ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, UINT, ZMOD_RW,
+	"IO issuing strategy during scrubbing. 0 = default, 1 = LBA, 2 = size");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW,
 	"Scrub using legacy non-sequential method");
 
-ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, UINT, ZMOD_RW,
 	"Scan progress on-disk checkpointing interval");
 
-ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, U64, ZMOD_RW,
 	"Max gap in bytes between sequential scrub / resilver I/Os");
 
-ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, UINT, ZMOD_RW,
 	"Fraction of hard limit used as soft limit");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW,
 	"Tunable to attempt to reduce lock contention");
 
-ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, UINT, ZMOD_RW,
 	"Tunable to adjust bias towards more filled segments during scans");
 
+ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW,
+	"Tunable to report resilver performance over the last N txgs");
+
 ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW,
 	"Process all resilvers immediately");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, UINT, ZMOD_RW,
+	"Error blocks to be scrubbed in one txg");
 /* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dsl_synctask.c b/sys/contrib/openzfs/module/zfs/dsl_synctask.c
index 148e8fff2437..409e12884d91 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_synctask.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_synctask.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -32,10 +32,10 @@
 
 #define	DST_AVG_BLKSHIFT 14
 
-/* ARGSUSED */
 static int
 dsl_null_checkfunc(void *arg, dmu_tx_t *tx)
 {
+	(void) arg, (void) tx;
 	return (0);
 }
 
diff --git a/sys/contrib/openzfs/module/zfs/dsl_userhold.c b/sys/contrib/openzfs/module/zfs/dsl_userhold.c
index 75d153194a00..75953f70f926 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_userhold.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_userhold.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -115,7 +115,7 @@ dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx)
 	    pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) {
 		dsl_dataset_t *ds;
 		int error = 0;
-		char *htag, *name;
+		const char *htag, *name;
 
 		/* must be a snapshot */
 		name = nvpair_name(pair);
@@ -346,7 +346,7 @@ dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist)
 	return (ret);
 }
 
-typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag,
+typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, const void *tag,
     dsl_dataset_t **dsp);
 
 typedef struct dsl_dataset_user_release_arg {
@@ -359,7 +359,7 @@ typedef struct dsl_dataset_user_release_arg {
 
 /* Place a dataset hold on the snapshot identified by passed dsobj string */
 static int
-dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag,
+dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, const void *tag,
     dsl_dataset_t **dsp)
 {
 	return (dsl_dataset_hold_obj(dp, zfs_strtonum(dsobj, NULL), tag, dsp));
@@ -572,7 +572,7 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
 {
 	dsl_dataset_user_release_arg_t ddura;
 	nvpair_t *pair;
-	char *pool;
+	const char *pool;
 	int error;
 
 	pair = nvlist_next_nvpair(holds, NULL);
diff --git a/sys/contrib/openzfs/module/zfs/edonr_zfs.c b/sys/contrib/openzfs/module/zfs/edonr_zfs.c
index aa00e1c9417e..db21c9cf197c 100644
--- a/sys/contrib/openzfs/module/zfs/edonr_zfs.c
+++ b/sys/contrib/openzfs/module/zfs/edonr_zfs.c
@@ -45,7 +45,6 @@ edonr_incremental(void *buf, size_t size, void *arg)
 /*
  * Native zio_checksum interface for the Edon-R hash function.
  */
-/*ARGSUSED*/
 void
 abd_checksum_edonr_native(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
@@ -54,10 +53,10 @@ abd_checksum_edonr_native(abd_t *abd, uint64_t size,
 	EdonRState	ctx;
 
 	ASSERT(ctx_template != NULL);
-	bcopy(ctx_template, &ctx, sizeof (ctx));
+	memcpy(&ctx, ctx_template, sizeof (ctx));
 	(void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx);
 	EdonRFinal(&ctx, digest);
-	bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word));
+	memcpy(zcp->zc_word, digest, sizeof (zcp->zc_word));
 }
 
 /*
@@ -89,18 +88,17 @@ abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
 	 * size by double-hashing it (the new salt block will be composed of
 	 * H(salt) || H(H(salt))).
 	 */
-	CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8));
-	EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8,
-	    salt_block);
-	EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block +
-	    EDONR_MODE / 8);
+	_Static_assert(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8),
+	    "Edon-R block size mismatch");
+	EdonRHash(salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8, salt_block);
+	EdonRHash(salt_block, EDONR_MODE, salt_block + EDONR_MODE / 8);
 
 	/*
 	 * Feed the new salt block into the hash function - this will serve
 	 * as our MAC key.
 	 */
 	ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
-	EdonRInit(ctx, EDONR_MODE);
+	EdonRInit(ctx);
 	EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8);
 	return (ctx);
 }
@@ -108,8 +106,8 @@ abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
 void
 abd_checksum_edonr_tmpl_free(void *ctx_template)
 {
-	EdonRState	*ctx = ctx_template;
+	EdonRState *ctx = ctx_template;
 
-	bzero(ctx, sizeof (*ctx));
+	memset(ctx, 0, sizeof (*ctx));
 	kmem_free(ctx, sizeof (*ctx));
 }
diff --git a/sys/contrib/openzfs/module/zfs/fm.c b/sys/contrib/openzfs/module/zfs/fm.c
index b8a1c7c8a5ca..77d87b694a43 100644
--- a/sys/contrib/openzfs/module/zfs/fm.c
+++ b/sys/contrib/openzfs/module/zfs/fm.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -68,9 +68,9 @@
 #include <sys/condvar.h>
 #include <sys/zfs_ioctl.h>
 
-int zfs_zevent_len_max = 512;
+static uint_t zfs_zevent_len_max = 512;
 
-static int zevent_len_cur = 0;
+static uint_t zevent_len_cur = 0;
 static int zevent_waiters = 0;
 static int zevent_flags = 0;
 
@@ -148,8 +148,7 @@ zfs_zevent_drain(zevent_t *ev)
 	list_remove(&zevent_list, ev);
 
 	/* Remove references to this event in all private file data */
-	while ((ze = list_head(&ev->ev_ze_list)) != NULL) {
-		list_remove(&ev->ev_ze_list, ze);
+	while ((ze = list_remove_head(&ev->ev_ze_list)) != NULL) {
 		ze->ze_zevent = NULL;
 		ze->ze_dropped++;
 	}
@@ -158,7 +157,7 @@ zfs_zevent_drain(zevent_t *ev)
 }
 
 void
-zfs_zevent_drain_all(int *count)
+zfs_zevent_drain_all(uint_t *count)
 {
 	zevent_t *ev;
 
@@ -380,8 +379,7 @@ zfs_zevent_wait(zfs_zevent_t *ze)
 			break;
 		}
 
-		error = cv_wait_sig(&zevent_cv, &zevent_lock);
-		if (signal_pending(current)) {
+		if (cv_wait_sig(&zevent_cv, &zevent_lock) == 0) {
 			error = SET_ERROR(EINTR);
 			break;
 		} else if (!list_is_empty(&zevent_list)) {
@@ -483,21 +481,21 @@ zfs_zevent_destroy(zfs_zevent_t *ze)
 /*
  * Wrappers for FM nvlist allocators
  */
-/* ARGSUSED */
 static void *
 i_fm_alloc(nv_alloc_t *nva, size_t size)
 {
-	return (kmem_zalloc(size, KM_SLEEP));
+	(void) nva;
+	return (kmem_alloc(size, KM_SLEEP));
 }
 
-/* ARGSUSED */
 static void
 i_fm_free(nv_alloc_t *nva, void *buf, size_t size)
 {
+	(void) nva;
 	kmem_free(buf, size);
 }
 
-const nv_alloc_ops_t fm_mem_alloc_ops = {
+static const nv_alloc_ops_t fm_mem_alloc_ops = {
 	.nv_ao_init = NULL,
 	.nv_ao_fini = NULL,
 	.nv_ao_alloc = i_fm_alloc,
@@ -702,7 +700,7 @@ i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap)
 		case DATA_TYPE_STRING_ARRAY:
 			nelem = va_arg(ap, int);
 			ret = nvlist_add_string_array(payload, name,
-			    va_arg(ap, char **), nelem);
+			    va_arg(ap, const char **), nelem);
 			break;
 		case DATA_TYPE_NVLIST:
 			ret = nvlist_add_nvlist(payload, name,
@@ -711,7 +709,7 @@ i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap)
 		case DATA_TYPE_NVLIST_ARRAY:
 			nelem = va_arg(ap, int);
 			ret = nvlist_add_nvlist_array(payload, name,
-			    va_arg(ap, nvlist_t **), nelem);
+			    va_arg(ap, const nvlist_t **), nelem);
 			break;
 		default:
 			ret = EINVAL;
@@ -867,8 +865,10 @@ fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth,
 	}
 	va_end(ap);
 
-	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
+	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST,
+	    (const nvlist_t **)pairs, npairs) != 0) {
 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+	}
 
 	for (i = 0; i < npairs; i++)
 		fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
@@ -891,7 +891,7 @@ fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
 	uint_t n;
 	int i, j;
 	va_list ap;
-	char *hcname, *hcid;
+	const char *hcname, *hcid;
 
 	if (!fm_fmri_hc_set_common(fmri, version, auth))
 		return;
@@ -953,6 +953,7 @@ fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
 			}
 			atomic_inc_64(
 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
+			va_end(ap);
 			return;
 		}
 	}
@@ -961,8 +962,8 @@ fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
 	/*
 	 * Create the fmri hc list
 	 */
-	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
-	    npairs + n) != 0) {
+	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST,
+	    (const nvlist_t **)pairs, npairs + n) != 0) {
 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
@@ -1128,7 +1129,7 @@ fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
 
 	if (serial != NULL) {
 		if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
-		    (char **)&serial, 1) != 0) {
+		    (const char **)&serial, 1) != 0) {
 			atomic_inc_64(
 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 		}
@@ -1340,7 +1341,7 @@ fm_init(void)
 void
 fm_fini(void)
 {
-	int count;
+	uint_t count;
 
 	zfs_ereport_fini();
 
@@ -1352,7 +1353,7 @@ fm_fini(void)
 	zevent_flags |= ZEVENT_SHUTDOWN;
 	while (zevent_waiters > 0) {
 		mutex_exit(&zevent_lock);
-		schedule();
+		kpreempt(KPREEMPT_SYNC);
 		mutex_enter(&zevent_lock);
 	}
 	mutex_exit(&zevent_lock);
@@ -1368,5 +1369,5 @@ fm_fini(void)
 }
 #endif /* _KERNEL */
 
-ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, UINT, ZMOD_RW,
 	"Max event queue length");
diff --git a/sys/contrib/openzfs/module/zfs/gzip.c b/sys/contrib/openzfs/module/zfs/gzip.c
index e2c6e59969d6..f3b19446352a 100644
--- a/sys/contrib/openzfs/module/zfs/gzip.c
+++ b/sys/contrib/openzfs/module/zfs/gzip.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -28,7 +28,6 @@
 
 #include <sys/debug.h>
 #include <sys/types.h>
-#include <sys/strings.h>
 #include <sys/qat.h>
 #include <sys/zio_compress.h>
 
@@ -66,7 +65,7 @@ gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 			if (d_len != s_len)
 				return (s_len);
 
-			bcopy(s_start, d_start, s_len);
+			memcpy(d_start, s_start, s_len);
 			return (s_len);
 		}
 		/* if hardware compression fails, do it again with software */
@@ -76,17 +75,17 @@ gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 		if (d_len != s_len)
 			return (s_len);
 
-		bcopy(s_start, d_start, s_len);
+		memcpy(d_start, s_start, s_len);
 		return (s_len);
 	}
 
 	return ((size_t)dstlen);
 }
 
-/*ARGSUSED*/
 int
 gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 {
+	(void) n;
 	zlen_t dstlen = d_len;
 
 	ASSERT(d_len >= s_len);
diff --git a/sys/contrib/openzfs/module/zfs/hkdf.c b/sys/contrib/openzfs/module/zfs/hkdf.c
index 14265472df7d..580544c8ac1a 100644
--- a/sys/contrib/openzfs/module/zfs/hkdf.c
+++ b/sys/contrib/openzfs/module/zfs/hkdf.c
@@ -36,7 +36,6 @@ hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material,
 	mech.cm_param_len = 0;
 
 	/* initialize the salt as a crypto key */
-	key.ck_format = CRYPTO_KEY_RAW;
 	key.ck_length = CRYPTO_BYTES2BITS(salt_len);
 	key.ck_data = salt;
 
@@ -53,7 +52,7 @@ hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material,
 	output_cd.cd_raw.iov_base = (char *)out_buf;
 	output_cd.cd_raw.iov_len = output_cd.cd_length;
 
-	ret = crypto_mac(&mech, &input_cd, &key, NULL, &output_cd, NULL);
+	ret = crypto_mac(&mech, &input_cd, &key, NULL, &output_cd);
 	if (ret != CRYPTO_SUCCESS)
 		return (SET_ERROR(EIO));
 
@@ -83,7 +82,6 @@ hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len,
 	mech.cm_param_len = 0;
 
 	/* initialize the salt as a crypto key */
-	key.ck_format = CRYPTO_KEY_RAW;
 	key.ck_length = CRYPTO_BYTES2BITS(SHA512_DIGEST_LENGTH);
 	key.ck_data = extract_key;
 
@@ -110,19 +108,19 @@ hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len,
 		T_cd.cd_length = T_len;
 		T_cd.cd_raw.iov_len = T_cd.cd_length;
 
-		ret = crypto_mac_init(&mech, &key, NULL, &ctx, NULL);
+		ret = crypto_mac_init(&mech, &key, NULL, &ctx);
 		if (ret != CRYPTO_SUCCESS)
 			return (SET_ERROR(EIO));
 
-		ret = crypto_mac_update(ctx, &T_cd, NULL);
+		ret = crypto_mac_update(ctx, &T_cd);
 		if (ret != CRYPTO_SUCCESS)
 			return (SET_ERROR(EIO));
 
-		ret = crypto_mac_update(ctx, &info_cd, NULL);
+		ret = crypto_mac_update(ctx, &info_cd);
 		if (ret != CRYPTO_SUCCESS)
 			return (SET_ERROR(EIO));
 
-		ret = crypto_mac_update(ctx, &c_cd, NULL);
+		ret = crypto_mac_update(ctx, &c_cd);
 		if (ret != CRYPTO_SUCCESS)
 			return (SET_ERROR(EIO));
 
@@ -130,11 +128,11 @@ hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len,
 		T_cd.cd_length = T_len;
 		T_cd.cd_raw.iov_len = T_cd.cd_length;
 
-		ret = crypto_mac_final(ctx, &T_cd, NULL);
+		ret = crypto_mac_final(ctx, &T_cd);
 		if (ret != CRYPTO_SUCCESS)
 			return (SET_ERROR(EIO));
 
-		bcopy(T, out_buf + pos,
+		memcpy(out_buf + pos, T,
 		    (i != N) ? SHA512_DIGEST_LENGTH : (out_len - pos));
 		pos += SHA512_DIGEST_LENGTH;
 	}
diff --git a/sys/contrib/openzfs/module/zfs/lz4.c b/sys/contrib/openzfs/module/zfs/lz4.c
index 9da9d9e00635..75a31bf17ea4 100644
--- a/sys/contrib/openzfs/module/zfs/lz4.c
+++ b/sys/contrib/openzfs/module/zfs/lz4.c
@@ -1,165 +1,50 @@
 /*
- * LZ4 - Fast LZ compression algorithm
- * Header File
- * Copyright (C) 2011-2013, Yann Collet.
- * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
+   LZ4 - Fast LZ compression algorithm
+   Copyright (C) 2011-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+    - LZ4 homepage : http://www.lz4.org
+    - LZ4 source repository : https://github.com/lz4/lz4
+*/
+
+/*
+ * This file contains unmodified code from lz4 1.9.3's decompressor, plus
+ * associated macros and constants.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * It also contains a couple of defines from the old lz4.c to make things
+ * fit together smoothly.
  *
- * You can contact the author at :
- * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
- * - LZ4 source repository : http://code.google.com/p/lz4/
  */
 
 #include <sys/zfs_context.h>
-#include <sys/zio_compress.h>
 
-static int real_LZ4_compress(const char *source, char *dest, int isize,
-    int osize);
-static int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
+int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
     int isize, int maxOutputSize);
-static int LZ4_compressCtx(void *ctx, const char *source, char *dest,
-    int isize, int osize);
-static int LZ4_compress64kCtx(void *ctx, const char *source, char *dest,
-    int isize, int osize);
-
-static void *lz4_alloc(int flags);
-static void lz4_free(void *ctx);
-
-/*ARGSUSED*/
-size_t
-lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
-    size_t d_len, int n)
-{
-	uint32_t bufsiz;
-	char *dest = d_start;
-
-	ASSERT(d_len >= sizeof (bufsiz));
-
-	bufsiz = real_LZ4_compress(s_start, &dest[sizeof (bufsiz)], s_len,
-	    d_len - sizeof (bufsiz));
-
-	/* Signal an error if the compression routine returned zero. */
-	if (bufsiz == 0)
-		return (s_len);
-
-	/*
-	 * The exact compressed size is needed by the decompression routine,
-	 * so it is stored at the start of the buffer. Note that this may be
-	 * less than the compressed block size, which is rounded up to a
-	 * multiple of 1<<ashift.
-	 */
-	*(uint32_t *)dest = BE_32(bufsiz);
-
-	return (bufsiz + sizeof (bufsiz));
-}
-
-/*ARGSUSED*/
-int
-lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
-    size_t d_len, int n)
-{
-	const char *src = s_start;
-	uint32_t bufsiz = BE_IN32(src);
-
-	/* invalid compressed buffer size encoded at start */
-	if (bufsiz + sizeof (bufsiz) > s_len)
-		return (1);
-
-	/*
-	 * Returns 0 on success (decompression function returned non-negative)
-	 * and non-zero on failure (decompression function returned negative).
-	 */
-	return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)],
-	    d_start, bufsiz, d_len) < 0);
-}
-
-/*
- * LZ4 API Description:
- *
- * Simple Functions:
- * real_LZ4_compress() :
- * 	isize  : is the input size. Max supported value is ~1.9GB
- * 	return : the number of bytes written in buffer dest
- *		 or 0 if the compression fails (if LZ4_COMPRESSMIN is set).
- * 	note : destination buffer must be already allocated.
- * 		destination buffer must be sized to handle worst cases
- * 		situations (input data not compressible) worst case size
- * 		evaluation is provided by function LZ4_compressBound().
- *
- * real_LZ4_uncompress() :
- * 	osize  : is the output size, therefore the original size
- * 	return : the number of bytes read in the source buffer.
- * 		If the source stream is malformed, the function will stop
- * 		decoding and return a negative result, indicating the byte
- * 		position of the faulty instruction. This function never
- * 		writes beyond dest + osize, and is therefore protected
- * 		against malicious data packets.
- * 	note : destination buffer must be already allocated
- *	note : real_LZ4_uncompress() is not used in ZFS so its code
- *	       is not present here.
- *
- * Advanced Functions
- *
- * LZ4_compressBound() :
- * 	Provides the maximum size that LZ4 may output in a "worst case"
- * 	scenario (input data not compressible) primarily useful for memory
- * 	allocation of output buffer.
- *
- * 	isize  : is the input size. Max supported value is ~1.9GB
- * 	return : maximum output size in a "worst case" scenario
- * 	note : this function is limited by "int" range (2^31-1)
- *
- * LZ4_uncompress_unknownOutputSize() :
- * 	isize  : is the input size, therefore the compressed size
- * 	maxOutputSize : is the size of the destination buffer (which must be
- * 		already allocated)
- * 	return : the number of bytes decoded in the destination buffer
- * 		(necessarily <= maxOutputSize). If the source stream is
- * 		malformed, the function will stop decoding and return a
- * 		negative result, indicating the byte position of the faulty
- * 		instruction. This function never writes beyond dest +
- * 		maxOutputSize, and is therefore protected against malicious
- * 		data packets.
- * 	note   : Destination buffer must be already allocated.
- *		This version is slightly slower than real_LZ4_uncompress()
- *
- * LZ4_compressCtx() :
- * 	This function explicitly handles the CTX memory structure.
- *
- * 	ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
- * 	by the caller (either on the stack or using kmem_cache_alloc). Passing
- * 	NULL isn't valid.
- *
- * LZ4_compress64kCtx() :
- * 	Same as LZ4_compressCtx(), but specific to small inputs (<64KB).
- * 	isize *Must* be <64KB, otherwise the output will be corrupted.
- *
- * 	ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
- * 	by the caller (either on the stack or using kmem_cache_alloc). Passing
- * 	NULL isn't valid.
- */
 
 /*
  * Tuning parameters
@@ -186,26 +71,6 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
 #define	NOTCOMPRESSIBLE_CONFIRMATION 6
 
 /*
- * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE: This will provide a boost to
- * performance for big endian cpu, but the resulting compressed stream
- * will be incompatible with little-endian CPU. You can set this option
- * to 1 in situations where data will stay within closed environment.
- * This option is useless on Little_Endian CPU (such as x86).
- */
-/* #define	BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */
-
-/*
- * CPU Feature Detection
- */
-
-/* 32 or 64 bits ? */
-#if defined(_LP64)
-#define	LZ4_ARCH64 1
-#else
-#define	LZ4_ARCH64 0
-#endif
-
-/*
  * Little Endian or Big Endian?
  * Note: overwrite the below #define if you know your architecture endianness.
  */
@@ -219,25 +84,44 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
 #undef LZ4_BIG_ENDIAN
 #endif
 
-/*
- * Unaligned memory access is automatically enabled for "common" CPU,
- * such as x86. For others CPU, the compiler will be more cautious, and
- * insert extra code to ensure aligned access is respected. If you know
- * your target CPU supports unaligned memory access, you may want to
- * force this option manually to improve performance
+/*-************************************
+*  CPU Feature Detection
+**************************************/
+/* LZ4_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets which assembly generation depends on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
  */
-#if defined(__ARM_FEATURE_UNALIGNED)
-#define	LZ4_FORCE_UNALIGNED_ACCESS 1
+#ifndef LZ4_FORCE_MEMORY_ACCESS   /* can be defined externally */
+#  if defined(__GNUC__) && \
+  ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) \
+  || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define LZ4_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__)
+#    define LZ4_FORCE_MEMORY_ACCESS 1
+#  endif
 #endif
 
 /*
+ * LZ4_FORCE_SW_BITCOUNT
+ * Define this parameter if your target system or compiler does not support hardware bit count
+ */
+/*
  * Illumos : we can't use GCC's __builtin_ctz family of builtins in the
  * kernel
  * Linux : we can use GCC's __builtin_ctz family of builtins in the
  * kernel
  */
 #undef	LZ4_FORCE_SW_BITCOUNT
-#if defined(__sparc)
+#if defined(__sunos__)
 #define	LZ4_FORCE_SW_BITCOUNT
 #endif
 
@@ -257,10 +141,50 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
 
 #define	GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 
-#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
-#define	expect(expr, value)    (__builtin_expect((expr), (value)))
+#ifndef LZ4_FORCE_INLINE
+#  ifdef _MSC_VER    /* Visual Studio */
+#    define LZ4_FORCE_INLINE static __forceinline
+#  else
+#    if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#      ifdef __GNUC__
+#        define LZ4_FORCE_INLINE static inline __attribute__((always_inline))
+#      else
+#        define LZ4_FORCE_INLINE static inline
+#      endif
+#    else
+#      define LZ4_FORCE_INLINE static
+#    endif /* __STDC_VERSION__ */
+#  endif  /* _MSC_VER */
+#endif /* LZ4_FORCE_INLINE */
+
+/* LZ4_FORCE_O2 and LZ4_FORCE_INLINE
+ * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
+ * together with a simple 8-byte copy loop as a fall-back path.
+ * However, this optimization hurts the decompression speed by >30%,
+ * because the execution does not go to the optimized loop
+ * for typical compressible data, and all of the preamble checks
+ * before going to the fall-back path become useless overhead.
+ * This optimization happens only with the -O3 flag, and -O2 generates
+ * a simple 8-byte copy loop.
+ * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
+ * functions are annotated with __attribute__((optimize("O2"))),
+ * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
+ * of LZ4_wildCopy8 does not affect the compression speed.
+ */
+#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
+#  define LZ4_FORCE_O2  __attribute__((optimize("O2")))
+#  undef LZ4_FORCE_INLINE
+#  define LZ4_FORCE_INLINE  static __inline __attribute__((optimize("O2"),always_inline))
 #else
-#define	expect(expr, value)    (expr)
+#  define LZ4_FORCE_O2
+#endif
+
+#ifndef expect
+#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)
+#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
+#else
+#  define expect(expr,value)    (expr)
+#endif
 #endif
 
 #ifndef likely
@@ -271,814 +195,793 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
 #define	unlikely(expr)	expect((expr) != 0, 0)
 #endif
 
-#define	lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \
-	(((x) & 0xffu) << 8)))
+#ifndef _KERNEL
+#include <stdlib.h>   /* malloc, calloc, free */
+#include <string.h>   /* memset, memcpy */
+#endif
+#define ALLOC(s)          malloc(s)
+#define ALLOC_AND_ZERO(s) calloc(1,s)
+#define FREEMEM(p)        free(p)
+
+#define MEM_INIT(p,v,s)   memset((p),(v),(s))
+
+
+/*-************************************
+*  Common Constants
+**************************************/
+#define MINMATCH 4
 
-/* Basic types */
-#define	BYTE	uint8_t
-#define	U16	uint16_t
-#define	U32	uint32_t
-#define	S32	int32_t
-#define	U64	uint64_t
+#define WILDCOPYLENGTH 8
+#define LASTLITERALS   5   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MFLIMIT       12   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MATCH_SAFEGUARD_DISTANCE  ((2*WILDCOPYLENGTH) - MINMATCH)   /* ensure it's possible to write 2 x wildcopyLength without overflowing output buffer */
+#define FASTLOOP_SAFE_DISTANCE 64
 
-#ifndef LZ4_FORCE_UNALIGNED_ACCESS
-#pragma pack(1)
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#ifndef LZ4_DISTANCE_MAX   /* history window size; can be user-defined at compile time */
+#  define LZ4_DISTANCE_MAX 65535   /* set to maximum value by default */
 #endif
 
-typedef struct _U16_S {
-	U16 v;
-} U16_S;
-typedef struct _U32_S {
-	U32 v;
-} U32_S;
-typedef struct _U64_S {
-	U64 v;
-} U64_S;
-
-#ifndef LZ4_FORCE_UNALIGNED_ACCESS
-#pragma pack()
+#define LZ4_DISTANCE_ABSOLUTE_MAX 65535
+#if (LZ4_DISTANCE_MAX > LZ4_DISTANCE_ABSOLUTE_MAX)   /* max supported by LZ4 format */
+#  error "LZ4_DISTANCE_MAX is too big : must be <= 65535"
 #endif
 
-#define	A64(x) (((U64_S *)(x))->v)
-#define	A32(x) (((U32_S *)(x))->v)
-#define	A16(x) (((U16_S *)(x))->v)
+#define ML_BITS  4
+#define ML_MASK  ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
 
-/*
- * Constants
- */
-#define	MINMATCH 4
+#define DEBUGLOG(l, ...) {}    /* disabled */
 
-#define	HASH_LOG COMPRESSIONLEVEL
-#define	HASHTABLESIZE (1 << HASH_LOG)
-#define	HASH_MASK (HASHTABLESIZE - 1)
+#ifndef assert
+#define assert ASSERT
+#endif
 
-#define	SKIPSTRENGTH (NOTCOMPRESSIBLE_CONFIRMATION > 2 ? \
-	NOTCOMPRESSIBLE_CONFIRMATION : 2)
+/*-************************************
+*  Types
+**************************************/
+#ifndef _KERNEL
+#include <limits.h>
+#endif
+#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#ifndef _KERNEL
+#include <stdint.h>
+#endif
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef uintptr_t uptrval;
+#else
+# if UINT_MAX != 4294967295UL
+#   error "LZ4 code (when not C++ or C99) assumes that sizeof(int) == 4"
+# endif
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef size_t              uptrval;   /* generally true, except OpenVMS-64 */
+#endif
 
-#define	COPYLENGTH 8
-#define	LASTLITERALS 5
-#define	MFLIMIT (COPYLENGTH + MINMATCH)
-#define	MINLENGTH (MFLIMIT + 1)
+#if defined(__x86_64__)
+  typedef U64    reg_t;   /* 64-bits in x32 mode */
+#else
+  typedef size_t reg_t;   /* 32-bits in x32 mode */
+#endif
 
-#define	MAXD_LOG 16
-#define	MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+typedef enum {
+    notLimited = 0,
+    limitedOutput = 1,
+    fillOutput = 2
+} limitedOutput_directive;
 
-#define	ML_BITS 4
-#define	ML_MASK ((1U<<ML_BITS)-1)
-#define	RUN_BITS (8-ML_BITS)
-#define	RUN_MASK ((1U<<RUN_BITS)-1)
 
+/*-************************************
+*  Reading and writing into memory
+**************************************/
 
-/*
- * Architecture-specific macros
+/**
+ * LZ4 relies on memcpy with a constant size being inlined. In freestanding
+ * environments, the compiler can't assume the implementation of memcpy() is
+ * standard compliant, so it can't apply its specialized memcpy() inlining
+ * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze
+ * memcpy() as if it were standard compliant, so it can inline it in freestanding
+ * environments. This is needed when decompressing the Linux Kernel, for example.
  */
-#if LZ4_ARCH64
-#define	STEPSIZE 8
-#define	UARCH U64
-#define	AARCH A64
-#define	LZ4_COPYSTEP(s, d)	A64(d) = A64(s); d += 8; s += 8;
-#define	LZ4_COPYPACKET(s, d)	LZ4_COPYSTEP(s, d)
-#define	LZ4_SECURECOPY(s, d, e)	if (d < e) LZ4_WILDCOPY(s, d, e)
-#define	HTYPE U32
-#define	INITBASE(base)		const BYTE* const base = ip
-#else /* !LZ4_ARCH64 */
-#define	STEPSIZE 4
-#define	UARCH U32
-#define	AARCH A32
-#define	LZ4_COPYSTEP(s, d)	A32(d) = A32(s); d += 4; s += 4;
-#define	LZ4_COPYPACKET(s, d)	LZ4_COPYSTEP(s, d); LZ4_COPYSTEP(s, d);
-#define	LZ4_SECURECOPY		LZ4_WILDCOPY
-#define	HTYPE const BYTE *
-#define	INITBASE(base)		const int base = 0
-#endif /* !LZ4_ARCH64 */
-
-#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
-#define	LZ4_READ_LITTLEENDIAN_16(d, s, p) \
-	{ U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
-#define	LZ4_WRITE_LITTLEENDIAN_16(p, i) \
-	{ U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p += 2; }
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
 #else
-#define	LZ4_READ_LITTLEENDIAN_16(d, s, p) { d = (s) - A16(p); }
-#define	LZ4_WRITE_LITTLEENDIAN_16(p, v)  { A16(p) = v; p += 2; }
+#define LZ4_memcpy(dst, src, size) memcpy(dst, src, size)
 #endif
 
+static unsigned LZ4_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental */
+    return one.c[0];
+}
 
-/* Local structures */
-struct refTables {
-	HTYPE hashTable[HASHTABLESIZE];
-};
 
+#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2)
+/* lie to the compiler about data alignment; use with caution */
 
-/* Macros */
-#define	LZ4_HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH * 8) - \
-	HASH_LOG))
-#define	LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p))
-#define	LZ4_WILDCOPY(s, d, e) do { LZ4_COPYPACKET(s, d) } while (d < e);
-#define	LZ4_BLINDCOPY(s, d, l) { BYTE* e = (d) + l; LZ4_WILDCOPY(s, d, e); \
-	d = e; }
+static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; }
 
+static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
 
-/* Private functions */
-#if LZ4_ARCH64
+#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1)
 
-static inline int
-LZ4_NbCommonBytes(register U64 val)
-{
-#if defined(LZ4_BIG_ENDIAN)
-#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
-	!defined(LZ4_FORCE_SW_BITCOUNT)
-	return (__builtin_clzll(val) >> 3);
-#else
-	int r;
-	if (!(val >> 32)) {
-		r = 4;
-	} else {
-		r = 0;
-		val >>= 32;
-	}
-	if (!(val >> 16)) {
-		r += 2;
-		val >>= 8;
-	} else {
-		val >>= 24;
-	}
-	r += (!val);
-	return (r);
-#endif
-#else
-#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
-	!defined(LZ4_FORCE_SW_BITCOUNT)
-	return (__builtin_ctzll(val) >> 3);
-#else
-	static const int DeBruijnBytePos[64] =
-	    { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5,
-		3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5,
-		5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4,
-		4, 5, 7, 2, 6, 5, 7, 6, 7, 7
-	};
-	return DeBruijnBytePos[((U64) ((val & -val) * 0x0218A392CDABBD3F)) >>
-	    58];
-#endif
-#endif
-}
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) unalign;
 
-#else
+static U16 LZ4_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+
+static void LZ4_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
+
+#else  /* safe and portable access using memcpy() */
 
-static inline int
-LZ4_NbCommonBytes(register U32 val)
+static U16 LZ4_read16(const void* memPtr)
 {
-#if defined(LZ4_BIG_ENDIAN)
-#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
-	!defined(LZ4_FORCE_SW_BITCOUNT)
-	return (__builtin_clz(val) >> 3);
-#else
-	int r;
-	if (!(val >> 16)) {
-		r = 2;
-		val >>= 8;
-	} else {
-		r = 0;
-		val >>= 24;
-	}
-	r += (!val);
-	return (r);
-#endif
-#else
-#if defined(__GNUC__) && (GCC_VERSION >= 304) && \
-	!defined(LZ4_FORCE_SW_BITCOUNT)
-	return (__builtin_ctz(val) >> 3);
-#else
-	static const int DeBruijnBytePos[32] = {
-		0, 0, 3, 0, 3, 1, 3, 0,
-		3, 2, 2, 1, 3, 2, 0, 1,
-		3, 3, 1, 2, 2, 2, 2, 0,
-		3, 1, 2, 0, 1, 0, 1, 1
-	};
-	return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >>
-	    27];
-#endif
-#endif
+    U16 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
 }
 
-#endif
+static void LZ4_write32(void* memPtr, U32 value)
+{
+    LZ4_memcpy(memPtr, &value, sizeof(value));
+}
 
-/* Compression functions */
+#endif /* LZ4_FORCE_MEMORY_ACCESS */
 
-/*ARGSUSED*/
-static int
-LZ4_compressCtx(void *ctx, const char *source, char *dest, int isize,
-    int osize)
+static U16 LZ4_readLE16(const void* memPtr)
 {
-	struct refTables *srt = (struct refTables *)ctx;
-	HTYPE *HashTable = (HTYPE *) (srt->hashTable);
-
-	const BYTE *ip = (BYTE *) source;
-	INITBASE(base);
-	const BYTE *anchor = ip;
-	const BYTE *const iend = ip + isize;
-	const BYTE *const oend = (BYTE *) dest + osize;
-	const BYTE *const mflimit = iend - MFLIMIT;
-#define	matchlimit (iend - LASTLITERALS)
-
-	BYTE *op = (BYTE *) dest;
-
-	int len, length;
-	const int skipStrength = SKIPSTRENGTH;
-	U32 forwardH;
-
-
-	/* Init */
-	if (isize < MINLENGTH)
-		goto _last_literals;
-
-	/* First Byte */
-	HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
-	ip++;
-	forwardH = LZ4_HASH_VALUE(ip);
-
-	/* Main Loop */
-	for (;;) {
-		int findMatchAttempts = (1U << skipStrength) + 3;
-		const BYTE *forwardIp = ip;
-		const BYTE *ref;
-		BYTE *token;
-
-		/* Find a match */
-		do {
-			U32 h = forwardH;
-			int step = findMatchAttempts++ >> skipStrength;
-			ip = forwardIp;
-			forwardIp = ip + step;
-
-			if (unlikely(forwardIp > mflimit)) {
-				goto _last_literals;
-			}
-
-			forwardH = LZ4_HASH_VALUE(forwardIp);
-			ref = base + HashTable[h];
-			HashTable[h] = ip - base;
-
-		} while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip)));
-
-		/* Catch up */
-		while ((ip > anchor) && (ref > (BYTE *) source) &&
-		    unlikely(ip[-1] == ref[-1])) {
-			ip--;
-			ref--;
-		}
-
-		/* Encode Literal length */
-		length = ip - anchor;
-		token = op++;
-
-		/* Check output limit */
-		if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
-		    (length >> 8) > oend))
-			return (0);
-
-		if (length >= (int)RUN_MASK) {
-			*token = (RUN_MASK << ML_BITS);
-			len = length - RUN_MASK;
-			for (; len > 254; len -= 255)
-				*op++ = 255;
-			*op++ = (BYTE)len;
-		} else
-			*token = (length << ML_BITS);
-
-		/* Copy Literals */
-		LZ4_BLINDCOPY(anchor, op, length);
-
-		_next_match:
-		/* Encode Offset */
-		LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
-
-		/* Start Counting */
-		ip += MINMATCH;
-		ref += MINMATCH;	/* MinMatch verified */
-		anchor = ip;
-		while (likely(ip < matchlimit - (STEPSIZE - 1))) {
-			UARCH diff = AARCH(ref) ^ AARCH(ip);
-			if (!diff) {
-				ip += STEPSIZE;
-				ref += STEPSIZE;
-				continue;
-			}
-			ip += LZ4_NbCommonBytes(diff);
-			goto _endCount;
-		}
-#if LZ4_ARCH64
-		if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
-			ip += 4;
-			ref += 4;
-		}
-#endif
-		if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
-			ip += 2;
-			ref += 2;
-		}
-		if ((ip < matchlimit) && (*ref == *ip))
-			ip++;
-		_endCount:
-
-		/* Encode MatchLength */
-		len = (ip - anchor);
-		/* Check output limit */
-		if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
-			return (0);
-		if (len >= (int)ML_MASK) {
-			*token += ML_MASK;
-			len -= ML_MASK;
-			for (; len > 509; len -= 510) {
-				*op++ = 255;
-				*op++ = 255;
-			}
-			if (len > 254) {
-				len -= 255;
-				*op++ = 255;
-			}
-			*op++ = (BYTE)len;
-		} else
-			*token += len;
-
-		/* Test end of chunk */
-		if (ip > mflimit) {
-			anchor = ip;
-			break;
-		}
-		/* Fill table */
-		HashTable[LZ4_HASH_VALUE(ip - 2)] = ip - 2 - base;
-
-		/* Test next position */
-		ref = base + HashTable[LZ4_HASH_VALUE(ip)];
-		HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
-		if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) {
-			token = op++;
-			*token = 0;
-			goto _next_match;
-		}
-		/* Prepare next loop */
-		anchor = ip++;
-		forwardH = LZ4_HASH_VALUE(ip);
-	}
-
-	_last_literals:
-	/* Encode Last Literals */
-	{
-		int lastRun = iend - anchor;
-		if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
-		    oend)
-			return (0);
-		if (lastRun >= (int)RUN_MASK) {
-			*op++ = (RUN_MASK << ML_BITS);
-			lastRun -= RUN_MASK;
-			for (; lastRun > 254; lastRun -= 255) {
-				*op++ = 255;
-			}
-			*op++ = (BYTE)lastRun;
-		} else
-			*op++ = (lastRun << ML_BITS);
-		(void) memcpy(op, anchor, iend - anchor);
-		op += iend - anchor;
-	}
-
-	/* End */
-	return (int)(((char *)op) - dest);
+    if (LZ4_isLittleEndian()) {
+        return LZ4_read16(memPtr);
+    } else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)((U16)p[0] + (p[1]<<8));
+    }
 }
 
+/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */
+LZ4_FORCE_INLINE
+void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* const e = (BYTE*)dstEnd;
 
+    do { LZ4_memcpy(d,s,8); d+=8; s+=8; } while (d<e);
+}
 
-/* Note : this function is valid only if isize < LZ4_64KLIMIT */
-#define	LZ4_64KLIMIT ((1 << 16) + (MFLIMIT - 1))
-#define	HASHLOG64K (HASH_LOG + 1)
-#define	HASH64KTABLESIZE (1U << HASHLOG64K)
-#define	LZ4_HASH64K_FUNCTION(i)	(((i) * 2654435761U) >> ((MINMATCH*8) - \
-	HASHLOG64K))
-#define	LZ4_HASH64K_VALUE(p)	LZ4_HASH64K_FUNCTION(A32(p))
+static const unsigned inc32table[8] = {0, 1, 2,  1,  0,  4, 4, 4};
+static const int      dec64table[8] = {0, 0, 0, -1, -4,  1, 2, 3};
+
+
+#ifndef LZ4_FAST_DEC_LOOP
+#  if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+#    define LZ4_FAST_DEC_LOOP 1
+#  elif defined(__aarch64__) && !defined(__clang__)
+     /* On aarch64, we disable this optimization for clang because on certain
+      * mobile chipsets, performance is reduced with clang. For information
+      * refer to https://github.com/lz4/lz4/pull/707 */
+#    define LZ4_FAST_DEC_LOOP 1
+#  else
+#    define LZ4_FAST_DEC_LOOP 0
+#  endif
+#endif
+
+#if LZ4_FAST_DEC_LOOP
 
-/*ARGSUSED*/
-static int
-LZ4_compress64kCtx(void *ctx, const char *source, char *dest, int isize,
-    int osize)
+LZ4_FORCE_INLINE void
+LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
 {
-	struct refTables *srt = (struct refTables *)ctx;
-	U16 *HashTable = (U16 *) (srt->hashTable);
-
-	const BYTE *ip = (BYTE *) source;
-	const BYTE *anchor = ip;
-	const BYTE *const base = ip;
-	const BYTE *const iend = ip + isize;
-	const BYTE *const oend = (BYTE *) dest + osize;
-	const BYTE *const mflimit = iend - MFLIMIT;
-#define	matchlimit (iend - LASTLITERALS)
-
-	BYTE *op = (BYTE *) dest;
-
-	int len, length;
-	const int skipStrength = SKIPSTRENGTH;
-	U32 forwardH;
-
-	/* Init */
-	if (isize < MINLENGTH)
-		goto _last_literals;
-
-	/* First Byte */
-	ip++;
-	forwardH = LZ4_HASH64K_VALUE(ip);
-
-	/* Main Loop */
-	for (;;) {
-		int findMatchAttempts = (1U << skipStrength) + 3;
-		const BYTE *forwardIp = ip;
-		const BYTE *ref;
-		BYTE *token;
-
-		/* Find a match */
-		do {
-			U32 h = forwardH;
-			int step = findMatchAttempts++ >> skipStrength;
-			ip = forwardIp;
-			forwardIp = ip + step;
-
-			if (forwardIp > mflimit) {
-				goto _last_literals;
-			}
-
-			forwardH = LZ4_HASH64K_VALUE(forwardIp);
-			ref = base + HashTable[h];
-			HashTable[h] = ip - base;
-
-		} while (A32(ref) != A32(ip));
-
-		/* Catch up */
-		while ((ip > anchor) && (ref > (BYTE *) source) &&
-		    (ip[-1] == ref[-1])) {
-			ip--;
-			ref--;
-		}
-
-		/* Encode Literal length */
-		length = ip - anchor;
-		token = op++;
-
-		/* Check output limit */
-		if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
-		    (length >> 8) > oend))
-			return (0);
-
-		if (length >= (int)RUN_MASK) {
-			*token = (RUN_MASK << ML_BITS);
-			len = length - RUN_MASK;
-			for (; len > 254; len -= 255)
-				*op++ = 255;
-			*op++ = (BYTE)len;
-		} else
-			*token = (length << ML_BITS);
-
-		/* Copy Literals */
-		LZ4_BLINDCOPY(anchor, op, length);
-
-		_next_match:
-		/* Encode Offset */
-		LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
-
-		/* Start Counting */
-		ip += MINMATCH;
-		ref += MINMATCH;	/* MinMatch verified */
-		anchor = ip;
-		while (ip < matchlimit - (STEPSIZE - 1)) {
-			UARCH diff = AARCH(ref) ^ AARCH(ip);
-			if (!diff) {
-				ip += STEPSIZE;
-				ref += STEPSIZE;
-				continue;
-			}
-			ip += LZ4_NbCommonBytes(diff);
-			goto _endCount;
-		}
-#if LZ4_ARCH64
-		if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
-			ip += 4;
-			ref += 4;
-		}
-#endif
-		if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
-			ip += 2;
-			ref += 2;
-		}
-		if ((ip < matchlimit) && (*ref == *ip))
-			ip++;
-		_endCount:
-
-		/* Encode MatchLength */
-		len = (ip - anchor);
-		/* Check output limit */
-		if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
-			return (0);
-		if (len >= (int)ML_MASK) {
-			*token += ML_MASK;
-			len -= ML_MASK;
-			for (; len > 509; len -= 510) {
-				*op++ = 255;
-				*op++ = 255;
-			}
-			if (len > 254) {
-				len -= 255;
-				*op++ = 255;
-			}
-			*op++ = (BYTE)len;
-		} else
-			*token += len;
-
-		/* Test end of chunk */
-		if (ip > mflimit) {
-			anchor = ip;
-			break;
-		}
-		/* Fill table */
-		HashTable[LZ4_HASH64K_VALUE(ip - 2)] = ip - 2 - base;
-
-		/* Test next position */
-		ref = base + HashTable[LZ4_HASH64K_VALUE(ip)];
-		HashTable[LZ4_HASH64K_VALUE(ip)] = ip - base;
-		if (A32(ref) == A32(ip)) {
-			token = op++;
-			*token = 0;
-			goto _next_match;
-		}
-		/* Prepare next loop */
-		anchor = ip++;
-		forwardH = LZ4_HASH64K_VALUE(ip);
-	}
-
-	_last_literals:
-	/* Encode Last Literals */
-	{
-		int lastRun = iend - anchor;
-		if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
-		    oend)
-			return (0);
-		if (lastRun >= (int)RUN_MASK) {
-			*op++ = (RUN_MASK << ML_BITS);
-			lastRun -= RUN_MASK;
-			for (; lastRun > 254; lastRun -= 255)
-				*op++ = 255;
-			*op++ = (BYTE)lastRun;
-		} else
-			*op++ = (lastRun << ML_BITS);
-		(void) memcpy(op, anchor, iend - anchor);
-		op += iend - anchor;
-	}
-
-	/* End */
-	return (int)(((char *)op) - dest);
+    assert(srcPtr + offset == dstPtr);
+    if (offset < 8) {
+        LZ4_write32(dstPtr, 0);   /* silence an msan warning when offset==0 */
+        dstPtr[0] = srcPtr[0];
+        dstPtr[1] = srcPtr[1];
+        dstPtr[2] = srcPtr[2];
+        dstPtr[3] = srcPtr[3];
+        srcPtr += inc32table[offset];
+        LZ4_memcpy(dstPtr+4, srcPtr, 4);
+        srcPtr -= dec64table[offset];
+        dstPtr += 8;
+    } else {
+        LZ4_memcpy(dstPtr, srcPtr, 8);
+        dstPtr += 8;
+        srcPtr += 8;
+    }
+
+    LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
 }
 
-static int
-real_LZ4_compress(const char *source, char *dest, int isize, int osize)
+/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
+ * this version copies two times 16 bytes (instead of one time 32 bytes)
+ * because it must be compatible with offsets >= 16. */
+LZ4_FORCE_INLINE void
+LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd)
 {
-	void *ctx;
-	int result;
-
-	ctx = lz4_alloc(KM_SLEEP);
-
-	/*
-	 * out of kernel memory, gently fall through - this will disable
-	 * compression in zio_compress_data
-	 */
-	if (ctx == NULL)
-		return (0);
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* const e = (BYTE*)dstEnd;
 
-	memset(ctx, 0, sizeof (struct refTables));
-
-	if (isize < LZ4_64KLIMIT)
-		result = LZ4_compress64kCtx(ctx, source, dest, isize, osize);
-	else
-		result = LZ4_compressCtx(ctx, source, dest, isize, osize);
+    do { LZ4_memcpy(d,s,16); LZ4_memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e);
+}
 
-	lz4_free(ctx);
-	return (result);
+/* LZ4_memcpy_using_offset()  presumes :
+ * - dstEnd >= dstPtr + MINMATCH
+ * - there is at least 8 bytes available to write after dstEnd */
+LZ4_FORCE_INLINE void
+LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
+{
+    BYTE v[8];
+
+    assert(dstEnd >= dstPtr + MINMATCH);
+
+    switch(offset) {
+    case 1:
+        MEM_INIT(v, *srcPtr, 8);
+        break;
+    case 2:
+        LZ4_memcpy(v, srcPtr, 2);
+        LZ4_memcpy(&v[2], srcPtr, 2);
+        LZ4_memcpy(&v[4], v, 4);
+        break;
+    case 4:
+        LZ4_memcpy(v, srcPtr, 4);
+        LZ4_memcpy(&v[4], srcPtr, 4);
+        break;
+    default:
+        LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
+        return;
+    }
+
+    LZ4_memcpy(dstPtr, v, 8);
+    dstPtr += 8;
+    while (dstPtr < dstEnd) {
+        LZ4_memcpy(dstPtr, v, 8);
+        dstPtr += 8;
+    }
 }
+#endif
 
-/* Decompression functions */
 
-/*
- * Note: The decoding functions real_LZ4_uncompress() and
- *	LZ4_uncompress_unknownOutputSize() are safe against "buffer overflow"
- *	attack type. They will never write nor read outside of the provided
- *	output buffers. LZ4_uncompress_unknownOutputSize() also insures that
- *	it will never read outside of the input buffer. A corrupted input
- *	will produce an error result, a negative int, indicating the position
- *	of the error within input stream.
+/*-************************************
+*  Local Structures and types
+**************************************/
+typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t;
+
+/**
+ * This enum distinguishes several different modes of accessing previous
+ * content in the stream.
  *
- * Note[2]: real_LZ4_uncompress(), referred to above, is not used in ZFS so
- *	its code is not present here.
+ * - noDict        : There is no preceding content.
+ * - withPrefix64k : Table entries up to ctx->dictSize before the current blob
+ *                   blob being compressed are valid and refer to the preceding
+ *                   content (of length ctx->dictSize), which is available
+ *                   contiguously preceding in memory the content currently
+ *                   being compressed.
+ * - usingExtDict  : Like withPrefix64k, but the preceding content is somewhere
+ *                   else in memory, starting at ctx->dictionary with length
+ *                   ctx->dictSize.
+ * - usingDictCtx  : Like usingExtDict, but everything concerning the preceding
+ *                   content is in a separate context, pointed to by
+ *                   ctx->dictCtx. ctx->dictionary, ctx->dictSize, and table
+ *                   entries in the current context that refer to positions
+ *                   preceding the beginning of the current compression are
+ *                   ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx
+ *                   ->dictSize describe the location and size of the preceding
+ *                   content, and matches are found by looking in the ctx
+ *                   ->dictCtx->hashTable.
  */
+typedef enum { noDict = 0, withPrefix64k, usingExtDict, usingDictCtx } dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
 
-static const int dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
-#if LZ4_ARCH64
-static const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
-#endif
+/*-*******************************
+ *  Decompression functions
+ ********************************/
 
-static int
-LZ4_uncompress_unknownOutputSize(const char *source, char *dest, int isize,
-    int maxOutputSize)
-{
-	/* Local Variables */
-	const BYTE *restrict ip = (const BYTE *) source;
-	const BYTE *const iend = ip + isize;
-	const BYTE *ref;
-
-	BYTE *op = (BYTE *) dest;
-	BYTE *const oend = op + maxOutputSize;
-	BYTE *cpy;
-
-	/* Main Loop */
-	while (ip < iend) {
-		unsigned token;
-		size_t length;
-
-		/* get runlength */
-		token = *ip++;
-		if ((length = (token >> ML_BITS)) == RUN_MASK) {
-			int s = 255;
-			while ((ip < iend) && (s == 255)) {
-				s = *ip++;
-				if (unlikely(length > (size_t)(length + s)))
-					goto _output_error;
-				length += s;
-			}
-		}
-		/* copy literals */
-		cpy = op + length;
-		/* CORNER-CASE: cpy might overflow. */
-		if (cpy < op)
-			goto _output_error;	/* cpy was overflowed, bail! */
-		if ((cpy > oend - COPYLENGTH) ||
-		    (ip + length > iend - COPYLENGTH)) {
-			if (cpy > oend)
-				/* Error: writes beyond output buffer */
-				goto _output_error;
-			if (ip + length != iend)
-				/*
-				 * Error: LZ4 format requires to consume all
-				 * input at this stage
-				 */
-				goto _output_error;
-			(void) memcpy(op, ip, length);
-			op += length;
-			/* Necessarily EOF, due to parsing restrictions */
-			break;
-		}
-		LZ4_WILDCOPY(ip, op, cpy);
-		ip -= (op - cpy);
-		op = cpy;
-
-		/* get offset */
-		LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
-		ip += 2;
-		if (ref < (BYTE * const) dest)
-			/*
-			 * Error: offset creates reference outside of
-			 * destination buffer
-			 */
-			goto _output_error;
-
-		/* get matchlength */
-		if ((length = (token & ML_MASK)) == ML_MASK) {
-			while (ip < iend) {
-				int s = *ip++;
-				if (unlikely(length > (size_t)(length + s)))
-					goto _output_error;
-				length += s;
-				if (s == 255)
-					continue;
-				break;
-			}
-		}
-		/* copy repeated sequence */
-		if (unlikely(op - ref < STEPSIZE)) {
-#if LZ4_ARCH64
-			int dec64 = dec64table[op - ref];
-#else
-			const int dec64 = 0;
-#endif
-			op[0] = ref[0];
-			op[1] = ref[1];
-			op[2] = ref[2];
-			op[3] = ref[3];
-			op += 4;
-			ref += 4;
-			ref -= dec32table[op - ref];
-			A32(op) = A32(ref);
-			op += STEPSIZE - 4;
-			ref -= dec64;
-		} else {
-			LZ4_COPYSTEP(ref, op);
-		}
-		cpy = op + length - (STEPSIZE - 4);
-		if (cpy > oend - COPYLENGTH) {
-			if (cpy > oend)
-				/*
-				 * Error: request to write outside of
-				 * destination buffer
-				 */
-				goto _output_error;
-#if LZ4_ARCH64
-			if ((ref + COPYLENGTH) > oend)
-#else
-			if ((ref + COPYLENGTH) > oend ||
-			    (op + COPYLENGTH) > oend)
-#endif
-				goto _output_error;
-			LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
-			while (op < cpy)
-				*op++ = *ref++;
-			op = cpy;
-			if (op == oend)
-				/*
-				 * Check EOF (should never happen, since
-				 * last 5 bytes are supposed to be literals)
-				 */
-				goto _output_error;
-			continue;
-		}
-		LZ4_SECURECOPY(ref, op, cpy);
-		op = cpy;	/* correction */
-	}
-
-	/* end of decoding */
-	return (int)(((char *)op) - dest);
-
-	/* write overflow error detected */
-	_output_error:
-	return (-1);
-}
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
+typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive;
 
-#ifdef __FreeBSD__
-/*
- * FreeBSD has 4, 8 and 16 KB malloc zones which can be used here.
- * Should struct refTables get resized this may need to be revisited, hence
- * compiler-time asserts.
- */
-_Static_assert(sizeof(struct refTables) <= 16384,
-    "refTables too big for malloc");
-_Static_assert((sizeof(struct refTables) % 4096) == 0,
-    "refTables not a multiple of page size");
-#else
-#define ZFS_LZ4_USE_CACHE
-#endif
+typedef enum { loop_error = -2, initial_error = -1, ok = 0 } variable_length_error;
 
-#ifdef ZFS_LZ4_USE_CACHE
-static kmem_cache_t *lz4_cache;
-
-void
-lz4_init(void)
+LZ4_FORCE_INLINE unsigned
+read_variable_length(const BYTE**ip, const BYTE* lencheck,
+                     int loop_check, int initial_check,
+                     variable_length_error* error)
 {
-	lz4_cache = kmem_cache_create("lz4_cache",
-	    sizeof (struct refTables), 0, NULL, NULL, NULL, NULL, NULL, 0);
+    U32 length = 0;
+    U32 s;
+    if (initial_check && unlikely((*ip) >= lencheck)) {    /* overflow detection */
+        *error = initial_error;
+        return length;
+    }
+    do {
+        s = **ip;
+        (*ip)++;
+        length += s;
+        if (loop_check && unlikely((*ip) >= lencheck)) {    /* overflow detection */
+            *error = loop_error;
+            return length;
+        }
+    } while (s==255);
+
+    return length;
 }
 
-void
-lz4_fini(void)
-{
-	if (lz4_cache) {
-		kmem_cache_destroy(lz4_cache);
-		lz4_cache = NULL;
-	}
-}
+#define	LZ4_STATIC_ASSERT(c)	ASSERT(c)
 
-static void *
-lz4_alloc(int flags)
-{
-	ASSERT(lz4_cache != NULL);
-	return (kmem_cache_alloc(lz4_cache, flags));
-}
 
-static void
-lz4_free(void *ctx)
-{
-	kmem_cache_free(lz4_cache, ctx);
-}
-#else
-void
-lz4_init(void)
+/*! LZ4_decompress_generic() :
+ *  This generic decompression function covers all use cases.
+ *  It shall be instantiated several times, using different sets of directives.
+ *  Note that it is important for performance that this function really get inlined,
+ *  in order to remove useless branches during compilation optimization.
+ */
+LZ4_FORCE_INLINE int
+LZ4_decompress_generic(
+                 const char* const src,
+                 char* const dst,
+                 int srcSize,
+                 int outputSize,         /* If endOnInput==endOnInputSize, this value is `dstCapacity` */
+
+                 endCondition_directive endOnInput,   /* endOnOutputSize, endOnInputSize */
+                 earlyEnd_directive partialDecoding,  /* full, partial */
+                 dict_directive dict,                 /* noDict, withPrefix64k, usingExtDict */
+                 const BYTE* const lowPrefix,  /* always <= dst, == dst when no prefix */
+                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
+                 const size_t dictSize         /* note : = 0 if noDict */
+                 )
 {
-}
+    if ((src == NULL) || (outputSize < 0)) { return -1; }
+
+    {   const BYTE* ip = (const BYTE*) src;
+        const BYTE* const iend = ip + srcSize;
+
+        BYTE* op = (BYTE*) dst;
+        BYTE* const oend = op + outputSize;
+        BYTE* cpy;
+
+        const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize;
+
+        const int safeDecode = (endOnInput==endOnInputSize);
+        const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
+
+
+        /* Set up the "end" pointers for the shortcut. */
+        const BYTE* const shortiend = iend - (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
+        const BYTE* const shortoend = oend - (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
+
+        const BYTE* match;
+        size_t offset;
+        unsigned token;
+        size_t length;
+
+
+        DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, outputSize);
+
+        /* Special cases */
+        assert(lowPrefix <= op);
+        if ((endOnInput) && (unlikely(outputSize==0))) {
+            /* Empty output buffer */
+            if (partialDecoding) return 0;
+            return ((srcSize==1) && (*ip==0)) ? 0 : -1;
+        }
+        if ((!endOnInput) && (unlikely(outputSize==0))) { return (*ip==0 ? 1 : -1); }
+        if ((endOnInput) && unlikely(srcSize==0)) { return -1; }
+
+	/* Currently the fast loop shows a regression on qualcomm arm chips. */
+#if LZ4_FAST_DEC_LOOP
+        if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
+            DEBUGLOG(6, "skip fast decode loop");
+            goto safe_decode;
+        }
+
+        /* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */
+        while (1) {
+            /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
+            assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
+            if (endOnInput) { assert(ip < iend); }
+            token = *ip++;
+            length = token >> ML_BITS;  /* literal length */
+
+            assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
+
+            /* decode literal length */
+            if (length == RUN_MASK) {
+                variable_length_error error = ok;
+                length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error);
+                if (error == initial_error) { goto _output_error; }
+                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
+                if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
+
+                /* copy literals */
+                cpy = op+length;
+                LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+                if (endOnInput) {  /* LZ4_decompress_safe() */
+                    if ((cpy>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; }
+                    LZ4_wildCopy32(op, ip, cpy);
+                } else {   /* LZ4_decompress_fast() */
+                    if (cpy>oend-8) { goto safe_literal_copy; }
+                    LZ4_wildCopy8(op, ip, cpy); /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time :
+                                                 * it doesn't know input length, and only relies on end-of-block properties */
+                }
+                ip += length; op = cpy;
+            } else {
+                cpy = op+length;
+                if (endOnInput) {  /* LZ4_decompress_safe() */
+                    DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length);
+                    /* We don't need to check oend, since we check it once for each loop below */
+                    if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) { goto safe_literal_copy; }
+                    /* Literals can only be 14, but hope compilers optimize if we copy by a register size */
+                    LZ4_memcpy(op, ip, 16);
+                } else {  /* LZ4_decompress_fast() */
+                    /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time :
+                     * it doesn't know input length, and relies on end-of-block properties */
+                    LZ4_memcpy(op, ip, 8);
+                    if (length > 8) { LZ4_memcpy(op+8, ip+8, 8); }
+                }
+                ip += length; op = cpy;
+            }
+
+            /* get offset */
+            offset = LZ4_readLE16(ip); ip+=2;
+            match = op - offset;
+            assert(match <= op);
+
+            /* get matchlength */
+            length = token & ML_MASK;
+
+            if (length == ML_MASK) {
+                variable_length_error error = ok;
+                if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */
+                length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error);
+                if (error != ok) { goto _output_error; }
+                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */
+                length += MINMATCH;
+                if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+                    goto safe_match_copy;
+                }
+            } else {
+                length += MINMATCH;
+                if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+                    goto safe_match_copy;
+                }
+
+                /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
+                if ((dict == withPrefix64k) || (match >= lowPrefix)) {
+                    if (offset >= 8) {
+                        assert(match >= lowPrefix);
+                        assert(match <= op);
+                        assert(op + 18 <= oend);
+
+                        LZ4_memcpy(op, match, 8);
+                        LZ4_memcpy(op+8, match+8, 8);
+                        LZ4_memcpy(op+16, match+16, 2);
+                        op += length;
+                        continue;
+            }   }   }
+
+            if (checkOffset && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */
+            /* match starting within external dictionary */
+            if ((dict==usingExtDict) && (match < lowPrefix)) {
+                if (unlikely(op+length > oend-LASTLITERALS)) {
+                    if (partialDecoding) {
+                        DEBUGLOG(7, "partialDecoding: dictionary match, close to dstEnd");
+                        length = MIN(length, (size_t)(oend-op));
+                    } else {
+                        goto _output_error;  /* end-of-block condition violated */
+                }   }
+
+                if (length <= (size_t)(lowPrefix-match)) {
+                    /* match fits entirely within external dictionary : just copy */
+                    memmove(op, dictEnd - (lowPrefix-match), length);
+                    op += length;
+                } else {
+                    /* match stretches into both external dictionary and current block */
+                    size_t const copySize = (size_t)(lowPrefix - match);
+                    size_t const restSize = length - copySize;
+                    LZ4_memcpy(op, dictEnd - copySize, copySize);
+                    op += copySize;
+                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
+                        BYTE* const endOfMatch = op + restSize;
+                        const BYTE* copyFrom = lowPrefix;
+                        while (op < endOfMatch) { *op++ = *copyFrom++; }
+                    } else {
+                        LZ4_memcpy(op, lowPrefix, restSize);
+                        op += restSize;
+                }   }
+                continue;
+            }
+
+            /* copy match within block */
+            cpy = op + length;
+
+            assert((op <= oend) && (oend-op >= 32));
+            if (unlikely(offset<16)) {
+                LZ4_memcpy_using_offset(op, match, cpy, offset);
+            } else {
+                LZ4_wildCopy32(op, match, cpy);
+            }
+
+            op = cpy;   /* wildcopy correction */
+        }
+    safe_decode:
+#endif
 
-void
-lz4_fini(void)
-{
+        /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */
+        while (1) {
+            token = *ip++;
+            length = token >> ML_BITS;  /* literal length */
+
+            assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
+
+            /* A two-stage shortcut for the most common case:
+             * 1) If the literal length is 0..14, and there is enough space,
+             * enter the shortcut and copy 16 bytes on behalf of the literals
+             * (in the fast mode, only 8 bytes can be safely copied this way).
+             * 2) Further if the match length is 4..18, copy 18 bytes in a similar
+             * manner; but we ensure that there's enough space in the output for
+             * those 18 bytes earlier, upon entering the shortcut (in other words,
+             * there is a combined check for both stages).
+             */
+            if ( (endOnInput ? length != RUN_MASK : length <= 8)
+                /* strictly "less than" on input, to re-enter the loop with at least one byte */
+              && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend)) ) {
+                /* Copy the literals */
+                LZ4_memcpy(op, ip, endOnInput ? 16 : 8);
+                op += length; ip += length;
+
+                /* The second stage: prepare for match copying, decode full info.
+                 * If it doesn't work out, the info won't be wasted. */
+                length = token & ML_MASK; /* match length */
+                offset = LZ4_readLE16(ip); ip += 2;
+                match = op - offset;
+                assert(match <= op); /* check overflow */
+
+                /* Do not deal with overlapping matches. */
+                if ( (length != ML_MASK)
+                  && (offset >= 8)
+                  && (dict==withPrefix64k || match >= lowPrefix) ) {
+                    /* Copy the match. */
+                    LZ4_memcpy(op + 0, match + 0, 8);
+                    LZ4_memcpy(op + 8, match + 8, 8);
+                    LZ4_memcpy(op +16, match +16, 2);
+                    op += length + MINMATCH;
+                    /* Both stages worked, load the next token. */
+                    continue;
+                }
+
+                /* The second stage didn't work out, but the info is ready.
+                 * Propel it right to the point of match copying. */
+                goto _copy_match;
+            }
+
+            /* decode literal length */
+            if (length == RUN_MASK) {
+                variable_length_error error = ok;
+                length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error);
+                if (error == initial_error) { goto _output_error; }
+                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
+                if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
+            }
+
+            /* copy literals */
+            cpy = op+length;
+#if LZ4_FAST_DEC_LOOP
+        safe_literal_copy:
+#endif
+            LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+            if ( ((endOnInput) && ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) )
+              || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)) )
+            {
+                /* We've either hit the input parsing restriction or the output parsing restriction.
+                 * In the normal scenario, decoding a full block, it must be the last sequence,
+                 * otherwise it's an error (invalid input or dimensions).
+                 * In partialDecoding scenario, it's necessary to ensure there is no buffer overflow.
+                 */
+                if (partialDecoding) {
+                    /* Since we are partial decoding we may be in this block because of the output parsing
+                     * restriction, which is not valid since the output buffer is allowed to be undersized.
+                     */
+                    assert(endOnInput);
+                    DEBUGLOG(7, "partialDecoding: copying literals, close to input or output end")
+                    DEBUGLOG(7, "partialDecoding: literal length = %u", (unsigned)length);
+                    DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", (int)(oend - op));
+                    DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i", (int)(iend - ip));
+                    /* Finishing in the middle of a literals segment,
+                     * due to lack of input.
+                     */
+                    if (ip+length > iend) {
+                        length = (size_t)(iend-ip);
+                        cpy = op + length;
+                    }
+                    /* Finishing in the middle of a literals segment,
+                     * due to lack of output space.
+                     */
+                    if (cpy > oend) {
+                        cpy = oend;
+                        assert(op<=oend);
+                        length = (size_t)(oend-op);
+                    }
+                } else {
+                    /* We must be on the last sequence because of the parsing limitations so check
+                     * that we exactly regenerate the original size (must be exact when !endOnInput).
+                     */
+                    if ((!endOnInput) && (cpy != oend)) { goto _output_error; }
+                     /* We must be on the last sequence (or invalid) because of the parsing limitations
+                      * so check that we exactly consume the input and don't overrun the output buffer.
+                      */
+                    if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) {
+                        DEBUGLOG(6, "should have been last run of literals")
+                        DEBUGLOG(6, "ip(%p) + length(%i) = %p != iend (%p)", ip, (int)length, ip+length, iend);
+                        DEBUGLOG(6, "or cpy(%p) > oend(%p)", cpy, oend);
+                        goto _output_error;
+                    }
+                }
+                memmove(op, ip, length);  /* supports overlapping memory regions; only matters for in-place decompression scenarios */
+                ip += length;
+                op += length;
+                /* Necessarily EOF when !partialDecoding.
+                 * When partialDecoding, it is EOF if we've either
+                 * filled the output buffer or
+                 * can't proceed with reading an offset for following match.
+                 */
+                if (!partialDecoding || (cpy == oend) || (ip >= (iend-2))) {
+                    break;
+                }
+            } else {
+                LZ4_wildCopy8(op, ip, cpy);   /* may overwrite up to WILDCOPYLENGTH beyond cpy */
+                ip += length; op = cpy;
+            }
+
+            /* get offset */
+            offset = LZ4_readLE16(ip); ip+=2;
+            match = op - offset;
+
+            /* get matchlength */
+            length = token & ML_MASK;
+
+    _copy_match:
+            if (length == ML_MASK) {
+              variable_length_error error = ok;
+              length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error);
+              if (error != ok) goto _output_error;
+                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error;   /* overflow detection */
+            }
+            length += MINMATCH;
+
+#if LZ4_FAST_DEC_LOOP
+        safe_match_copy:
+#endif
+            if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error;   /* Error : offset outside buffers */
+            /* match starting within external dictionary */
+            if ((dict==usingExtDict) && (match < lowPrefix)) {
+                if (unlikely(op+length > oend-LASTLITERALS)) {
+                    if (partialDecoding) length = MIN(length, (size_t)(oend-op));
+                    else goto _output_error;   /* doesn't respect parsing restriction */
+                }
+
+                if (length <= (size_t)(lowPrefix-match)) {
+                    /* match fits entirely within external dictionary : just copy */
+                    memmove(op, dictEnd - (lowPrefix-match), length);
+                    op += length;
+                } else {
+                    /* match stretches into both external dictionary and current block */
+                    size_t const copySize = (size_t)(lowPrefix - match);
+                    size_t const restSize = length - copySize;
+                    LZ4_memcpy(op, dictEnd - copySize, copySize);
+                    op += copySize;
+                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
+                        BYTE* const endOfMatch = op + restSize;
+                        const BYTE* copyFrom = lowPrefix;
+                        while (op < endOfMatch) *op++ = *copyFrom++;
+                    } else {
+                        LZ4_memcpy(op, lowPrefix, restSize);
+                        op += restSize;
+                }   }
+                continue;
+            }
+            assert(match >= lowPrefix);
+
+            /* copy match within block */
+            cpy = op + length;
+
+            /* partialDecoding : may end anywhere within the block */
+            assert(op<=oend);
+            if (partialDecoding && (cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
+                size_t const mlen = MIN(length, (size_t)(oend-op));
+                const BYTE* const matchEnd = match + mlen;
+                BYTE* const copyEnd = op + mlen;
+                if (matchEnd > op) {   /* overlap copy */
+                    while (op < copyEnd) { *op++ = *match++; }
+                } else {
+                    LZ4_memcpy(op, match, mlen);
+                }
+                op = copyEnd;
+                if (op == oend) { break; }
+                continue;
+            }
+
+            if (unlikely(offset<8)) {
+                LZ4_write32(op, 0);   /* silence msan warning when offset==0 */
+                op[0] = match[0];
+                op[1] = match[1];
+                op[2] = match[2];
+                op[3] = match[3];
+                match += inc32table[offset];
+                LZ4_memcpy(op+4, match, 4);
+                match -= dec64table[offset];
+            } else {
+                LZ4_memcpy(op, match, 8);
+                match += 8;
+            }
+            op += 8;
+
+            if (unlikely(cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
+                BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1);
+                if (cpy > oend-LASTLITERALS) { goto _output_error; } /* Error : last LASTLITERALS bytes must be literals (uncompressed) */
+                if (op < oCopyLimit) {
+                    LZ4_wildCopy8(op, match, oCopyLimit);
+                    match += oCopyLimit - op;
+                    op = oCopyLimit;
+                }
+                while (op < cpy) { *op++ = *match++; }
+            } else {
+                LZ4_memcpy(op, match, 8);
+                if (length > 16)  { LZ4_wildCopy8(op+8, match+8, cpy); }
+            }
+            op = cpy;   /* wildcopy correction */
+        }
+
+        /* end of decoding */
+        if (endOnInput) {
+            DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst));
+           return (int) (((char*)op)-dst);     /* Nb of output bytes decoded */
+       } else {
+           return (int) (((const char*)ip)-src);   /* Nb of input bytes read */
+       }
+
+        /* Overflow error detected */
+    _output_error:
+        return (int) (-(((const char*)ip)-src))-1;
+    }
 }
 
-static void *
-lz4_alloc(int flags)
-{
-	return (kmem_alloc(sizeof (struct refTables), flags));
-}
+/*
+ * LZ4_uncompress_unknownOutputSize() :
+ * 	isize  : is the input size, therefore the compressed size
+ * 	maxOutputSize : is the size of the destination buffer (which must be
+ * 		already allocated)
+ * 	return : the number of bytes decoded in the destination buffer
+ * 		(necessarily <= maxOutputSize). If the source stream is
+ * 		malformed, the function will stop decoding and return a
+ * 		negative result, indicating the byte position of the faulty
+ * 		instruction. This function never writes beyond dest +
+ * 		maxOutputSize, and is therefore protected against malicious
+ * 		data packets.
+ * 	note   : Destination buffer must be already allocated.
+ *		This version is slightly slower than real_LZ4_uncompress()
+ *
+ */
 
-static void
-lz4_free(void *ctx)
+/*
+ * Note: In upstream code, LZ4_uncompress_unknownOutputSize is now a legacy
+ *       wrapper for LZ4_decompress_safe which is a wrapper for
+ *	 LZ4_decompress_generic; this wrapper flattens that, rather than
+ *	 rewriting the callers.
+ */
+int LZ4_uncompress_unknownOutputSize(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
 {
-	kmem_free(ctx, sizeof (struct refTables));
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize,
+                                  endOnInputSize, decode_full_block, noDict,
+                                  (BYTE*)dest, NULL, 0);
 }
-#endif
diff --git a/sys/contrib/openzfs/module/zfs/lz4_zfs.c b/sys/contrib/openzfs/module/zfs/lz4_zfs.c
new file mode 100644
index 000000000000..820556effb8b
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/lz4_zfs.c
@@ -0,0 +1,935 @@
+/*
+ * LZ4 - Fast LZ compression algorithm
+ * Header File
+ * Copyright (C) 2011-2013, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ */
+
+/*
+ * N.B. - This file seems to be based on LZ4 r85, dated Dec 10, 2012
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_compress.h>
+
+static int real_LZ4_compress(const char *source, char *dest, int isize,
+    int osize);
+static int LZ4_compressCtx(void *ctx, const char *source, char *dest,
+    int isize, int osize);
+static int LZ4_compress64kCtx(void *ctx, const char *source, char *dest,
+    int isize, int osize);
+
+/* See lz4.c */
+int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
+    int isize, int maxOutputSize);
+
+static void *lz4_alloc(int flags);
+static void lz4_free(void *ctx);
+
+size_t
+lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
+{
+	(void) n;
+	uint32_t bufsiz;
+	char *dest = d_start;
+
+	ASSERT(d_len >= sizeof (bufsiz));
+
+	bufsiz = real_LZ4_compress(s_start, &dest[sizeof (bufsiz)], s_len,
+	    d_len - sizeof (bufsiz));
+
+	/* Signal an error if the compression routine returned zero. */
+	if (bufsiz == 0)
+		return (s_len);
+
+	/*
+	 * The exact compressed size is needed by the decompression routine,
+	 * so it is stored at the start of the buffer. Note that this may be
+	 * less than the compressed block size, which is rounded up to a
+	 * multiple of 1<<ashift.
+	 */
+	*(uint32_t *)dest = BE_32(bufsiz);
+
+	return (bufsiz + sizeof (bufsiz));
+}
+
+int
+lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
+{
+	(void) n;
+	const char *src = s_start;
+	uint32_t bufsiz = BE_IN32(src);
+
+	/* invalid compressed buffer size encoded at start */
+	if (bufsiz + sizeof (bufsiz) > s_len)
+		return (1);
+
+	/*
+	 * Returns 0 on success (decompression function returned non-negative)
+	 * and non-zero on failure (decompression function returned negative).
+	 */
+	return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)],
+	    d_start, bufsiz, d_len) < 0);
+}
+
+/*
+ * LZ4 API Description:
+ *
+ * Simple Functions:
+ * real_LZ4_compress() :
+ * 	isize  : is the input size. Max supported value is ~1.9GB
+ * 	return : the number of bytes written in buffer dest
+ *		 or 0 if the compression fails (if LZ4_COMPRESSMIN is set).
+ * 	note : destination buffer must be already allocated.
+ * 		destination buffer must be sized to handle worst cases
+ * 		situations (input data not compressible) worst case size
+ * 		evaluation is provided by function LZ4_compressBound().
+ *
+ * real_LZ4_uncompress() :
+ * 	osize  : is the output size, therefore the original size
+ * 	return : the number of bytes read in the source buffer.
+ * 		If the source stream is malformed, the function will stop
+ * 		decoding and return a negative result, indicating the byte
+ * 		position of the faulty instruction. This function never
+ * 		writes beyond dest + osize, and is therefore protected
+ * 		against malicious data packets.
+ * 	note : destination buffer must be already allocated
+ *	note : real_LZ4_uncompress() is not used in ZFS so its code
+ *	       is not present here.
+ *
+ * Advanced Functions
+ *
+ * LZ4_compressBound() :
+ * 	Provides the maximum size that LZ4 may output in a "worst case"
+ * 	scenario (input data not compressible) primarily useful for memory
+ * 	allocation of output buffer.
+ *
+ * 	isize  : is the input size. Max supported value is ~1.9GB
+ * 	return : maximum output size in a "worst case" scenario
+ * 	note : this function is limited by "int" range (2^31-1)
+ *
+ * LZ4_uncompress_unknownOutputSize() :
+ * 	isize  : is the input size, therefore the compressed size
+ * 	maxOutputSize : is the size of the destination buffer (which must be
+ * 		already allocated)
+ * 	return : the number of bytes decoded in the destination buffer
+ * 		(necessarily <= maxOutputSize). If the source stream is
+ * 		malformed, the function will stop decoding and return a
+ * 		negative result, indicating the byte position of the faulty
+ * 		instruction. This function never writes beyond dest +
+ * 		maxOutputSize, and is therefore protected against malicious
+ * 		data packets.
+ * 	note   : Destination buffer must be already allocated.
+ *		This version is slightly slower than real_LZ4_uncompress()
+ *
+ * LZ4_compressCtx() :
+ * 	This function explicitly handles the CTX memory structure.
+ *
+ * 	ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
+ * 	by the caller (either on the stack or using kmem_cache_alloc). Passing
+ * 	NULL isn't valid.
+ *
+ * LZ4_compress64kCtx() :
+ * 	Same as LZ4_compressCtx(), but specific to small inputs (<64KB).
+ * 	isize *Must* be <64KB, otherwise the output will be corrupted.
+ *
+ * 	ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
+ * 	by the caller (either on the stack or using kmem_cache_alloc). Passing
+ * 	NULL isn't valid.
+ */
+
+/*
+ * Tuning parameters
+ */
+
+/*
+ * COMPRESSIONLEVEL: Increasing this value improves compression ratio
+ *	 Lowering this value reduces memory usage. Reduced memory usage
+ *	typically improves speed, due to cache effect (ex: L1 32KB for Intel,
+ *	L1 64KB for AMD). Memory usage formula : N->2^(N+2) Bytes
+ *	(examples : 12 -> 16KB ; 17 -> 512KB)
+ */
+#define	COMPRESSIONLEVEL 12
+
+/*
+ * NOTCOMPRESSIBLE_CONFIRMATION: Decreasing this value will make the
+ *	algorithm skip faster data segments considered "incompressible".
+ *	This may decrease compression ratio dramatically, but will be
+ *	faster on incompressible data. Increasing this value will make
+ *	the algorithm search more before declaring a segment "incompressible".
+ *	This could improve compression a bit, but will be slower on
+ *	incompressible data. The default value (6) is recommended.
+ */
+#define	NOTCOMPRESSIBLE_CONFIRMATION 6
+
+/*
+ * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE: This will provide a boost to
+ * performance for big endian cpu, but the resulting compressed stream
+ * will be incompatible with little-endian CPU. You can set this option
+ * to 1 in situations where data will stay within closed environment.
+ * This option is useless on Little_Endian CPU (such as x86).
+ */
+/* #define	BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */
+
+/*
+ * CPU Feature Detection
+ */
+
+/* 32 or 64 bits ? */
+#if defined(_LP64)
+#define	LZ4_ARCH64 1
+#else
+#define	LZ4_ARCH64 0
+#endif
+
+/*
+ * Little Endian or Big Endian?
+ * Note: overwrite the below #define if you know your architecture endianness.
+ */
+#if defined(_ZFS_BIG_ENDIAN)
+#define	LZ4_BIG_ENDIAN 1
+#else
+/*
+ * Little Endian assumed. PDP Endian and other very rare endian format
+ * are unsupported.
+ */
+#undef LZ4_BIG_ENDIAN
+#endif
+
+/*
+ * Unaligned memory access is automatically enabled for "common" CPU,
+ * such as x86. For others CPU, the compiler will be more cautious, and
+ * insert extra code to ensure aligned access is respected. If you know
+ * your target CPU supports unaligned memory access, you may want to
+ * force this option manually to improve performance
+ */
+#if defined(__ARM_FEATURE_UNALIGNED)
+#define	LZ4_FORCE_UNALIGNED_ACCESS 1
+#endif
+
+/*
+ * Illumos : we can't use GCC's __builtin_ctz family of builtins in the
+ * kernel
+ * Linux : we can use GCC's __builtin_ctz family of builtins in the
+ * kernel
+ */
+#undef	LZ4_FORCE_SW_BITCOUNT
+#if defined(__sparc)
+#define	LZ4_FORCE_SW_BITCOUNT
+#endif
+
+/*
+ * Compiler Options
+ */
+/* Disable restrict */
+#define	restrict
+
+/*
+ * Linux : GCC_VERSION is defined as of 3.9-rc1, so undefine it.
+ * torvalds/linux@3f3f8d2f48acfd8ed3b8e6b7377935da57b27b16
+ */
+#ifdef GCC_VERSION
+#undef GCC_VERSION
+#endif
+
+#define	GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
+#define	expect(expr, value)    (__builtin_expect((expr), (value)))
+#else
+#define	expect(expr, value)    (expr)
+#endif
+
+#ifndef likely
+#define	likely(expr)	expect((expr) != 0, 1)
+#endif
+
+#ifndef unlikely
+#define	unlikely(expr)	expect((expr) != 0, 0)
+#endif
+
+#define	lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \
+	(((x) & 0xffu) << 8)))
+
+/* Basic types */
+#define	BYTE	uint8_t
+#define	U16	uint16_t
+#define	U32	uint32_t
+#define	S32	int32_t
+#define	U64	uint64_t
+
+#ifndef LZ4_FORCE_UNALIGNED_ACCESS
+#pragma pack(1)
+#endif
+
+typedef struct _U16_S {
+	U16 v;
+} U16_S;
+typedef struct _U32_S {
+	U32 v;
+} U32_S;
+typedef struct _U64_S {
+	U64 v;
+} U64_S;
+
+#ifndef LZ4_FORCE_UNALIGNED_ACCESS
+#pragma pack()
+#endif
+
+#define	A64(x) (((U64_S *)(x))->v)
+#define	A32(x) (((U32_S *)(x))->v)
+#define	A16(x) (((U16_S *)(x))->v)
+
+/*
+ * Constants
+ */
+#define	MINMATCH 4
+
+#define	HASH_LOG COMPRESSIONLEVEL
+#define	HASHTABLESIZE (1 << HASH_LOG)
+#define	HASH_MASK (HASHTABLESIZE - 1)
+
+#define	SKIPSTRENGTH (NOTCOMPRESSIBLE_CONFIRMATION > 2 ? \
+	NOTCOMPRESSIBLE_CONFIRMATION : 2)
+
+#define	COPYLENGTH 8
+#define	LASTLITERALS 5
+#define	MFLIMIT (COPYLENGTH + MINMATCH)
+#define	MINLENGTH (MFLIMIT + 1)
+
+#define	MAXD_LOG 16
+#define	MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+
+#define	ML_BITS 4
+#define	ML_MASK ((1U<<ML_BITS)-1)
+#define	RUN_BITS (8-ML_BITS)
+#define	RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/*
+ * Architecture-specific macros
+ */
+#if LZ4_ARCH64
+#define	STEPSIZE 8
+#define	UARCH U64
+#define	AARCH A64
+#define	LZ4_COPYSTEP(s, d)	A64(d) = A64(s); d += 8; s += 8;
+#define	LZ4_COPYPACKET(s, d)	LZ4_COPYSTEP(s, d)
+#define	LZ4_SECURECOPY(s, d, e)	if (d < e) LZ4_WILDCOPY(s, d, e)
+#define	HTYPE U32
+#define	INITBASE(base)		const BYTE* const base = ip
+#else /* !LZ4_ARCH64 */
+#define	STEPSIZE 4
+#define	UARCH U32
+#define	AARCH A32
+#define	LZ4_COPYSTEP(s, d)	A32(d) = A32(s); d += 4; s += 4;
+#define	LZ4_COPYPACKET(s, d)	LZ4_COPYSTEP(s, d); LZ4_COPYSTEP(s, d);
+#define	LZ4_SECURECOPY		LZ4_WILDCOPY
+#define	HTYPE const BYTE *
+#define	INITBASE(base)		const int base = 0
+#endif /* !LZ4_ARCH64 */
+
+#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
+#define	LZ4_READ_LITTLEENDIAN_16(d, s, p) \
+	{ U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
+#define	LZ4_WRITE_LITTLEENDIAN_16(p, i) \
+	{ U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p += 2; }
+#else
+#define	LZ4_READ_LITTLEENDIAN_16(d, s, p) { d = (s) - A16(p); }
+#define	LZ4_WRITE_LITTLEENDIAN_16(p, v)  { A16(p) = v; p += 2; }
+#endif
+
+
+/* Local structures */
+struct refTables {
+	HTYPE hashTable[HASHTABLESIZE];
+};
+
+
+/* Macros */
+#define	LZ4_HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH * 8) - \
+	HASH_LOG))
+#define	LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p))
+#define	LZ4_WILDCOPY(s, d, e) do { LZ4_COPYPACKET(s, d) } while (d < e);
+#define	LZ4_BLINDCOPY(s, d, l) { BYTE* e = (d) + l; LZ4_WILDCOPY(s, d, e); \
+	d = e; }
+
+
+/* Private functions */
+#if LZ4_ARCH64
+
+static inline int
+LZ4_NbCommonBytes(register U64 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
+	!defined(LZ4_FORCE_SW_BITCOUNT)
+	return (__builtin_clzll(val) >> 3);
+#else
+	int r;
+	if (!(val >> 32)) {
+		r = 4;
+	} else {
+		r = 0;
+		val >>= 32;
+	}
+	if (!(val >> 16)) {
+		r += 2;
+		val >>= 8;
+	} else {
+		val >>= 24;
+	}
+	r += (!val);
+	return (r);
+#endif
+#else
+#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
+	!defined(LZ4_FORCE_SW_BITCOUNT)
+	return (__builtin_ctzll(val) >> 3);
+#else
+	static const int DeBruijnBytePos[64] =
+	    { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5,
+		3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5,
+		5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4,
+		4, 5, 7, 2, 6, 5, 7, 6, 7, 7
+	};
+	return DeBruijnBytePos[((U64) ((val & -val) * 0x0218A392CDABBD3F)) >>
+	    58];
+#endif
+#endif
+}
+
+#else
+
+static inline int
+LZ4_NbCommonBytes(register U32 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
+	!defined(LZ4_FORCE_SW_BITCOUNT)
+	return (__builtin_clz(val) >> 3);
+#else
+	int r;
+	if (!(val >> 16)) {
+		r = 2;
+		val >>= 8;
+	} else {
+		r = 0;
+		val >>= 24;
+	}
+	r += (!val);
+	return (r);
+#endif
+#else
+#if defined(__GNUC__) && (GCC_VERSION >= 304) && \
+	!defined(LZ4_FORCE_SW_BITCOUNT)
+	return (__builtin_ctz(val) >> 3);
+#else
+	static const int DeBruijnBytePos[32] = {
+		0, 0, 3, 0, 3, 1, 3, 0,
+		3, 2, 2, 1, 3, 2, 0, 1,
+		3, 3, 1, 2, 2, 2, 2, 0,
+		3, 1, 2, 0, 1, 0, 1, 1
+	};
+	return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >>
+	    27];
+#endif
+#endif
+}
+
+#endif
+
+/* Compression functions */
+
+static int
+LZ4_compressCtx(void *ctx, const char *source, char *dest, int isize,
+    int osize)
+{
+	struct refTables *srt = (struct refTables *)ctx;
+	HTYPE *HashTable = (HTYPE *) (srt->hashTable);
+
+	const BYTE *ip = (BYTE *) source;
+	INITBASE(base);
+	const BYTE *anchor = ip;
+	const BYTE *const iend = ip + isize;
+	const BYTE *const oend = (BYTE *) dest + osize;
+	const BYTE *const mflimit = iend - MFLIMIT;
+#define	matchlimit (iend - LASTLITERALS)
+
+	BYTE *op = (BYTE *) dest;
+
+	int len, length;
+	const int skipStrength = SKIPSTRENGTH;
+	U32 forwardH;
+
+
+	/* Init */
+	if (isize < MINLENGTH)
+		goto _last_literals;
+
+	/* First Byte */
+	HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
+	ip++;
+	forwardH = LZ4_HASH_VALUE(ip);
+
+	/* Main Loop */
+	for (;;) {
+		int findMatchAttempts = (1U << skipStrength) + 3;
+		const BYTE *forwardIp = ip;
+		const BYTE *ref;
+		BYTE *token;
+
+		/* Find a match */
+		do {
+			U32 h = forwardH;
+			int step = findMatchAttempts++ >> skipStrength;
+			ip = forwardIp;
+			forwardIp = ip + step;
+
+			if (unlikely(forwardIp > mflimit)) {
+				goto _last_literals;
+			}
+
+			forwardH = LZ4_HASH_VALUE(forwardIp);
+			ref = base + HashTable[h];
+			HashTable[h] = ip - base;
+
+		} while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip)));
+
+		/* Catch up */
+		while ((ip > anchor) && (ref > (BYTE *) source) &&
+		    unlikely(ip[-1] == ref[-1])) {
+			ip--;
+			ref--;
+		}
+
+		/* Encode Literal length */
+		length = ip - anchor;
+		token = op++;
+
+		/* Check output limit */
+		if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
+		    (length >> 8) > oend))
+			return (0);
+
+		if (length >= (int)RUN_MASK) {
+			*token = (RUN_MASK << ML_BITS);
+			len = length - RUN_MASK;
+			for (; len > 254; len -= 255)
+				*op++ = 255;
+			*op++ = (BYTE)len;
+		} else
+			*token = (length << ML_BITS);
+
+		/* Copy Literals */
+		LZ4_BLINDCOPY(anchor, op, length);
+
+		_next_match:
+		/* Encode Offset */
+		LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
+
+		/* Start Counting */
+		ip += MINMATCH;
+		ref += MINMATCH;	/* MinMatch verified */
+		anchor = ip;
+		while (likely(ip < matchlimit - (STEPSIZE - 1))) {
+			UARCH diff = AARCH(ref) ^ AARCH(ip);
+			if (!diff) {
+				ip += STEPSIZE;
+				ref += STEPSIZE;
+				continue;
+			}
+			ip += LZ4_NbCommonBytes(diff);
+			goto _endCount;
+		}
+#if LZ4_ARCH64
+		if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
+			ip += 4;
+			ref += 4;
+		}
+#endif
+		if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
+			ip += 2;
+			ref += 2;
+		}
+		if ((ip < matchlimit) && (*ref == *ip))
+			ip++;
+		_endCount:
+
+		/* Encode MatchLength */
+		len = (ip - anchor);
+		/* Check output limit */
+		if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
+			return (0);
+		if (len >= (int)ML_MASK) {
+			*token += ML_MASK;
+			len -= ML_MASK;
+			for (; len > 509; len -= 510) {
+				*op++ = 255;
+				*op++ = 255;
+			}
+			if (len > 254) {
+				len -= 255;
+				*op++ = 255;
+			}
+			*op++ = (BYTE)len;
+		} else
+			*token += len;
+
+		/* Test end of chunk */
+		if (ip > mflimit) {
+			anchor = ip;
+			break;
+		}
+		/* Fill table */
+		HashTable[LZ4_HASH_VALUE(ip - 2)] = ip - 2 - base;
+
+		/* Test next position */
+		ref = base + HashTable[LZ4_HASH_VALUE(ip)];
+		HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
+		if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) {
+			token = op++;
+			*token = 0;
+			goto _next_match;
+		}
+		/* Prepare next loop */
+		anchor = ip++;
+		forwardH = LZ4_HASH_VALUE(ip);
+	}
+
+	_last_literals:
+	/* Encode Last Literals */
+	{
+		int lastRun = iend - anchor;
+		if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
+		    oend)
+			return (0);
+		if (lastRun >= (int)RUN_MASK) {
+			*op++ = (RUN_MASK << ML_BITS);
+			lastRun -= RUN_MASK;
+			for (; lastRun > 254; lastRun -= 255) {
+				*op++ = 255;
+			}
+			*op++ = (BYTE)lastRun;
+		} else
+			*op++ = (lastRun << ML_BITS);
+		(void) memcpy(op, anchor, iend - anchor);
+		op += iend - anchor;
+	}
+
+	/* End */
+	return (int)(((char *)op) - dest);
+}
+
+
+
+/* Note : this function is valid only if isize < LZ4_64KLIMIT */
+#define	LZ4_64KLIMIT ((1 << 16) + (MFLIMIT - 1))
+#define	HASHLOG64K (HASH_LOG + 1)
+#define	HASH64KTABLESIZE (1U << HASHLOG64K)
+#define	LZ4_HASH64K_FUNCTION(i)	(((i) * 2654435761U) >> ((MINMATCH*8) - \
+	HASHLOG64K))
+#define	LZ4_HASH64K_VALUE(p)	LZ4_HASH64K_FUNCTION(A32(p))
+
+static int
+LZ4_compress64kCtx(void *ctx, const char *source, char *dest, int isize,
+    int osize)
+{
+	struct refTables *srt = (struct refTables *)ctx;
+	U16 *HashTable = (U16 *) (srt->hashTable);
+
+	const BYTE *ip = (BYTE *) source;
+	const BYTE *anchor = ip;
+	const BYTE *const base = ip;
+	const BYTE *const iend = ip + isize;
+	const BYTE *const oend = (BYTE *) dest + osize;
+	const BYTE *const mflimit = iend - MFLIMIT;
+#define	matchlimit (iend - LASTLITERALS)
+
+	BYTE *op = (BYTE *) dest;
+
+	int len, length;
+	const int skipStrength = SKIPSTRENGTH;
+	U32 forwardH;
+
+	/* Init */
+	if (isize < MINLENGTH)
+		goto _last_literals;
+
+	/* First Byte */
+	ip++;
+	forwardH = LZ4_HASH64K_VALUE(ip);
+
+	/* Main Loop */
+	for (;;) {
+		int findMatchAttempts = (1U << skipStrength) + 3;
+		const BYTE *forwardIp = ip;
+		const BYTE *ref;
+		BYTE *token;
+
+		/* Find a match */
+		do {
+			U32 h = forwardH;
+			int step = findMatchAttempts++ >> skipStrength;
+			ip = forwardIp;
+			forwardIp = ip + step;
+
+			if (forwardIp > mflimit) {
+				goto _last_literals;
+			}
+
+			forwardH = LZ4_HASH64K_VALUE(forwardIp);
+			ref = base + HashTable[h];
+			HashTable[h] = ip - base;
+
+		} while (A32(ref) != A32(ip));
+
+		/* Catch up */
+		while ((ip > anchor) && (ref > (BYTE *) source) &&
+		    (ip[-1] == ref[-1])) {
+			ip--;
+			ref--;
+		}
+
+		/* Encode Literal length */
+		length = ip - anchor;
+		token = op++;
+
+		/* Check output limit */
+		if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
+		    (length >> 8) > oend))
+			return (0);
+
+		if (length >= (int)RUN_MASK) {
+			*token = (RUN_MASK << ML_BITS);
+			len = length - RUN_MASK;
+			for (; len > 254; len -= 255)
+				*op++ = 255;
+			*op++ = (BYTE)len;
+		} else
+			*token = (length << ML_BITS);
+
+		/* Copy Literals */
+		LZ4_BLINDCOPY(anchor, op, length);
+
+		_next_match:
+		/* Encode Offset */
+		LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
+
+		/* Start Counting */
+		ip += MINMATCH;
+		ref += MINMATCH;	/* MinMatch verified */
+		anchor = ip;
+		while (ip < matchlimit - (STEPSIZE - 1)) {
+			UARCH diff = AARCH(ref) ^ AARCH(ip);
+			if (!diff) {
+				ip += STEPSIZE;
+				ref += STEPSIZE;
+				continue;
+			}
+			ip += LZ4_NbCommonBytes(diff);
+			goto _endCount;
+		}
+#if LZ4_ARCH64
+		if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
+			ip += 4;
+			ref += 4;
+		}
+#endif
+		if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
+			ip += 2;
+			ref += 2;
+		}
+		if ((ip < matchlimit) && (*ref == *ip))
+			ip++;
+		_endCount:
+
+		/* Encode MatchLength */
+		len = (ip - anchor);
+		/* Check output limit */
+		if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
+			return (0);
+		if (len >= (int)ML_MASK) {
+			*token += ML_MASK;
+			len -= ML_MASK;
+			for (; len > 509; len -= 510) {
+				*op++ = 255;
+				*op++ = 255;
+			}
+			if (len > 254) {
+				len -= 255;
+				*op++ = 255;
+			}
+			*op++ = (BYTE)len;
+		} else
+			*token += len;
+
+		/* Test end of chunk */
+		if (ip > mflimit) {
+			anchor = ip;
+			break;
+		}
+		/* Fill table */
+		HashTable[LZ4_HASH64K_VALUE(ip - 2)] = ip - 2 - base;
+
+		/* Test next position */
+		ref = base + HashTable[LZ4_HASH64K_VALUE(ip)];
+		HashTable[LZ4_HASH64K_VALUE(ip)] = ip - base;
+		if (A32(ref) == A32(ip)) {
+			token = op++;
+			*token = 0;
+			goto _next_match;
+		}
+		/* Prepare next loop */
+		anchor = ip++;
+		forwardH = LZ4_HASH64K_VALUE(ip);
+	}
+
+	_last_literals:
+	/* Encode Last Literals */
+	{
+		int lastRun = iend - anchor;
+		if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
+		    oend)
+			return (0);
+		if (lastRun >= (int)RUN_MASK) {
+			*op++ = (RUN_MASK << ML_BITS);
+			lastRun -= RUN_MASK;
+			for (; lastRun > 254; lastRun -= 255)
+				*op++ = 255;
+			*op++ = (BYTE)lastRun;
+		} else
+			*op++ = (lastRun << ML_BITS);
+		(void) memcpy(op, anchor, iend - anchor);
+		op += iend - anchor;
+	}
+
+	/* End */
+	return (int)(((char *)op) - dest);
+}
+
+static int
+real_LZ4_compress(const char *source, char *dest, int isize, int osize)
+{
+	void *ctx;
+	int result;
+
+	ctx = lz4_alloc(KM_SLEEP);
+
+	/*
+	 * out of kernel memory, gently fall through - this will disable
+	 * compression in zio_compress_data
+	 */
+	if (ctx == NULL)
+		return (0);
+
+	memset(ctx, 0, sizeof (struct refTables));
+
+	if (isize < LZ4_64KLIMIT)
+		result = LZ4_compress64kCtx(ctx, source, dest, isize, osize);
+	else
+		result = LZ4_compressCtx(ctx, source, dest, isize, osize);
+
+	lz4_free(ctx);
+	return (result);
+}
+
+#ifdef __FreeBSD__
+/*
+ * FreeBSD has 4, 8 and 16 KB malloc zones which can be used here.
+ * Should struct refTables get resized this may need to be revisited, hence
+ * compiler-time asserts.
+ */
+_Static_assert(sizeof(struct refTables) <= 16384,
+    "refTables too big for malloc");
+_Static_assert((sizeof(struct refTables) % 4096) == 0,
+    "refTables not a multiple of page size");
+#else
+#define ZFS_LZ4_USE_CACHE
+#endif
+
+#ifdef ZFS_LZ4_USE_CACHE
+static kmem_cache_t *lz4_cache;
+#endif
+
+#ifdef ZFS_LZ4_USE_CACHE
+void
+lz4_init(void)
+{
+	lz4_cache = kmem_cache_create("lz4_cache",
+	    sizeof (struct refTables), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+lz4_fini(void)
+{
+	if (lz4_cache) {
+		kmem_cache_destroy(lz4_cache);
+		lz4_cache = NULL;
+	}
+}
+
+static void *
+lz4_alloc(int flags)
+{
+	ASSERT(lz4_cache != NULL);
+	return (kmem_cache_alloc(lz4_cache, flags));
+}
+ 
+static void
+lz4_free(void *ctx)
+{
+	kmem_cache_free(lz4_cache, ctx);
+}
+#else
+void
+lz4_init(void)
+{
+}
+
+void
+lz4_fini(void)
+{
+}
+
+static void *
+lz4_alloc(int flags)
+{
+	return (kmem_alloc(sizeof (struct refTables), flags));
+}
+
+static void
+lz4_free(void *ctx)
+{
+	kmem_free(ctx, sizeof (struct refTables));
+}
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/lzjb.c b/sys/contrib/openzfs/module/zfs/lzjb.c
index a478e64c5141..a24f17e0fe74 100644
--- a/sys/contrib/openzfs/module/zfs/lzjb.c
+++ b/sys/contrib/openzfs/module/zfs/lzjb.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -45,10 +45,10 @@
 #define	OFFSET_MASK	((1 << (16 - MATCH_BITS)) - 1)
 #define	LEMPEL_SIZE	1024
 
-/*ARGSUSED*/
 size_t
 lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 {
+	(void) n;
 	uchar_t *src = s_start;
 	uchar_t *dst = d_start;
 	uchar_t *cpy;
@@ -100,10 +100,10 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 	return (dst - (uchar_t *)d_start);
 }
 
-/*ARGSUSED*/
 int
 lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 {
+	(void) s_len, (void) n;
 	uchar_t *src = s_start;
 	uchar_t *dst = d_start;
 	uchar_t *d_end = (uchar_t *)d_start + d_len;
diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c
index d1fee70f004b..7170b5eefcea 100644
--- a/sys/contrib/openzfs/module/zfs/metaslab.c
+++ b/sys/contrib/openzfs/module/zfs/metaslab.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -40,23 +40,26 @@
 #include <sys/zap.h>
 #include <sys/btree.h>
 
-#define	WITH_DF_BLOCK_ALLOCATOR
-
 #define	GANG_ALLOCATION(flags) \
 	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
 
 /*
  * Metaslab granularity, in bytes. This is roughly similar to what would be
  * referred to as the "stripe size" in traditional RAID arrays. In normal
- * operation, we will try to write this amount of data to a top-level vdev
- * before moving on to the next one.
+ * operation, we will try to write this amount of data to each disk before
+ * moving on to the next top-level vdev.
  */
-unsigned long metaslab_aliquot = 512 << 10;
+static uint64_t metaslab_aliquot = 1024 * 1024;
 
 /*
  * For testing, make some blocks above a certain size be gang blocks.
  */
-unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
+uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
+
+/*
+ * Of blocks of size >= metaslab_force_ganging, actually gang them this often.
+ */
+uint_t metaslab_force_ganging_pct = 3;
 
 /*
  * In pools where the log space map feature is not enabled we touch
@@ -81,7 +84,7 @@ int zfs_metaslab_sm_blksz_with_log = (1 << 17);
  * space map representation must be before we compact it on-disk.
  * Values should be greater than or equal to 100.
  */
-int zfs_condense_pct = 200;
+uint_t zfs_condense_pct = 200;
 
 /*
  * Condensing a metaslab is not guaranteed to actually reduce the amount of
@@ -96,7 +99,7 @@ int zfs_condense_pct = 200;
  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  * blocks.
  */
-int zfs_metaslab_condense_block_threshold = 4;
+static const int zfs_metaslab_condense_block_threshold = 4;
 
 /*
  * The zfs_mg_noalloc_threshold defines which metaslab groups should
@@ -111,7 +114,7 @@ int zfs_metaslab_condense_block_threshold = 4;
  * eligible to allocate on any metaslab group. The default value of 0 means
  * no metaslab group will be excluded based on this criterion.
  */
-int zfs_mg_noalloc_threshold = 0;
+static uint_t zfs_mg_noalloc_threshold = 0;
 
 /*
  * Metaslab groups are considered eligible for allocations if their
@@ -135,7 +138,7 @@ int zfs_mg_noalloc_threshold = 0;
  * enough to avoid hitting the speed bump on pools that are being pushed
  * to the edge.
  */
-int zfs_mg_fragmentation_threshold = 95;
+static uint_t zfs_mg_fragmentation_threshold = 95;
 
 /*
  * Allow metaslabs to keep their active state as long as their fragmentation
@@ -143,17 +146,17 @@ int zfs_mg_fragmentation_threshold = 95;
  * active metaslab that exceeds this threshold will no longer keep its active
  * status allowing better metaslabs to be selected.
  */
-int zfs_metaslab_fragmentation_threshold = 70;
+static uint_t zfs_metaslab_fragmentation_threshold = 70;
 
 /*
  * When set will load all metaslabs when pool is first opened.
  */
-int metaslab_debug_load = 0;
+int metaslab_debug_load = B_FALSE;
 
 /*
  * When set will prevent metaslabs from being unloaded.
  */
-int metaslab_debug_unload = 0;
+static int metaslab_debug_unload = B_FALSE;
 
 /*
  * Minimum size which forces the dynamic allocator to change
@@ -169,7 +172,7 @@ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
  * Once the space map's free space drops below this level we dynamically
  * switch to using best-fit allocations.
  */
-int metaslab_df_free_pct = 4;
+uint_t metaslab_df_free_pct = 4;
 
 /*
  * Maximum distance to search forward from the last offset. Without this
@@ -184,14 +187,14 @@ int metaslab_df_free_pct = 4;
  * With the default setting of 16MB this is 16*1024 (with ashift=9) or
  * 2048 (with ashift=12).
  */
-int metaslab_df_max_search = 16 * 1024 * 1024;
+static uint_t metaslab_df_max_search = 16 * 1024 * 1024;
 
 /*
  * Forces the metaslab_block_picker function to search for at least this many
  * segments forwards until giving up on finding a segment that the allocation
  * will fit into.
  */
-uint32_t metaslab_min_search_count = 100;
+static const uint32_t metaslab_min_search_count = 100;
 
 /*
  * If we are not searching forward (due to metaslab_df_max_search,
@@ -200,12 +203,7 @@ uint32_t metaslab_min_search_count = 100;
  * segment.  If it is not set, we will use a segment of exactly the requested
  * size (or larger).
  */
-int metaslab_df_use_largest_segment = B_FALSE;
-
-/*
- * Percentage of all cpus that can be used by the metaslab taskq.
- */
-int metaslab_load_pct = 50;
+static int metaslab_df_use_largest_segment = B_FALSE;
 
 /*
  * These tunables control how long a metaslab will remain loaded after the
@@ -215,56 +213,56 @@ int metaslab_load_pct = 50;
  * unloaded sooner.  These settings are intended to be generous -- to keep
  * metaslabs loaded for a long time, reducing the rate of metaslab loading.
  */
-int metaslab_unload_delay = 32;
-int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
+static uint_t metaslab_unload_delay = 32;
+static uint_t metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
 
 /*
  * Max number of metaslabs per group to preload.
  */
-int metaslab_preload_limit = 10;
+uint_t metaslab_preload_limit = 10;
 
 /*
  * Enable/disable preloading of metaslab.
  */
-int metaslab_preload_enabled = B_TRUE;
+static int metaslab_preload_enabled = B_TRUE;
 
 /*
  * Enable/disable fragmentation weighting on metaslabs.
  */
-int metaslab_fragmentation_factor_enabled = B_TRUE;
+static int metaslab_fragmentation_factor_enabled = B_TRUE;
 
 /*
  * Enable/disable lba weighting (i.e. outer tracks are given preference).
  */
-int metaslab_lba_weighting_enabled = B_TRUE;
+static int metaslab_lba_weighting_enabled = B_TRUE;
 
 /*
  * Enable/disable metaslab group biasing.
  */
-int metaslab_bias_enabled = B_TRUE;
+static int metaslab_bias_enabled = B_TRUE;
 
 /*
  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
  */
-boolean_t zfs_remap_blkptr_enable = B_TRUE;
+static const boolean_t zfs_remap_blkptr_enable = B_TRUE;
 
 /*
  * Enable/disable segment-based metaslab selection.
  */
-int zfs_metaslab_segment_weight_enabled = B_TRUE;
+static int zfs_metaslab_segment_weight_enabled = B_TRUE;
 
 /*
  * When using segment-based metaslab selection, we will continue
  * allocating from the active metaslab until we have exhausted
  * zfs_metaslab_switch_threshold of its buckets.
  */
-int zfs_metaslab_switch_threshold = 2;
+static int zfs_metaslab_switch_threshold = 2;
 
 /*
  * Internal switch to enable/disable the metaslab allocation tracing
  * facility.
  */
-boolean_t metaslab_trace_enabled = B_FALSE;
+static const boolean_t metaslab_trace_enabled = B_FALSE;
 
 /*
  * Maximum entries that the metaslab allocation tracing facility will keep
@@ -274,32 +272,32 @@ boolean_t metaslab_trace_enabled = B_FALSE;
  * to every exceed this value. In debug mode, the system will panic if this
  * limit is ever reached allowing for further investigation.
  */
-uint64_t metaslab_trace_max_entries = 5000;
+static const uint64_t metaslab_trace_max_entries = 5000;
 
 /*
  * Maximum number of metaslabs per group that can be disabled
  * simultaneously.
  */
-int max_disabled_ms = 3;
+static const int max_disabled_ms = 3;
 
 /*
  * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
  * To avoid 64-bit overflow, don't set above UINT32_MAX.
  */
-unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
+static uint64_t zfs_metaslab_max_size_cache_sec = 1 * 60 * 60; /* 1 hour */
 
 /*
  * Maximum percentage of memory to use on storing loaded metaslabs. If loading
  * a metaslab would take it over this percentage, the oldest selected metaslab
  * is automatically unloaded.
  */
-int zfs_metaslab_mem_limit = 25;
+static uint_t zfs_metaslab_mem_limit = 25;
 
 /*
  * Force the per-metaslab range trees to use 64-bit integers to store
  * segments. Used for debugging purposes.
  */
-boolean_t zfs_metaslab_force_large_segs = B_FALSE;
+static const boolean_t zfs_metaslab_force_large_segs = B_FALSE;
 
 /*
  * By default we only store segments over a certain size in the size-sorted
@@ -308,7 +306,7 @@ boolean_t zfs_metaslab_force_large_segs = B_FALSE;
  * improves load and unload times at the cost of causing us to use slightly
  * larger segments than we would otherwise in some cases.
  */
-uint32_t metaslab_by_size_min_shift = 14;
+static const uint32_t metaslab_by_size_min_shift = 14;
 
 /*
  * If not set, we will first try normal allocation.  If that fails then
@@ -321,7 +319,7 @@ uint32_t metaslab_by_size_min_shift = 14;
  * allocation.  If that fails we will do a "try hard" gang allocation.  If
  * that fails then we will have a multi-layer gang block.
  */
-int zfs_metaslab_try_hard_before_gang = B_FALSE;
+static int zfs_metaslab_try_hard_before_gang = B_FALSE;
 
 /*
  * When not trying hard, we only consider the best zfs_metaslab_find_max_tries
@@ -337,7 +335,7 @@ int zfs_metaslab_try_hard_before_gang = B_FALSE;
  * subsequent metaslab has ms_max_size >60KB (but fewer segments in this
  * bucket, and therefore a lower weight).
  */
-int zfs_metaslab_find_max_tries = 100;
+static uint_t zfs_metaslab_find_max_tries = 100;
 
 static uint64_t metaslab_weight(metaslab_t *, boolean_t);
 static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
@@ -370,7 +368,7 @@ static metaslab_stats_t metaslab_stats = {
 	atomic_inc_64(&metaslab_stats.stat.value.ui64);
 
 
-kstat_t *metaslab_ksp;
+static kstat_t *metaslab_ksp;
 
 void
 metaslab_stat_init(void)
@@ -406,7 +404,7 @@ metaslab_stat_fini(void)
  * ==========================================================================
  */
 metaslab_class_t *
-metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
+metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops)
 {
 	metaslab_class_t *mc;
 
@@ -629,8 +627,8 @@ metaslab_class_expandable_space(metaslab_class_t *mc)
 		 * metaslabs. We report the expandable space in terms
 		 * of the metaslab size since that's the unit of expansion.
 		 */
-		space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
-		    1ULL << tvd->vdev_ms_shift);
+		space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize,
+		    1ULL << tvd->vdev_ms_shift, uint64_t);
 	}
 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 	return (space);
@@ -640,8 +638,9 @@ void
 metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 {
 	multilist_t *ml = &mc->mc_metaslab_txg_list;
+	hrtime_t now = gethrtime();
 	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
-		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL) {
@@ -658,13 +657,15 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 				i--;
 				break;
 			}
-			mls = multilist_sublist_lock(ml, i);
+			mls = multilist_sublist_lock_idx(ml, i);
 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 			multilist_sublist_unlock(mls);
 			if (txg >
 			    msp->ms_selected_txg + metaslab_unload_delay &&
-			    gethrtime() > msp->ms_selected_time +
-			    (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
+			    now > msp->ms_selected_time +
+			    MSEC2NSEC(metaslab_unload_delay_ms) &&
+			    (msp->ms_allocator == -1 ||
+			    !metaslab_preload_enabled)) {
 				metaslab_evict(msp, txg);
 			} else {
 				/*
@@ -851,9 +852,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 		zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
 	}
 
-	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
-	    maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
-
 	return (mg);
 }
 
@@ -869,7 +867,6 @@ metaslab_group_destroy(metaslab_group_t *mg)
 	 */
 	ASSERT(mg->mg_activation_count <= 0);
 
-	taskq_destroy(mg->mg_taskq);
 	avl_destroy(&mg->mg_metaslab_tree);
 	mutex_destroy(&mg->mg_lock);
 	mutex_destroy(&mg->mg_ms_disabled_lock);
@@ -899,7 +896,8 @@ metaslab_group_activate(metaslab_group_t *mg)
 	if (++mg->mg_activation_count <= 0)
 		return;
 
-	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+	mg->mg_aliquot = metaslab_aliquot * MAX(1,
+	    vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd));
 	metaslab_group_alloc_update(mg);
 
 	if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
@@ -959,7 +957,7 @@ metaslab_group_passivate(metaslab_group_t *mg)
 	 * allocations from taking place and any changes to the vdev tree.
 	 */
 	spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
-	taskq_wait_outstanding(mg->mg_taskq, 0);
+	taskq_wait_outstanding(spa->spa_metaslab_taskq, 0);
 	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 	metaslab_group_alloc_update(mg);
 	for (int i = 0; i < mg->mg_allocators; i++) {
@@ -1222,7 +1220,7 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
  */
 static boolean_t
 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
-    uint64_t psize, int allocator, int d)
+    int flags, uint64_t psize, int allocator, int d)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_class_t *mc = mg->mg_class;
@@ -1267,6 +1265,15 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 			return (B_FALSE);
 
 		/*
+		 * Some allocations (e.g., those coming from device removal
+		 * where the * allocations are not even counted in the
+		 * metaslab * allocation queues) are allowed to bypass
+		 * the throttle.
+		 */
+		if (flags & METASLAB_DONT_THROTTLE)
+			return (B_TRUE);
+
+		/*
 		 * Relax allocation throttling for ditto blocks.  Due to
 		 * random imbalances in allocation it tends to push copies
 		 * to one vdev, that looks a bit better at the moment.
@@ -1277,7 +1284,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 
 		/*
 		 * If this metaslab group is below its qmax or it's
-		 * the only allocatable metasable group, then attempt
+		 * the only allocatable metaslab group, then attempt
 		 * to allocate from it.
 		 */
 		if (qdepth < qmax || mc->mc_alloc_groups == 1)
@@ -1332,6 +1339,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
  * Comparison function for the private size-ordered tree using 32-bit
  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
  */
+__attribute__((always_inline)) inline
 static int
 metaslab_rangesize32_compare(const void *x1, const void *x2)
 {
@@ -1342,16 +1350,15 @@ metaslab_rangesize32_compare(const void *x1, const void *x2)
 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
 	int cmp = TREE_CMP(rs_size1, rs_size2);
-	if (likely(cmp))
-		return (cmp);
 
-	return (TREE_CMP(r1->rs_start, r2->rs_start));
+	return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
 }
 
 /*
  * Comparison function for the private size-ordered tree using 64-bit
  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
  */
+__attribute__((always_inline)) inline
 static int
 metaslab_rangesize64_compare(const void *x1, const void *x2)
 {
@@ -1362,11 +1369,10 @@ metaslab_rangesize64_compare(const void *x1, const void *x2)
 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
 	int cmp = TREE_CMP(rs_size1, rs_size2);
-	if (likely(cmp))
-		return (cmp);
 
-	return (TREE_CMP(r1->rs_start, r2->rs_start));
+	return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
 }
+
 typedef struct metaslab_rt_arg {
 	zfs_btree_t *mra_bt;
 	uint32_t mra_floor_shift;
@@ -1402,11 +1408,17 @@ metaslab_size_tree_full_load(range_tree_t *rt)
 	range_tree_walk(rt, metaslab_size_sorted_add, &arg);
 }
 
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf,
+    range_seg32_t, metaslab_rangesize32_compare)
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf,
+    range_seg64_t, metaslab_rangesize64_compare)
+
 /*
  * Create any block allocator specific components. The current allocators
  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
  */
-/* ARGSUSED */
 static void
 metaslab_rt_create(range_tree_t *rt, void *arg)
 {
@@ -1415,26 +1427,29 @@ metaslab_rt_create(range_tree_t *rt, void *arg)
 
 	size_t size;
 	int (*compare) (const void *, const void *);
+	bt_find_in_buf_f bt_find;
 	switch (rt->rt_type) {
 	case RANGE_SEG32:
 		size = sizeof (range_seg32_t);
 		compare = metaslab_rangesize32_compare;
+		bt_find = metaslab_rt_find_rangesize32_in_buf;
 		break;
 	case RANGE_SEG64:
 		size = sizeof (range_seg64_t);
 		compare = metaslab_rangesize64_compare;
+		bt_find = metaslab_rt_find_rangesize64_in_buf;
 		break;
 	default:
 		panic("Invalid range seg type %d", rt->rt_type);
 	}
-	zfs_btree_create(size_tree, compare, size);
+	zfs_btree_create(size_tree, compare, bt_find, size);
 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
 }
 
-/* ARGSUSED */
 static void
 metaslab_rt_destroy(range_tree_t *rt, void *arg)
 {
+	(void) rt;
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
@@ -1442,7 +1457,6 @@ metaslab_rt_destroy(range_tree_t *rt, void *arg)
 	kmem_free(mrap, sizeof (*mrap));
 }
 
-/* ARGSUSED */
 static void
 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
@@ -1450,27 +1464,25 @@ metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
-	    (1 << mrap->mra_floor_shift))
+	    (1ULL << mrap->mra_floor_shift))
 		return;
 
 	zfs_btree_add(size_tree, rs);
 }
 
-/* ARGSUSED */
 static void
 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
-	if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 <<
+	if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL <<
 	    mrap->mra_floor_shift))
 		return;
 
 	zfs_btree_remove(size_tree, rs);
 }
 
-/* ARGSUSED */
 static void
 metaslab_rt_vacate(range_tree_t *rt, void *arg)
 {
@@ -1482,7 +1494,7 @@ metaslab_rt_vacate(range_tree_t *rt, void *arg)
 	metaslab_rt_create(rt, arg);
 }
 
-static range_tree_ops_t metaslab_rt_ops = {
+static const range_tree_ops_t metaslab_rt_ops = {
 	.rtop_create = metaslab_rt_create,
 	.rtop_destroy = metaslab_rt_destroy,
 	.rtop_add = metaslab_rt_add,
@@ -1602,9 +1614,6 @@ metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
 	return (rs);
 }
 
-#if defined(WITH_DF_BLOCK_ALLOCATOR) || \
-    defined(WITH_CF_BLOCK_ALLOCATOR)
-
 /*
  * This is a helper function that can be used by the allocator to find a
  * suitable block to allocate. This will search the specified B-tree looking
@@ -1639,9 +1648,74 @@ metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
 	*cursor = 0;
 	return (-1ULL);
 }
-#endif /* WITH_DF/CF_BLOCK_ALLOCATOR */
 
-#if defined(WITH_DF_BLOCK_ALLOCATOR)
+static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size);
+static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size);
+static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size);
+metaslab_ops_t *metaslab_allocator(spa_t *spa);
+
+static metaslab_ops_t metaslab_allocators[] = {
+	{ "dynamic", metaslab_df_alloc },
+	{ "cursor", metaslab_cf_alloc },
+	{ "new-dynamic", metaslab_ndf_alloc },
+};
+
+static int
+spa_find_allocator_byname(const char *val)
+{
+	int a = ARRAY_SIZE(metaslab_allocators) - 1;
+	if (strcmp("new-dynamic", val) == 0)
+		return (-1); /* remove when ndf is working */
+	for (; a >= 0; a--) {
+		if (strcmp(val, metaslab_allocators[a].msop_name) == 0)
+			return (a);
+	}
+	return (-1);
+}
+
+void
+spa_set_allocator(spa_t *spa, const char *allocator)
+{
+	int a = spa_find_allocator_byname(allocator);
+	if (a < 0) a = 0;
+	spa->spa_active_allocator = a;
+	zfs_dbgmsg("spa allocator: %s\n", metaslab_allocators[a].msop_name);
+}
+
+int
+spa_get_allocator(spa_t *spa)
+{
+	return (spa->spa_active_allocator);
+}
+
+#if defined(_KERNEL)
+int
+param_set_active_allocator_common(const char *val)
+{
+	char *p;
+
+	if (val == NULL)
+		return (SET_ERROR(EINVAL));
+
+	if ((p = strchr(val, '\n')) != NULL)
+		*p = '\0';
+
+	int a = spa_find_allocator_byname(val);
+	if (a < 0)
+		return (SET_ERROR(EINVAL));
+
+	zfs_active_allocator = metaslab_allocators[a].msop_name;
+	return (0);
+}
+#endif
+
+metaslab_ops_t *
+metaslab_allocator(spa_t *spa)
+{
+	int allocator = spa_get_allocator(spa);
+	return (&metaslab_allocators[allocator]);
+}
+
 /*
  * ==========================================================================
  * Dynamic Fit (df) block allocator
@@ -1675,7 +1749,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 	uint64_t align = size & -size;
 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 	range_tree_t *rt = msp->ms_allocatable;
-	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
+	uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 	uint64_t offset;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -1716,14 +1790,6 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 	return (offset);
 }
 
-static metaslab_ops_t metaslab_df_ops = {
-	metaslab_df_alloc
-};
-
-metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
-#endif /* WITH_DF_BLOCK_ALLOCATOR */
-
-#if defined(WITH_CF_BLOCK_ALLOCATOR)
 /*
  * ==========================================================================
  * Cursor fit block allocator -
@@ -1766,14 +1832,6 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 	return (offset);
 }
 
-static metaslab_ops_t metaslab_cf_ops = {
-	metaslab_cf_alloc
-};
-
-metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops;
-#endif /* WITH_CF_BLOCK_ALLOCATOR */
-
-#if defined(WITH_NDF_BLOCK_ALLOCATOR)
 /*
  * ==========================================================================
  * New dynamic fit allocator -
@@ -1830,14 +1888,6 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 	return (-1ULL);
 }
 
-static metaslab_ops_t metaslab_ndf_ops = {
-	metaslab_ndf_alloc
-};
-
-metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
-#endif /* WITH_NDF_BLOCK_ALLOCATOR */
-
-
 /*
  * ==========================================================================
  * Metaslabs
@@ -1962,9 +2012,9 @@ metaslab_aux_histograms_clear(metaslab_t *msp)
 	 */
 	ASSERT(msp->ms_loaded);
 
-	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+	memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
 	for (int t = 0; t < TXG_DEFER_SIZE; t++)
-		bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
+		memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t]));
 }
 
 static void
@@ -2054,13 +2104,13 @@ metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
 	 */
 	uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
 	if (defer_allowed) {
-		bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
+		memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist,
 		    sizeof (msp->ms_synchist));
 	} else {
-		bzero(msp->ms_deferhist[hist_index],
+		memset(msp->ms_deferhist[hist_index], 0,
 		    sizeof (msp->ms_deferhist[hist_index]));
 	}
-	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+	memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
 }
 
 /*
@@ -2178,19 +2228,19 @@ metaslab_potentially_evict(metaslab_class_t *mc)
 	uint64_t allmem = arc_all_memory();
 	uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 	uint64_t size =	spl_kmem_cache_entry_size(zfs_btree_leaf_cache);
-	int tries = 0;
+	uint_t tries = 0;
 	for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
 	    tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2;
 	    tries++) {
 		unsigned int idx = multilist_get_random_index(
 		    &mc->mc_metaslab_txg_list);
 		multilist_sublist_t *mls =
-		    multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx);
+		    multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
 		    inuse * size) {
-			VERIFY3P(mls, ==, multilist_sublist_lock(
+			VERIFY3P(mls, ==, multilist_sublist_lock_idx(
 			    &mc->mc_metaslab_txg_list, idx));
 			ASSERT3U(idx, ==,
 			    metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
@@ -2240,6 +2290,8 @@ metaslab_potentially_evict(metaslab_class_t *mc)
 			inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 		}
 	}
+#else
+	(void) mc, (void) zfs_metaslab_mem_limit;
 #endif
 }
 
@@ -2758,7 +2810,8 @@ metaslab_fini_flush_data(metaslab_t *msp)
 	mutex_exit(&spa->spa_flushed_ms_lock);
 
 	spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
-	spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
+	spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp),
+	    metaslab_unflushed_dirty(msp));
 }
 
 uint64_t
@@ -2857,7 +2910,7 @@ metaslab_fini(metaslab_t *msp)
  * of the table. Since the fragmentation value is never stored on disk, it
  * is possible to change these calculations in the future.
  */
-int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
+static const int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
 	100,	/* 512B	*/
 	100,	/* 1K	*/
 	98,	/* 2K	*/
@@ -3192,6 +3245,15 @@ static boolean_t
 metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
 {
 	/*
+	 * This case will usually but not always get caught by the checks below;
+	 * metaslabs can be loaded by various means, including the trim and
+	 * initialize code. Once that happens, without this check they are
+	 * allocatable even before they finish their first txg sync.
+	 */
+	if (unlikely(msp->ms_new))
+		return (B_FALSE);
+
+	/*
 	 * If the metaslab is loaded, ms_max_size is definitive and we can use
 	 * the fast check. If it's not, the ms_max_size is a lower bound (once
 	 * set), and we should use the fast check as long as we're not in
@@ -3503,10 +3565,8 @@ metaslab_group_preload(metaslab_group_t *mg)
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	int m = 0;
 
-	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
-		taskq_wait_outstanding(mg->mg_taskq, 0);
+	if (spa_shutting_down(spa) || !metaslab_preload_enabled)
 		return;
-	}
 
 	mutex_enter(&mg->mg_lock);
 
@@ -3526,8 +3586,9 @@ metaslab_group_preload(metaslab_group_t *mg)
 			continue;
 		}
 
-		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
-		    msp, TQ_SLEEP) != TASKQID_INVALID);
+		VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload,
+		    msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0))
+		    != TASKQID_INVALID);
 	}
 	mutex_exit(&mg->mg_lock);
 }
@@ -3558,7 +3619,7 @@ metaslab_should_condense(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	vdev_t *vd = msp->ms_group->mg_vd;
-	uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
+	uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
@@ -3736,50 +3797,45 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
 	metaslab_flush_update(msp, tx);
 }
 
-/*
- * Called when the metaslab has been flushed (its own spacemap now reflects
- * all the contents of the pool-wide spacemap log). Updates the metaslab's
- * metadata and any pool-wide related log space map data (e.g. summary,
- * obsolete logs, etc..) to reflect that.
- */
 static void
-metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
+metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx)
 {
-	metaslab_group_t *mg = msp->ms_group;
-	spa_t *spa = mg->mg_vd->vdev_spa;
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	ASSERT(spa_syncing_log_sm(spa) != NULL);
+	ASSERT(msp->ms_sm != NULL);
 	ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
 	ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 
-	/*
-	 * Just because a metaslab got flushed, that doesn't mean that
-	 * it will pass through metaslab_sync_done(). Thus, make sure to
-	 * update ms_synced_length here in case it doesn't.
-	 */
-	msp->ms_synced_length = space_map_length(msp->ms_sm);
+	mutex_enter(&spa->spa_flushed_ms_lock);
+	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+	metaslab_set_unflushed_dirty(msp, B_TRUE);
+	avl_add(&spa->spa_metaslabs_by_flushed, msp);
+	mutex_exit(&spa->spa_flushed_ms_lock);
 
-	/*
-	 * We may end up here from metaslab_condense() without the
-	 * feature being active. In that case this is a no-op.
-	 */
-	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
-		return;
+	spa_log_sm_increment_current_mscount(spa);
+	spa_log_summary_add_flushed_metaslab(spa, B_TRUE);
+}
 
+void
+metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	ASSERT(spa_syncing_log_sm(spa) != NULL);
 	ASSERT(msp->ms_sm != NULL);
 	ASSERT(metaslab_unflushed_txg(msp) != 0);
 	ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
+	ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+	ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 
 	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
 
 	/* update metaslab's position in our flushing tree */
 	uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
+	boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp);
 	mutex_enter(&spa->spa_flushed_ms_lock);
 	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
 	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+	metaslab_set_unflushed_dirty(msp, dirty);
 	avl_add(&spa->spa_metaslabs_by_flushed, msp);
 	mutex_exit(&spa->spa_flushed_ms_lock);
 
@@ -3787,17 +3843,47 @@ metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
 	spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
 	spa_log_sm_increment_current_mscount(spa);
 
+	/* update log space map summary */
+	spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg,
+	    ms_prev_flushed_dirty);
+	spa_log_summary_add_flushed_metaslab(spa, dirty);
+
 	/* cleanup obsolete logs if any */
-	uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
 	spa_cleanup_old_sm_logs(spa, tx);
-	uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
-	VERIFY3U(log_blocks_after, <=, log_blocks_before);
+}
 
-	/* update log space map summary */
-	uint64_t blocks_gone = log_blocks_before - log_blocks_after;
-	spa_log_summary_add_flushed_metaslab(spa);
-	spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
-	spa_log_summary_decrement_blkcount(spa, blocks_gone);
+/*
+ * Called when the metaslab has been flushed (its own spacemap now reflects
+ * all the contents of the pool-wide spacemap log). Updates the metaslab's
+ * metadata and any pool-wide related log space map data (e.g. summary,
+ * obsolete logs, etc..) to reflect that.
+ */
+static void
+metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
+{
+	metaslab_group_t *mg = msp->ms_group;
+	spa_t *spa = mg->mg_vd->vdev_spa;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+
+	/*
+	 * Just because a metaslab got flushed, that doesn't mean that
+	 * it will pass through metaslab_sync_done(). Thus, make sure to
+	 * update ms_synced_length here in case it doesn't.
+	 */
+	msp->ms_synced_length = space_map_length(msp->ms_sm);
+
+	/*
+	 * We may end up here from metaslab_condense() without the
+	 * feature being active. In that case this is a no-op.
+	 */
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) ||
+	    metaslab_unflushed_txg(msp) == 0)
+		return;
+
+	metaslab_unflushed_bump(msp, tx, B_FALSE);
 }
 
 boolean_t
@@ -4013,23 +4099,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		ASSERT0(metaslab_allocated_space(msp));
 	}
 
-	if (metaslab_unflushed_txg(msp) == 0 &&
-	    spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
-		ASSERT(spa_syncing_log_sm(spa) != NULL);
-
-		metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
-		spa_log_sm_increment_current_mscount(spa);
-		spa_log_summary_add_flushed_metaslab(spa);
-
-		ASSERT(msp->ms_sm != NULL);
-		mutex_enter(&spa->spa_flushed_ms_lock);
-		avl_add(&spa->spa_metaslabs_by_flushed, msp);
-		mutex_exit(&spa->spa_flushed_ms_lock);
-
-		ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
-		ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
-	}
-
 	if (!range_tree_is_empty(msp->ms_checkpointing) &&
 	    vd->vdev_checkpoint_sm == NULL) {
 		ASSERT(spa_has_checkpoint(spa));
@@ -4077,6 +4146,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	space_map_t *log_sm = spa_syncing_log_sm(spa);
 	if (log_sm != NULL) {
 		ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
+		if (metaslab_unflushed_txg(msp) == 0)
+			metaslab_unflushed_add(msp, tx);
+		else if (!metaslab_unflushed_dirty(msp))
+			metaslab_unflushed_bump(msp, tx, B_TRUE);
 
 		space_map_write(log_sm, alloctree, SM_ALLOC,
 		    vd->vdev_id, tx);
@@ -4272,7 +4345,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 
 	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
 	    metaslab_class_get_alloc(spa_normal_class(spa));
-	if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
+	if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing ||
+	    vd->vdev_rz_expanding) {
 		defer_allowed = B_FALSE;
 	}
 
@@ -4502,8 +4576,8 @@ metaslab_trace_fini(zio_alloc_list_t *zal)
  */
 
 static void
-metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
-    int allocator)
+metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag,
+    int flags, int allocator)
 {
 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
 	    (flags & METASLAB_DONT_THROTTLE))
@@ -4536,8 +4610,8 @@ metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
 }
 
 void
-metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
-    int allocator, boolean_t io_complete)
+metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag,
+    int flags, int allocator, boolean_t io_complete)
 {
 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
 	    (flags & METASLAB_DONT_THROTTLE))
@@ -4554,7 +4628,7 @@ metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
 }
 
 void
-metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
+metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag,
     int allocator)
 {
 #ifdef ZFS_DEBUG
@@ -4580,6 +4654,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(msp->ms_disabled);
+	VERIFY0(msp->ms_new);
 
 	start = mc->mc_ops->msop_alloc(msp, size);
 	if (start != -1ULL) {
@@ -4634,7 +4709,7 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
 	if (msp == NULL)
 		msp = avl_nearest(t, idx, AVL_AFTER);
 
-	int tries = 0;
+	uint_t tries = 0;
 	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
 		int i;
 
@@ -4651,10 +4726,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
 		}
 
 		/*
-		 * If the selected metaslab is condensing or disabled,
-		 * skip it.
+		 * If the selected metaslab is condensing or disabled, or
+		 * hasn't gone through a metaslab_sync_done(), then skip it.
 		 */
-		if (msp->ms_condensing || msp->ms_disabled > 0)
+		if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new)
 			continue;
 
 		*was_active = msp->ms_allocator != -1;
@@ -4721,7 +4796,6 @@ metaslab_active_mask_verify(metaslab_t *msp)
 	}
 }
 
-/* ARGSUSED */
 static uint64_t
 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
@@ -5034,7 +5108,6 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
     int allocator, boolean_t try_hard)
 {
 	uint64_t offset;
-	ASSERT(mg->mg_initialized);
 
 	offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
 	    dva, d, allocator, try_hard);
@@ -5073,7 +5146,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     zio_alloc_list_t *zal, int allocator)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
-	metaslab_group_t *mg, *fast_mg, *rotor;
+	metaslab_group_t *mg, *rotor;
 	vdev_t *vd;
 	boolean_t try_hard = B_FALSE;
 
@@ -5086,7 +5159,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	 * damage can result in extremely long reconstruction times.  This
 	 * will also test spilling from special to normal.
 	 */
-	if (psize >= metaslab_force_ganging && (random_in_range(100) < 3)) {
+	if (psize >= metaslab_force_ganging &&
+	    metaslab_force_ganging_pct > 0 &&
+	    (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
 		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
 		    allocator);
 		return (SET_ERROR(ENOSPC));
@@ -5126,8 +5201,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 		if (vd != NULL && vd->vdev_mg != NULL) {
 			mg = vdev_get_mg(vd, mc);
 
-			if (flags & METASLAB_HINTBP_AVOID &&
-			    mg->mg_next != NULL)
+			if (flags & METASLAB_HINTBP_AVOID)
 				mg = mg->mg_next;
 		} else {
 			mg = mca->mca_rotor;
@@ -5135,15 +5209,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	} else if (d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vd->vdev_mg->mg_next;
-	} else if (flags & METASLAB_FASTWRITE) {
-		mg = fast_mg = mca->mca_rotor;
-
-		do {
-			if (fast_mg->mg_vd->vdev_pending_fastwrite <
-			    mg->mg_vd->vdev_pending_fastwrite)
-				mg = fast_mg;
-		} while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor);
-
 	} else {
 		ASSERT(mca->mca_rotor != NULL);
 		mg = mca->mca_rotor;
@@ -5184,7 +5249,7 @@ top:
 		 */
 		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
 			allocatable = metaslab_group_allocatable(mg, rotor,
-			    psize, allocator, d);
+			    flags, psize, allocator, d);
 		}
 
 		if (!allocatable) {
@@ -5193,15 +5258,12 @@ top:
 			goto next;
 		}
 
-		ASSERT(mg->mg_initialized);
-
 		/*
-		 * Avoid writing single-copy data to a failing,
+		 * Avoid writing single-copy data to an unhealthy,
 		 * non-redundant vdev, unless we've already tried all
 		 * other vdevs.
 		 */
-		if ((vd->vdev_stat.vs_write_errors > 0 ||
-		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
+		if (vd->vdev_state < VDEV_STATE_HEALTHY &&
 		    d == 0 && !try_hard && vd->vdev_children == 0) {
 			metaslab_trace_add(zal, mg, NULL, psize, d,
 			    TRACE_VDEV_ERROR, allocator);
@@ -5210,7 +5272,7 @@ top:
 
 		ASSERT(mg->mg_class == mc);
 
-		uint64_t asize = vdev_psize_to_asize(vd, psize);
+		uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
 		/*
@@ -5269,7 +5331,7 @@ top:
 				mg->mg_bias = 0;
 			}
 
-			if ((flags & METASLAB_FASTWRITE) ||
+			if ((flags & METASLAB_ZIL) ||
 			    atomic_add_64_nv(&mca->mca_aliquot, asize) >=
 			    mg->mg_aliquot + mg->mg_bias) {
 				mca->mca_rotor = mg->mg_next;
@@ -5282,11 +5344,6 @@ top:
 			    ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
 			DVA_SET_ASIZE(&dva[d], asize);
 
-			if (flags & METASLAB_FASTWRITE) {
-				atomic_add_64(&vd->vdev_pending_fastwrite,
-				    psize);
-			}
-
 			return (0);
 		}
 next:
@@ -5305,7 +5362,7 @@ next:
 		goto top;
 	}
 
-	bzero(&dva[d], sizeof (dva_t));
+	memset(&dva[d], 0, sizeof (dva_t));
 
 	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
 	return (SET_ERROR(ENOSPC));
@@ -5347,11 +5404,11 @@ metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
 	mutex_exit(&msp->ms_lock);
 }
 
-/* ARGSUSED */
 void
 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
+	(void) inner_offset;
 	boolean_t *checkpoint = arg;
 
 	ASSERT3P(checkpoint, !=, NULL);
@@ -5441,8 +5498,9 @@ remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
 	vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
 	    DVA_GET_VDEV(&bp->blk_dva[0]));
 	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
-	bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
+	uint64_t physical_birth = vdev_indirect_births_physbirth(vib,
 	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
+	BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
 
 	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
@@ -5629,8 +5687,7 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
 		 * We reserve the slots individually so that we can unreserve
 		 * them individually when an I/O completes.
 		 */
-		for (int d = 0; d < slots; d++)
-			zfs_refcount_add(&mca->mca_alloc_slots, zio);
+		zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio);
 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
 		return (B_TRUE);
 	}
@@ -5644,8 +5701,7 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
-	for (int d = 0; d < slots; d++)
-		zfs_refcount_remove(&mca->mca_alloc_slots, zio);
+	zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio);
 }
 
 static int
@@ -5717,11 +5773,11 @@ typedef struct metaslab_claim_cb_arg_t {
 	int		mcca_error;
 } metaslab_claim_cb_arg_t;
 
-/* ARGSUSED */
 static void
 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
+	(void) inner_offset;
 	metaslab_claim_cb_arg_t *mcca_arg = arg;
 
 	if (mcca_arg->mcca_error == 0) {
@@ -5793,8 +5849,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
 	int error = 0;
 
-	ASSERT(bp->blk_birth == 0);
-	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
+	ASSERT0(BP_GET_LOGICAL_BIRTH(bp));
+	ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
@@ -5818,7 +5874,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 				metaslab_group_alloc_decrement(spa,
 				    DVA_GET_VDEV(&dva[d]), zio, flags,
 				    allocator, B_FALSE);
-				bzero(&dva[d], sizeof (dva_t));
+				memset(&dva[d], 0, sizeof (dva_t));
 			}
 			spa_config_exit(spa, SCL_ALLOC, FTAG);
 			return (error);
@@ -5848,7 +5904,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 	int ndvas = BP_GET_NDVAS(bp);
 
 	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
+	ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa));
 
 	/*
 	 * If we have a checkpoint for the pool we need to make sure that
@@ -5866,7 +5922,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 	 * normally as they will be referenced by the checkpointed uberblock.
 	 */
 	boolean_t checkpoint = B_FALSE;
-	if (bp->blk_birth <= spa->spa_checkpoint_txg &&
+	if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg &&
 	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
 		/*
 		 * At this point, if the block is part of the checkpoint
@@ -5924,60 +5980,12 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 	return (error);
 }
 
-void
-metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
-{
-	const dva_t *dva = bp->blk_dva;
-	int ndvas = BP_GET_NDVAS(bp);
-	uint64_t psize = BP_GET_PSIZE(bp);
-	int d;
-	vdev_t *vd;
-
-	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(!BP_IS_EMBEDDED(bp));
-	ASSERT(psize > 0);
-
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-
-	for (d = 0; d < ndvas; d++) {
-		if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
-			continue;
-		atomic_add_64(&vd->vdev_pending_fastwrite, psize);
-	}
-
-	spa_config_exit(spa, SCL_VDEV, FTAG);
-}
-
-void
-metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
-{
-	const dva_t *dva = bp->blk_dva;
-	int ndvas = BP_GET_NDVAS(bp);
-	uint64_t psize = BP_GET_PSIZE(bp);
-	int d;
-	vdev_t *vd;
-
-	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(!BP_IS_EMBEDDED(bp));
-	ASSERT(psize > 0);
-
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-
-	for (d = 0; d < ndvas; d++) {
-		if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
-			continue;
-		ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
-		atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
-	}
-
-	spa_config_exit(spa, SCL_VDEV, FTAG);
-}
-
-/* ARGSUSED */
 static void
 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
+	(void) inner, (void) arg;
+
 	if (vd->vdev_ops == &vdev_indirect_ops)
 		return;
 
@@ -6139,6 +6147,12 @@ metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
 	mutex_exit(&mg->mg_ms_disabled_lock);
 }
 
+void
+metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty)
+{
+	ms->ms_unflushed_dirty = dirty;
+}
+
 static void
 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
 {
@@ -6175,22 +6189,23 @@ metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
 void
 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
 {
-	spa_t *spa = ms->ms_group->mg_vd->vdev_spa;
-
-	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
-		return;
-
 	ms->ms_unflushed_txg = txg;
 	metaslab_update_ondisk_flush_data(ms, tx);
 }
 
+boolean_t
+metaslab_unflushed_dirty(metaslab_t *ms)
+{
+	return (ms->ms_unflushed_dirty);
+}
+
 uint64_t
 metaslab_unflushed_txg(metaslab_t *ms)
 {
 	return (ms->ms_unflushed_txg);
 }
 
-ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW,
 	"Allocation granularity (a.k.a. stripe size)");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW,
@@ -6202,29 +6217,33 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
 	"Preload potential metaslabs during reassessment");
 
-ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW,
+	"Max number of metaslabs per group to preload");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW,
 	"Delay in txgs after metaslab was last used before unloading");
 
-ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW,
 	"Delay in milliseconds after metaslab was last used before unloading");
 
 /* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW,
 	"Percentage of metaslab group size that should be free to make it "
 	"eligible for allocation");
 
-ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW,
 	"Percentage of metaslab group size that should be considered eligible "
 	"for allocations unless all metaslab groups within the metaslab class "
 	"have also crossed this threshold");
 
-ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, INT,
-	 ZMOD_RW, "Fragmentation for metaslab to allow allocation");
-
-ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT,
+	ZMOD_RW,
 	"Use the fragmentation metric to prefer less fragmented metaslabs");
 /* END CSTYLED */
 
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT,
+	ZMOD_RW, "Fragmentation for metaslab to allow allocation");
+
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW,
 	"Prefer metaslabs with lower LBAs");
 
@@ -6237,23 +6256,32 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT,
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
 	"Segment-based metaslab selection maximum buckets before switching");
 
-ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, ULONG, ZMOD_RW,
-	"Blocks larger than this size are forced to be gang blocks");
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW,
+	"Blocks larger than this size are sometimes forced to be gang blocks");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW,
+	"Percentage of large blocks that will be forced to be gang blocks");
 
-ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
 	"Max distance (bytes) to search forward before using size tree");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW,
 	"When looking in size tree, use largest segment instead of exact fit");
 
-ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG,
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64,
 	ZMOD_RW, "How long to trust the cached max chunk size of a metaslab");
 
-ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW,
 	"Percentage of memory that can be used to store metaslab range trees");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
 	ZMOD_RW, "Try hard to allocate before ganging");
 
-ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW,
 	"Normally only consider this many of the best metaslabs in each vdev");
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator,
+	param_set_active_allocator, param_get_charp, ZMOD_RW,
+	"SPA active allocator");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c
index f67a4eb22a2d..71122542758d 100644
--- a/sys/contrib/openzfs/module/zfs/mmp.c
+++ b/sys/contrib/openzfs/module/zfs/mmp.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -156,7 +156,7 @@
  * vary with the I/O load and this observed value is the ub_mmp_delay which is
  * stored in the uberblock.  The minimum allowed value is 100 ms.
  */
-ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
+uint64_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
 
 /*
  * Used to control the duration of the activity test on import.  Smaller values
@@ -186,8 +186,8 @@ uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS;
  */
 uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS;
 
-char *mmp_tag = "mmp_write_uberblock";
-static void mmp_thread(void *arg);
+static const void *const mmp_tag = "mmp_write_uberblock";
+static __attribute__((noreturn)) void mmp_thread(void *arg);
 
 void
 mmp_init(spa_t *spa)
@@ -224,7 +224,6 @@ mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr)
 	*mpp = NULL;
 	cv_broadcast(&mmp->mmp_thread_cv);
 	CALLB_CPR_EXIT(cpr);		/* drops &mmp->mmp_thread_lock */
-	thread_exit();
 }
 
 void
@@ -304,8 +303,10 @@ mmp_next_leaf(spa_t *spa)
 
 	do {
 		leaf = list_next(&spa->spa_leaf_list, leaf);
-		if (leaf == NULL)
+		if (leaf == NULL) {
 			leaf = list_head(&spa->spa_leaf_list);
+			ASSERT3P(leaf, !=, NULL);
+		}
 
 		/*
 		 * We skip unwritable, offline, detached, and dRAID spare
@@ -444,7 +445,7 @@ mmp_write_uberblock(spa_t *spa)
 	uint64_t offset;
 
 	hrtime_t lock_acquire_time = gethrtime();
-	spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER);
+	spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER);
 	lock_acquire_time = gethrtime() - lock_acquire_time;
 	if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
 		zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
@@ -516,8 +517,9 @@ mmp_write_uberblock(spa_t *spa)
 
 	zio_t *zio  = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
 	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
-	abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
 	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
+	abd_zero_off(ub_abd, sizeof (uberblock_t),
+	    VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t));
 
 	mmp->mmp_seq++;
 	mmp->mmp_kstat_id++;
@@ -537,7 +539,7 @@ mmp_write_uberblock(spa_t *spa)
 	zio_nowait(zio);
 }
 
-static void
+static __attribute__((noreturn)) void
 mmp_thread(void *arg)
 {
 	spa_t *spa = (spa_t *)arg;
@@ -549,11 +551,11 @@ mmp_thread(void *arg)
 	uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK(
 	    zfs_multihost_fail_intervals);
 	hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval;
-	boolean_t last_spa_suspended = suspended;
-	boolean_t last_spa_multihost = multihost;
-	uint64_t last_mmp_interval = mmp_interval;
-	uint32_t last_mmp_fail_intervals = mmp_fail_intervals;
-	hrtime_t last_mmp_fail_ns = mmp_fail_ns;
+	boolean_t last_spa_suspended;
+	boolean_t last_spa_multihost;
+	uint64_t last_mmp_interval;
+	uint32_t last_mmp_fail_intervals;
+	hrtime_t last_mmp_fail_ns;
 	callb_cpr_t cpr;
 	int skip_wait = 0;
 
@@ -662,12 +664,13 @@ mmp_thread(void *arg)
 		    (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
 			zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
 			    "mmp_last_write %llu mmp_interval %llu "
-			    "mmp_fail_intervals %llu mmp_fail_ns %llu",
+			    "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
 			    spa_name(spa), (u_longlong_t)gethrtime(),
 			    (u_longlong_t)mmp->mmp_last_write,
 			    (u_longlong_t)mmp_interval,
 			    (u_longlong_t)mmp_fail_intervals,
-			    (u_longlong_t)mmp_fail_ns);
+			    (u_longlong_t)mmp_fail_ns,
+			    (u_longlong_t)spa->spa_uberblock.ub_txg);
 			cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
 			    "succeeded in over %llu ms; suspending pool. "
 			    "Hrtime %llu",
@@ -698,6 +701,8 @@ mmp_thread(void *arg)
 
 	mmp->mmp_zio_root = NULL;
 	mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
+
+	thread_exit();
 }
 
 /*
@@ -733,7 +738,7 @@ mmp_signal_all_threads(void)
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval,
-	param_set_multihost_interval, param_get_ulong, ZMOD_RW,
+	param_set_multihost_interval, spl_param_get_u64, ZMOD_RW,
 	"Milliseconds between mmp writes to each leaf");
 /* END CSTYLED */
 
diff --git a/sys/contrib/openzfs/module/zfs/multilist.c b/sys/contrib/openzfs/module/zfs/multilist.c
index 8bbc9b376ae0..3d3ef86e6839 100644
--- a/sys/contrib/openzfs/module/zfs/multilist.c
+++ b/sys/contrib/openzfs/module/zfs/multilist.c
@@ -24,7 +24,7 @@
  * This overrides the number of sublists in each multilist_t, which defaults
  * to the number of CPUs in the system (see multilist_create()).
  */
-int zfs_multilist_num_sublists = 0;
+uint_t zfs_multilist_num_sublists = 0;
 
 /*
  * Given the object contained on the list, return a pointer to the
@@ -36,6 +36,8 @@ multilist_d2l(multilist_t *ml, void *obj)
 {
 	return ((multilist_node_t *)((char *)obj + ml->ml_offset));
 }
+#else
+#define	multilist_d2l(ml, obj) ((void) sizeof (ml), (void) sizeof (obj), NULL)
 #endif
 
 /*
@@ -67,7 +69,7 @@ multilist_d2l(multilist_t *ml, void *obj)
  */
 static void
 multilist_create_impl(multilist_t *ml, size_t size, size_t offset,
-    unsigned int num, multilist_sublist_index_func_t *index_func)
+    uint_t num, multilist_sublist_index_func_t *index_func)
 {
 	ASSERT3U(size, >, 0);
 	ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
@@ -102,7 +104,7 @@ void
 multilist_create(multilist_t *ml, size_t size, size_t offset,
     multilist_sublist_index_func_t *index_func)
 {
-	int num_sublists;
+	uint_t num_sublists;
 
 	if (zfs_multilist_num_sublists > 0) {
 		num_sublists = zfs_multilist_num_sublists;
@@ -275,9 +277,15 @@ multilist_get_random_index(multilist_t *ml)
 	return (random_in_range(ml->ml_num_sublists));
 }
 
+void
+multilist_sublist_lock(multilist_sublist_t *mls)
+{
+	mutex_enter(&mls->mls_lock);
+}
+
 /* Lock and return the sublist specified at the given index */
 multilist_sublist_t *
-multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+multilist_sublist_lock_idx(multilist_t *ml, unsigned int sublist_idx)
 {
 	multilist_sublist_t *mls;
 
@@ -292,7 +300,7 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
 multilist_sublist_t *
 multilist_sublist_lock_obj(multilist_t *ml, void *obj)
 {
-	return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
+	return (multilist_sublist_lock_idx(ml, ml->ml_index_func(ml, obj)));
 }
 
 void
@@ -325,6 +333,22 @@ multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
 	list_insert_tail(&mls->mls_list, obj);
 }
 
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_after(multilist_sublist_t *mls, void *prev, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_after(&mls->mls_list, prev, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_before(multilist_sublist_t *mls, void *next, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_before(&mls->mls_list, next, obj);
+}
+
 /*
  * Move the object one element forward in the list.
  *
@@ -423,7 +447,5 @@ multilist_link_active(multilist_node_t *link)
 	return (list_link_active(link));
 }
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, multilist_num_sublists, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, multilist_num_sublists, UINT, ZMOD_RW,
 	"Number of sublists used in each multilist");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/pathname.c b/sys/contrib/openzfs/module/zfs/pathname.c
index 84ab7b7e1111..51460d119106 100644
--- a/sys/contrib/openzfs/module/zfs/pathname.c
+++ b/sys/contrib/openzfs/module/zfs/pathname.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c
index 595918e5a742..5174e2c46633 100644
--- a/sys/contrib/openzfs/module/zfs/range_tree.c
+++ b/sys/contrib/openzfs/module/zfs/range_tree.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -78,7 +78,7 @@
 static inline void
 rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt)
 {
-	ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+	ASSERT3U(rt->rt_type, <, RANGE_SEG_NUM_TYPES);
 	size_t size = 0;
 	switch (rt->rt_type) {
 	case RANGE_SEG32:
@@ -91,9 +91,9 @@ rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt)
 		size = sizeof (range_seg_gap_t);
 		break;
 	default:
-		VERIFY(0);
+		__builtin_unreachable();
 	}
-	bcopy(src, dest, size);
+	memcpy(dest, src, size);
 }
 
 void
@@ -151,6 +151,7 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
 	rt->rt_histogram[idx]--;
 }
 
+__attribute__((always_inline)) inline
 static int
 range_tree_seg32_compare(const void *x1, const void *x2)
 {
@@ -163,6 +164,7 @@ range_tree_seg32_compare(const void *x1, const void *x2)
 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
 }
 
+__attribute__((always_inline)) inline
 static int
 range_tree_seg64_compare(const void *x1, const void *x2)
 {
@@ -175,6 +177,7 @@ range_tree_seg64_compare(const void *x1, const void *x2)
 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
 }
 
+__attribute__((always_inline)) inline
 static int
 range_tree_seg_gap_compare(const void *x1, const void *x2)
 {
@@ -187,11 +190,18 @@ range_tree_seg_gap_compare(const void *x1, const void *x2)
 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
 }
 
+ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t,
+    range_tree_seg32_compare)
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t,
+    range_tree_seg64_compare)
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t,
+    range_tree_seg_gap_compare)
+
 range_tree_t *
-range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg,
-    uint64_t start, uint64_t shift,
-    int (*zfs_btree_compare) (const void *, const void *),
-    uint64_t gap)
+range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type,
+    void *arg, uint64_t start, uint64_t shift, uint64_t gap)
 {
 	range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
 
@@ -199,23 +209,27 @@ range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg,
 	ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES);
 	size_t size;
 	int (*compare) (const void *, const void *);
+	bt_find_in_buf_f bt_find;
 	switch (type) {
 	case RANGE_SEG32:
 		size = sizeof (range_seg32_t);
 		compare = range_tree_seg32_compare;
+		bt_find = range_tree_seg32_find_in_buf;
 		break;
 	case RANGE_SEG64:
 		size = sizeof (range_seg64_t);
 		compare = range_tree_seg64_compare;
+		bt_find = range_tree_seg64_find_in_buf;
 		break;
 	case RANGE_SEG_GAP:
 		size = sizeof (range_seg_gap_t);
 		compare = range_tree_seg_gap_compare;
+		bt_find = range_tree_seg_gap_find_in_buf;
 		break;
 	default:
 		panic("Invalid range seg type %d", type);
 	}
-	zfs_btree_create(&rt->rt_root, compare, size);
+	zfs_btree_create(&rt->rt_root, compare, bt_find, size);
 
 	rt->rt_ops = ops;
 	rt->rt_gap = gap;
@@ -223,7 +237,6 @@ range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg,
 	rt->rt_type = type;
 	rt->rt_start = start;
 	rt->rt_shift = shift;
-	rt->rt_btree_compare = zfs_btree_compare;
 
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
 		rt->rt_ops->rtop_create(rt, rt->rt_arg);
@@ -232,10 +245,10 @@ range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg,
 }
 
 range_tree_t *
-range_tree_create(range_tree_ops_t *ops, range_seg_type_t type,
+range_tree_create(const range_tree_ops_t *ops, range_seg_type_t type,
     void *arg, uint64_t start, uint64_t shift)
 {
-	return (range_tree_create_impl(ops, type, arg, start, shift, NULL, 0));
+	return (range_tree_create_gap(ops, type, arg, start, shift, 0));
 }
 
 void
@@ -372,6 +385,7 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
 		 * invalid as soon as we do any mutating btree operations.
 		 */
 		rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
+		ASSERT3P(rs_after, !=, NULL);
 		rs_set_start_raw(rs_after, rt, before_start);
 		rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
 		rs = rs_after;
@@ -701,7 +715,7 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
 		zfs_btree_clear(&rt->rt_root);
 	}
 
-	bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
+	memset(rt->rt_histogram, 0, sizeof (rt->rt_histogram));
 	rt->rt_space = 0;
 }
 
@@ -741,76 +755,6 @@ range_tree_is_empty(range_tree_t *rt)
 	return (range_tree_space(rt) == 0);
 }
 
-/* ARGSUSED */
-void
-rt_btree_create(range_tree_t *rt, void *arg)
-{
-	zfs_btree_t *size_tree = arg;
-
-	size_t size;
-	switch (rt->rt_type) {
-	case RANGE_SEG32:
-		size = sizeof (range_seg32_t);
-		break;
-	case RANGE_SEG64:
-		size = sizeof (range_seg64_t);
-		break;
-	case RANGE_SEG_GAP:
-		size = sizeof (range_seg_gap_t);
-		break;
-	default:
-		panic("Invalid range seg type %d", rt->rt_type);
-	}
-	zfs_btree_create(size_tree, rt->rt_btree_compare, size);
-}
-
-/* ARGSUSED */
-void
-rt_btree_destroy(range_tree_t *rt, void *arg)
-{
-	zfs_btree_t *size_tree = arg;
-	ASSERT0(zfs_btree_numnodes(size_tree));
-
-	zfs_btree_destroy(size_tree);
-}
-
-/* ARGSUSED */
-void
-rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
-	zfs_btree_t *size_tree = arg;
-
-	zfs_btree_add(size_tree, rs);
-}
-
-/* ARGSUSED */
-void
-rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
-	zfs_btree_t *size_tree = arg;
-
-	zfs_btree_remove(size_tree, rs);
-}
-
-/* ARGSUSED */
-void
-rt_btree_vacate(range_tree_t *rt, void *arg)
-{
-	zfs_btree_t *size_tree = arg;
-	zfs_btree_clear(size_tree);
-	zfs_btree_destroy(size_tree);
-
-	rt_btree_create(rt, arg);
-}
-
-range_tree_ops_t rt_btree_ops = {
-	.rtop_create = rt_btree_create,
-	.rtop_destroy = rt_btree_destroy,
-	.rtop_add = rt_btree_add,
-	.rtop_remove = rt_btree_remove,
-	.rtop_vacate = rt_btree_vacate
-};
-
 /*
  * Remove any overlapping ranges between the given segment [start, end)
  * from removefrom. Add non-overlapping leftovers to addto.
diff --git a/sys/contrib/openzfs/module/zfs/refcount.c b/sys/contrib/openzfs/module/zfs/refcount.c
index 35a379dded69..718bbb34a8d5 100644
--- a/sys/contrib/openzfs/module/zfs/refcount.c
+++ b/sys/contrib/openzfs/module/zfs/refcount.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -26,43 +26,50 @@
 #include <sys/zfs_context.h>
 #include <sys/zfs_refcount.h>
 
+#ifdef	ZFS_DEBUG
 /*
  * Reference count tracking is disabled by default.  It's memory requirements
  * are reasonable, however as implemented it consumes a significant amount of
  * cpu time.  Until its performance is improved it should be manually enabled.
  */
-int reference_tracking_enable = FALSE;
-int reference_history = 3; /* tunable */
+int reference_tracking_enable = B_FALSE;
+static uint_t reference_history = 3; /* tunable */
 
-#ifdef	ZFS_DEBUG
 static kmem_cache_t *reference_cache;
-static kmem_cache_t *reference_history_cache;
 
 void
 zfs_refcount_init(void)
 {
 	reference_cache = kmem_cache_create("reference_cache",
 	    sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-
-	reference_history_cache = kmem_cache_create("reference_history_cache",
-	    sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
 zfs_refcount_fini(void)
 {
 	kmem_cache_destroy(reference_cache);
-	kmem_cache_destroy(reference_history_cache);
+}
+
+static int
+zfs_refcount_compare(const void *x1, const void *x2)
+{
+	const reference_t *r1 = (const reference_t *)x1;
+	const reference_t *r2 = (const reference_t *)x2;
+
+	int cmp1 = TREE_CMP(r1->ref_holder, r2->ref_holder);
+	int cmp2 = TREE_CMP(r1->ref_number, r2->ref_number);
+	int cmp = cmp1 ? cmp1 : cmp2;
+	return ((cmp || r1->ref_search) ? cmp : TREE_PCMP(r1, r2));
 }
 
 void
 zfs_refcount_create(zfs_refcount_t *rc)
 {
 	mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&rc->rc_list, sizeof (reference_t),
-	    offsetof(reference_t, ref_link));
+	avl_create(&rc->rc_tree, zfs_refcount_compare, sizeof (reference_t),
+	    offsetof(reference_t, ref_link.a));
 	list_create(&rc->rc_removed, sizeof (reference_t),
-	    offsetof(reference_t, ref_link));
+	    offsetof(reference_t, ref_link.l));
 	rc->rc_count = 0;
 	rc->rc_removed_count = 0;
 	rc->rc_tracked = reference_tracking_enable;
@@ -86,19 +93,15 @@ void
 zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number)
 {
 	reference_t *ref;
+	void *cookie = NULL;
 
 	ASSERT3U(rc->rc_count, ==, number);
-	while ((ref = list_head(&rc->rc_list))) {
-		list_remove(&rc->rc_list, ref);
+	while ((ref = avl_destroy_nodes(&rc->rc_tree, &cookie)) != NULL)
 		kmem_cache_free(reference_cache, ref);
-	}
-	list_destroy(&rc->rc_list);
+	avl_destroy(&rc->rc_tree);
 
-	while ((ref = list_head(&rc->rc_removed))) {
-		list_remove(&rc->rc_removed, ref);
-		kmem_cache_free(reference_history_cache, ref->ref_removed);
+	while ((ref = list_remove_head(&rc->rc_removed)))
 		kmem_cache_free(reference_cache, ref);
-	}
 	list_destroy(&rc->rc_removed);
 	mutex_destroy(&rc->rc_mtx);
 }
@@ -124,10 +127,10 @@ zfs_refcount_count(zfs_refcount_t *rc)
 int64_t
 zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder)
 {
-	reference_t *ref = NULL;
+	reference_t *ref;
 	int64_t count;
 
-	if (!rc->rc_tracked) {
+	if (likely(!rc->rc_tracked)) {
 		count = atomic_add_64_nv(&(rc)->rc_count, number);
 		ASSERT3U(count, >=, number);
 		return (count);
@@ -136,9 +139,9 @@ zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder)
 	ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
 	ref->ref_holder = holder;
 	ref->ref_number = number;
+	ref->ref_search = B_FALSE;
 	mutex_enter(&rc->rc_mtx);
-	ASSERT3U(rc->rc_count, >=, 0);
-	list_insert_head(&rc->rc_list, ref);
+	avl_add(&rc->rc_tree, ref);
 	rc->rc_count += number;
 	count = rc->rc_count;
 	mutex_exit(&rc->rc_mtx);
@@ -152,51 +155,55 @@ zfs_refcount_add(zfs_refcount_t *rc, const void *holder)
 	return (zfs_refcount_add_many(rc, 1, holder));
 }
 
+void
+zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
+{
+	if (likely(!rc->rc_tracked))
+		(void) zfs_refcount_add_many(rc, number, holder);
+	else for (; number > 0; number--)
+		(void) zfs_refcount_add(rc, holder);
+}
+
 int64_t
 zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number,
     const void *holder)
 {
-	reference_t *ref;
+	reference_t *ref, s;
 	int64_t count;
 
-	if (!rc->rc_tracked) {
+	if (likely(!rc->rc_tracked)) {
 		count = atomic_add_64_nv(&(rc)->rc_count, -number);
 		ASSERT3S(count, >=, 0);
 		return (count);
 	}
 
+	s.ref_holder = holder;
+	s.ref_number = number;
+	s.ref_search = B_TRUE;
 	mutex_enter(&rc->rc_mtx);
 	ASSERT3U(rc->rc_count, >=, number);
-	for (ref = list_head(&rc->rc_list); ref;
-	    ref = list_next(&rc->rc_list, ref)) {
-		if (ref->ref_holder == holder && ref->ref_number == number) {
-			list_remove(&rc->rc_list, ref);
-			if (reference_history > 0) {
-				ref->ref_removed =
-				    kmem_cache_alloc(reference_history_cache,
-				    KM_SLEEP);
-				list_insert_head(&rc->rc_removed, ref);
-				rc->rc_removed_count++;
-				if (rc->rc_removed_count > reference_history) {
-					ref = list_tail(&rc->rc_removed);
-					list_remove(&rc->rc_removed, ref);
-					kmem_cache_free(reference_history_cache,
-					    ref->ref_removed);
-					kmem_cache_free(reference_cache, ref);
-					rc->rc_removed_count--;
-				}
-			} else {
-				kmem_cache_free(reference_cache, ref);
-			}
-			rc->rc_count -= number;
-			count = rc->rc_count;
-			mutex_exit(&rc->rc_mtx);
-			return (count);
+	ref = avl_find(&rc->rc_tree, &s, NULL);
+	if (unlikely(ref == NULL)) {
+		panic("No such hold %p on refcount %llx", holder,
+		    (u_longlong_t)(uintptr_t)rc);
+		return (-1);
+	}
+	avl_remove(&rc->rc_tree, ref);
+	if (reference_history > 0) {
+		list_insert_head(&rc->rc_removed, ref);
+		if (rc->rc_removed_count >= reference_history) {
+			ref = list_remove_tail(&rc->rc_removed);
+			kmem_cache_free(reference_cache, ref);
+		} else {
+			rc->rc_removed_count++;
 		}
+	} else {
+		kmem_cache_free(reference_cache, ref);
 	}
-	panic("No such hold %p on refcount %llx", holder,
-	    (u_longlong_t)(uintptr_t)rc);
-	return (-1);
+	rc->rc_count -= number;
+	count = rc->rc_count;
+	mutex_exit(&rc->rc_mtx);
+	return (count);
 }
 
 int64_t
@@ -206,33 +213,49 @@ zfs_refcount_remove(zfs_refcount_t *rc, const void *holder)
 }
 
 void
+zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
+{
+	if (likely(!rc->rc_tracked))
+		(void) zfs_refcount_remove_many(rc, number, holder);
+	else for (; number > 0; number--)
+		(void) zfs_refcount_remove(rc, holder);
+}
+
+void
 zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src)
 {
-	int64_t count, removed_count;
-	list_t list, removed;
+	avl_tree_t tree;
+	list_t removed;
+	reference_t *ref;
+	void *cookie = NULL;
+	uint64_t count;
+	uint_t removed_count;
 
-	list_create(&list, sizeof (reference_t),
-	    offsetof(reference_t, ref_link));
+	avl_create(&tree, zfs_refcount_compare, sizeof (reference_t),
+	    offsetof(reference_t, ref_link.a));
 	list_create(&removed, sizeof (reference_t),
-	    offsetof(reference_t, ref_link));
+	    offsetof(reference_t, ref_link.l));
 
 	mutex_enter(&src->rc_mtx);
 	count = src->rc_count;
 	removed_count = src->rc_removed_count;
 	src->rc_count = 0;
 	src->rc_removed_count = 0;
-	list_move_tail(&list, &src->rc_list);
+	avl_swap(&tree, &src->rc_tree);
 	list_move_tail(&removed, &src->rc_removed);
 	mutex_exit(&src->rc_mtx);
 
 	mutex_enter(&dst->rc_mtx);
 	dst->rc_count += count;
 	dst->rc_removed_count += removed_count;
-	list_move_tail(&dst->rc_list, &list);
+	if (avl_is_empty(&dst->rc_tree))
+		avl_swap(&dst->rc_tree, &tree);
+	else while ((ref = avl_destroy_nodes(&tree, &cookie)) != NULL)
+		avl_add(&dst->rc_tree, ref);
 	list_move_tail(&dst->rc_removed, &removed);
 	mutex_exit(&dst->rc_mtx);
 
-	list_destroy(&list);
+	avl_destroy(&tree);
 	list_destroy(&removed);
 }
 
@@ -240,23 +263,19 @@ void
 zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number,
     const void *current_holder, const void *new_holder)
 {
-	reference_t *ref;
-	boolean_t found = B_FALSE;
+	reference_t *ref, s;
 
-	if (!rc->rc_tracked)
+	if (likely(!rc->rc_tracked))
 		return;
 
+	s.ref_holder = current_holder;
+	s.ref_number = number;
+	s.ref_search = B_TRUE;
 	mutex_enter(&rc->rc_mtx);
-	for (ref = list_head(&rc->rc_list); ref;
-	    ref = list_next(&rc->rc_list, ref)) {
-		if (ref->ref_holder == current_holder &&
-		    ref->ref_number == number) {
-			ref->ref_holder = new_holder;
-			found = B_TRUE;
-			break;
-		}
-	}
-	ASSERT(found);
+	ref = avl_find(&rc->rc_tree, &s, NULL);
+	ASSERT(ref);
+	ref->ref_holder = new_holder;
+	avl_update(&rc->rc_tree, ref);
 	mutex_exit(&rc->rc_mtx);
 }
 
@@ -276,21 +295,23 @@ zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder,
 boolean_t
 zfs_refcount_held(zfs_refcount_t *rc, const void *holder)
 {
-	reference_t *ref;
+	reference_t *ref, s;
+	avl_index_t idx;
+	boolean_t res;
 
-	if (!rc->rc_tracked)
+	if (likely(!rc->rc_tracked))
 		return (zfs_refcount_count(rc) > 0);
 
+	s.ref_holder = holder;
+	s.ref_number = 0;
+	s.ref_search = B_TRUE;
 	mutex_enter(&rc->rc_mtx);
-	for (ref = list_head(&rc->rc_list); ref;
-	    ref = list_next(&rc->rc_list, ref)) {
-		if (ref->ref_holder == holder) {
-			mutex_exit(&rc->rc_mtx);
-			return (B_TRUE);
-		}
-	}
+	ref = avl_find(&rc->rc_tree, &s, &idx);
+	if (likely(ref == NULL))
+		ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER);
+	res = ref && ref->ref_holder == holder;
 	mutex_exit(&rc->rc_mtx);
-	return (B_FALSE);
+	return (res);
 }
 
 /*
@@ -301,21 +322,23 @@ zfs_refcount_held(zfs_refcount_t *rc, const void *holder)
 boolean_t
 zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder)
 {
-	reference_t *ref;
+	reference_t *ref, s;
+	avl_index_t idx;
+	boolean_t res;
 
-	if (!rc->rc_tracked)
+	if (likely(!rc->rc_tracked))
 		return (B_TRUE);
 
 	mutex_enter(&rc->rc_mtx);
-	for (ref = list_head(&rc->rc_list); ref;
-	    ref = list_next(&rc->rc_list, ref)) {
-		if (ref->ref_holder == holder) {
-			mutex_exit(&rc->rc_mtx);
-			return (B_FALSE);
-		}
-	}
+	s.ref_holder = holder;
+	s.ref_number = 0;
+	s.ref_search = B_TRUE;
+	ref = avl_find(&rc->rc_tree, &s, &idx);
+	if (likely(ref == NULL))
+		ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER);
+	res = ref == NULL || ref->ref_holder != holder;
 	mutex_exit(&rc->rc_mtx);
-	return (B_TRUE);
+	return (res);
 }
 
 EXPORT_SYMBOL(zfs_refcount_create);
@@ -327,10 +350,10 @@ EXPORT_SYMBOL(zfs_refcount_remove);
 EXPORT_SYMBOL(zfs_refcount_held);
 
 /* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, ,reference_tracking_enable, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, , reference_tracking_enable, INT, ZMOD_RW,
 	"Track reference holders to refcount_t objects");
 
-ZFS_MODULE_PARAM(zfs, ,reference_history, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, , reference_history, UINT, ZMOD_RW,
 	"Maximum reference holders being tracked");
 /* END CSTYLED */
 #endif	/* ZFS_DEBUG */
diff --git a/sys/contrib/openzfs/module/zfs/rrwlock.c b/sys/contrib/openzfs/module/zfs/rrwlock.c
index d23fc3ad1067..a8c438bb6ebd 100644
--- a/sys/contrib/openzfs/module/zfs/rrwlock.c
+++ b/sys/contrib/openzfs/module/zfs/rrwlock.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -77,7 +77,7 @@ uint_t rrw_tsd_key;
 typedef struct rrw_node {
 	struct rrw_node *rn_next;
 	rrwlock_t *rn_rrl;
-	void *rn_tag;
+	const void *rn_tag;
 } rrw_node_t;
 
 static rrw_node_t *
@@ -99,7 +99,7 @@ rrn_find(rrwlock_t *rrl)
  * Add a node to the head of the singly linked list.
  */
 static void
-rrn_add(rrwlock_t *rrl, void *tag)
+rrn_add(rrwlock_t *rrl, const void *tag)
 {
 	rrw_node_t *rn;
 
@@ -115,7 +115,7 @@ rrn_add(rrwlock_t *rrl, void *tag)
  * thread's list and return TRUE; otherwise return FALSE.
  */
 static boolean_t
-rrn_find_and_remove(rrwlock_t *rrl, void *tag)
+rrn_find_and_remove(rrwlock_t *rrl, const void *tag)
 {
 	rrw_node_t *rn;
 	rrw_node_t *prev = NULL;
@@ -160,7 +160,7 @@ rrw_destroy(rrwlock_t *rrl)
 }
 
 static void
-rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag)
+rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, const void *tag)
 {
 	mutex_enter(&rrl->rr_lock);
 #if !defined(ZFS_DEBUG) && defined(_KERNEL)
@@ -192,7 +192,7 @@ rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag)
 }
 
 void
-rrw_enter_read(rrwlock_t *rrl, void *tag)
+rrw_enter_read(rrwlock_t *rrl, const void *tag)
 {
 	rrw_enter_read_impl(rrl, B_FALSE, tag);
 }
@@ -204,7 +204,7 @@ rrw_enter_read(rrwlock_t *rrl, void *tag)
  * the pending writer does not work, so we have to give an explicit hint here.
  */
 void
-rrw_enter_read_prio(rrwlock_t *rrl, void *tag)
+rrw_enter_read_prio(rrwlock_t *rrl, const void *tag)
 {
 	rrw_enter_read_impl(rrl, B_TRUE, tag);
 }
@@ -228,7 +228,7 @@ rrw_enter_write(rrwlock_t *rrl)
 }
 
 void
-rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
+rrw_enter(rrwlock_t *rrl, krw_t rw, const void *tag)
 {
 	if (rw == RW_READER)
 		rrw_enter_read(rrl, tag);
@@ -237,7 +237,7 @@ rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
 }
 
 void
-rrw_exit(rrwlock_t *rrl, void *tag)
+rrw_exit(rrwlock_t *rrl, const void *tag)
 {
 	mutex_enter(&rrl->rr_lock);
 #if !defined(ZFS_DEBUG) && defined(_KERNEL)
@@ -339,7 +339,7 @@ rrm_destroy(rrmlock_t *rrl)
 }
 
 void
-rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
+rrm_enter(rrmlock_t *rrl, krw_t rw, const void *tag)
 {
 	if (rw == RW_READER)
 		rrm_enter_read(rrl, tag);
@@ -358,7 +358,7 @@ rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
 #define	RRM_TD_LOCK()	(((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
 
 void
-rrm_enter_read(rrmlock_t *rrl, void *tag)
+rrm_enter_read(rrmlock_t *rrl, const void *tag)
 {
 	rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
 }
@@ -373,7 +373,7 @@ rrm_enter_write(rrmlock_t *rrl)
 }
 
 void
-rrm_exit(rrmlock_t *rrl, void *tag)
+rrm_exit(rrmlock_t *rrl, const void *tag)
 {
 	int i;
 
diff --git a/sys/contrib/openzfs/module/zfs/sa.c b/sys/contrib/openzfs/module/zfs/sa.c
index 2604a7513ecf..0ae4c331dd36 100644
--- a/sys/contrib/openzfs/module/zfs/sa.c
+++ b/sys/contrib/openzfs/module/zfs/sa.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -23,6 +23,7 @@
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2023 RackTop Systems, Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -141,7 +142,7 @@ static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
     sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
     uint16_t buflen, dmu_tx_t *tx);
 
-arc_byteswap_func_t sa_bswap_table[] = {
+static arc_byteswap_func_t sa_bswap_table[] = {
 	byteswap_uint64_array,
 	byteswap_uint32_array,
 	byteswap_uint16_array,
@@ -160,7 +161,7 @@ do {								\
 			*(uint64_t *)((uintptr_t)t + 8) =	\
 			    *(uint64_t *)((uintptr_t)s + 8);	\
 		} else {					\
-			bcopy(s, t, l);				\
+			memcpy(t, s, l);				\
 		}						\
 	} else {						\
 		sa_copy_data(f, s, t, l);			\
@@ -178,7 +179,7 @@ do {								\
  * won't have the registry.  Only objsets of type ZFS_TYPE_FILESYSTEM will
  * use this static table.
  */
-sa_attr_reg_t sa_legacy_attrs[] = {
+static const sa_attr_reg_t sa_legacy_attrs[] = {
 	{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
 	{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
 	{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
@@ -200,32 +201,32 @@ sa_attr_reg_t sa_legacy_attrs[] = {
 /*
  * This is only used for objects of type DMU_OT_ZNODE
  */
-sa_attr_type_t sa_legacy_zpl_layout[] = {
+static const sa_attr_type_t sa_legacy_zpl_layout[] = {
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 };
 
 /*
  * Special dummy layout used for buffers with no attributes.
  */
-sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
+static const sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
 
-static int sa_legacy_attr_count = ARRAY_SIZE(sa_legacy_attrs);
+static const size_t sa_legacy_attr_count = ARRAY_SIZE(sa_legacy_attrs);
 static kmem_cache_t *sa_cache = NULL;
 
-/*ARGSUSED*/
 static int
 sa_cache_constructor(void *buf, void *unused, int kmflag)
 {
+	(void) unused, (void) kmflag;
 	sa_handle_t *hdl = buf;
 
 	mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
 
-/*ARGSUSED*/
 static void
 sa_cache_destructor(void *buf, void *unused)
 {
+	(void) unused;
 	sa_handle_t *hdl = buf;
 	mutex_destroy(&hdl->sa_lock);
 }
@@ -285,12 +286,11 @@ sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
 #define	SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
 
 static uint64_t
-sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
+sa_layout_info_hash(const sa_attr_type_t *attrs, int attr_count)
 {
-	int i;
 	uint64_t crc = -1ULL;
 
-	for (i = 0; i != attr_count; i++)
+	for (int i = 0; i != attr_count; i++)
 		crc ^= SA_ATTR_HASH(attrs[i]);
 
 	return (crc);
@@ -370,7 +370,7 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
 			if (bulk[i].sa_data) {
 				SA_COPY_DATA(bulk[i].sa_data_func,
 				    bulk[i].sa_addr, bulk[i].sa_data,
-				    bulk[i].sa_size);
+				    MIN(bulk[i].sa_size, bulk[i].sa_length));
 			}
 			continue;
 
@@ -402,7 +402,7 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
 }
 
 static sa_lot_t *
-sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
+sa_add_layout_entry(objset_t *os, const sa_attr_type_t *attrs, int attr_count,
     uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
 {
 	sa_os_t *sa = os->os_sa;
@@ -415,7 +415,7 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
 	tb->lot_attr_count = attr_count;
 	tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
 	    KM_SLEEP);
-	bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
+	memcpy(tb->lot_attrs, attrs, sizeof (sa_attr_type_t) * attr_count);
 	tb->lot_num = lot_num;
 	tb->lot_hash = hash;
 	tb->lot_instance = 0;
@@ -512,7 +512,7 @@ static void
 sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
 {
 	if (func == NULL) {
-		bcopy(datastart, target, buflen);
+		memcpy(target, datastart, buflen);
 	} else {
 		boolean_t start;
 		int bytes;
@@ -524,7 +524,7 @@ sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
 		bytes = 0;
 		while (bytes < buflen) {
 			func(&dataptr, &length, buflen, start, datastart);
-			bcopy(dataptr, saptr, length);
+			memcpy(saptr, dataptr, length);
 			saptr = (void *)((caddr_t)saptr + length);
 			bytes += length;
 			start = B_FALSE;
@@ -831,7 +831,7 @@ sa_free_attr_table(sa_os_t *sa)
 }
 
 static int
-sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
+sa_attr_table_setup(objset_t *os, const sa_attr_reg_t *reg_attrs, int count)
 {
 	sa_os_t *sa = os->os_sa;
 	uint64_t sa_attr_count = 0;
@@ -992,8 +992,8 @@ bail:
 }
 
 int
-sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
-    sa_attr_type_t **user_table)
+sa_setup(objset_t *os, uint64_t sa_obj, const sa_attr_reg_t *reg_attrs,
+    int count, sa_attr_type_t **user_table)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
@@ -1069,8 +1069,8 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
 				    za.za_num_integers);
 				break;
 			}
-			VERIFY(ddi_strtoull(za.za_name, NULL, 10,
-			    (unsigned long long *)&lot_num) == 0);
+			VERIFY0(ddi_strtoull(za.za_name, NULL, 10,
+			    (unsigned long long *)&lot_num));
 
 			(void) sa_add_layout_entry(os, lot_attrs,
 			    za.za_num_integers, lot_num,
@@ -1202,6 +1202,7 @@ sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
 		uint8_t idx_len;
 
 		reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
+		IMPLY(reg_length == 0, IS_SA_BONUSTYPE(type));
 		if (reg_length) {
 			attr_length = reg_length;
 			idx_len = 0;
@@ -1218,11 +1219,11 @@ sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
 	}
 }
 
-/*ARGSUSED*/
 static void
 sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
     uint16_t length, int length_idx, boolean_t variable_length, void *userp)
 {
+	(void) hdr, (void) length_idx, (void) variable_length;
 	sa_handle_t *hdl = userp;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 
@@ -1309,10 +1310,10 @@ sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
 	return (0);
 }
 
-/*ARGSUSED*/
 static void
 sa_evict_sync(void *dbu)
 {
+	(void) dbu;
 	panic("evicting sa dbuf\n");
 }
 
@@ -1450,13 +1451,13 @@ sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
 }
 
 int
-sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
+sa_buf_hold(objset_t *objset, uint64_t obj_num, const void *tag, dmu_buf_t **db)
 {
 	return (dmu_bonus_hold(objset, obj_num, tag, db));
 }
 
 void
-sa_buf_rele(dmu_buf_t *db, void *tag)
+sa_buf_rele(dmu_buf_t *db, const void *tag)
 {
 	dmu_buf_rele(db, tag);
 }
@@ -1665,8 +1666,9 @@ sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid)
 		    &xattr, 8);
 
 	if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
-		bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
-		    scanstamp, AV_SCANSTAMP_SZ);
+		memcpy(scanstamp,
+		    (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+		    AV_SCANSTAMP_SZ);
 		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), NULL,
 		    scanstamp, AV_SCANSTAMP_SZ);
 		zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
@@ -1874,7 +1876,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
 	if (dn->dn_bonuslen != 0) {
 		bonus_data_size = hdl->sa_bonus->db_size;
 		old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
-		bcopy(hdl->sa_bonus->db_data, old_data[0],
+		memcpy(old_data[0], hdl->sa_bonus->db_data,
 		    hdl->sa_bonus->db_size);
 		bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
 	} else {
@@ -1887,7 +1889,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
 	if ((error = sa_get_spill(hdl)) == 0) {
 		spill_data_size = hdl->sa_spill->db_size;
 		old_data[1] = vmem_alloc(spill_data_size, KM_SLEEP);
-		bcopy(hdl->sa_spill->db_data, old_data[1],
+		memcpy(old_data[1], hdl->sa_spill->db_data,
 		    hdl->sa_spill->db_size);
 		spill_attr_count =
 		    hdl->sa_spill_tab->sa_layout->lot_attr_count;
@@ -1917,7 +1919,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
 	count = bonus_attr_count;
 	hdr = SA_GET_HDR(hdl, SA_BONUS);
 	idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
-	for (; k != 2; k++) {
+	for (; ; k++) {
 		/*
 		 * Iterate over each attribute in layout.  Fetch the
 		 * size of variable-length attributes needing rewrite
diff --git a/sys/contrib/openzfs/module/zfs/sha256.c b/sys/contrib/openzfs/module/zfs/sha2_zfs.c
index d297768eada5..872b1e53ee66 100644
--- a/sys/contrib/openzfs/module/zfs/sha256.c
+++ b/sys/contrib/openzfs/module/zfs/sha2_zfs.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -18,16 +18,14 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2016 by Delphix. All rights reserved.
  */
+
 #include <sys/zfs_context.h>
-#include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/sha2.h>
 #include <sys/abd.h>
@@ -41,11 +39,11 @@ sha_incremental(void *buf, size_t size, void *arg)
 	return (0);
 }
 
-/*ARGSUSED*/
 void
-abd_checksum_SHA256(abd_t *abd, uint64_t size,
+abd_checksum_sha256(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
+	(void) ctx_template;
 	int ret;
 	SHA2_CTX ctx;
 	zio_cksum_t tmp;
@@ -78,11 +76,11 @@ bswap:
 	zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
 }
 
-/*ARGSUSED*/
 void
-abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
+abd_checksum_sha512_native(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
+	(void) ctx_template;
 	SHA2_CTX	ctx;
 
 	SHA2Init(SHA512_256, &ctx);
@@ -90,14 +88,13 @@ abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
 	SHA2Final(zcp, &ctx);
 }
 
-/*ARGSUSED*/
 void
-abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size,
+abd_checksum_sha512_byteswap(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	zio_cksum_t	tmp;
 
-	abd_checksum_SHA512_native(abd, size, ctx_template, &tmp);
+	abd_checksum_sha512_native(abd, size, ctx_template, &tmp);
 	zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
 	zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
 	zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
diff --git a/sys/contrib/openzfs/module/zfs/skein_zfs.c b/sys/contrib/openzfs/module/zfs/skein_zfs.c
index 11b9940e027e..4b2aca888eee 100644
--- a/sys/contrib/openzfs/module/zfs/skein_zfs.c
+++ b/sys/contrib/openzfs/module/zfs/skein_zfs.c
@@ -41,18 +41,17 @@ skein_incremental(void *buf, size_t size, void *arg)
  * function requires the presence of a ctx_template that should be allocated
  * using abd_checksum_skein_tmpl_init.
  */
-/*ARGSUSED*/
 void
 abd_checksum_skein_native(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
-	Skein_512_Ctxt_t	ctx;
+	Skein_512_Ctxt_t ctx;
 
 	ASSERT(ctx_template != NULL);
-	bcopy(ctx_template, &ctx, sizeof (ctx));
+	memcpy(&ctx, ctx_template, sizeof (ctx));
 	(void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx);
 	(void) Skein_512_Final(&ctx, (uint8_t *)zcp);
-	bzero(&ctx, sizeof (ctx));
+	memset(&ctx, 0, sizeof (ctx));
 }
 
 /*
@@ -80,9 +79,8 @@ abd_checksum_skein_byteswap(abd_t *abd, uint64_t size,
 void *
 abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
 {
-	Skein_512_Ctxt_t	*ctx;
+	Skein_512_Ctxt_t *ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
 
-	ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
 	(void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0,
 	    salt->zcs_bytes, sizeof (salt->zcs_bytes));
 	return (ctx);
@@ -95,8 +93,8 @@ abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
 void
 abd_checksum_skein_tmpl_free(void *ctx_template)
 {
-	Skein_512_Ctxt_t	*ctx = ctx_template;
+	Skein_512_Ctxt_t *ctx = ctx_template;
 
-	bzero(ctx, sizeof (*ctx));
+	memset(ctx, 0, sizeof (*ctx));
 	kmem_free(ctx, sizeof (*ctx));
 }
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
index 7546e3e414f1..638572996c3a 100644
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2018, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -33,6 +33,8 @@
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
+ * Copyright (c) 2024, Klara Inc.
  */
 
 /*
@@ -52,6 +54,7 @@
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
+#include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_removal.h>
@@ -61,6 +64,7 @@
 #include <sys/vdev_rebuild.h>
 #include <sys/vdev_trim.h>
 #include <sys/vdev_disk.h>
+#include <sys/vdev_raidz.h>
 #include <sys/vdev_draid.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
@@ -81,7 +85,6 @@
 #include <sys/arc.h>
 #include <sys/callb.h>
 #include <sys/systeminfo.h>
-#include <sys/spa_boot.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_scan.h>
 #include <sys/zfeature.h>
@@ -98,6 +101,27 @@
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
+#include <cityhash.h>
+
+/*
+ * spa_thread() existed on Illumos as a parent thread for the various worker
+ * threads that actually run the pool, as a way to both reference the entire
+ * pool work as a single object, and to share properties like scheduling
+ * options. It has not yet been adapted to Linux or FreeBSD. This define is
+ * used to mark related parts of the code to make things easier for the reader,
+ * and to compile this code out. It can be removed when someone implements it,
+ * moves it to some Illumos-specific place, or removes it entirely.
+ */
+#undef HAVE_SPA_THREAD
+
+/*
+ * The "System Duty Cycle" scheduling class is an Illumos feature to help
+ * prevent CPU-intensive kernel threads from affecting latency on interactive
+ * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is
+ * gated behind a define. On Illumos SDC depends on spa_thread(), but
+ * spa_thread() also has other uses, so this is a separate define.
+ */
+#undef HAVE_SYSDC
 
 /*
  * The interval, in seconds, at which failed configuration cache file writes
@@ -107,16 +131,16 @@ int zfs_ccw_retry_interval = 300;
 
 typedef enum zti_modes {
 	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
-	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
 	ZTI_MODE_SCALE,			/* Taskqs scale with CPUs. */
+	ZTI_MODE_SYNC,			/* sync thread assigned */
 	ZTI_MODE_NULL,			/* don't create a taskq */
 	ZTI_NMODES
 } zti_modes_t;
 
 #define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
 #define	ZTI_PCT(n)	{ ZTI_MODE_ONLINE_PERCENT, (n), 1 }
-#define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
 #define	ZTI_SCALE	{ ZTI_MODE_SCALE, 0, 1 }
+#define	ZTI_SYNC	{ ZTI_MODE_SYNC, 0, 1 }
 #define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
 
 #define	ZTI_N(n)	ZTI_P(n, 1)
@@ -137,42 +161,60 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
  * initializing a pool, we use this table to create an appropriately sized
  * taskq. Some operations are low volume and therefore have a small, static
  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
- * macros. Other operations process a large amount of data; the ZTI_BATCH
+ * macros. Other operations process a large amount of data; the ZTI_SCALE
  * macro causes us to create a taskq oriented for throughput. Some operations
  * are so high frequency and short-lived that the taskq itself can become a
  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
  * additional degree of parallelism specified by the number of threads per-
  * taskq and the number of taskqs; when dispatching an event in this case, the
- * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH,
- * but with number of taskqs also scaling with number of CPUs.
+ * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs
+ * that scales with the number of CPUs.
  *
  * The different taskq priorities are to handle the different contexts (issue
- * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
- * need to be handled with minimum delay.
+ * and interrupt) and then to reserve threads for high priority I/Os that
+ * need to be handled with minimum delay.  Illumos taskq has unfair TQ_FRONT
+ * implementation, so separate high priority threads are used there.
  */
-const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
 	{ ZTI_N(8),	ZTI_NULL,	ZTI_SCALE,	ZTI_NULL }, /* READ */
-	{ ZTI_BATCH,	ZTI_N(5),	ZTI_SCALE,	ZTI_N(5) }, /* WRITE */
+#ifdef illumos
+	{ ZTI_SYNC,	ZTI_N(5),	ZTI_SCALE,	ZTI_N(5) }, /* WRITE */
+#else
+	{ ZTI_SYNC,	ZTI_NULL,	ZTI_SCALE,	ZTI_NULL }, /* WRITE */
+#endif
 	{ ZTI_SCALE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FLUSH */
 	{ ZTI_N(4),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* TRIM */
 };
 
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
-static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
+static int spa_load_impl(spa_t *spa, spa_import_type_t type,
+    const char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);
 
-uint_t		zio_taskq_batch_pct = 80;	/* 1 thread per cpu in pset */
-uint_t		zio_taskq_batch_tpq;		/* threads per taskq */
-boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
-uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
+/*
+ * Percentage of all CPUs that can be used by the metaslab preload taskq.
+ */
+static uint_t metaslab_preload_pct = 50;
+
+static uint_t	zio_taskq_batch_pct = 80;	  /* 1 thread per cpu in pset */
+static uint_t	zio_taskq_batch_tpq;		  /* threads per taskq */
+
+#ifdef HAVE_SYSDC
+static const boolean_t	zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
+static const uint_t	zio_taskq_basedc = 80;	  /* base duty cycle */
+#endif
+
+#ifdef HAVE_SPA_THREAD
+static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
+#endif
 
-boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
+static uint_t	zio_taskq_write_tpq = 16;
 
 /*
  * Report any spa_load_verify errors found, but do not fail spa_load.
@@ -195,7 +237,7 @@ boolean_t	spa_mode_readable_spacemaps = B_FALSE;
 /*
  * For debugging purposes: print out vdev tree during pool import.
  */
-int		spa_load_print_vdev_tree = B_FALSE;
+static int		spa_load_print_vdev_tree = B_FALSE;
 
 /*
  * A non-zero value for zfs_max_missing_tvds means that we allow importing
@@ -218,7 +260,7 @@ int		spa_load_print_vdev_tree = B_FALSE;
  * there are also risks of performing an inadvertent rewind as we might be
  * missing all the vdevs with the latest uberblocks.
  */
-unsigned long	zfs_max_missing_tvds = 0;
+uint64_t	zfs_max_missing_tvds = 0;
 
 /*
  * The parameters below are similar to zfs_max_missing_tvds but are only
@@ -244,28 +286,28 @@ uint64_t	zfs_max_missing_tvds_scan = 0;
 /*
  * Debugging aid that pauses spa_sync() towards the end.
  */
-boolean_t	zfs_pause_spa_sync = B_FALSE;
+static const boolean_t	zfs_pause_spa_sync = B_FALSE;
 
 /*
  * Variables to indicate the livelist condense zthr func should wait at certain
  * points for the livelist to be removed - used to test condense/destroy races
  */
-int zfs_livelist_condense_zthr_pause = 0;
-int zfs_livelist_condense_sync_pause = 0;
+static int zfs_livelist_condense_zthr_pause = 0;
+static int zfs_livelist_condense_sync_pause = 0;
 
 /*
  * Variables to track whether or not condense cancellation has been
  * triggered in testing.
  */
-int zfs_livelist_condense_sync_cancel = 0;
-int zfs_livelist_condense_zthr_cancel = 0;
+static int zfs_livelist_condense_sync_cancel = 0;
+static int zfs_livelist_condense_zthr_cancel = 0;
 
 /*
  * Variable to track whether or not extra ALLOC blkptrs were added to a
  * livelist entry while it was being condensed (caused by the way we track
  * remapped blkptrs in dbuf_remap_impl)
  */
-int zfs_livelist_condense_new_alloc = 0;
+static int zfs_livelist_condense_new_alloc = 0;
 
 /*
  * ==========================================================================
@@ -277,7 +319,7 @@ int zfs_livelist_condense_new_alloc = 0;
  * Add a (source=src, propname=propval) list to an nvlist.
  */
 static void
-spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
+spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval,
     uint64_t intval, zprop_source_t src)
 {
 	const char *propname = zpool_prop_to_name(prop);
@@ -296,6 +338,22 @@ spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
 }
 
 /*
+ * Add a user property (source=src, propname=propval) to an nvlist.
+ */
+static void
+spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval,
+    zprop_source_t src)
+{
+	nvlist_t *propval;
+
+	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
+	VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
+	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
+	nvlist_free(propval);
+}
+
+/*
  * Get property values from the spa configuration.
  */
 static void
@@ -341,6 +399,12 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
 		    ddt_get_pool_dedup_ratio(spa), src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL,
+		    brt_get_used(spa), src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL,
+		    brt_get_saved(spa), src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL,
+		    brt_get_ratio(spa), src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
 		    rvd->vdev_state, src);
@@ -464,7 +528,8 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
 		zprop_source_t src = ZPROP_SRC_DEFAULT;
 		zpool_prop_t prop;
 
-		if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
+		if ((prop = zpool_name_to_prop(za.za_name)) ==
+		    ZPOOL_PROP_INVAL && !zfs_prop_user(za.za_name))
 			continue;
 
 		switch (za.za_integer_length) {
@@ -507,7 +572,13 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
 				kmem_free(strval, za.za_num_integers);
 				break;
 			}
-			spa_prop_add_list(*nvp, prop, strval, 0, src);
+			if (prop != ZPOOL_PROP_INVAL) {
+				spa_prop_add_list(*nvp, prop, strval, 0, src);
+			} else {
+				src = ZPROP_SRC_LOCAL;
+				spa_prop_add_user(*nvp, za.za_name, strval,
+				    src);
+			}
 			kmem_free(strval, za.za_num_integers);
 			break;
 
@@ -543,42 +614,53 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 		uint64_t intval;
-		char *strval, *slash, *check, *fname;
+		const char *strval, *slash, *check, *fname;
 		const char *propname = nvpair_name(elem);
 		zpool_prop_t prop = zpool_name_to_prop(propname);
 
 		switch (prop) {
 		case ZPOOL_PROP_INVAL:
-			if (!zpool_prop_feature(propname)) {
-				error = SET_ERROR(EINVAL);
-				break;
-			}
-
 			/*
 			 * Sanitize the input.
 			 */
-			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
-				error = SET_ERROR(EINVAL);
-				break;
-			}
+			if (zfs_prop_user(propname)) {
+				if (strlen(propname) >= ZAP_MAXNAMELEN) {
+					error = SET_ERROR(ENAMETOOLONG);
+					break;
+				}
 
-			if (nvpair_value_uint64(elem, &intval) != 0) {
-				error = SET_ERROR(EINVAL);
-				break;
-			}
+				if (strlen(fnvpair_value_string(elem)) >=
+				    ZAP_MAXVALUELEN) {
+					error = SET_ERROR(E2BIG);
+					break;
+				}
+			} else if (zpool_prop_feature(propname)) {
+				if (nvpair_type(elem) != DATA_TYPE_UINT64) {
+					error = SET_ERROR(EINVAL);
+					break;
+				}
 
-			if (intval != 0) {
-				error = SET_ERROR(EINVAL);
-				break;
-			}
+				if (nvpair_value_uint64(elem, &intval) != 0) {
+					error = SET_ERROR(EINVAL);
+					break;
+				}
+
+				if (intval != 0) {
+					error = SET_ERROR(EINVAL);
+					break;
+				}
+
+				fname = strchr(propname, '@') + 1;
+				if (zfeature_lookup_name(fname, NULL) != 0) {
+					error = SET_ERROR(EINVAL);
+					break;
+				}
 
-			fname = strchr(propname, '@') + 1;
-			if (zfeature_lookup_name(fname, NULL) != 0) {
+				has_feature = B_TRUE;
+			} else {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
-
-			has_feature = B_TRUE;
 			break;
 
 		case ZPOOL_PROP_VERSION:
@@ -745,7 +827,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 void
 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
 {
-	char *cachefile;
+	const char *cachefile;
 	spa_config_dirent_t *dp;
 
 	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
@@ -785,8 +867,14 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp)
 		    prop == ZPOOL_PROP_READONLY)
 			continue;
 
+		if (prop == ZPOOL_PROP_INVAL &&
+		    zfs_prop_user(nvpair_name(elem))) {
+			need_sync = B_TRUE;
+			break;
+		}
+
 		if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
-			uint64_t ver;
+			uint64_t ver = 0;
 
 			if (prop == ZPOOL_PROP_VERSION) {
 				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
@@ -840,7 +928,6 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 	}
 }
 
-/*ARGSUSED*/
 static int
 spa_change_guid_check(void *arg, dmu_tx_t *tx)
 {
@@ -910,7 +997,16 @@ spa_change_guid(spa_t *spa)
 	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
 
 	if (error == 0) {
-		spa_write_cachefile(spa, B_FALSE, B_TRUE);
+		/*
+		 * Clear the kobj flag from all the vdevs to allow
+		 * vdev_cache_process_kobj_evt() to post events to all the
+		 * vdevs since GUID is updated.
+		 */
+		vdev_clear_kobj_evt(spa->spa_root_vdev);
+		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
+			vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]);
+
+		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
 	}
 
@@ -948,8 +1044,8 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 {
 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
 
-	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
-	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
+	memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t));
+	memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
@@ -968,17 +1064,33 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 	uint_t count = ztip->zti_count;
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	uint_t cpus, flags = TASKQ_DYNAMIC;
-	boolean_t batch = B_FALSE;
 
 	switch (mode) {
 	case ZTI_MODE_FIXED:
 		ASSERT3U(value, >, 0);
 		break;
 
-	case ZTI_MODE_BATCH:
-		batch = B_TRUE;
+	case ZTI_MODE_SYNC:
+
+		/*
+		 * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs,
+		 * not to exceed the number of spa allocators, and align to it.
+		 */
+		cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
+		count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq));
+		count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
+		count = MIN(count, spa->spa_alloc_count);
+		while (spa->spa_alloc_count % count != 0 &&
+		    spa->spa_alloc_count < count * 2)
+			count--;
+
+		/*
+		 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
+		 * single taskq may have more threads than 100% of online cpus.
+		 */
+		value = (zio_taskq_batch_pct + count / 2) / count;
+		value = MIN(value, 100);
 		flags |= TASKQ_THREADS_CPU_PCT;
-		value = MIN(zio_taskq_batch_pct, 100);
 		break;
 
 	case ZTI_MODE_SCALE:
@@ -1025,7 +1137,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 
 	default:
 		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
-		    "spa_activate()",
+		    "spa_taskqs_init()",
 		    zio_type_name[t], zio_taskq_types[q], mode, value);
 		break;
 	}
@@ -1045,13 +1157,13 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 			(void) snprintf(name, sizeof (name), "%s_%s",
 			    zio_type_name[t], zio_taskq_types[q]);
 
+#ifdef HAVE_SYSDC
 		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
-			if (batch)
-				flags |= TASKQ_DC_BATCH;
-
+			(void) zio_taskq_basedc;
 			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 			    spa->spa_proc, zio_taskq_basedc, flags);
 		} else {
+#endif
 			pri_t pri = maxclsyspri;
 			/*
 			 * The write issue taskq can be extremely CPU
@@ -1077,7 +1189,9 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 			}
 			tq = taskq_create_proc(name, value, pri, 50,
 			    INT_MAX, spa->spa_proc, flags);
+#ifdef HAVE_SYSDC
 		}
+#endif
 
 		tqs->stqs_taskq[i] = tq;
 	}
@@ -1102,54 +1216,309 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 	tqs->stqs_taskq = NULL;
 }
 
+#ifdef _KERNEL
 /*
- * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
- * Note that a type may have multiple discrete taskqs to avoid lock contention
- * on the taskq itself. In that case we choose which taskq at random by using
- * the low bits of gethrtime().
+ * The READ and WRITE rows of zio_taskqs are configurable at module load time
+ * by setting zio_taskq_read or zio_taskq_write.
+ *
+ * Example (the defaults for READ and WRITE)
+ *   zio_taskq_read='fixed,1,8 null scale null'
+ *   zio_taskq_write='sync null scale null'
+ *
+ * Each sets the entire row at a time.
+ *
+ * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number
+ * of threads per taskq.
+ *
+ * 'null' can only be set on the high-priority queues (queue selection for
+ * high-priority queues will fall back to the regular queue if the high-pri
+ * is NULL.
  */
-void
-spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
-    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
+static const char *const modes[ZTI_NMODES] = {
+	"fixed", "scale", "sync", "null"
+};
+
+/* Parse the incoming config string. Modifies cfg */
+static int
+spa_taskq_param_set(zio_type_t t, char *cfg)
 {
-	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
-	taskq_t *tq;
+	int err = 0;
 
-	ASSERT3P(tqs->stqs_taskq, !=, NULL);
-	ASSERT3U(tqs->stqs_count, !=, 0);
+	zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}};
 
-	if (tqs->stqs_count == 1) {
-		tq = tqs->stqs_taskq[0];
-	} else {
-		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
+	char *next = cfg, *tok, *c;
+
+	/*
+	 * Parse out each element from the string and fill `row`. The entire
+	 * row has to be set at once, so any errors are flagged by just
+	 * breaking out of this loop early.
+	 */
+	uint_t q;
+	for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
+		/* `next` is the start of the config */
+		if (next == NULL)
+			break;
+
+		/* Eat up leading space */
+		while (isspace(*next))
+			next++;
+		if (*next == '\0')
+			break;
+
+		/* Mode ends at space or end of string */
+		tok = next;
+		next = strchr(tok, ' ');
+		if (next != NULL) *next++ = '\0';
+
+		/* Parameters start after a comma */
+		c = strchr(tok, ',');
+		if (c != NULL) *c++ = '\0';
+
+		/* Match mode string */
+		uint_t mode;
+		for (mode = 0; mode < ZTI_NMODES; mode++)
+			if (strcmp(tok, modes[mode]) == 0)
+				break;
+		if (mode == ZTI_NMODES)
+			break;
+
+		/* Invalid canary */
+		row[q].zti_mode = ZTI_NMODES;
+
+		/* Per-mode setup */
+		switch (mode) {
+
+		/*
+		 * FIXED is parameterised: number of queues, and number of
+		 * threads per queue.
+		 */
+		case ZTI_MODE_FIXED: {
+			/* No parameters? */
+			if (c == NULL || *c == '\0')
+				break;
+
+			/* Find next parameter */
+			tok = c;
+			c = strchr(tok, ',');
+			if (c == NULL)
+				break;
+
+			/* Take digits and convert */
+			unsigned long long nq;
+			if (!(isdigit(*tok)))
+				break;
+			err = ddi_strtoull(tok, &tok, 10, &nq);
+			/* Must succeed and also end at the next param sep */
+			if (err != 0 || tok != c)
+				break;
+
+			/* Move past the comma */
+			tok++;
+			/* Need another number */
+			if (!(isdigit(*tok)))
+				break;
+			/* Remember start to make sure we moved */
+			c = tok;
+
+			/* Take digits */
+			unsigned long long ntpq;
+			err = ddi_strtoull(tok, &tok, 10, &ntpq);
+			/* Must succeed, and moved forward */
+			if (err != 0 || tok == c || *tok != '\0')
+				break;
+
+			/*
+			 * sanity; zero queues/threads make no sense, and
+			 * 16K is almost certainly more than anyone will ever
+			 * need and avoids silly numbers like UINT32_MAX
+			 */
+			if (nq == 0 || nq >= 16384 ||
+			    ntpq == 0 || ntpq >= 16384)
+				break;
+
+			const zio_taskq_info_t zti = ZTI_P(ntpq, nq);
+			row[q] = zti;
+			break;
+		}
+
+		case ZTI_MODE_SCALE: {
+			const zio_taskq_info_t zti = ZTI_SCALE;
+			row[q] = zti;
+			break;
+		}
+
+		case ZTI_MODE_SYNC: {
+			const zio_taskq_info_t zti = ZTI_SYNC;
+			row[q] = zti;
+			break;
+		}
+
+		case ZTI_MODE_NULL: {
+			/*
+			 * Can only null the high-priority queues; the general-
+			 * purpose ones have to exist.
+			 */
+			if (q != ZIO_TASKQ_ISSUE_HIGH &&
+			    q != ZIO_TASKQ_INTERRUPT_HIGH)
+				break;
+
+			const zio_taskq_info_t zti = ZTI_NULL;
+			row[q] = zti;
+			break;
+		}
+
+		default:
+			break;
+		}
+
+		/* Ensure we set a mode */
+		if (row[q].zti_mode == ZTI_NMODES)
+			break;
 	}
 
-	taskq_dispatch_ent(tq, func, arg, flags, ent);
+	/* Didn't get a full row, fail */
+	if (q < ZIO_TASKQ_TYPES)
+		return (SET_ERROR(EINVAL));
+
+	/* Eat trailing space */
+	if (next != NULL)
+		while (isspace(*next))
+			next++;
+
+	/* If there's anything left over then fail */
+	if (next != NULL && *next != '\0')
+		return (SET_ERROR(EINVAL));
+
+	/* Success! Copy it into the real config */
+	for (q = 0; q < ZIO_TASKQ_TYPES; q++)
+		zio_taskqs[t][q] = row[q];
+
+	return (0);
+}
+
+static int
+spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline)
+{
+	int pos = 0;
+
+	/* Build paramater string from live config */
+	const char *sep = "";
+	for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) {
+		const zio_taskq_info_t *zti = &zio_taskqs[t][q];
+		if (zti->zti_mode == ZTI_MODE_FIXED)
+			pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
+			    modes[zti->zti_mode], zti->zti_count,
+			    zti->zti_value);
+		else
+			pos += sprintf(&buf[pos], "%s%s", sep,
+			    modes[zti->zti_mode]);
+		sep = " ";
+	}
+
+	if (add_newline)
+		buf[pos++] = '\n';
+	buf[pos] = '\0';
+
+	return (pos);
+}
+
+#ifdef __linux__
+static int
+spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
+{
+	char *cfg = kmem_strdup(val);
+	int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg);
+	kmem_free(cfg, strlen(val)+1);
+	return (-err);
+}
+static int
+spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
+{
+	return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE));
 }
 
+static int
+spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
+{
+	char *cfg = kmem_strdup(val);
+	int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg);
+	kmem_free(cfg, strlen(val)+1);
+	return (-err);
+}
+static int
+spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
+{
+	return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE));
+}
+#else
 /*
- * Same as spa_taskq_dispatch_ent() but block on the task until completion.
+ * On FreeBSD load-time parameters can be set up before malloc() is available,
+ * so we have to do all the parsing work on the stack.
+ */
+#define	SPA_TASKQ_PARAM_MAX	(128)
+
+static int
+spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS)
+{
+	char buf[SPA_TASKQ_PARAM_MAX];
+	int err;
+
+	(void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE);
+	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+	if (err || req->newptr == NULL)
+		return (err);
+	return (spa_taskq_param_set(ZIO_TYPE_READ, buf));
+}
+
+static int
+spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
+{
+	char buf[SPA_TASKQ_PARAM_MAX];
+	int err;
+
+	(void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE);
+	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+	if (err || req->newptr == NULL)
+		return (err);
+	return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf));
+}
+#endif
+#endif /* _KERNEL */
+
+/*
+ * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
+ * Note that a type may have multiple discrete taskqs to avoid lock contention
+ * on the taskq itself.
  */
 void
-spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
-    task_func_t *func, void *arg, uint_t flags)
+spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    task_func_t *func, zio_t *zio, boolean_t cutinline)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	taskq_t *tq;
-	taskqid_t id;
 
 	ASSERT3P(tqs->stqs_taskq, !=, NULL);
 	ASSERT3U(tqs->stqs_count, !=, 0);
 
+	/*
+	 * NB: We are assuming that the zio can only be dispatched
+	 * to a single taskq at a time.  It would be a grievous error
+	 * to dispatch the zio to another taskq at the same time.
+	 */
+	ASSERT(zio);
+	ASSERT(taskq_empty_ent(&zio->io_tqent));
+
 	if (tqs->stqs_count == 1) {
 		tq = tqs->stqs_taskq[0];
+	} else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
+	    ZIO_HAS_ALLOCATOR(zio)) {
+		tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count];
 	} else {
 		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
 	}
 
-	id = taskq_dispatch(tq, func, arg, flags);
-	if (id)
-		taskq_wait_id(tq, id);
+	taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0,
+	    &zio->io_tqent);
 }
 
 static void
@@ -1162,11 +1531,6 @@ spa_create_zio_taskqs(spa_t *spa)
 	}
 }
 
-/*
- * Disabled until spa_thread() can be adapted for Linux.
- */
-#undef HAVE_SPA_THREAD
-
 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
 static void
 spa_thread(void *arg)
@@ -1207,9 +1571,11 @@ spa_thread(void *arg)
 		pool_unlock();
 	}
 
+#ifdef HAVE_SYSDC
 	if (zio_taskq_sysdc) {
 		sysdc_thread_enter(curthread, 100, 0);
 	}
+#endif
 
 	spa->spa_proc = curproc;
 	spa->spa_did = curthread->t_did;
@@ -1238,24 +1604,26 @@ spa_thread(void *arg)
 }
 #endif
 
+extern metaslab_ops_t *metaslab_allocator(spa_t *spa);
+
 /*
  * Activate an uninitialized pool.
  */
 static void
 spa_activate(spa_t *spa, spa_mode_t mode)
 {
+	metaslab_ops_t *msp = metaslab_allocator(spa);
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_mode = mode;
 	spa->spa_read_spacemaps = spa_mode_readable_spacemaps;
 
-	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
-	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
-	spa->spa_embedded_log_class =
-	    metaslab_class_create(spa, zfs_metaslab_ops);
-	spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
-	spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
+	spa->spa_normal_class = metaslab_class_create(spa, msp);
+	spa->spa_log_class = metaslab_class_create(spa, msp);
+	spa->spa_embedded_log_class = metaslab_class_create(spa, msp);
+	spa->spa_special_class = metaslab_class_create(spa, msp);
+	spa->spa_dedup_class = metaslab_class_create(spa, msp);
 
 	/* Try to create a covering process */
 	mutex_enter(&spa->spa_proc_lock);
@@ -1313,6 +1681,11 @@ spa_activate(spa_t *spa, spa_mode_t mode)
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
+	avl_create(&spa->spa_errlist_healed,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
+
+	spa_activate_os(spa);
 
 	spa_keystore_init(&spa->spa_keystore);
 
@@ -1335,6 +1708,13 @@ spa_activate(spa_t *spa, spa_mode_t mode)
 	    1, INT_MAX, 0);
 
 	/*
+	 * The taskq to preload metaslabs.
+	 */
+	spa->spa_metaslab_taskq = taskq_create("z_metaslab",
+	    metaslab_preload_pct, maxclsyspri, 1, INT_MAX,
+	    TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
+
+	/*
 	 * Taskq dedicated to prefetcher threads: this is used to prevent the
 	 * pool traverse code from monopolizing the global (and limited)
 	 * system_taskq by inappropriately scheduling long running tasks on it.
@@ -1369,6 +1749,11 @@ spa_deactivate(spa_t *spa)
 		spa->spa_zvol_taskq = NULL;
 	}
 
+	if (spa->spa_metaslab_taskq) {
+		taskq_destroy(spa->spa_metaslab_taskq);
+		spa->spa_metaslab_taskq = NULL;
+	}
+
 	if (spa->spa_prefetch_taskq) {
 		taskq_destroy(spa->spa_prefetch_taskq);
 		spa->spa_prefetch_taskq = NULL;
@@ -1421,6 +1806,7 @@ spa_deactivate(spa_t *spa)
 	spa_errlog_drain(spa);
 	avl_destroy(&spa->spa_errlist_scrub);
 	avl_destroy(&spa->spa_errlist_last);
+	avl_destroy(&spa->spa_errlist_healed);
 
 	spa_keystore_fini(&spa->spa_keystore);
 
@@ -1450,6 +1836,9 @@ spa_deactivate(spa_t *spa)
 		thread_join(spa->spa_did);
 		spa->spa_did = 0;
 	}
+
+	spa_deactivate_os(spa);
+
 }
 
 /*
@@ -1542,16 +1931,16 @@ spa_unload_log_sm_metadata(spa_t *spa)
 {
 	void *cookie = NULL;
 	spa_log_sm_t *sls;
+	log_summary_entry_t *e;
+
 	while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
 	    &cookie)) != NULL) {
 		VERIFY0(sls->sls_mscount);
 		kmem_free(sls, sizeof (spa_log_sm_t));
 	}
 
-	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
-	    e != NULL; e = list_head(&spa->spa_log_summary)) {
+	while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) {
 		VERIFY0(e->lse_mscount);
-		list_remove(&spa->spa_log_summary, e);
 		kmem_free(e, sizeof (log_summary_entry_t));
 	}
 
@@ -1579,6 +1968,10 @@ spa_destroy_aux_threads(spa_t *spa)
 		zthr_destroy(spa->spa_livelist_condense_zthr);
 		spa->spa_livelist_condense_zthr = NULL;
 	}
+	if (spa->spa_raidz_expand_zthr != NULL) {
+		zthr_destroy(spa->spa_raidz_expand_zthr);
+		spa->spa_raidz_expand_zthr = NULL;
+	}
 }
 
 /*
@@ -1587,7 +1980,8 @@ spa_destroy_aux_threads(spa_t *spa)
 static void
 spa_unload(spa_t *spa)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_export_thread == curthread);
 	ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
 
 	spa_import_progress_remove(spa_guid(spa));
@@ -1596,25 +1990,33 @@ spa_unload(spa_t *spa)
 	spa_wake_waiters(spa);
 
 	/*
-	 * If the log space map feature is enabled and the pool is getting
-	 * exported (but not destroyed), we want to spend some time flushing
-	 * as many metaslabs as we can in an attempt to destroy log space
-	 * maps and save import time.
+	 * If we have set the spa_final_txg, we have already performed the
+	 * tasks below in spa_export_common(). We should not redo it here since
+	 * we delay the final TXGs beyond what spa_final_txg is set at.
 	 */
-	if (spa_should_flush_logs_on_unload(spa))
-		spa_unload_log_sm_flush_all(spa);
+	if (spa->spa_final_txg == UINT64_MAX) {
+		/*
+		 * If the log space map feature is enabled and the pool is
+		 * getting exported (but not destroyed), we want to spend some
+		 * time flushing as many metaslabs as we can in an attempt to
+		 * destroy log space maps and save import time.
+		 */
+		if (spa_should_flush_logs_on_unload(spa))
+			spa_unload_log_sm_flush_all(spa);
 
-	/*
-	 * Stop async tasks.
-	 */
-	spa_async_suspend(spa);
+		/*
+		 * Stop async tasks.
+		 */
+		spa_async_suspend(spa);
 
-	if (spa->spa_root_vdev) {
-		vdev_t *root_vdev = spa->spa_root_vdev;
-		vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE);
-		vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
-		vdev_autotrim_stop_all(spa);
-		vdev_rebuild_stop_all(spa);
+		if (spa->spa_root_vdev) {
+			vdev_t *root_vdev = spa->spa_root_vdev;
+			vdev_initialize_stop_all(root_vdev,
+			    VDEV_INITIALIZE_ACTIVE);
+			vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
+			vdev_autotrim_stop_all(spa);
+			vdev_rebuild_stop_all(spa);
+		}
 	}
 
 	/*
@@ -1629,13 +2031,7 @@ spa_unload(spa_t *spa)
 	 * This ensures that there is no async metaslab prefetching
 	 * while we attempt to unload the spa.
 	 */
-	if (spa->spa_root_vdev != NULL) {
-		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
-			vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
-			if (vc->vdev_mg != NULL)
-				taskq_wait(vc->vdev_mg->mg_taskq);
-		}
-	}
+	taskq_wait(spa->spa_metaslab_taskq);
 
 	if (spa->spa_mmp.mmp_thread)
 		mmp_thread_stop(spa);
@@ -1680,6 +2076,7 @@ spa_unload(spa_t *spa)
 	}
 
 	ddt_unload(spa);
+	brt_unload(spa);
 	spa_unload_log_sm_metadata(spa);
 
 	/*
@@ -1687,9 +2084,9 @@ spa_unload(spa_t *spa)
 	 */
 	spa_l2cache_drop(spa);
 
-	for (int i = 0; i < spa->spa_spares.sav_count; i++)
-		vdev_free(spa->spa_spares.sav_vdevs[i]);
 	if (spa->spa_spares.sav_vdevs) {
+		for (int i = 0; i < spa->spa_spares.sav_count; i++)
+			vdev_free(spa->spa_spares.sav_vdevs[i]);
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 		spa->spa_spares.sav_vdevs = NULL;
@@ -1700,11 +2097,11 @@ spa_unload(spa_t *spa)
 	}
 	spa->spa_spares.sav_count = 0;
 
-	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
-		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
-		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
-	}
 	if (spa->spa_l2cache.sav_vdevs) {
+		for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+			vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
+			vdev_free(spa->spa_l2cache.sav_vdevs[i]);
+		}
 		kmem_free(spa->spa_l2cache.sav_vdevs,
 		    spa->spa_l2cache.sav_count * sizeof (void *));
 		spa->spa_l2cache.sav_vdevs = NULL;
@@ -1728,6 +2125,8 @@ spa_unload(spa_t *spa)
 		spa->spa_compatibility = NULL;
 	}
 
+	spa->spa_raidz_expand = NULL;
+
 	spa_config_exit(spa, SCL_ALL, spa);
 }
 
@@ -1762,20 +2161,21 @@ spa_load_spares(spa_t *spa)
 	/*
 	 * First, close and free any existing spare vdevs.
 	 */
-	for (i = 0; i < spa->spa_spares.sav_count; i++) {
-		vd = spa->spa_spares.sav_vdevs[i];
+	if (spa->spa_spares.sav_vdevs) {
+		for (i = 0; i < spa->spa_spares.sav_count; i++) {
+			vd = spa->spa_spares.sav_vdevs[i];
 
-		/* Undo the call to spa_activate() below */
-		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
-		    B_FALSE)) != NULL && tvd->vdev_isspare)
-			spa_spare_remove(tvd);
-		vdev_close(vd);
-		vdev_free(vd);
-	}
+			/* Undo the call to spa_activate() below */
+			if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
+			    B_FALSE)) != NULL && tvd->vdev_isspare)
+				spa_spare_remove(tvd);
+			vdev_close(vd);
+			vdev_free(vd);
+		}
 
-	if (spa->spa_spares.sav_vdevs)
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
+	}
 
 	if (spa->spa_spares.sav_config == NULL)
 		nspares = 0;
@@ -1851,7 +2251,8 @@ spa_load_spares(spa_t *spa)
 		spares[i] = vdev_config_generate(spa,
 		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
 	fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
-	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count);
+	    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
+	    spa->spa_spares.sav_count);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		nvlist_free(spares[i]);
 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
@@ -1978,30 +2379,31 @@ spa_load_l2cache(spa_t *spa)
 	for (i = 0; i < sav->sav_count; i++)
 		l2cache[i] = vdev_config_generate(spa,
 		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
-	fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache,
-	    sav->sav_count);
+	fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
+	    (const nvlist_t * const *)l2cache, sav->sav_count);
 
 out:
 	/*
 	 * Purge vdevs that were dropped
 	 */
-	for (i = 0; i < oldnvdevs; i++) {
-		uint64_t pool;
+	if (oldvdevs) {
+		for (i = 0; i < oldnvdevs; i++) {
+			uint64_t pool;
 
-		vd = oldvdevs[i];
-		if (vd != NULL) {
-			ASSERT(vd->vdev_isl2cache);
+			vd = oldvdevs[i];
+			if (vd != NULL) {
+				ASSERT(vd->vdev_isl2cache);
 
-			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
-			    pool != 0ULL && l2arc_vdev_present(vd))
-				l2arc_remove_vdev(vd);
-			vdev_clear_stats(vd);
-			vdev_free(vd);
+				if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+				    pool != 0ULL && l2arc_vdev_present(vd))
+					l2arc_remove_vdev(vd);
+				vdev_clear_stats(vd);
+				vdev_free(vd);
+			}
 		}
-	}
 
-	if (oldvdevs)
 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
+	}
 
 	for (i = 0; i < sav->sav_count; i++)
 		nvlist_free(l2cache[i]);
@@ -2107,8 +2509,8 @@ spa_check_for_missing_logs(spa_t *spa)
 		}
 
 		if (idx > 0) {
-			fnvlist_add_nvlist_array(nv,
-			    ZPOOL_CONFIG_CHILDREN, child, idx);
+			fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+			    (const nvlist_t * const *)child, idx);
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_MISSING_DEVICES, nv);
 
@@ -2243,12 +2645,13 @@ spa_claim_notify(zio_t *zio)
 		return;
 
 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
-	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
-		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+	if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp))
+		spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp);
 	mutex_exit(&spa->spa_props_lock);
 }
 
 typedef struct spa_load_error {
+	boolean_t	sle_verify_data;
 	uint64_t	sle_meta_count;
 	uint64_t	sle_data_count;
 } spa_load_error_t;
@@ -2281,18 +2684,19 @@ spa_load_verify_done(zio_t *zio)
  * Maximum number of inflight bytes is the log2 fraction of the arc size.
  * By default, we set it to 1/16th of the arc.
  */
-int spa_load_verify_shift = 4;
-int spa_load_verify_metadata = B_TRUE;
-int spa_load_verify_data = B_TRUE;
+static uint_t spa_load_verify_shift = 4;
+static int spa_load_verify_metadata = B_TRUE;
+static int spa_load_verify_data = B_TRUE;
 
-/*ARGSUSED*/
 static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
-	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
-	    BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
-		return (0);
+	zio_t *rio = arg;
+	spa_load_error_t *sle = rio->io_private;
+
+	(void) zilog, (void) dnp;
+
 	/*
 	 * Note: normally this routine will not be called if
 	 * spa_load_verify_metadata is not set.  However, it may be useful
@@ -2300,12 +2704,28 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	 */
 	if (!spa_load_verify_metadata)
 		return (0);
-	if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
+
+	/*
+	 * Sanity check the block pointer in order to detect obvious damage
+	 * before using the contents in subsequent checks or in zio_read().
+	 * When damaged consider it to be a metadata error since we cannot
+	 * trust the BP_GET_TYPE and BP_GET_LEVEL values.
+	 */
+	if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
+		atomic_inc_64(&sle->sle_meta_count);
+		return (0);
+	}
+
+	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
+	    BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
+		return (0);
+
+	if (!BP_IS_METADATA(bp) &&
+	    (!spa_load_verify_data || !sle->sle_verify_data))
 		return (0);
 
 	uint64_t maxinflight_bytes =
 	    arc_target_bytes() >> spa_load_verify_shift;
-	zio_t *rio = arg;
 	size_t size = BP_GET_PSIZE(bp);
 
 	mutex_enter(&spa->spa_scrub_lock);
@@ -2321,10 +2741,11 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	return (0);
 }
 
-/* ARGSUSED */
 static int
 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
+	(void) dp, (void) arg;
+
 	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 
@@ -2342,7 +2763,8 @@ spa_load_verify(spa_t *spa)
 
 	zpool_get_load_policy(spa->spa_config, &policy);
 
-	if (policy.zlp_rewind & ZPOOL_NEVER_REWIND)
+	if (policy.zlp_rewind & ZPOOL_NEVER_REWIND ||
+	    policy.zlp_maxmeta == UINT64_MAX)
 		return (0);
 
 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
@@ -2353,6 +2775,13 @@ spa_load_verify(spa_t *spa)
 	if (error != 0)
 		return (error);
 
+	/*
+	 * Verify data only if we are rewinding or error limit was set.
+	 * Otherwise nothing except dbgmsg care about it to waste time.
+	 */
+	sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) ||
+	    (policy.zlp_maxdata < UINT64_MAX);
+
 	rio = zio_root(spa, NULL, &sle,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
 
@@ -2397,6 +2826,8 @@ spa_load_verify(spa_t *spa)
 		fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME,
 		    loss);
 		fnvlist_add_uint64(spa->spa_load_info,
+		    ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count);
+		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count);
 	} else {
 		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
@@ -2454,10 +2885,10 @@ spa_livelist_delete_check(spa_t *spa)
 	return (spa->spa_livelists_to_delete != 0);
 }
 
-/* ARGSUSED */
 static boolean_t
 spa_livelist_delete_cb_check(void *arg, zthr_t *z)
 {
+	(void) z;
 	spa_t *spa = arg;
 	return (spa_livelist_delete_check(spa));
 }
@@ -2549,7 +2980,6 @@ livelist_delete_sync(void *arg, dmu_tx_t *tx)
  * be freed. Then, call a synctask which performs the actual frees and updates
  * the pool-wide livelist data.
  */
-/* ARGSUSED */
 static void
 spa_livelist_delete_cb(void *arg, zthr_t *z)
 {
@@ -2795,7 +3225,6 @@ spa_livelist_condense_cb(void *arg, zthr_t *t)
 		zfs_livelist_condense_zthr_cancel++;
 }
 
-/* ARGSUSED */
 /*
  * Check that there is something to condense but that a condense is not
  * already in progress and that condensing has not been cancelled.
@@ -2803,6 +3232,7 @@ spa_livelist_condense_cb(void *arg, zthr_t *t)
 static boolean_t
 spa_livelist_condense_cb_check(void *arg, zthr_t *z)
 {
+	(void) z;
 	spa_t *spa = arg;
 	if ((spa->spa_to_condense.ds != NULL) &&
 	    (spa->spa_to_condense.syncing == B_FALSE) &&
@@ -2833,8 +3263,7 @@ spa_spawn_aux_threads(spa_t *spa)
 {
 	ASSERT(spa_writeable(spa));
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
+	spa_start_raidz_expansion_thread(spa);
 	spa_start_indirect_condensing_thread(spa);
 	spa_start_livelist_destroy_thread(spa);
 	spa_start_livelist_condensing_thread(spa);
@@ -2931,12 +3360,13 @@ spa_try_repair(spa_t *spa, nvlist_t *config)
 static int
 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
 {
-	char *ereport = FM_EREPORT_ZFS_POOL;
+	const char *ereport = FM_EREPORT_ZFS_POOL;
 	int error;
 
 	spa->spa_load_state = state;
 	(void) spa_import_progress_set_state(spa_guid(spa),
 	    spa_load_state(spa));
+	spa_import_progress_set_notes(spa, "spa_load()");
 
 	gethrestime(&spa->spa_loaded_ts);
 	error = spa_load_impl(spa, type, &ereport);
@@ -2978,6 +3408,12 @@ vdev_count_verify_zaps(vdev_t *vd)
 	spa_t *spa = vd->vdev_spa;
 	uint64_t total = 0;
 
+	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) &&
+	    vd->vdev_root_zap != 0) {
+		total++;
+		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps, vd->vdev_root_zap));
+	}
 	if (vd->vdev_top_zap != 0) {
 		total++;
 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
@@ -2995,6 +3431,8 @@ vdev_count_verify_zaps(vdev_t *vd)
 
 	return (total);
 }
+#else
+#define	vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0)
 #endif
 
 /*
@@ -3146,18 +3584,23 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
 }
 
 /*
- * Perform the import activity check.  If the user canceled the import or
- * we detected activity then fail.
+ * Remote host activity check.
+ *
+ * error results:
+ *          0 - no activity detected
+ *  EREMOTEIO - remote activity detected
+ *      EINTR - user canceled the operation
  */
 static int
-spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
+spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
+    boolean_t importing)
 {
 	uint64_t txg = ub->ub_txg;
 	uint64_t timestamp = ub->ub_timestamp;
 	uint64_t mmp_config = ub->ub_mmp_config;
 	uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
 	uint64_t import_delay;
-	hrtime_t import_expire;
+	hrtime_t import_expire, now;
 	nvlist_t *mmp_label = NULL;
 	vdev_t *rvd = spa->spa_root_vdev;
 	kcondvar_t cv;
@@ -3195,9 +3638,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
 
 	import_expire = gethrtime() + import_delay;
 
-	while (gethrtime() < import_expire) {
-		(void) spa_import_progress_set_mmp_check(spa_guid(spa),
-		    NSEC2SEC(import_expire - gethrtime()));
+	if (importing) {
+		spa_import_progress_set_notes(spa, "Checking MMP activity, "
+		    "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+	}
+
+	int iterations = 0;
+	while ((now = gethrtime()) < import_expire) {
+		if (importing && iterations++ % 30 == 0) {
+			spa_import_progress_set_notes(spa, "Checking MMP "
+			    "activity, %llu ms remaining",
+			    (u_longlong_t)NSEC2MSEC(import_expire - now));
+		}
+
+		if (importing) {
+			(void) spa_import_progress_set_mmp_check(spa_guid(spa),
+			    NSEC2SEC(import_expire - gethrtime()));
+		}
 
 		vdev_uberblock_load(rvd, ub, &mmp_label);
 
@@ -3246,7 +3703,7 @@ out:
 	 * ZPOOL_CONFIG_MMP_HOSTID   - hostid from the active pool
 	 */
 	if (error == EREMOTEIO) {
-		char *hostname = "<unknown>";
+		const char *hostname = "<unknown>";
 		uint64_t hostid = 0;
 
 		if (mmp_label) {
@@ -3279,11 +3736,66 @@ out:
 	return (error);
 }
 
+/*
+ * Called from zfs_ioc_clear for a pool that was suspended
+ * after failing mmp write checks.
+ */
+boolean_t
+spa_mmp_remote_host_activity(spa_t *spa)
+{
+	ASSERT(spa_multihost(spa) && spa_suspended(spa));
+
+	nvlist_t *best_label;
+	uberblock_t best_ub;
+
+	/*
+	 * Locate the best uberblock on disk
+	 */
+	vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
+	if (best_label) {
+		/*
+		 * confirm that the best hostid matches our hostid
+		 */
+		if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
+		    spa_get_hostid(spa) !=
+		    fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
+			nvlist_free(best_label);
+			return (B_TRUE);
+		}
+		nvlist_free(best_label);
+	} else {
+		return (B_TRUE);
+	}
+
+	if (!MMP_VALID(&best_ub) ||
+	    !MMP_FAIL_INT_VALID(&best_ub) ||
+	    MMP_FAIL_INT(&best_ub) == 0) {
+		return (B_TRUE);
+	}
+
+	if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
+	    best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
+		zfs_dbgmsg("txg mismatch detected during pool clear "
+		    "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
+		    (u_longlong_t)spa->spa_uberblock.ub_txg,
+		    (u_longlong_t)best_ub.ub_txg,
+		    (u_longlong_t)spa->spa_uberblock.ub_timestamp,
+		    (u_longlong_t)best_ub.ub_timestamp);
+		return (B_TRUE);
+	}
+
+	/*
+	 * Perform an activity check looking for any remote writer
+	 */
+	return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
+	    B_FALSE) != 0);
+}
+
 static int
 spa_verify_host(spa_t *spa, nvlist_t *mos_config)
 {
 	uint64_t hostid;
-	char *hostname;
+	const char *hostname;
 	uint64_t myhostid = 0;
 
 	if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
@@ -3318,8 +3830,8 @@ spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
 	int parse;
 	vdev_t *rvd;
 	uint64_t pool_guid;
-	char *comment;
-	char *compatibility;
+	const char *comment;
+	const char *compatibility;
 
 	/*
 	 * Versioning wasn't explicitly added to the label until later, so if
@@ -3581,6 +4093,12 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 	}
 	spa_load_note(spa, "using uberblock with txg=%llu",
 	    (u_longlong_t)ub->ub_txg);
+	if (ub->ub_raidz_reflow_info != 0) {
+		spa_load_note(spa, "uberblock raidz_reflow_info: "
+		    "state=%u offset=%llu",
+		    (int)RRSS_GET_STATE(ub),
+		    (u_longlong_t)RRSS_GET_OFFSET(ub));
+	}
 
 
 	/*
@@ -3599,7 +4117,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
 		}
 
-		int error = spa_activity_check(spa, ub, spa->spa_config);
+		int error =
+		    spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
 		if (error) {
 			nvlist_free(label);
 			return (error);
@@ -3806,6 +4325,24 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
+	 * If 'zpool import' used a cached config, then the on-disk hostid and
+	 * hostname may be different to the cached config in ways that should
+	 * prevent import.  Userspace can't discover this without a scan, but
+	 * we know, so we add these values to LOAD_INFO so the caller can know
+	 * the difference.
+	 *
+	 * Note that we have to do this before the config is regenerated,
+	 * because the new config will have the hostid and hostname for this
+	 * host, in readiness for import.
+	 */
+	if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID))
+		fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID,
+		    fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID));
+	if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME))
+		fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME,
+		    fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME));
+
+	/*
 	 * We will use spa_config if we decide to reload the spa or if spa_load
 	 * fails and we rewind. We must thus regenerate the config using the
 	 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
@@ -4163,6 +4700,7 @@ spa_ld_get_props(spa_t *spa)
 		spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
 		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
 	} else if (error != 0) {
+		nvlist_free(mos_config);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
 		/*
@@ -4323,7 +4861,7 @@ spa_ld_load_vdev_metadata(spa_t *spa)
 
 	error = spa_ld_log_spacemaps(spa);
 	if (error != 0) {
-		spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]",
+		spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
@@ -4354,7 +4892,22 @@ spa_ld_load_dedup_tables(spa_t *spa)
 }
 
 static int
-spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
+spa_ld_load_brt(spa_t *spa)
+{
+	int error = 0;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	error = brt_load(spa);
+	if (error != 0) {
+		spa_load_failed(spa, "brt_load failed [error=%d]", error);
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+
+	return (0);
+}
+
+static int
+spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
@@ -4481,7 +5034,8 @@ spa_ld_read_checkpoint_txg(spa_t *spa)
 	int error = 0;
 
 	ASSERT0(spa->spa_checkpoint_txg);
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_load_thread == curthread);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
@@ -4721,13 +5275,14 @@ spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
  * config stored in the MOS.
  */
 static int
-spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
+spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 {
 	int error = 0;
 	boolean_t missing_feat_write = B_FALSE;
 	boolean_t checkpoint_rewind =
 	    (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 	boolean_t update_config_cache = B_FALSE;
+	hrtime_t load_start = gethrtime();
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
@@ -4773,11 +5328,18 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 	}
 
 	/*
+	 * Drop the namespace lock for the rest of the function.
+	 */
+	spa->spa_load_thread = curthread;
+	mutex_exit(&spa_namespace_lock);
+
+	/*
 	 * Retrieve the checkpoint txg if the pool has a checkpoint.
 	 */
+	spa_import_progress_set_notes(spa, "Loading checkpoint txg");
 	error = spa_ld_read_checkpoint_txg(spa);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	/*
 	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
@@ -4787,60 +5349,73 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 	 * initiated. Otherwise we could be reading from indirect vdevs before
 	 * we have loaded their mappings.
 	 */
+	spa_import_progress_set_notes(spa, "Loading indirect vdev metadata");
 	error = spa_ld_open_indirect_vdev_metadata(spa);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	/*
 	 * Retrieve the full list of active features from the MOS and check if
 	 * they are all supported.
 	 */
+	spa_import_progress_set_notes(spa, "Checking feature flags");
 	error = spa_ld_check_features(spa, &missing_feat_write);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	/*
 	 * Load several special directories from the MOS needed by the dsl_pool
 	 * layer.
 	 */
+	spa_import_progress_set_notes(spa, "Loading special MOS directories");
 	error = spa_ld_load_special_directories(spa);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	/*
 	 * Retrieve pool properties from the MOS.
 	 */
+	spa_import_progress_set_notes(spa, "Loading properties");
 	error = spa_ld_get_props(spa);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	/*
 	 * Retrieve the list of auxiliary devices - cache devices and spares -
 	 * and open them.
 	 */
+	spa_import_progress_set_notes(spa, "Loading AUX vdevs");
 	error = spa_ld_open_aux_vdevs(spa, type);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	/*
 	 * Load the metadata for all vdevs. Also check if unopenable devices
 	 * should be autoreplaced.
 	 */
+	spa_import_progress_set_notes(spa, "Loading vdev metadata");
 	error = spa_ld_load_vdev_metadata(spa);
 	if (error != 0)
-		return (error);
+		goto fail;
 
+	spa_import_progress_set_notes(spa, "Loading dedup tables");
 	error = spa_ld_load_dedup_tables(spa);
 	if (error != 0)
-		return (error);
+		goto fail;
+
+	spa_import_progress_set_notes(spa, "Loading BRT");
+	error = spa_ld_load_brt(spa);
+	if (error != 0)
+		goto fail;
 
 	/*
 	 * Verify the logs now to make sure we don't have any unexpected errors
 	 * when we claim log blocks later.
 	 */
+	spa_import_progress_set_notes(spa, "Verifying Log Devices");
 	error = spa_ld_verify_logs(spa, type, ereport);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	if (missing_feat_write) {
 		ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
@@ -4850,8 +5425,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 		 * read-only mode but not read-write mode. We now have enough
 		 * information and can return to userland.
 		 */
-		return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
-		    ENOTSUP));
+		error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
+		    ENOTSUP);
+		goto fail;
 	}
 
 	/*
@@ -4859,15 +5435,17 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 	 * state. When performing an extreme rewind, we verify the whole pool,
 	 * which can take a very long time.
 	 */
+	spa_import_progress_set_notes(spa, "Verifying pool data");
 	error = spa_ld_verify_pool_data(spa);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	/*
 	 * Calculate the deflated space for the pool. This must be done before
 	 * we write anything to the pool because we'd need to update the space
 	 * accounting using the deflated sizes.
 	 */
+	spa_import_progress_set_notes(spa, "Calculating deflated space");
 	spa_update_dspace(spa);
 
 	/*
@@ -4875,6 +5453,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 	 * pool. If we are importing the pool in read-write mode, a few
 	 * additional steps must be performed to finish the import.
 	 */
+	spa_import_progress_set_notes(spa, "Starting import");
 	if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
 	    spa->spa_load_max_txg == UINT64_MAX)) {
 		uint64_t config_cache_txg = spa->spa_config_txg;
@@ -4882,6 +5461,13 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
 
 		/*
+		 * Before we do any zio_write's, complete the raidz expansion
+		 * scratch space copying, if necessary.
+		 */
+		if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID)
+			vdev_raidz_reflow_copy_scratch(spa);
+
+		/*
 		 * In case of a checkpoint rewind, log the original txg
 		 * of the checkpointed uberblock.
 		 */
@@ -4891,6 +5477,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 			    (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
 		}
 
+		spa_import_progress_set_notes(spa, "Claiming ZIL blocks");
 		/*
 		 * Traverse the ZIL and claim all blocks.
 		 */
@@ -4910,6 +5497,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 		 * will have been set for us by ZIL traversal operations
 		 * performed above.
 		 */
+		spa_import_progress_set_notes(spa, "Syncing ZIL claims");
 		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
 
 		/*
@@ -4917,6 +5505,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 		 * next sync, we would update the config stored in vdev labels
 		 * and the cachefile (by default /etc/zfs/zpool.cache).
 		 */
+		spa_import_progress_set_notes(spa, "Updating configs");
 		spa_ld_check_for_config_update(spa, config_cache_txg,
 		    update_config_cache);
 
@@ -4925,6 +5514,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 		 * Then check all DTLs to see if anything needs resilvering.
 		 * The resilver will be deferred if a rebuild was started.
 		 */
+		spa_import_progress_set_notes(spa, "Starting resilvers");
 		if (vdev_rebuild_active(spa->spa_root_vdev)) {
 			vdev_rebuild_restart(spa);
 		} else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
@@ -4938,6 +5528,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 		 */
 		spa_history_log_version(spa, "open", NULL);
 
+		spa_import_progress_set_notes(spa,
+		    "Restarting device removals");
 		spa_restart_removal(spa);
 		spa_spawn_aux_threads(spa);
 
@@ -4950,27 +5542,40 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 		 * auxiliary threads above (from which the livelist
 		 * deletion zthr is part of).
 		 */
+		spa_import_progress_set_notes(spa,
+		    "Cleaning up inconsistent objsets");
 		(void) dmu_objset_find(spa_name(spa),
 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
 
 		/*
 		 * Clean up any stale temporary dataset userrefs.
 		 */
+		spa_import_progress_set_notes(spa,
+		    "Cleaning up temporary userrefs");
 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		spa_import_progress_set_notes(spa, "Restarting initialize");
 		vdev_initialize_restart(spa->spa_root_vdev);
+		spa_import_progress_set_notes(spa, "Restarting TRIM");
 		vdev_trim_restart(spa->spa_root_vdev);
 		vdev_autotrim_restart(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		spa_import_progress_set_notes(spa, "Finished importing");
 	}
+	zio_handle_import_delay(spa, gethrtime() - load_start);
 
 	spa_import_progress_remove(spa_guid(spa));
 	spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
 
 	spa_load_note(spa, "LOADED");
+fail:
+	mutex_enter(&spa_namespace_lock);
+	spa->spa_load_thread = NULL;
+	cv_broadcast(&spa_namespace_cv);
+
+	return (error);
 
-	return (0);
 }
 
 static int
@@ -5112,8 +5717,8 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
  * ambiguous state.
  */
 static int
-spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
-    nvlist_t **config)
+spa_open_common(const char *pool, spa_t **spapp, const void *tag,
+    nvlist_t *nvpolicy, nvlist_t **config)
 {
 	spa_t *spa;
 	spa_load_state_t state = SPA_LOAD_OPEN;
@@ -5170,7 +5775,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
 			 */
 			spa_unload(spa);
 			spa_deactivate(spa);
-			spa_write_cachefile(spa, B_TRUE, B_TRUE);
+			spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
 			spa_remove(spa);
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
@@ -5208,7 +5813,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
 	 * If we've recovered the pool, pass back any information we
 	 * gathered while doing the load.
 	 */
-	if (state == SPA_LOAD_RECOVER) {
+	if (state == SPA_LOAD_RECOVER && config != NULL) {
 		fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info);
 	}
@@ -5229,14 +5834,14 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
 }
 
 int
-spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
-    nvlist_t **config)
+spa_open_rewind(const char *name, spa_t **spapp, const void *tag,
+    nvlist_t *policy, nvlist_t **config)
 {
 	return (spa_open_common(name, spapp, tag, policy, config));
 }
 
 int
-spa_open(const char *name, spa_t **spapp, void *tag)
+spa_open(const char *name, spa_t **spapp, const void *tag)
 {
 	return (spa_open_common(name, spapp, tag, NULL, NULL));
 }
@@ -5292,8 +5897,8 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
 	VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares));
 	if (nspares != 0) {
-		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, spares,
-		    nspares);
+		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+		    (const nvlist_t * const *)spares, nspares);
 		VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 		    &spares, &nspares));
 
@@ -5305,13 +5910,15 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
 		for (i = 0; i < nspares; i++) {
 			guid = fnvlist_lookup_uint64(spares[i],
 			    ZPOOL_CONFIG_GUID);
+			VERIFY0(nvlist_lookup_uint64_array(spares[i],
+			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
 			if (spa_spare_exists(guid, &pool, NULL) &&
 			    pool != 0ULL) {
-				VERIFY0(nvlist_lookup_uint64_array(spares[i],
-				    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs,
-				    &vsc));
 				vs->vs_state = VDEV_STATE_CANT_OPEN;
 				vs->vs_aux = VDEV_AUX_SPARED;
+			} else {
+				vs->vs_state =
+				    spa->spa_spares.sav_vdevs[i]->vdev_state;
 			}
 		}
 	}
@@ -5340,8 +5947,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
 	VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
 	if (nl2cache != 0) {
-		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, l2cache,
-		    nl2cache);
+		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+		    (const nvlist_t * const *)l2cache, nl2cache);
 		VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 		    &l2cache, &nl2cache));
 
@@ -5484,7 +6091,7 @@ spa_get_stats(const char *name, nvlist_t **config,
 
 			fnvlist_add_uint64(*config,
 			    ZPOOL_CONFIG_ERRCOUNT,
-			    spa_get_errlog_size(spa));
+			    spa_approx_errlog_size(spa));
 
 			if (spa_suspended(spa)) {
 				fnvlist_add_uint64(*config,
@@ -5648,8 +6255,8 @@ spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
 
 		fnvlist_remove(sav->sav_config, config);
 
-		fnvlist_add_nvlist_array(sav->sav_config, config, newdevs,
-		    ndevs + oldndevs);
+		fnvlist_add_nvlist_array(sav->sav_config, config,
+		    (const nvlist_t * const *)newdevs, ndevs + oldndevs);
 		for (i = 0; i < oldndevs + ndevs; i++)
 			nvlist_free(newdevs[i]);
 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
@@ -5658,7 +6265,8 @@ spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
 		 * Generate a new dev list.
 		 */
 		sav->sav_config = fnvlist_alloc();
-		fnvlist_add_nvlist_array(sav->sav_config, config, devs, ndevs);
+		fnvlist_add_nvlist_array(sav->sav_config, config,
+		    (const nvlist_t * const *)devs, ndevs);
 	}
 }
 
@@ -5708,7 +6316,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
     nvlist_t *zplprops, dsl_crypto_params_t *dcp)
 {
 	spa_t *spa;
-	char *altroot = NULL;
+	const char *altroot = NULL;
 	vdev_t *rvd;
 	dsl_pool_t *dp;
 	dmu_tx_t *tx;
@@ -5721,12 +6329,13 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	boolean_t has_encryption;
 	boolean_t has_allocclass;
 	spa_feature_t feat;
-	char *feat_name;
-	char *poolname;
+	const char *feat_name;
+	const char *poolname;
 	nvlist_t *nvl;
 
 	if (props == NULL ||
-	    nvlist_lookup_string(props, "tname", &poolname) != 0)
+	    nvlist_lookup_string(props,
+	    zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
 		poolname = (char *)pool;
 
 	/*
@@ -5869,7 +6478,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	    &spares, &nspares) == 0) {
 		spa->spa_spares.sav_config = fnvlist_alloc();
 		fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
-		    ZPOOL_CONFIG_SPARES, spares, nspares);
+		    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
+		    nspares);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
@@ -5881,9 +6491,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
-		spa->spa_l2cache.sav_config = fnvlist_alloc();
+		VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config,
+		    NV_UNIQUE_NAME, KM_SLEEP));
 		fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
-		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache);
+		    ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
+		    nl2cache);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
@@ -5898,6 +6510,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	 * Create DDTs (dedup tables).
 	 */
 	ddt_create(spa);
+	/*
+	 * Create BRT table and BRT table object.
+	 */
+	brt_create(spa);
 
 	spa_update_dspace(spa);
 
@@ -5990,7 +6606,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 
 	spa_spawn_aux_threads(spa);
 
-	spa_write_cachefile(spa, B_FALSE, B_TRUE);
+	spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
 
 	/*
 	 * Don't count references from objsets that are already closed
@@ -6000,6 +6616,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
 	spa->spa_load_state = SPA_LOAD_NONE;
 
+	spa_import_os(spa);
+
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
@@ -6012,7 +6630,7 @@ int
 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 {
 	spa_t *spa;
-	char *altroot = NULL;
+	const char *altroot = NULL;
 	spa_load_state_t state = SPA_LOAD_IMPORT;
 	zpool_load_policy_t policy;
 	spa_mode_t mode = spa_mode_global;
@@ -6051,7 +6669,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 		if (props != NULL)
 			spa_configfile_set(spa, props, B_FALSE);
 
-		spa_write_cachefile(spa, B_FALSE, B_TRUE);
+		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 		zfs_dbgmsg("spa_import: verbatim import of %s", pool);
 		mutex_exit(&spa_namespace_lock);
@@ -6131,7 +6749,8 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 		else
 			spa->spa_spares.sav_config = fnvlist_alloc();
 		fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
-		    ZPOOL_CONFIG_SPARES, spares, nspares);
+		    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
+		    nspares);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
@@ -6145,7 +6764,8 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 		else
 			spa->spa_l2cache.sav_config = fnvlist_alloc();
 		fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
-		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache);
+		    ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
+		    nl2cache);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
@@ -6181,6 +6801,8 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 
 	zvol_create_minors_recursive(pool);
 
+	spa_import_os(spa);
+
 	return (0);
 }
 
@@ -6188,7 +6810,7 @@ nvlist_t *
 spa_tryimport(nvlist_t *tryconfig)
 {
 	nvlist_t *config = NULL;
-	char *poolname, *cachefile;
+	const char *poolname, *cachefile;
 	spa_t *spa;
 	uint64_t state;
 	int error;
@@ -6203,9 +6825,14 @@ spa_tryimport(nvlist_t *tryconfig)
 	/*
 	 * Create and initialize the spa structure.
 	 */
+	char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+	(void) snprintf(name, MAXPATHLEN, "%s-%llx-%s",
+	    TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname);
+
 	mutex_enter(&spa_namespace_lock);
-	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
+	spa = spa_add(name, tryconfig, NULL);
 	spa_activate(spa, SPA_MODE_READ);
+	kmem_free(name, MAXPATHLEN);
 
 	/*
 	 * Rewind pool if a max txg was provided.
@@ -6228,6 +6855,16 @@ spa_tryimport(nvlist_t *tryconfig)
 		spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
 	}
 
+	/*
+	 * spa_import() relies on a pool config fetched by spa_try_import()
+	 * for spare/cache devices. Import flags are not passed to
+	 * spa_tryimport(), which makes it return early due to a missing log
+	 * device and missing retrieving the cache device and spare eventually.
+	 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch
+	 * the correct configuration regardless of the missing log device.
+	 */
+	spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG;
+
 	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
 
 	/*
@@ -6308,8 +6945,9 @@ static int
 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
     boolean_t force, boolean_t hardforce)
 {
-	int error;
+	int error = 0;
 	spa_t *spa;
+	hrtime_t export_start = gethrtime();
 
 	if (oldconfig)
 		*oldconfig = NULL;
@@ -6331,8 +6969,8 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
 	spa->spa_is_exporting = B_TRUE;
 
 	/*
-	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
-	 * reacquire the namespace lock, and see if we can export.
+	 * Put a hold on the pool, drop the namespace lock, stop async tasks
+	 * and see if we can export.
 	 */
 	spa_open_ref(spa, FTAG);
 	mutex_exit(&spa_namespace_lock);
@@ -6342,10 +6980,14 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
 		taskq_wait(spa->spa_zvol_taskq);
 	}
 	mutex_enter(&spa_namespace_lock);
+	spa->spa_export_thread = curthread;
 	spa_close(spa, FTAG);
 
-	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+		mutex_exit(&spa_namespace_lock);
 		goto export_spa;
+	}
+
 	/*
 	 * The pool will be in core if it's openable, in which case we can
 	 * modify its state.  Objsets may be open only because they're dirty,
@@ -6366,7 +7008,16 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
 		goto fail;
 	}
 
+	mutex_exit(&spa_namespace_lock);
+	/*
+	 * At this point we no longer hold the spa_namespace_lock and
+	 * there were no references on the spa. Future spa_lookups will
+	 * notice the spa->spa_export_thread and wait until we signal
+	 * that we are finshed.
+	 */
+
 	if (spa->spa_sync_on) {
+		vdev_t *rvd = spa->spa_root_vdev;
 		/*
 		 * A pool cannot be exported if it has an active shared spare.
 		 * This is to prevent other pools stealing the active spare
@@ -6376,6 +7027,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
 		if (!force && new_state == POOL_STATE_EXPORTED &&
 		    spa_has_active_shared_spare(spa)) {
 			error = SET_ERROR(EXDEV);
+			mutex_enter(&spa_namespace_lock);
 			goto fail;
 		}
 
@@ -6386,13 +7038,10 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
 		 * dirty data resulting from the initialization is
 		 * committed to disk before we unload the pool.
 		 */
-		if (spa->spa_root_vdev != NULL) {
-			vdev_t *rvd = spa->spa_root_vdev;
-			vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
-			vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
-			vdev_autotrim_stop_all(spa);
-			vdev_rebuild_stop_all(spa);
-		}
+		vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
+		vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
+		vdev_autotrim_stop_all(spa);
+		vdev_rebuild_stop_all(spa);
 
 		/*
 		 * We want this to be reflected on every label,
@@ -6402,14 +7051,34 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_state = new_state;
+			vdev_config_dirty(rvd);
+			spa_config_exit(spa, SCL_ALL, FTAG);
+		}
+
+		/*
+		 * If the log space map feature is enabled and the pool is
+		 * getting exported (but not destroyed), we want to spend some
+		 * time flushing as many metaslabs as we can in an attempt to
+		 * destroy log space maps and save import time. This has to be
+		 * done before we set the spa_final_txg, otherwise
+		 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs.
+		 * spa_should_flush_logs_on_unload() should be called after
+		 * spa_state has been set to the new_state.
+		 */
+		if (spa_should_flush_logs_on_unload(spa))
+			spa_unload_log_sm_flush_all(spa);
+
+		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
+			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_final_txg = spa_last_synced_txg(spa) +
 			    TXG_DEFER_SIZE + 1;
-			vdev_config_dirty(spa->spa_root_vdev);
 			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
 	}
 
 export_spa:
+	spa_export_os(spa);
+
 	if (new_state == POOL_STATE_DESTROYED)
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
 	else if (new_state == POOL_STATE_EXPORTED)
@@ -6423,9 +7092,16 @@ export_spa:
 	if (oldconfig && spa->spa_config)
 		*oldconfig = fnvlist_dup(spa->spa_config);
 
+	if (new_state == POOL_STATE_EXPORTED)
+		zio_handle_export_delay(spa, gethrtime() - export_start);
+
+	/*
+	 * Take the namespace lock for the actual spa_t removal
+	 */
+	mutex_enter(&spa_namespace_lock);
 	if (new_state != POOL_STATE_UNINITIALIZED) {
 		if (!hardforce)
-			spa_write_cachefile(spa, B_TRUE, B_TRUE);
+			spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
 		spa_remove(spa);
 	} else {
 		/*
@@ -6434,14 +7110,25 @@ export_spa:
 		 * we make sure to reset the exporting flag.
 		 */
 		spa->spa_is_exporting = B_FALSE;
+		spa->spa_export_thread = NULL;
 	}
 
+	/*
+	 * Wake up any waiters in spa_lookup()
+	 */
+	cv_broadcast(&spa_namespace_cv);
 	mutex_exit(&spa_namespace_lock);
 	return (0);
 
 fail:
 	spa->spa_is_exporting = B_FALSE;
+	spa->spa_export_thread = NULL;
+
 	spa_async_resume(spa);
+	/*
+	 * Wake up any waiters in spa_lookup()
+	 */
+	cv_broadcast(&spa_namespace_cv);
 	mutex_exit(&spa_namespace_lock);
 	return (error);
 }
@@ -6501,7 +7188,7 @@ spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
  * Add a device to a storage pool.
  */
 int
-spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
+spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift)
 {
 	uint64_t txg, ndraid = 0;
 	int error;
@@ -6592,6 +7279,16 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 		}
 	}
 
+	if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) {
+		for (int c = 0; c < vd->vdev_children; c++) {
+			tvd = vd->vdev_child[c];
+			if (tvd->vdev_ashift != spa->spa_max_ashift) {
+				return (spa_vdev_exit(spa, vd, txg,
+				    ZFS_ERR_ASHIFT_MISMATCH));
+			}
+		}
+	}
+
 	for (int c = 0; c < vd->vdev_children; c++) {
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
@@ -6651,9 +7348,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 }
 
 /*
- * Attach a device to a mirror.  The arguments are the path to any device
- * in the mirror, and the nvroot for the new device.  If the path specifies
- * a device that is not mirrored, we automatically insert the mirror vdev.
+ * Attach a device to a vdev specified by its guid.  The vdev type can be
+ * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a
+ * single device). When the vdev is a single device, a mirror vdev will be
+ * automatically inserted.
  *
  * If 'replacing' is specified, the new device is intended to replace the
  * existing device; in this case the two devices are made into their own
@@ -6676,7 +7374,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
 	vdev_ops_t *pvops;
 	char *oldvdpath, *newvdpath;
-	int newvd_isspare;
+	int newvd_isspare = B_FALSE;
 	int error;
 
 	ASSERT(spa_writeable(spa));
@@ -6696,28 +7394,49 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
-		if (dsl_scan_resilvering(spa_get_dsl(spa)))
+		if (dsl_scan_resilvering(spa_get_dsl(spa)) ||
+		    dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {
 			return (spa_vdev_exit(spa, NULL, txg,
 			    ZFS_ERR_RESILVER_IN_PROGRESS));
+		}
 	} else {
 		if (vdev_rebuild_active(rvd))
 			return (spa_vdev_exit(spa, NULL, txg,
 			    ZFS_ERR_REBUILD_IN_PROGRESS));
 	}
 
-	if (spa->spa_vdev_removal != NULL)
-		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+	if (spa->spa_vdev_removal != NULL) {
+		return (spa_vdev_exit(spa, NULL, txg,
+		    ZFS_ERR_DEVRM_IN_PROGRESS));
+	}
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
-	if (!oldvd->vdev_ops->vdev_op_leaf)
+	boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops;
+
+	if (raidz) {
+		if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION))
+			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+		/*
+		 * Can't expand a raidz while prior expand is in progress.
+		 */
+		if (spa->spa_raidz_expand != NULL) {
+			return (spa_vdev_exit(spa, NULL, txg,
+			    ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
+		}
+	} else if (!oldvd->vdev_ops->vdev_op_leaf) {
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+	}
 
-	pvd = oldvd->vdev_parent;
+	if (raidz)
+		pvd = oldvd;
+	else
+		pvd = oldvd->vdev_parent;
 
-	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
-	    VDEV_ALLOC_ATTACH)) != 0)
+	if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
+	    VDEV_ALLOC_ATTACH) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	if (newrootvd->vdev_children != 1)
@@ -6732,10 +7451,12 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 		return (spa_vdev_exit(spa, newrootvd, txg, error));
 
 	/*
-	 * Spares can't replace logs
+	 * log, dedup and special vdevs should not be replaced by spares.
 	 */
-	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
+	if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE ||
+	    oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) {
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+	}
 
 	/*
 	 * A dRAID spare can only replace a child of its parent dRAID vdev.
@@ -6764,11 +7485,13 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 
 	if (!replacing) {
 		/*
-		 * For attach, the only allowable parent is a mirror or the root
-		 * vdev.
+		 * For attach, the only allowable parent is a mirror or
+		 * the root vdev. A raidz vdev can be attached to, but
+		 * you cannot attach to a raidz child.
 		 */
 		if (pvd->vdev_ops != &vdev_mirror_ops &&
-		    pvd->vdev_ops != &vdev_root_ops)
+		    pvd->vdev_ops != &vdev_root_ops &&
+		    !raidz)
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		pvops = &vdev_mirror_ops;
@@ -6807,7 +7530,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 	/*
 	 * Make sure the new device is big enough.
 	 */
-	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
+	vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;
+	if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))
 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
 
 	/*
@@ -6818,31 +7542,74 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 	/*
+	 * RAIDZ-expansion-specific checks.
+	 */
+	if (raidz) {
+		if (vdev_raidz_attach_check(newvd) != 0)
+			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+		/*
+		 * Fail early if a child is not healthy or being replaced
+		 */
+		for (int i = 0; i < oldvd->vdev_children; i++) {
+			if (vdev_is_dead(oldvd->vdev_child[i]) ||
+			    !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) {
+				return (spa_vdev_exit(spa, newrootvd, txg,
+				    ENXIO));
+			}
+			/* Also fail if reserved boot area is in-use */
+			if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i])
+			    != 0) {
+				return (spa_vdev_exit(spa, newrootvd, txg,
+				    EADDRINUSE));
+			}
+		}
+	}
+
+	if (raidz) {
+		/*
+		 * Note: oldvdpath is freed by spa_strfree(),  but
+		 * kmem_asprintf() is freed by kmem_strfree(), so we have to
+		 * move it to a spa_strdup-ed string.
+		 */
+		char *tmp = kmem_asprintf("raidz%u-%u",
+		    (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id);
+		oldvdpath = spa_strdup(tmp);
+		kmem_strfree(tmp);
+	} else {
+		oldvdpath = spa_strdup(oldvd->vdev_path);
+	}
+	newvdpath = spa_strdup(newvd->vdev_path);
+
+	/*
 	 * If this is an in-place replacement, update oldvd's path and devid
 	 * to make it distinguishable from newvd, and unopenable from now on.
 	 */
-	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+	if (strcmp(oldvdpath, newvdpath) == 0) {
 		spa_strfree(oldvd->vdev_path);
-		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+		oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,
 		    KM_SLEEP);
-		(void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5,
-		    "%s/%s", newvd->vdev_path, "old");
+		(void) sprintf(oldvd->vdev_path, "%s/old",
+		    newvdpath);
 		if (oldvd->vdev_devid != NULL) {
 			spa_strfree(oldvd->vdev_devid);
 			oldvd->vdev_devid = NULL;
 		}
+		spa_strfree(oldvdpath);
+		oldvdpath = spa_strdup(oldvd->vdev_path);
 	}
 
 	/*
 	 * If the parent is not a mirror, or if we're replacing, insert the new
 	 * mirror/replacing/spare vdev above oldvd.
 	 */
-	if (pvd->vdev_ops != pvops)
+	if (!raidz && pvd->vdev_ops != pvops) {
 		pvd = vdev_add_parent(oldvd, pvops);
+		ASSERT(pvd->vdev_ops == pvops);
+		ASSERT(oldvd->vdev_parent == pvd);
+	}
 
 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
-	ASSERT(pvd->vdev_ops == pvops);
-	ASSERT(oldvd->vdev_parent == pvd);
 
 	/*
 	 * Extract the new device from its root and add it to pvd.
@@ -6870,41 +7637,66 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 	 */
 	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
-	vdev_dtl_dirty(newvd, DTL_MISSING,
-	    TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
+	if (raidz) {
+		/*
+		 * Wait for the youngest allocations and frees to sync,
+		 * and then wait for the deferral of those frees to finish.
+		 */
+		spa_vdev_config_exit(spa, NULL,
+		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
-	if (newvd->vdev_isspare) {
-		spa_spare_activate(newvd);
-		spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
-	}
+		vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE);
+		vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE);
+		vdev_autotrim_stop_wait(tvd);
 
-	oldvdpath = spa_strdup(oldvd->vdev_path);
-	newvdpath = spa_strdup(newvd->vdev_path);
-	newvd_isspare = newvd->vdev_isspare;
+		dtl_max_txg = spa_vdev_config_enter(spa);
 
-	/*
-	 * Mark newvd's DTL dirty in this txg.
-	 */
-	vdev_dirty(tvd, VDD_DTL, newvd, txg);
+		tvd->vdev_rz_expanding = B_TRUE;
 
-	/*
-	 * Schedule the resilver or rebuild to restart in the future. We do
-	 * this to ensure that dmu_sync-ed blocks have been stitched into the
-	 * respective datasets.
-	 */
-	if (rebuild) {
-		newvd->vdev_rebuild_txg = txg;
+		vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg);
+		vdev_config_dirty(tvd);
 
-		vdev_rebuild(tvd);
+		dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
+		    dtl_max_txg);
+		dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,
+		    newvd, tx);
+		dmu_tx_commit(tx);
 	} else {
-		newvd->vdev_resilver_txg = txg;
+		vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+		    dtl_max_txg - TXG_INITIAL);
+
+		if (newvd->vdev_isspare) {
+			spa_spare_activate(newvd);
+			spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
+		}
+
+		newvd_isspare = newvd->vdev_isspare;
 
-		if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
-		    spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
-			vdev_defer_resilver(newvd);
+		/*
+		 * Mark newvd's DTL dirty in this txg.
+		 */
+		vdev_dirty(tvd, VDD_DTL, newvd, txg);
+
+		/*
+		 * Schedule the resilver or rebuild to restart in the future.
+		 * We do this to ensure that dmu_sync-ed blocks have been
+		 * stitched into the respective datasets.
+		 */
+		if (rebuild) {
+			newvd->vdev_rebuild_txg = txg;
+
+			vdev_rebuild(tvd);
 		} else {
-			dsl_scan_restart_resilver(spa->spa_dsl_pool,
-			    dtl_max_txg);
+			newvd->vdev_resilver_txg = txg;
+
+			if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
+			    spa_feature_is_enabled(spa,
+			    SPA_FEATURE_RESILVER_DEFER)) {
+				vdev_defer_resilver(newvd);
+			} else {
+				dsl_scan_restart_resilver(spa->spa_dsl_pool,
+				    dtl_max_txg);
+			}
 		}
 	}
 
@@ -6934,7 +7726,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
  * Detach a device from a mirror or replacing vdev.
  *
  * If 'replace_done' is specified, only detach if the parent
- * is a replacing vdev.
+ * is a replacing or a spare vdev.
  */
 int
 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
@@ -7073,7 +7865,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 	 * it may be that the unwritability of the disk is the reason
 	 * it's being detached!
 	 */
-	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	/*
 	 * Remove vd from its parent and compact the parent's children.
@@ -7229,7 +8021,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
 	 */
 	if (cmd_type == POOL_INITIALIZE_START &&
 	    (vd->vdev_initialize_thread != NULL ||
-	    vd->vdev_top->vdev_removing)) {
+	    vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(EBUSY));
 	} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
@@ -7241,6 +8033,10 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
 	    vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(ESRCH));
+	} else if (cmd_type == POOL_INITIALIZE_UNINIT &&
+	    vd->vdev_initialize_thread != NULL) {
+		mutex_exit(&vd->vdev_initialize_lock);
+		return (SET_ERROR(EBUSY));
 	}
 
 	switch (cmd_type) {
@@ -7253,6 +8049,9 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
 	case POOL_INITIALIZE_SUSPEND:
 		vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
 		break;
+	case POOL_INITIALIZE_UNINIT:
+		vdev_uninitialize(vd);
+		break;
 	default:
 		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
 	}
@@ -7344,7 +8143,8 @@ spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
 	 * which has completed but the thread is not exited.
 	 */
 	if (cmd_type == POOL_TRIM_START &&
-	    (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
+	    (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing ||
+	    vd->vdev_top->vdev_rz_expanding)) {
 		mutex_exit(&vd->vdev_trim_lock);
 		return (SET_ERROR(EBUSY));
 	} else if (cmd_type == POOL_TRIM_CANCEL &&
@@ -7432,7 +8232,7 @@ spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
  * Split a set of devices from their mirrors, and create a new pool from them.
  */
 int
-spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config,
     nvlist_t *props, boolean_t exp)
 {
 	int error = 0;
@@ -7441,7 +8241,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
 	uint_t c, children, lastlog;
 	nvlist_t **child, *nvl, *tmp;
 	dmu_tx_t *tx;
-	char *altroot = NULL;
+	const char *altroot = NULL;
 	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
 	boolean_t activate_slog;
 
@@ -7986,6 +8786,7 @@ spa_scan_stop(spa_t *spa)
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
 		return (SET_ERROR(EBUSY));
+
 	return (dsl_scan_cancel(spa->spa_dsl_pool));
 }
 
@@ -8011,6 +8812,10 @@ spa_scan(spa_t *spa, pool_scan_func_t func)
 		return (0);
 	}
 
+	if (func == POOL_SCAN_ERRORSCRUB &&
+	    !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG))
+		return (SET_ERROR(ENOTSUP));
+
 	return (dsl_scan(spa->spa_dsl_pool, func));
 }
 
@@ -8049,15 +8854,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
 }
 
 static void
-spa_async_probe(spa_t *spa, vdev_t *vd)
+spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
 {
-	if (vd->vdev_probe_wanted) {
-		vd->vdev_probe_wanted = B_FALSE;
-		vdev_reopen(vd);	/* vdev_open() does the actual probe */
+	if (vd->vdev_fault_wanted) {
+		vd->vdev_fault_wanted = B_FALSE;
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+		    VDEV_AUX_ERR_EXCEEDED);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
-		spa_async_probe(spa, vd->vdev_child[c]);
+		spa_async_fault_vdev(spa, vd->vdev_child[c]);
 }
 
 static void
@@ -8077,7 +8883,7 @@ spa_async_autoexpand(spa_t *spa, vdev_t *vd)
 	spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND);
 }
 
-static void
+static __attribute__((noreturn)) void
 spa_async_thread(void *arg)
 {
 	spa_t *spa = (spa_t *)arg;
@@ -8145,11 +8951,11 @@ spa_async_thread(void *arg)
 	}
 
 	/*
-	 * See if any devices need to be probed.
+	 * See if any devices need to be marked faulted.
 	 */
-	if (tasks & SPA_ASYNC_PROBE) {
+	if (tasks & SPA_ASYNC_FAULT_VDEV) {
 		spa_vdev_state_enter(spa, SCL_NONE);
-		spa_async_probe(spa, spa->spa_root_vdev);
+		spa_async_fault_vdev(spa, spa->spa_root_vdev);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
@@ -8157,7 +8963,8 @@ spa_async_thread(void *arg)
 	 * If any devices are done replacing, detach them.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER_DONE ||
-	    tasks & SPA_ASYNC_REBUILD_DONE) {
+	    tasks & SPA_ASYNC_REBUILD_DONE ||
+	    tasks & SPA_ASYNC_DETACH_SPARE) {
 		spa_vdev_resilver_done(spa);
 	}
 
@@ -8241,6 +9048,10 @@ spa_async_suspend(spa_t *spa)
 	if (condense_thread != NULL)
 		zthr_cancel(condense_thread);
 
+	zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
+	if (raidz_expand_thread != NULL)
+		zthr_cancel(raidz_expand_thread);
+
 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
 	if (discard_thread != NULL)
 		zthr_cancel(discard_thread);
@@ -8267,6 +9078,10 @@ spa_async_resume(spa_t *spa)
 	if (condense_thread != NULL)
 		zthr_resume(condense_thread);
 
+	zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
+	if (raidz_expand_thread != NULL)
+		zthr_resume(raidz_expand_thread);
+
 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
 	if (discard_thread != NULL)
 		zthr_resume(discard_thread);
@@ -8433,7 +9248,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 
 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
 	    KM_SLEEP) == 0);
-	bzero(packed + nvsize, bufsize - nvsize);
+	memset(packed + nvsize, 0, bufsize - nvsize);
 
 	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
 
@@ -8472,13 +9287,15 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
 
 	nvroot = fnvlist_alloc();
 	if (sav->sav_count == 0) {
-		fnvlist_add_nvlist_array(nvroot, config, NULL, 0);
+		fnvlist_add_nvlist_array(nvroot, config,
+		    (const nvlist_t * const *)NULL, 0);
 	} else {
 		list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP);
 		for (i = 0; i < sav->sav_count; i++)
 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
 			    B_FALSE, VDEV_CONFIG_L2CACHE);
-		fnvlist_add_nvlist_array(nvroot, config, list, sav->sav_count);
+		fnvlist_add_nvlist_array(nvroot, config,
+		    (const nvlist_t * const *)list, sav->sav_count);
 		for (i = 0; i < sav->sav_count; i++)
 			nvlist_free(list[i]);
 		kmem_free(list, sav->sav_count * sizeof (void *));
@@ -8499,6 +9316,11 @@ spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 
+	if (vd->vdev_root_zap != 0 &&
+	    spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) {
+		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
+		    vd->vdev_root_zap, tx));
+	}
 	if (vd->vdev_top_zap != 0) {
 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
 		    vd->vdev_top_zap, tx));
@@ -8659,27 +9481,14 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
 
 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
 		uint64_t intval;
-		char *strval, *fname;
+		const char *strval, *fname;
 		zpool_prop_t prop;
 		const char *propname;
+		const char *elemname = nvpair_name(elem);
 		zprop_type_t proptype;
 		spa_feature_t fid;
 
-		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
-		case ZPOOL_PROP_INVAL:
-			/*
-			 * We checked this earlier in spa_prop_validate().
-			 */
-			ASSERT(zpool_prop_feature(nvpair_name(elem)));
-
-			fname = strchr(nvpair_name(elem), '@') + 1;
-			VERIFY0(zfeature_lookup_name(fname, &fid));
-
-			spa_feature_enable(spa, fid, tx);
-			spa_history_log_internal(spa, "set", tx,
-			    "%s=enabled", nvpair_name(elem));
-			break;
-
+		switch (prop = zpool_name_to_prop(elemname)) {
 		case ZPOOL_PROP_VERSION:
 			intval = fnvpair_value_uint64(elem);
 			/*
@@ -8722,7 +9531,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
 				spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 			}
 			spa_history_log_internal(spa, "set", tx,
-			    "%s=%s", nvpair_name(elem), strval);
+			    "%s=%s", elemname, strval);
 			break;
 		case ZPOOL_PROP_COMPATIBILITY:
 			strval = fnvpair_value_string(elem);
@@ -8741,6 +9550,20 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
 			    "%s=%s", nvpair_name(elem), strval);
 			break;
 
+		case ZPOOL_PROP_INVAL:
+			if (zpool_prop_feature(elemname)) {
+				fname = strchr(elemname, '@') + 1;
+				VERIFY0(zfeature_lookup_name(fname, &fid));
+
+				spa_feature_enable(spa, fid, tx);
+				spa_history_log_internal(spa, "set", tx,
+				    "%s=enabled", elemname);
+				break;
+			} else if (!zfs_prop_user(elemname)) {
+				ASSERT(zpool_prop_feature(elemname));
+				break;
+			}
+			zfs_fallthrough;
 		default:
 			/*
 			 * Set pool property values in the poolprops mos object.
@@ -8753,8 +9576,13 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
 			}
 
 			/* normalize the property name */
-			propname = zpool_prop_to_name(prop);
-			proptype = zpool_prop_get_type(prop);
+			if (prop == ZPOOL_PROP_INVAL) {
+				propname = elemname;
+				proptype = PROP_TYPE_STRING;
+			} else {
+				propname = zpool_prop_to_name(prop);
+				proptype = zpool_prop_get_type(prop);
+			}
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
@@ -8763,7 +9591,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
 				    spa->spa_pool_props_object, propname,
 				    1, strlen(strval) + 1, strval, tx));
 				spa_history_log_internal(spa, "set", tx,
-				    "%s=%s", nvpair_name(elem), strval);
+				    "%s=%s", elemname, strval);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				intval = fnvpair_value_uint64(elem);
 
@@ -8776,38 +9604,38 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
 				    spa->spa_pool_props_object, propname,
 				    8, 1, &intval, tx));
 				spa_history_log_internal(spa, "set", tx,
-				    "%s=%lld", nvpair_name(elem),
+				    "%s=%lld", elemname,
 				    (longlong_t)intval);
-			} else {
-				ASSERT(0); /* not allowed */
-			}
 
-			switch (prop) {
-			case ZPOOL_PROP_DELEGATION:
-				spa->spa_delegation = intval;
-				break;
-			case ZPOOL_PROP_BOOTFS:
-				spa->spa_bootfs = intval;
-				break;
-			case ZPOOL_PROP_FAILUREMODE:
-				spa->spa_failmode = intval;
-				break;
-			case ZPOOL_PROP_AUTOTRIM:
-				spa->spa_autotrim = intval;
-				spa_async_request(spa,
-				    SPA_ASYNC_AUTOTRIM_RESTART);
-				break;
-			case ZPOOL_PROP_AUTOEXPAND:
-				spa->spa_autoexpand = intval;
-				if (tx->tx_txg != TXG_INITIAL)
+				switch (prop) {
+				case ZPOOL_PROP_DELEGATION:
+					spa->spa_delegation = intval;
+					break;
+				case ZPOOL_PROP_BOOTFS:
+					spa->spa_bootfs = intval;
+					break;
+				case ZPOOL_PROP_FAILUREMODE:
+					spa->spa_failmode = intval;
+					break;
+				case ZPOOL_PROP_AUTOTRIM:
+					spa->spa_autotrim = intval;
 					spa_async_request(spa,
-					    SPA_ASYNC_AUTOEXPAND);
-				break;
-			case ZPOOL_PROP_MULTIHOST:
-				spa->spa_multihost = intval;
-				break;
-			default:
-				break;
+					    SPA_ASYNC_AUTOTRIM_RESTART);
+					break;
+				case ZPOOL_PROP_AUTOEXPAND:
+					spa->spa_autoexpand = intval;
+					if (tx->tx_txg != TXG_INITIAL)
+						spa_async_request(spa,
+						    SPA_ASYNC_AUTOEXPAND);
+					break;
+				case ZPOOL_PROP_MULTIHOST:
+					spa->spa_multihost = intval;
+					break;
+				default:
+					break;
+				}
+			} else {
+				ASSERT(0); /* not allowed */
 			}
 		}
 
@@ -9045,8 +9873,10 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
 			    &spa->spa_deferred_bpobj, tx);
 		}
 
+		brt_sync(spa, txg);
 		ddt_sync(spa, txg);
 		dsl_scan_sync(dp, tx);
+		dsl_errorscrub_sync(dp, tx);
 		svr_sync(spa, tx);
 		spa_sync_upgrades(spa, tx);
 
@@ -9057,6 +9887,27 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
 		    != NULL)
 			vdev_sync(vd, txg);
 
+		if (pass == 1) {
+			/*
+			 * dsl_pool_sync() -> dp_sync_tasks may have dirtied
+			 * the config. If that happens, this txg should not
+			 * be a no-op. So we must sync the config to the MOS
+			 * before checking for no-op.
+			 *
+			 * Note that when the config is dirty, it will
+			 * be written to the MOS (i.e. the MOS will be
+			 * dirtied) every time we call spa_sync_config_object()
+			 * in this txg.  Therefore we can't call this after
+			 * dsl_pool_sync() every pass, because it would
+			 * prevent us from converging, since we'd dirty
+			 * the MOS every pass.
+			 *
+			 * Sync tasks can only be processed in pass 1, so
+			 * there's no need to do this in later passes.
+			 */
+			spa_sync_config_object(spa, tx);
+		}
+
 		/*
 		 * Note: We need to check if the MOS is dirty because we could
 		 * have marked the MOS dirty without updating the uberblock
@@ -9067,7 +9918,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
 		 * don't want to rely on that here).
 		 */
 		if (pass == 1 &&
-		    spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+		    BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
 		    !dmu_objset_is_dirty(mos, txg)) {
 			/*
 			 * Nothing changed on the first pass, therefore this
@@ -9170,6 +10021,13 @@ spa_sync(spa_t *spa, uint64_t txg)
 	    ZIO_FLAG_CANFAIL);
 
 	/*
+	 * Now that there can be no more cloning in this transaction group,
+	 * but we are still before issuing frees, we can process pending BRT
+	 * updates.
+	 */
+	brt_pending_apply(spa, txg);
+
+	/*
 	 * Lock out configuration changes.
 	 */
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
@@ -9188,7 +10046,13 @@ spa_sync(spa_t *spa, uint64_t txg)
 	 * into config changes that go out with this transaction group.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-	while (list_head(&spa->spa_state_dirty_list) != NULL) {
+	while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+		/* Avoid holding the write lock unless actually necessary */
+		if (vd->vdev_aux == NULL) {
+			vdev_state_clean(vd);
+			vdev_config_dirty(vd);
+			continue;
+		}
 		/*
 		 * We need the write lock here because, for aux vdevs,
 		 * calling vdev_config_dirty() modifies sav_config.
@@ -9304,11 +10168,17 @@ spa_sync(spa_t *spa, uint64_t txg)
 
 	metaslab_class_evict_old(spa->spa_normal_class, txg);
 	metaslab_class_evict_old(spa->spa_log_class, txg);
+	/* spa_embedded_log_class has only one metaslab per vdev. */
+	metaslab_class_evict_old(spa->spa_special_class, txg);
+	metaslab_class_evict_old(spa->spa_dedup_class, txg);
 
 	spa_sync_close_syncing_log_sm(spa);
 
 	spa_update_dspace(spa);
 
+	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON)
+		vdev_autotrim_kick(spa);
+
 	/*
 	 * It had better be the case that we didn't dirty anything
 	 * since vdev_config_sync().
@@ -9362,6 +10232,132 @@ spa_sync_allpools(void)
 	mutex_exit(&spa_namespace_lock);
 }
 
+taskq_t *
+spa_sync_tq_create(spa_t *spa, const char *name)
+{
+	kthread_t **kthreads;
+
+	ASSERT(spa->spa_sync_tq == NULL);
+	ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus);
+
+	/*
+	 * - do not allow more allocators than cpus.
+	 * - there may be more cpus than allocators.
+	 * - do not allow more sync taskq threads than allocators or cpus.
+	 */
+	int nthreads = spa->spa_alloc_count;
+	spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) *
+	    nthreads, KM_SLEEP);
+
+	spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri,
+	    nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads);
+	VERIFY(spa->spa_sync_tq != NULL);
+	VERIFY(kthreads != NULL);
+
+	spa_syncthread_info_t *ti = spa->spa_syncthreads;
+	for (int i = 0; i < nthreads; i++, ti++) {
+		ti->sti_thread = kthreads[i];
+		ti->sti_allocator = i;
+	}
+
+	kmem_free(kthreads, sizeof (*kthreads) * nthreads);
+	return (spa->spa_sync_tq);
+}
+
+void
+spa_sync_tq_destroy(spa_t *spa)
+{
+	ASSERT(spa->spa_sync_tq != NULL);
+
+	taskq_wait(spa->spa_sync_tq);
+	taskq_destroy(spa->spa_sync_tq);
+	kmem_free(spa->spa_syncthreads,
+	    sizeof (spa_syncthread_info_t) * spa->spa_alloc_count);
+	spa->spa_sync_tq = NULL;
+}
+
+uint_t
+spa_acq_allocator(spa_t *spa)
+{
+	int i;
+
+	if (spa->spa_alloc_count == 1)
+		return (0);
+
+	mutex_enter(&spa->spa_allocs_use->sau_lock);
+	uint_t r = spa->spa_allocs_use->sau_rotor;
+	do {
+		if (++r == spa->spa_alloc_count)
+			r = 0;
+	} while (spa->spa_allocs_use->sau_inuse[r]);
+	spa->spa_allocs_use->sau_inuse[r] = B_TRUE;
+	spa->spa_allocs_use->sau_rotor = r;
+	mutex_exit(&spa->spa_allocs_use->sau_lock);
+
+	spa_syncthread_info_t *ti = spa->spa_syncthreads;
+	for (i = 0; i < spa->spa_alloc_count; i++, ti++) {
+		if (ti->sti_thread == curthread) {
+			ti->sti_allocator = r;
+			break;
+		}
+	}
+	ASSERT3S(i, <, spa->spa_alloc_count);
+	return (r);
+}
+
+void
+spa_rel_allocator(spa_t *spa, uint_t allocator)
+{
+	if (spa->spa_alloc_count > 1)
+		spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE;
+}
+
+void
+spa_select_allocator(zio_t *zio)
+{
+	zbookmark_phys_t *bm = &zio->io_bookmark;
+	spa_t *spa = zio->io_spa;
+
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+	/*
+	 * A gang block (for example) may have inherited its parent's
+	 * allocator, in which case there is nothing further to do here.
+	 */
+	if (ZIO_HAS_ALLOCATOR(zio))
+		return;
+
+	ASSERT(spa != NULL);
+	ASSERT(bm != NULL);
+
+	/*
+	 * First try to use an allocator assigned to the syncthread, and set
+	 * the corresponding write issue taskq for the allocator.
+	 * Note, we must have an open pool to do this.
+	 */
+	if (spa->spa_sync_tq != NULL) {
+		spa_syncthread_info_t *ti = spa->spa_syncthreads;
+		for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
+			if (ti->sti_thread == curthread) {
+				zio->io_allocator = ti->sti_allocator;
+				return;
+			}
+		}
+	}
+
+	/*
+	 * We want to try to use as many allocators as possible to help improve
+	 * performance, but we also want logically adjacent IOs to be physically
+	 * adjacent to improve sequential read performance. We chunk each object
+	 * into 2^20 block regions, and then hash based on the objset, object,
+	 * level, and region to accomplish both of these goals.
+	 */
+	uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level,
+	    bm->zb_blkid >> 20);
+
+	zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
+}
+
 /*
  * ==========================================================================
  * Miscellaneous routines
@@ -9454,6 +10450,7 @@ spa_upgrade(spa_t *spa, uint64_t version)
 static boolean_t
 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav)
 {
+	(void) spa;
 	int i;
 	uint64_t vdev_guid;
 
@@ -9699,9 +10696,10 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
 		    DSS_SCANNING);
 		break;
 	case ZPOOL_WAIT_RESILVER:
-		if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
+		*in_progress = vdev_rebuild_active(spa->spa_root_vdev);
+		if (*in_progress)
 			break;
-		fallthrough;
+		zfs_fallthrough;
 	case ZPOOL_WAIT_SCRUB:
 	{
 		boolean_t scanning, paused, is_scrub;
@@ -9714,6 +10712,12 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
 		    is_scrub == (activity == ZPOOL_WAIT_SCRUB));
 		break;
 	}
+	case ZPOOL_WAIT_RAIDZ_EXPAND:
+	{
+		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+		*in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING);
+		break;
+	}
 	default:
 		panic("unrecognized value for activity %d", activity);
 	}
@@ -9817,6 +10821,8 @@ spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
 		ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP);
 		ev->resource = resource;
 	}
+#else
+	(void) spa, (void) vd, (void) hist_nvl, (void) name;
 #endif
 	return (ev);
 }
@@ -9829,6 +10835,8 @@ spa_event_post(sysevent_t *ev)
 		zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb);
 		kmem_free(ev, sizeof (*ev));
 	}
+#else
+	(void) ev;
 #endif
 }
 
@@ -9900,10 +10908,14 @@ EXPORT_SYMBOL(spa_prop_clear_bootfs);
 /* asynchronous event notification */
 EXPORT_SYMBOL(spa_event_notify);
 
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW,
+	"Percentage of CPUs to run a metaslab preload taskq");
+
 /* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
 	"log2 fraction of arc that can be used by inflight I/Os when "
 	"verifying pool during import");
+/* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW,
 	"Set to traverse metadata on pool import");
@@ -9914,29 +10926,47 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
 	"Print vdev tree to zfs_dbgmsg during pool import");
 
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW,
 	"Percentage of CPUs to run an IO worker thread");
 
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW,
 	"Number of threads per IO worker taskqueue");
 
-ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW,
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW,
 	"Allow importing pool with up to this number of missing top-level "
 	"vdevs (in read-only mode)");
+/* END CSTYLED */
 
-ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW,
-	"Set the livelist condense zthr to pause");
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT,
+	ZMOD_RW, "Set the livelist condense zthr to pause");
 
-ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW,
-	"Set the livelist condense synctask to pause");
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT,
+	ZMOD_RW, "Set the livelist condense synctask to pause");
 
-ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW,
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel,
+	INT, ZMOD_RW,
 	"Whether livelist condensing was canceled in the synctask");
 
-ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel,
+	INT, ZMOD_RW,
 	"Whether livelist condensing was canceled in the zthr function");
 
-ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
+	ZMOD_RW,
 	"Whether extra ALLOC blkptrs were added to a livelist entry while it "
 	"was being condensed");
+
+#ifdef _KERNEL
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
+	spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,
+	"Configure IO queues for read IO");
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
+	spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW,
+	"Configure IO queues for write IO");
+#endif
 /* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW,
+	"Number of CPUs per write issue taskq");
diff --git a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
index 09f62996853d..1efff47f87a0 100644
--- a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
+++ b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -158,7 +158,7 @@
  * amount of checkpointed data that has been freed within them while
  * the pool had a checkpoint.
  */
-unsigned long zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
+static uint64_t zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
 
 int
 spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
@@ -166,7 +166,7 @@ spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
 	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 		return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
 
-	bzero(pcs, sizeof (pool_checkpoint_stat_t));
+	memset(pcs, 0, sizeof (pool_checkpoint_stat_t));
 
 	int error = zap_contains(spa_meta_objset(spa),
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
@@ -347,7 +347,7 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
 		if (error != 0) {
 			zfs_panic_recover("zfs: error %lld was returned "
 			    "while incrementally destroying the checkpoint "
-			    "space map of vdev %u\n",
+			    "space map of vdev %llu\n",
 			    (longlong_t)error, vd->vdev_id);
 		}
 		ASSERT0(words_after);
@@ -380,10 +380,10 @@ spa_checkpoint_discard_is_done(spa_t *spa)
 	return (B_TRUE);
 }
 
-/* ARGSUSED */
 boolean_t
 spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr)
 {
+	(void) zthr;
 	spa_t *spa = arg;
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
@@ -450,10 +450,10 @@ spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
 }
 
 
-/* ARGSUSED */
 static int
 spa_checkpoint_check(void *arg, dmu_tx_t *tx)
 {
+	(void) arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
@@ -465,6 +465,9 @@ spa_checkpoint_check(void *arg, dmu_tx_t *tx)
 	if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
 		return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
 
+	if (spa->spa_raidz_expand != NULL)
+		return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
+
 	if (spa->spa_checkpoint_txg != 0)
 		return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
 
@@ -474,10 +477,10 @@ spa_checkpoint_check(void *arg, dmu_tx_t *tx)
 	return (0);
 }
 
-/* ARGSUSED */
 static void
 spa_checkpoint_sync(void *arg, dmu_tx_t *tx)
 {
+	(void) arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	spa_t *spa = dp->dp_spa;
 	uberblock_t checkpoint = spa->spa_ubsync;
@@ -571,10 +574,10 @@ spa_checkpoint(const char *pool)
 	return (error);
 }
 
-/* ARGSUSED */
 static int
 spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
 {
+	(void) arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
@@ -589,10 +592,10 @@ spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
 	return (0);
 }
 
-/* ARGSUSED */
 static void
 spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx)
 {
+	(void) arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
@@ -631,7 +634,7 @@ EXPORT_SYMBOL(spa_checkpoint_discard_thread);
 EXPORT_SYMBOL(spa_checkpoint_discard_thread_check);
 
 /* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, U64, ZMOD_RW,
 	"Limit for memory used in prefetching the checkpoint space map done "
 	"on each vdev while discarding the checkpoint");
 /* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c
index ad82932ce567..a77874ea0dd3 100644
--- a/sys/contrib/openzfs/module/zfs/spa_config.c
+++ b/sys/contrib/openzfs/module/zfs/spa_config.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -67,8 +67,10 @@ static uint64_t spa_config_generation = 1;
  * This can be overridden in userland to preserve an alternate namespace for
  * userland pools when doing testing.
  */
-char *spa_config_path = ZPOOL_CACHE;
-int zfs_autoimport_disable = 1;
+char *spa_config_path = (char *)ZPOOL_CACHE;
+#ifdef _KERNEL
+static int zfs_autoimport_disable = B_TRUE;
+#endif
 
 /*
  * Called when the module is first loaded, this routine loads the configuration
@@ -238,11 +240,12 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
  * would be required.
  */
 void
-spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
+spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent,
+    boolean_t postblkidevent)
 {
 	spa_config_dirent_t *dp, *tdp;
 	nvlist_t *nvl;
-	char *pool_name;
+	const char *pool_name;
 	boolean_t ccw_failure;
 	int error = 0;
 
@@ -344,6 +347,18 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
 
 	if (postsysevent)
 		spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC);
+
+	/*
+	 * Post udev event to sync blkid information if the pool is created
+	 * or a new vdev is added to the pool.
+	 */
+	if ((target->spa_root_vdev) && postblkidevent) {
+		vdev_post_kobj_evt(target->spa_root_vdev);
+		for (int i = 0; i < target->spa_l2cache.sav_count; i++)
+			vdev_post_kobj_evt(target->spa_l2cache.sav_vdevs[i]);
+		for (int i = 0; i < target->spa_spares.sav_count; i++)
+			vdev_post_kobj_evt(target->spa_spares.sav_vdevs[i]);
+	}
 }
 
 /*
@@ -352,23 +367,24 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
  * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
  * information for all pool visible within the zone.
  */
-nvlist_t *
-spa_all_configs(uint64_t *generation)
+int
+spa_all_configs(uint64_t *generation, nvlist_t **pools)
 {
-	nvlist_t *pools;
 	spa_t *spa = NULL;
 
 	if (*generation == spa_config_generation)
-		return (NULL);
+		return (SET_ERROR(EEXIST));
 
-	pools = fnvlist_alloc();
+	int error = mutex_enter_interruptible(&spa_namespace_lock);
+	if (error)
+		return (SET_ERROR(EINTR));
 
-	mutex_enter(&spa_namespace_lock);
+	*pools = fnvlist_alloc();
 	while ((spa = spa_next(spa)) != NULL) {
 		if (INGLOBALZONE(curproc) ||
 		    zone_dataset_visible(spa_name(spa), NULL)) {
 			mutex_enter(&spa->spa_props_lock);
-			fnvlist_add_nvlist(pools, spa_name(spa),
+			fnvlist_add_nvlist(*pools, spa_name(spa),
 			    spa->spa_config);
 			mutex_exit(&spa->spa_props_lock);
 		}
@@ -376,7 +392,7 @@ spa_all_configs(uint64_t *generation)
 	*generation = spa_config_generation;
 	mutex_exit(&spa_namespace_lock);
 
-	return (pools);
+	return (0);
 }
 
 void
@@ -403,7 +419,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 	unsigned long hostid = 0;
 	boolean_t locked = B_FALSE;
 	uint64_t split_guid;
-	char *pool_name;
+	const char *pool_name;
 
 	if (vd == NULL) {
 		vd = rvd;
@@ -598,6 +614,7 @@ spa_config_update(spa_t *spa, int what)
 	 */
 	if (!spa->spa_is_root) {
 		spa_write_cachefile(spa, B_FALSE,
+		    what != SPA_CONFIG_UPDATE_POOL,
 		    what != SPA_CONFIG_UPDATE_POOL);
 	}
 
@@ -611,7 +628,6 @@ EXPORT_SYMBOL(spa_config_set);
 EXPORT_SYMBOL(spa_config_generate);
 EXPORT_SYMBOL(spa_config_update);
 
-/* BEGIN CSTYLED */
 #ifdef __linux__
 /* string sysctls require a char array on FreeBSD */
 ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD,
@@ -620,4 +636,3 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD,
 
 ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW,
 	"Disable pool import at module load");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/spa_errlog.c b/sys/contrib/openzfs/module/zfs/spa_errlog.c
index fa5120eb61b3..62d7b4fa2df2 100644
--- a/sys/contrib/openzfs/module/zfs/spa_errlog.c
+++ b/sys/contrib/openzfs/module/zfs/spa_errlog.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -20,7 +20,9 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014, Delphix. All rights reserved.
+ * Copyright (c) 2019 Datto Inc.
+ * Copyright (c) 2021, 2022, George Amanakis. All rights reserved.
  */
 
 /*
@@ -43,6 +45,16 @@
  * calculation when the data is requested, storing the result so future queries
  * will be faster.
  *
+ * If the head_errlog feature is enabled, a different on-disk format is used.
+ * The error log of each head dataset is stored separately in the zap object
+ * and keyed by the head id. This enables listing every dataset affected in
+ * userland. In order to be able to track whether an error block has been
+ * modified or added to snapshots since it was marked as an error, a new tuple
+ * is introduced: zbookmark_err_phys_t. It allows the storage of the birth
+ * transaction group of an error block on-disk. The birth transaction group is
+ * used by check_filesystem() to assess whether this block was freed,
+ * re-written or added to a snapshot since its marking as an error.
+ *
  * This log is then shipped into an nvlist where the key is the dataset name and
  * the value is the object name.  Userland is then responsible for uniquifying
  * this list and displaying it to the user.
@@ -53,7 +65,25 @@
 #include <sys/spa_impl.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_objset.h>
+#include <sys/dbuf.h>
+#include <sys/zfs_znode.h>
+
+#define	NAME_MAX_LEN 64
+
+typedef struct clones {
+	uint64_t clone_ds;
+	list_node_t node;
+} clones_t;
 
+/*
+ * spa_upgrade_errlog_limit : A zfs module parameter that controls the number
+ *		of on-disk error log entries that will be converted to the new
+ *		format when enabling head_errlog. Defaults to 0 which converts
+ *		all log entries.
+ */
+static uint_t spa_upgrade_errlog_limit = 0;
 
 /*
  * Convert a bookmark to a string.
@@ -67,9 +97,35 @@ bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
 }
 
 /*
- * Convert a string to a bookmark
+ * Convert an err_phys to a string.
+ */
+static void
+errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len)
+{
+	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
+	    (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level,
+	    (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth);
+}
+
+/*
+ * Convert a string to a err_phys.
+ */
+void
+name_to_errphys(char *buf, zbookmark_err_phys_t *zep)
+{
+	zep->zb_object = zfs_strtonum(buf, &buf);
+	ASSERT(*buf == ':');
+	zep->zb_level = (int)zfs_strtonum(buf + 1, &buf);
+	ASSERT(*buf == ':');
+	zep->zb_blkid = zfs_strtonum(buf + 1, &buf);
+	ASSERT(*buf == ':');
+	zep->zb_birth = zfs_strtonum(buf + 1, &buf);
+	ASSERT(*buf == '\0');
+}
+
+/*
+ * Convert a string to a bookmark.
  */
-#ifdef _KERNEL
 static void
 name_to_bookmark(char *buf, zbookmark_phys_t *zb)
 {
@@ -82,7 +138,41 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb)
 	zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == '\0');
 }
-#endif
+
+void
+zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb)
+{
+	zb->zb_objset = dataset;
+	zb->zb_object = zep->zb_object;
+	zb->zb_level = zep->zb_level;
+	zb->zb_blkid = zep->zb_blkid;
+}
+
+static void
+name_to_object(char *buf, uint64_t *obj)
+{
+	*obj = zfs_strtonum(buf, &buf);
+	ASSERT(*buf == '\0');
+}
+
+/*
+ * Retrieve the head filesystem.
+ */
+static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds)
+{
+	dsl_dataset_t *ds;
+	int error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool,
+	    dsobj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+
+	if (error != 0)
+		return (error);
+
+	ASSERT(head_ds);
+	*head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
+	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+
+	return (error);
+}
 
 /*
  * Log an uncorrectable error to the persistent error log.  We add it to the
@@ -90,7 +180,7 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb)
  * during spa_errlog_sync().
  */
 void
-spa_log_error(spa_t *spa, const zbookmark_phys_t *zb)
+spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t birth)
 {
 	spa_error_entry_t search;
 	spa_error_entry_t *new;
@@ -123,96 +213,856 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb)
 
 	new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
 	new->se_bookmark = *zb;
-	avl_insert(tree, new, where);
 
+	/*
+	 * If the head_errlog feature is enabled, store the birth txg now. In
+	 * case the file is deleted before spa_errlog_sync() runs, we will not
+	 * be able to retrieve the birth txg.
+	 */
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+		new->se_zep.zb_object = zb->zb_object;
+		new->se_zep.zb_level = zb->zb_level;
+		new->se_zep.zb_blkid = zb->zb_blkid;
+		new->se_zep.zb_birth = birth;
+	}
+
+	avl_insert(tree, new, where);
 	mutex_exit(&spa->spa_errlist_lock);
 }
 
+int
+find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
+    uint64_t *birth_txg)
+{
+	objset_t *os;
+	int error = dmu_objset_from_ds(ds, &os);
+	if (error != 0)
+		return (error);
+
+	dnode_t *dn;
+	blkptr_t bp;
+
+	error = dnode_hold(os, zep->zb_object, FTAG, &dn);
+	if (error != 0)
+		return (error);
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL,
+	    NULL);
+	if (error == 0 && BP_IS_HOLE(&bp))
+		error = SET_ERROR(ENOENT);
+
+	*birth_txg = BP_GET_LOGICAL_BIRTH(&bp);
+	rw_exit(&dn->dn_struct_rwlock);
+	dnode_rele(dn, FTAG);
+	return (error);
+}
+
+/*
+ * This function finds the oldest affected filesystem containing an error
+ * block.
+ */
+int
+find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
+    uint64_t *top_affected_fs)
+{
+	uint64_t oldest_dsobj;
+	int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth,
+	    &oldest_dsobj);
+	if (error != 0)
+		return (error);
+
+	dsl_dataset_t *ds;
+	error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj,
+	    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	*top_affected_fs =
+	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
+	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+	return (0);
+}
+
+
+#ifdef _KERNEL
+/*
+ * Copy the bookmark to the end of the user-space buffer which starts at
+ * uaddr and has *count unused entries, and decrement *count by 1.
+ */
+static int
+copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count)
+{
+	if (*count == 0)
+		return (SET_ERROR(ENOMEM));
+
+	*count -= 1;
+	if (copyout(zb, (char *)uaddr + (*count) * sizeof (zbookmark_phys_t),
+	    sizeof (zbookmark_phys_t)) != 0)
+		return (SET_ERROR(EFAULT));
+	return (0);
+}
+
 /*
- * Return the number of errors currently in the error log.  This is actually the
- * sum of both the last log and the current log, since we don't know the union
- * of these logs until we reach userland.
+ * Each time the error block is referenced by a snapshot or clone, add a
+ * zbookmark_phys_t entry to the userspace array at uaddr. The array is
+ * filled from the back and the in-out parameter *count is modified to be the
+ * number of unused entries at the beginning of the array. The function
+ * scrub_filesystem() is modelled after this one.
  */
+static int
+check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
+    void *uaddr, uint64_t *count, list_t *clones_list)
+{
+	dsl_dataset_t *ds;
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+
+	int error = dsl_dataset_hold_obj_flags(dp, head_ds,
+	    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	uint64_t latest_txg;
+	uint64_t txg_to_consider = spa->spa_syncing_txg;
+	boolean_t check_snapshot = B_TRUE;
+	error = find_birth_txg(ds, zep, &latest_txg);
+
+	/*
+	 * If find_birth_txg() errors out otherwise, let txg_to_consider be
+	 * equal to the spa's syncing txg: if check_filesystem() errors out
+	 * then affected snapshots or clones will not be checked.
+	 */
+	if (error == 0 && zep->zb_birth == latest_txg) {
+		/* Block neither free nor rewritten. */
+		zbookmark_phys_t zb;
+		zep_to_zb(head_ds, zep, &zb);
+		error = copyout_entry(&zb, uaddr, count);
+		if (error != 0) {
+			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+			return (error);
+		}
+		check_snapshot = B_FALSE;
+	} else if (error == 0) {
+		txg_to_consider = latest_txg;
+	}
+
+	/*
+	 * Retrieve the number of snapshots if the dataset is not a snapshot.
+	 */
+	uint64_t snap_count = 0;
+	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
+
+		error = zap_count(spa->spa_meta_objset,
+		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+
+		if (error != 0) {
+			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+			return (error);
+		}
+	}
+
+	if (snap_count == 0) {
+		/* Filesystem without snapshots. */
+		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+		return (0);
+	}
+
+	uint64_t *snap_obj_array = kmem_zalloc(snap_count * sizeof (uint64_t),
+	    KM_SLEEP);
+
+	int aff_snap_count = 0;
+	uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+	uint64_t zap_clone = dsl_dir_phys(ds->ds_dir)->dd_clones;
+
+	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+
+	/* Check only snapshots created from this file system. */
+	while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
+	    snap_obj_txg <= txg_to_consider) {
+
+		error = dsl_dataset_hold_obj_flags(dp, snap_obj,
+		    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+		if (error != 0)
+			goto out;
+
+		if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) {
+			snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+			snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+			continue;
+		}
+
+		boolean_t affected = B_TRUE;
+		if (check_snapshot) {
+			uint64_t blk_txg;
+			error = find_birth_txg(ds, zep, &blk_txg);
+			affected = (error == 0 && zep->zb_birth == blk_txg);
+		}
+
+		/* Report errors in snapshots. */
+		if (affected) {
+			snap_obj_array[aff_snap_count] = snap_obj;
+			aff_snap_count++;
+
+			zbookmark_phys_t zb;
+			zep_to_zb(snap_obj, zep, &zb);
+			error = copyout_entry(&zb, uaddr, count);
+			if (error != 0) {
+				dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT,
+				    FTAG);
+				goto out;
+			}
+		}
+		snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+		snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+	}
+
+	if (zap_clone == 0 || aff_snap_count == 0) {
+		error = 0;
+		goto out;
+	}
+
+	/* Check clones. */
+	zap_cursor_t *zc;
+	zap_attribute_t *za;
+
+	zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
+	za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+	for (zap_cursor_init(zc, spa->spa_meta_objset, zap_clone);
+	    zap_cursor_retrieve(zc, za) == 0;
+	    zap_cursor_advance(zc)) {
+
+		dsl_dataset_t *clone;
+		error = dsl_dataset_hold_obj_flags(dp, za->za_first_integer,
+		    DS_HOLD_FLAG_DECRYPT, FTAG, &clone);
+
+		if (error != 0)
+			break;
+
+		/*
+		 * Only clones whose origins were affected could also
+		 * have affected snapshots.
+		 */
+		boolean_t found = B_FALSE;
+		for (int i = 0; i < snap_count; i++) {
+			if (dsl_dir_phys(clone->ds_dir)->dd_origin_obj
+			    == snap_obj_array[i])
+				found = B_TRUE;
+		}
+		dsl_dataset_rele_flags(clone, DS_HOLD_FLAG_DECRYPT, FTAG);
+
+		if (!found)
+			continue;
+
+		clones_t *ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
+		ct->clone_ds = za->za_first_integer;
+		list_insert_tail(clones_list, ct);
+	}
+
+	zap_cursor_fini(zc);
+	kmem_free(za, sizeof (*za));
+	kmem_free(zc, sizeof (*zc));
+
+out:
+	kmem_free(snap_obj_array, sizeof (*snap_obj_array));
+	return (error);
+}
+
+static int
+process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
+    void *uaddr, uint64_t *count)
+{
+	/*
+	 * If zb_birth == 0 or head_ds == 0 it means we failed to retrieve the
+	 * birth txg or the head filesystem of the block pointer. This may
+	 * happen e.g. when an encrypted filesystem is not mounted or when
+	 * the key is not loaded. In this case do not proceed to
+	 * check_filesystem(), instead do the accounting here.
+	 */
+	if (zep->zb_birth == 0 || head_ds == 0) {
+		zbookmark_phys_t zb;
+		zep_to_zb(head_ds, zep, &zb);
+		int error = copyout_entry(&zb, uaddr, count);
+		if (error != 0) {
+			return (error);
+		}
+		return (0);
+	}
+
+	uint64_t top_affected_fs;
+	uint64_t init_count = *count;
+	int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs);
+	if (error == 0) {
+		clones_t *ct;
+		list_t clones_list;
+
+		list_create(&clones_list, sizeof (clones_t),
+		    offsetof(clones_t, node));
+
+		error = check_filesystem(spa, top_affected_fs, zep,
+		    uaddr, count, &clones_list);
+
+		while ((ct = list_remove_head(&clones_list)) != NULL) {
+			error = check_filesystem(spa, ct->clone_ds, zep,
+			    uaddr, count, &clones_list);
+			kmem_free(ct, sizeof (*ct));
+
+			if (error) {
+				while (!list_is_empty(&clones_list)) {
+					ct = list_remove_head(&clones_list);
+					kmem_free(ct, sizeof (*ct));
+				}
+				break;
+			}
+		}
+
+		list_destroy(&clones_list);
+	}
+	if (error == 0 && init_count == *count) {
+		/*
+		 * If we reach this point, no errors have been detected
+		 * in the checked filesystems/snapshots. Before returning mark
+		 * the error block to be removed from the error lists and logs.
+		 */
+		zbookmark_phys_t zb;
+		zep_to_zb(head_ds, zep, &zb);
+		spa_remove_error(spa, &zb, zep->zb_birth);
+	}
+
+	return (error);
+}
+#endif
+
+/* Return the number of errors in the error log */
 uint64_t
-spa_get_errlog_size(spa_t *spa)
+spa_get_last_errlog_size(spa_t *spa)
 {
 	uint64_t total = 0, count;
-
 	mutex_enter(&spa->spa_errlog_lock);
-	if (spa->spa_errlog_scrub != 0 &&
-	    zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
-	    &count) == 0)
-		total += count;
 
-	if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
+	if (spa->spa_errlog_last != 0 &&
 	    zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
 	    &count) == 0)
 		total += count;
 	mutex_exit(&spa->spa_errlog_lock);
+	return (total);
+}
+
+/*
+ * If a healed bookmark matches an entry in the error log we stash it in a tree
+ * so that we can later remove the related log entries in sync context.
+ */
+static void
+spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb,
+    const uint64_t birth)
+{
+	char name[NAME_MAX_LEN];
+
+	if (obj == 0)
+		return;
+
+	boolean_t held_list = B_FALSE;
+	boolean_t held_log = B_FALSE;
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+		bookmark_to_name(healed_zb, name, sizeof (name));
+
+		if (zap_contains(spa->spa_meta_objset, healed_zb->zb_objset,
+		    name) == 0) {
+			if (!MUTEX_HELD(&spa->spa_errlog_lock)) {
+				mutex_enter(&spa->spa_errlog_lock);
+				held_log = B_TRUE;
+			}
+
+			/*
+			 * Found an error matching healed zb, add zb to our
+			 * tree of healed errors
+			 */
+			avl_tree_t *tree = &spa->spa_errlist_healed;
+			spa_error_entry_t search;
+			spa_error_entry_t *new;
+			avl_index_t where;
+			search.se_bookmark = *healed_zb;
+			if (!MUTEX_HELD(&spa->spa_errlist_lock)) {
+				mutex_enter(&spa->spa_errlist_lock);
+				held_list = B_TRUE;
+			}
+			if (avl_find(tree, &search, &where) != NULL) {
+				if (held_list)
+					mutex_exit(&spa->spa_errlist_lock);
+				if (held_log)
+					mutex_exit(&spa->spa_errlog_lock);
+				return;
+			}
+			new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
+			new->se_bookmark = *healed_zb;
+			avl_insert(tree, new, where);
+			if (held_list)
+				mutex_exit(&spa->spa_errlist_lock);
+			if (held_log)
+				mutex_exit(&spa->spa_errlog_lock);
+		}
+		return;
+	}
+
+	zbookmark_err_phys_t healed_zep;
+	healed_zep.zb_object = healed_zb->zb_object;
+	healed_zep.zb_level = healed_zb->zb_level;
+	healed_zep.zb_blkid = healed_zb->zb_blkid;
+	healed_zep.zb_birth = birth;
+
+	errphys_to_name(&healed_zep, name, sizeof (name));
+
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_errlog_last);
+	    zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+		if (zap_contains(spa->spa_meta_objset, za.za_first_integer,
+		    name) == 0) {
+			if (!MUTEX_HELD(&spa->spa_errlog_lock)) {
+				mutex_enter(&spa->spa_errlog_lock);
+				held_log = B_TRUE;
+			}
+
+			avl_tree_t *tree = &spa->spa_errlist_healed;
+			spa_error_entry_t search;
+			spa_error_entry_t *new;
+			avl_index_t where;
+			search.se_bookmark = *healed_zb;
+
+			if (!MUTEX_HELD(&spa->spa_errlist_lock)) {
+				mutex_enter(&spa->spa_errlist_lock);
+				held_list = B_TRUE;
+			}
+
+			if (avl_find(tree, &search, &where) != NULL) {
+				if (held_list)
+					mutex_exit(&spa->spa_errlist_lock);
+				if (held_log)
+					mutex_exit(&spa->spa_errlog_lock);
+				continue;
+			}
+			new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
+			new->se_bookmark = *healed_zb;
+			new->se_zep = healed_zep;
+			avl_insert(tree, new, where);
+
+			if (held_list)
+				mutex_exit(&spa->spa_errlist_lock);
+			if (held_log)
+				mutex_exit(&spa->spa_errlog_lock);
+		}
+	}
+	zap_cursor_fini(&zc);
+}
+
+/*
+ * If this error exists in the given tree remove it.
+ */
+static void
+remove_error_from_list(spa_t *spa, avl_tree_t *t, const zbookmark_phys_t *zb)
+{
+	spa_error_entry_t search, *found;
+	avl_index_t where;
+
+	mutex_enter(&spa->spa_errlist_lock);
+	search.se_bookmark = *zb;
+	if ((found = avl_find(t, &search, &where)) != NULL) {
+		avl_remove(t, found);
+		kmem_free(found, sizeof (spa_error_entry_t));
+	}
+	mutex_exit(&spa->spa_errlist_lock);
+}
+
+
+/*
+ * Removes all of the recv healed errors from both on-disk error logs
+ */
+static void
+spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx)
+{
+	char name[NAME_MAX_LEN];
+	spa_error_entry_t *se;
+	void *cookie = NULL;
+
+	ASSERT(MUTEX_HELD(&spa->spa_errlog_lock));
+
+	while ((se = avl_destroy_nodes(&spa->spa_errlist_healed,
+	    &cookie)) != NULL) {
+		remove_error_from_list(spa, s, &se->se_bookmark);
+		remove_error_from_list(spa, l, &se->se_bookmark);
+
+		if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+			bookmark_to_name(&se->se_bookmark, name, sizeof (name));
+			(void) zap_remove(spa->spa_meta_objset,
+			    spa->spa_errlog_last, name, tx);
+			(void) zap_remove(spa->spa_meta_objset,
+			    spa->spa_errlog_scrub, name, tx);
+		} else {
+			errphys_to_name(&se->se_zep, name, sizeof (name));
+			zap_cursor_t zc;
+			zap_attribute_t za;
+			for (zap_cursor_init(&zc, spa->spa_meta_objset,
+			    spa->spa_errlog_last);
+			    zap_cursor_retrieve(&zc, &za) == 0;
+			    zap_cursor_advance(&zc)) {
+				zap_remove(spa->spa_meta_objset,
+				    za.za_first_integer, name, tx);
+			}
+			zap_cursor_fini(&zc);
+
+			for (zap_cursor_init(&zc, spa->spa_meta_objset,
+			    spa->spa_errlog_scrub);
+			    zap_cursor_retrieve(&zc, &za) == 0;
+			    zap_cursor_advance(&zc)) {
+				zap_remove(spa->spa_meta_objset,
+				    za.za_first_integer, name, tx);
+			}
+			zap_cursor_fini(&zc);
+		}
+		kmem_free(se, sizeof (spa_error_entry_t));
+	}
+}
+
+/*
+ * Stash away healed bookmarks to remove them from the on-disk error logs
+ * later in spa_remove_healed_errors().
+ */
+void
+spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, uint64_t birth)
+{
+	spa_add_healed_error(spa, spa->spa_errlog_last, zb, birth);
+	spa_add_healed_error(spa, spa->spa_errlog_scrub, zb, birth);
+}
+
+static uint64_t
+approx_errlog_size_impl(spa_t *spa, uint64_t spa_err_obj)
+{
+	if (spa_err_obj == 0)
+		return (0);
+	uint64_t total = 0;
 
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
+	    zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+		uint64_t count;
+		if (zap_count(spa->spa_meta_objset, za.za_first_integer,
+		    &count) == 0)
+			total += count;
+	}
+	zap_cursor_fini(&zc);
+	return (total);
+}
+
+/*
+ * Return the approximate number of errors currently in the error log.  This
+ * will be nonzero if there are some errors, but otherwise it may be more
+ * or less than the number of entries returned by spa_get_errlog().
+ */
+uint64_t
+spa_approx_errlog_size(spa_t *spa)
+{
+	uint64_t total = 0;
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+		mutex_enter(&spa->spa_errlog_lock);
+		uint64_t count;
+		if (spa->spa_errlog_scrub != 0 &&
+		    zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
+		    &count) == 0)
+			total += count;
+
+		if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
+		    zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
+		    &count) == 0)
+			total += count;
+		mutex_exit(&spa->spa_errlog_lock);
+
+	} else {
+		mutex_enter(&spa->spa_errlog_lock);
+		total += approx_errlog_size_impl(spa, spa->spa_errlog_last);
+		total += approx_errlog_size_impl(spa, spa->spa_errlog_scrub);
+		mutex_exit(&spa->spa_errlog_lock);
+	}
 	mutex_enter(&spa->spa_errlist_lock);
 	total += avl_numnodes(&spa->spa_errlist_last);
 	total += avl_numnodes(&spa->spa_errlist_scrub);
 	mutex_exit(&spa->spa_errlist_lock);
-
 	return (total);
 }
 
-#ifdef _KERNEL
-static int
-process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
+/*
+ * This function sweeps through an on-disk error log and stores all bookmarks
+ * as error bookmarks in a new ZAP object. At the end we discard the old one,
+ * and spa_update_errlog() will set the spa's on-disk error log to new ZAP
+ * object.
+ */
+static void
+sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj,
+    dmu_tx_t *tx)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	zbookmark_phys_t zb;
+	uint64_t count;
 
-	if (obj == 0)
-		return (0);
+	*newobj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
+	    DMU_OT_NONE, 0, tx);
+
+	/*
+	 * If we cannnot perform the upgrade we should clear the old on-disk
+	 * error logs.
+	 */
+	if (zap_count(spa->spa_meta_objset, spa_err_obj, &count) != 0) {
+		VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
+		return;
+	}
 
-	for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
+	for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
+		if (spa_upgrade_errlog_limit != 0 &&
+		    zc.zc_cd == spa_upgrade_errlog_limit)
+			break;
 
-		if (*count == 0) {
-			zap_cursor_fini(&zc);
-			return (SET_ERROR(ENOMEM));
+		name_to_bookmark(za.za_name, &zb);
+
+		zbookmark_err_phys_t zep;
+		zep.zb_object = zb.zb_object;
+		zep.zb_level = zb.zb_level;
+		zep.zb_blkid = zb.zb_blkid;
+		zep.zb_birth = 0;
+
+		/*
+		 * In case of an error we should simply continue instead of
+		 * returning prematurely. See the next comment.
+		 */
+		uint64_t head_ds;
+		dsl_pool_t *dp = spa->spa_dsl_pool;
+		dsl_dataset_t *ds;
+		objset_t *os;
+
+		int error = dsl_dataset_hold_obj_flags(dp, zb.zb_objset,
+		    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+		if (error != 0)
+			continue;
+
+		head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
+
+		/*
+		 * The objset and the dnode are required for getting the block
+		 * pointer, which is used to determine if BP_IS_HOLE(). If
+		 * getting the objset or the dnode fails, do not create a
+		 * zap entry (presuming we know the dataset) as this may create
+		 * spurious errors that we cannot ever resolve. If an error is
+		 * truly persistent, it should re-appear after a scan.
+		 */
+		if (dmu_objset_from_ds(ds, &os) != 0) {
+			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+			continue;
 		}
 
-		name_to_bookmark(za.za_name, &zb);
+		dnode_t *dn;
+		blkptr_t bp;
 
-		if (copyout(&zb, (char *)addr +
-		    (*count - 1) * sizeof (zbookmark_phys_t),
-		    sizeof (zbookmark_phys_t)) != 0) {
-			zap_cursor_fini(&zc);
-			return (SET_ERROR(EFAULT));
+		if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) {
+			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+			continue;
 		}
 
-		*count -= 1;
-	}
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp,
+		    NULL, NULL);
+		if (error == EACCES)
+			error = 0;
+		else if (!error)
+			zep.zb_birth = BP_GET_LOGICAL_BIRTH(&bp);
+
+		rw_exit(&dn->dn_struct_rwlock);
+		dnode_rele(dn, FTAG);
+		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+
+		if (error != 0 || BP_IS_HOLE(&bp))
+			continue;
+
+		uint64_t err_obj;
+		error = zap_lookup_int_key(spa->spa_meta_objset, *newobj,
+		    head_ds, &err_obj);
+
+		if (error == ENOENT) {
+			err_obj = zap_create(spa->spa_meta_objset,
+			    DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
+
+			(void) zap_update_int_key(spa->spa_meta_objset,
+			    *newobj, head_ds, err_obj, tx);
+		}
 
+		char buf[64];
+		errphys_to_name(&zep, buf, sizeof (buf));
+
+		const char *name = "";
+		(void) zap_update(spa->spa_meta_objset, err_obj,
+		    buf, 1, strlen(name) + 1, name, tx);
+	}
 	zap_cursor_fini(&zc);
 
-	return (0);
+	VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
+}
+
+void
+spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx)
+{
+	uint64_t newobj = 0;
+
+	mutex_enter(&spa->spa_errlog_lock);
+	if (spa->spa_errlog_last != 0) {
+		sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx);
+		spa->spa_errlog_last = newobj;
+
+		(void) zap_update(spa->spa_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
+		    sizeof (uint64_t), 1, &spa->spa_errlog_last, tx);
+	}
+
+	if (spa->spa_errlog_scrub != 0) {
+		sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx);
+		spa->spa_errlog_scrub = newobj;
+
+		(void) zap_update(spa->spa_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
+		    sizeof (uint64_t), 1, &spa->spa_errlog_scrub, tx);
+	}
+
+	mutex_exit(&spa->spa_errlog_lock);
 }
 
+#ifdef _KERNEL
+/*
+ * If an error block is shared by two datasets it will be counted twice.
+ */
 static int
-process_error_list(avl_tree_t *list, void *addr, size_t *count)
+process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count)
 {
-	spa_error_entry_t *se;
+	if (obj == 0)
+		return (0);
 
-	for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+	zap_cursor_t *zc;
+	zap_attribute_t *za;
+
+	zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
+	za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+		for (zap_cursor_init(zc, spa->spa_meta_objset, obj);
+		    zap_cursor_retrieve(zc, za) == 0;
+		    zap_cursor_advance(zc)) {
+			if (*count == 0) {
+				zap_cursor_fini(zc);
+				kmem_free(zc, sizeof (*zc));
+				kmem_free(za, sizeof (*za));
+				return (SET_ERROR(ENOMEM));
+			}
+
+			zbookmark_phys_t zb;
+			name_to_bookmark(za->za_name, &zb);
+
+			int error = copyout_entry(&zb, uaddr, count);
+			if (error != 0) {
+				zap_cursor_fini(zc);
+				kmem_free(zc, sizeof (*zc));
+				kmem_free(za, sizeof (*za));
+				return (error);
+			}
+		}
+		zap_cursor_fini(zc);
+		kmem_free(zc, sizeof (*zc));
+		kmem_free(za, sizeof (*za));
+		return (0);
+	}
 
-		if (*count == 0)
-			return (SET_ERROR(ENOMEM));
+	for (zap_cursor_init(zc, spa->spa_meta_objset, obj);
+	    zap_cursor_retrieve(zc, za) == 0;
+	    zap_cursor_advance(zc)) {
+
+		zap_cursor_t *head_ds_cursor;
+		zap_attribute_t *head_ds_attr;
+
+		head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
+		head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+		uint64_t head_ds_err_obj = za->za_first_integer;
+		uint64_t head_ds;
+		name_to_object(za->za_name, &head_ds);
+		for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset,
+		    head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor,
+		    head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) {
+
+			zbookmark_err_phys_t head_ds_block;
+			name_to_errphys(head_ds_attr->za_name, &head_ds_block);
+			int error = process_error_block(spa, head_ds,
+			    &head_ds_block, uaddr, count);
+
+			if (error != 0) {
+				zap_cursor_fini(head_ds_cursor);
+				kmem_free(head_ds_cursor,
+				    sizeof (*head_ds_cursor));
+				kmem_free(head_ds_attr, sizeof (*head_ds_attr));
+
+				zap_cursor_fini(zc);
+				kmem_free(za, sizeof (*za));
+				kmem_free(zc, sizeof (*zc));
+				return (error);
+			}
+		}
+		zap_cursor_fini(head_ds_cursor);
+		kmem_free(head_ds_cursor, sizeof (*head_ds_cursor));
+		kmem_free(head_ds_attr, sizeof (*head_ds_attr));
+	}
+	zap_cursor_fini(zc);
+	kmem_free(za, sizeof (*za));
+	kmem_free(zc, sizeof (*zc));
+	return (0);
+}
 
-		if (copyout(&se->se_bookmark, (char *)addr +
-		    (*count - 1) * sizeof (zbookmark_phys_t),
-		    sizeof (zbookmark_phys_t)) != 0)
-			return (SET_ERROR(EFAULT));
+static int
+process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count)
+{
+	spa_error_entry_t *se;
 
-		*count -= 1;
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+		for (se = avl_first(list); se != NULL;
+		    se = AVL_NEXT(list, se)) {
+			int error =
+			    copyout_entry(&se->se_bookmark, uaddr, count);
+			if (error != 0) {
+				return (error);
+			}
+		}
+		return (0);
 	}
 
+	for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+		uint64_t head_ds = 0;
+		int error = get_head_ds(spa, se->se_bookmark.zb_objset,
+		    &head_ds);
+
+		/*
+		 * If get_head_ds() errors out, set the head filesystem
+		 * to the filesystem stored in the bookmark of the
+		 * error block.
+		 */
+		if (error != 0)
+			head_ds = se->se_bookmark.zb_objset;
+
+		error = process_error_block(spa, head_ds,
+		    &se->se_zep, uaddr, count);
+		if (error != 0)
+			return (error);
+	}
 	return (0);
 }
 #endif
@@ -229,11 +1079,18 @@ process_error_list(avl_tree_t *list, void *addr, size_t *count)
  * the error list lock when we are finished.
  */
 int
-spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
+spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count)
 {
 	int ret = 0;
 
 #ifdef _KERNEL
+	/*
+	 * The pool config lock is needed to hold a dataset_t via (among other
+	 * places) process_error_list() -> process_error_block()->
+	 * find_top_affected_fs(), and lock ordering requires that we get it
+	 * before the spa_errlog_lock.
+	 */
+	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 	mutex_enter(&spa->spa_errlog_lock);
 
 	ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
@@ -244,14 +1101,17 @@ spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
 
 	mutex_enter(&spa->spa_errlist_lock);
 	if (!ret)
-		ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
+		ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr,
 		    count);
 	if (!ret)
-		ret = process_error_list(&spa->spa_errlist_last, uaddr,
+		ret = process_error_list(spa, &spa->spa_errlist_last, uaddr,
 		    count);
 	mutex_exit(&spa->spa_errlist_lock);
 
 	mutex_exit(&spa->spa_errlog_lock);
+	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+#else
+	(void) spa, (void) uaddr, (void) count;
 #endif
 
 	return (ret);
@@ -297,35 +1157,89 @@ spa_errlog_drain(spa_t *spa)
 /*
  * Process a list of errors into the current on-disk log.
  */
-static void
+void
 sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
 {
 	spa_error_entry_t *se;
-	char buf[64];
+	char buf[NAME_MAX_LEN];
 	void *cookie;
 
-	if (avl_numnodes(t) != 0) {
-		/* create log if necessary */
-		if (*obj == 0)
-			*obj = zap_create(spa->spa_meta_objset,
-			    DMU_OT_ERROR_LOG, DMU_OT_NONE,
-			    0, tx);
+	if (avl_numnodes(t) == 0)
+		return;
+
+	/* create log if necessary */
+	if (*obj == 0)
+		*obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
+		    DMU_OT_NONE, 0, tx);
 
-		/* add errors to the current log */
+	/* add errors to the current log */
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
-			char *name = se->se_name ? se->se_name : "";
-
 			bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
 
+			const char *name = se->se_name ? se->se_name : "";
+			(void) zap_update(spa->spa_meta_objset, *obj, buf, 1,
+			    strlen(name) + 1, name, tx);
+		}
+	} else {
+		for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
+			zbookmark_err_phys_t zep;
+			zep.zb_object = se->se_zep.zb_object;
+			zep.zb_level = se->se_zep.zb_level;
+			zep.zb_blkid = se->se_zep.zb_blkid;
+			zep.zb_birth = se->se_zep.zb_birth;
+
+			uint64_t head_ds = 0;
+			int error = get_head_ds(spa, se->se_bookmark.zb_objset,
+			    &head_ds);
+
+			/*
+			 * If get_head_ds() errors out, set the head filesystem
+			 * to the filesystem stored in the bookmark of the
+			 * error block.
+			 */
+			if (error != 0)
+				head_ds = se->se_bookmark.zb_objset;
+
+			uint64_t err_obj;
+			error = zap_lookup_int_key(spa->spa_meta_objset,
+			    *obj, head_ds, &err_obj);
+
+			if (error == ENOENT) {
+				err_obj = zap_create(spa->spa_meta_objset,
+				    DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
+
+				(void) zap_update_int_key(spa->spa_meta_objset,
+				    *obj, head_ds, err_obj, tx);
+			}
+			errphys_to_name(&zep, buf, sizeof (buf));
+
+			const char *name = se->se_name ? se->se_name : "";
 			(void) zap_update(spa->spa_meta_objset,
-			    *obj, buf, 1, strlen(name) + 1, name, tx);
+			    err_obj, buf, 1, strlen(name) + 1, name, tx);
 		}
+	}
+	/* purge the error list */
+	cookie = NULL;
+	while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
+		kmem_free(se, sizeof (spa_error_entry_t));
+}
 
-		/* purge the error list */
-		cookie = NULL;
-		while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
-			kmem_free(se, sizeof (spa_error_entry_t));
+static void
+delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx)
+{
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+		zap_cursor_t zc;
+		zap_attribute_t za;
+		for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    zap_cursor_advance(&zc)) {
+			VERIFY0(dmu_object_free(spa->spa_meta_objset,
+			    za.za_first_integer, tx));
+		}
+		zap_cursor_fini(&zc);
 	}
+	VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
 }
 
 /*
@@ -352,6 +1266,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
 	 */
 	if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
 	    avl_numnodes(&spa->spa_errlist_last) == 0 &&
+	    avl_numnodes(&spa->spa_errlist_healed) == 0 &&
 	    !spa->spa_scrub_finished) {
 		mutex_exit(&spa->spa_errlist_lock);
 		return;
@@ -362,11 +1277,23 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
 	spa->spa_scrub_finished = B_FALSE;
 
 	mutex_exit(&spa->spa_errlist_lock);
+
+	/*
+	 * The pool config lock is needed to hold a dataset_t via
+	 * sync_error_list() -> get_head_ds(), and lock ordering
+	 * requires that we get it before the spa_errlog_lock.
+	 */
+	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 	mutex_enter(&spa->spa_errlog_lock);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	/*
+	 * Remove healed errors from errors.
+	 */
+	spa_remove_healed_errors(spa, &last, &scrub, tx);
+
+	/*
 	 * Sync out the current list of errors.
 	 */
 	sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
@@ -376,8 +1303,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
 	 */
 	if (scrub_finished) {
 		if (spa->spa_errlog_last != 0)
-			VERIFY(dmu_object_free(spa->spa_meta_objset,
-			    spa->spa_errlog_last, tx) == 0);
+			delete_errlog(spa, spa->spa_errlog_last, tx);
 		spa->spa_errlog_last = spa->spa_errlog_scrub;
 		spa->spa_errlog_scrub = 0;
 
@@ -402,15 +1328,163 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
 	dmu_tx_commit(tx);
 
 	mutex_exit(&spa->spa_errlog_lock);
+	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+}
+
+static void
+delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds,
+    dmu_tx_t *tx)
+{
+	if (spa_err_obj == 0)
+		return;
+
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
+	    zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+		uint64_t head_ds;
+		name_to_object(za.za_name, &head_ds);
+		if (head_ds == ds) {
+			(void) zap_remove(spa->spa_meta_objset, spa_err_obj,
+			    za.za_name, tx);
+			VERIFY0(dmu_object_free(spa->spa_meta_objset,
+			    za.za_first_integer, tx));
+			break;
+		}
+	}
+	zap_cursor_fini(&zc);
+}
+
+void
+spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx)
+{
+	mutex_enter(&spa->spa_errlog_lock);
+	delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx);
+	delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx);
+	mutex_exit(&spa->spa_errlog_lock);
+}
+
+static int
+find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head,
+    uint64_t *txg)
+{
+	dsl_dataset_t *ds;
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+
+	int error = dsl_dataset_hold_obj_flags(dp, old_head,
+	    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+
+	while (prev_obj != 0) {
+		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+		if ((error = dsl_dataset_hold_obj_flags(dp, prev_obj,
+		    DS_HOLD_FLAG_DECRYPT, FTAG, &ds)) == 0 &&
+		    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head)
+			break;
+
+		if (error != 0)
+			return (error);
+
+		prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+		prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	}
+	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+	ASSERT(prev_obj != 0);
+	*txg = prev_obj_txg;
+	return (0);
+}
+
+static void
+swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t
+    old_head, dmu_tx_t *tx)
+{
+	if (spa_err_obj == 0)
+		return;
+
+	uint64_t old_head_errlog;
+	int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj,
+	    old_head, &old_head_errlog);
+
+	/* If no error log, then there is nothing to do. */
+	if (error != 0)
+		return;
+
+	uint64_t txg;
+	error = find_txg_ancestor_snapshot(spa, new_head, old_head, &txg);
+	if (error != 0)
+		return;
+
+	/*
+	 * Create an error log if the file system being promoted does not
+	 * already have one.
+	 */
+	uint64_t new_head_errlog;
+	error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head,
+	    &new_head_errlog);
+
+	if (error != 0) {
+		new_head_errlog = zap_create(spa->spa_meta_objset,
+		    DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
+
+		(void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj,
+		    new_head, new_head_errlog, tx);
+	}
+
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	zbookmark_err_phys_t err_block;
+	for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog);
+	    zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+
+		const char *name = "";
+		name_to_errphys(za.za_name, &err_block);
+		if (err_block.zb_birth < txg) {
+			(void) zap_update(spa->spa_meta_objset, new_head_errlog,
+			    za.za_name, 1, strlen(name) + 1, name, tx);
+
+			(void) zap_remove(spa->spa_meta_objset, old_head_errlog,
+			    za.za_name, tx);
+		}
+	}
+	zap_cursor_fini(&zc);
+}
+
+void
+spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds,
+    dmu_tx_t *tx)
+{
+	mutex_enter(&spa->spa_errlog_lock);
+	swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx);
+	swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx);
+	mutex_exit(&spa->spa_errlog_lock);
 }
 
 #if defined(_KERNEL)
 /* error handling */
 EXPORT_SYMBOL(spa_log_error);
-EXPORT_SYMBOL(spa_get_errlog_size);
+EXPORT_SYMBOL(spa_approx_errlog_size);
+EXPORT_SYMBOL(spa_get_last_errlog_size);
 EXPORT_SYMBOL(spa_get_errlog);
 EXPORT_SYMBOL(spa_errlog_rotate);
 EXPORT_SYMBOL(spa_errlog_drain);
 EXPORT_SYMBOL(spa_errlog_sync);
 EXPORT_SYMBOL(spa_get_errlists);
+EXPORT_SYMBOL(spa_delete_dataset_errlog);
+EXPORT_SYMBOL(spa_swap_errlog);
+EXPORT_SYMBOL(sync_error_list);
+EXPORT_SYMBOL(spa_upgrade_errlog);
+EXPORT_SYMBOL(find_top_affected_fs);
+EXPORT_SYMBOL(find_birth_txg);
+EXPORT_SYMBOL(zep_to_zb);
+EXPORT_SYMBOL(name_to_errphys);
 #endif
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, UINT, ZMOD_RW,
+	"Limit the number of errors which will be upgraded to the new "
+	"on-disk error log when enabling head_errlog");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/spa_history.c b/sys/contrib/openzfs/module/zfs/spa_history.c
index dae06e46c316..de036d6c3718 100644
--- a/sys/contrib/openzfs/module/zfs/spa_history.c
+++ b/sys/contrib/openzfs/module/zfs/spa_history.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -199,7 +199,7 @@ spa_history_log_notify(spa_t *spa, nvlist_t *nvl)
 {
 	nvlist_t *hist_nvl = fnvlist_alloc();
 	uint64_t uint64;
-	char *string;
+	const char *string;
 
 	if (nvlist_lookup_string(nvl, ZPOOL_HIST_CMD, &string) == 0)
 		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_CMD, string);
@@ -248,7 +248,6 @@ spa_history_log_notify(spa_t *spa, nvlist_t *nvl)
 /*
  * Write out a history event.
  */
-/*ARGSUSED*/
 static void
 spa_history_log_sync(void *arg, dmu_tx_t *tx)
 {
diff --git a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c
index 6fd302b8df34..32158e8c592c 100644
--- a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c
+++ b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -177,7 +177,7 @@
  * block size as we expect to be writing a lot of data to them at
  * once.
  */
-unsigned long zfs_log_sm_blksz = 1ULL << 17;
+static const unsigned long zfs_log_sm_blksz = 1ULL << 17;
 
 /*
  * Percentage of the overall system's memory that ZFS allows to be
@@ -188,13 +188,13 @@ unsigned long zfs_log_sm_blksz = 1ULL << 17;
  * (thus the _ppm suffix; reads as "parts per million"). As an example,
  * the default of 1000 allows 0.1% of memory to be used.
  */
-unsigned long zfs_unflushed_max_mem_ppm = 1000;
+static uint64_t zfs_unflushed_max_mem_ppm = 1000;
 
 /*
  * Specific hard-limit in memory that ZFS allows to be used for
  * unflushed changes.
  */
-unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30;
+static uint64_t zfs_unflushed_max_mem_amt = 1ULL << 30;
 
 /*
  * The following tunable determines the number of blocks that can be used for
@@ -243,28 +243,33 @@ unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30;
  * provide upper and lower bounds for the log block limit.
  * [see zfs_unflushed_log_block_{min,max}]
  */
-unsigned long zfs_unflushed_log_block_pct = 400;
+static uint_t zfs_unflushed_log_block_pct = 400;
 
 /*
  * If the number of metaslabs is small and our incoming rate is high, we could
  * get into a situation that we are flushing all our metaslabs every TXG. Thus
  * we always allow at least this many log blocks.
  */
-unsigned long zfs_unflushed_log_block_min = 1000;
+static uint64_t zfs_unflushed_log_block_min = 1000;
 
 /*
  * If the log becomes too big, the import time of the pool can take a hit in
  * terms of performance. Thus we have a hard limit in the size of the log in
  * terms of blocks.
  */
-unsigned long zfs_unflushed_log_block_max = (1ULL << 18);
+static uint64_t zfs_unflushed_log_block_max = (1ULL << 17);
+
+/*
+ * Also we have a hard limit in the size of the log in terms of dirty TXGs.
+ */
+static uint64_t zfs_unflushed_log_txg_max = 1000;
 
 /*
  * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
  * stability of the flushing algorithm (longer summary) vs its runtime overhead
  * (smaller summary is faster to traverse).
  */
-unsigned long zfs_max_logsm_summary_length = 10;
+static uint64_t zfs_max_logsm_summary_length = 10;
 
 /*
  * Tunable that sets the lower bound on the metaslabs to flush every TXG.
@@ -277,7 +282,7 @@ unsigned long zfs_max_logsm_summary_length = 10;
  * The point of this tunable is to be used in extreme cases where we really
  * want to flush more metaslabs than our adaptable heuristic plans to flush.
  */
-unsigned long zfs_min_metaslabs_to_flush = 1;
+static uint64_t zfs_min_metaslabs_to_flush = 1;
 
 /*
  * Tunable that specifies how far in the past do we want to look when trying to
@@ -288,7 +293,7 @@ unsigned long zfs_min_metaslabs_to_flush = 1;
  * average over all the blocks that we walk
  * [see spa_estimate_incoming_log_blocks].
  */
-unsigned long zfs_max_log_walking = 5;
+static uint64_t zfs_max_log_walking = 5;
 
 /*
  * This tunable exists solely for testing purposes. It ensures that the log
@@ -333,9 +338,13 @@ spa_log_sm_set_blocklimit(spa_t *spa)
 		return;
 	}
 
-	uint64_t calculated_limit =
-	    (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100;
-	spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit,
+	uint64_t msdcount = 0;
+	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+	    e; e = list_next(&spa->spa_log_summary, e))
+		msdcount += e->lse_msdcount;
+
+	uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100;
+	spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit,
 	    zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
 }
 
@@ -380,8 +389,13 @@ spa_log_summary_verify_counts(spa_t *spa)
 }
 
 static boolean_t
-summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
+summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg)
 {
+	if (e->lse_end == txg)
+		return (0);
+	if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max,
+	    zfs_max_logsm_summary_length))
+		return (1);
 	uint64_t blocks_per_row = MAX(1,
 	    DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
 	    zfs_max_logsm_summary_length));
@@ -401,7 +415,7 @@ summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
  * the metaslab.
  */
 void
-spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
+spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty)
 {
 	/*
 	 * We don't track summary data for read-only pools and this function
@@ -429,6 +443,8 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
 	}
 
 	target->lse_mscount--;
+	if (dirty)
+		target->lse_msdcount--;
 }
 
 /*
@@ -490,15 +506,12 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
 void
 spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
 {
-	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
-	    e != NULL; e = list_head(&spa->spa_log_summary)) {
+	log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+	ASSERT3P(e, !=, NULL);
+	if (e->lse_txgcount > 0)
+		e->lse_txgcount--;
+	for (; e != NULL; e = list_head(&spa->spa_log_summary)) {
 		if (e->lse_blkcount > blocks_gone) {
-			/*
-			 * Assert that we stopped at an entry that is not
-			 * obsolete.
-			 */
-			ASSERT(e->lse_mscount != 0);
-
 			e->lse_blkcount -= blocks_gone;
 			blocks_gone = 0;
 			break;
@@ -560,31 +573,52 @@ spa_log_sm_increment_current_mscount(spa_t *spa)
 
 static void
 summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
-    uint64_t nblocks)
+    uint64_t metaslabs_dirty, uint64_t nblocks)
 {
 	log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
 
-	if (e == NULL || summary_entry_is_full(spa, e)) {
+	if (e == NULL || summary_entry_is_full(spa, e, txg)) {
 		e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
-		e->lse_start = txg;
+		e->lse_start = e->lse_end = txg;
+		e->lse_txgcount = 1;
 		list_insert_tail(&spa->spa_log_summary, e);
 	}
 
 	ASSERT3U(e->lse_start, <=, txg);
+	if (e->lse_end < txg) {
+		e->lse_end = txg;
+		e->lse_txgcount++;
+	}
 	e->lse_mscount += metaslabs_flushed;
+	e->lse_msdcount += metaslabs_dirty;
 	e->lse_blkcount += nblocks;
 }
 
 static void
 spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
 {
-	summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks);
+	summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks);
+}
+
+void
+spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty)
+{
+	summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0);
 }
 
 void
-spa_log_summary_add_flushed_metaslab(spa_t *spa)
+spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg)
 {
-	summary_add_data(spa, spa_syncing_txg(spa), 1, 0);
+	log_summary_entry_t *target = NULL;
+	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+	    e != NULL; e = list_next(&spa->spa_log_summary, e)) {
+		if (e->lse_start > txg)
+			break;
+		target = e;
+	}
+	ASSERT3P(target, !=, NULL);
+	ASSERT3U(target->lse_mscount, !=, 0);
+	target->lse_msdcount++;
 }
 
 /*
@@ -630,6 +664,11 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
 	int64_t available_blocks =
 	    spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
 
+	int64_t available_txgs = zfs_unflushed_log_txg_max;
+	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+	    e; e = list_next(&spa->spa_log_summary, e))
+		available_txgs -= e->lse_txgcount;
+
 	/*
 	 * This variable tells us the total number of flushes needed to
 	 * keep the log size within the limit when we reach txgs_in_future.
@@ -637,9 +676,7 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
 	uint64_t total_flushes = 0;
 
 	/* Holds the current maximum of our estimates so far. */
-	uint64_t max_flushes_pertxg =
-	    MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed),
-	    zfs_min_metaslabs_to_flush);
+	uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush;
 
 	/*
 	 * For our estimations we only look as far in the future
@@ -653,11 +690,15 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
 		 * then keep skipping TXGs accumulating more blocks
 		 * based on the incoming rate until we exceed it.
 		 */
-		if (available_blocks >= 0) {
-			uint64_t skip_txgs = (available_blocks / incoming) + 1;
+		if (available_blocks >= 0 && available_txgs >= 0) {
+			uint64_t skip_txgs = (incoming == 0) ?
+			    available_txgs + 1 : MIN(available_txgs + 1,
+			    (available_blocks / incoming) + 1);
 			available_blocks -= (skip_txgs * incoming);
+			available_txgs -= skip_txgs;
 			txgs_in_future += skip_txgs;
 			ASSERT3S(available_blocks, >=, -incoming);
+			ASSERT3S(available_txgs, >=, -1);
 		}
 
 		/*
@@ -666,9 +707,10 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
 		 * based on the current entry in the summary, updating
 		 * our available_blocks.
 		 */
-		ASSERT3S(available_blocks, <, 0);
+		ASSERT(available_blocks < 0 || available_txgs < 0);
 		available_blocks += e->lse_blkcount;
-		total_flushes += e->lse_mscount;
+		available_txgs += e->lse_txgcount;
+		total_flushes += e->lse_msdcount;
 
 		/*
 		 * Keep the running maximum of the total_flushes that
@@ -680,8 +722,6 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
 		 */
 		max_flushes_pertxg = MAX(max_flushes_pertxg,
 		    DIV_ROUND_UP(total_flushes, txgs_in_future));
-		ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
-		    max_flushes_pertxg);
 	}
 	return (max_flushes_pertxg);
 }
@@ -743,7 +783,7 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
 	 * request of flushing everything before we attempt to return
 	 * immediately.
 	 */
-	if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+	if (BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
 	    !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
 	    !spa_flush_all_logs_requested(spa))
 		return;
@@ -771,14 +811,11 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
 	uint64_t want_to_flush;
 	if (spa_flush_all_logs_requested(spa)) {
 		ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
-		want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed);
+		want_to_flush = UINT64_MAX;
 	} else {
 		want_to_flush = spa_estimate_metaslabs_to_flush(spa);
 	}
 
-	ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
-	    want_to_flush);
-
 	/* Used purely for verification purposes */
 	uint64_t visited = 0;
 
@@ -809,31 +846,22 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
 		if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
 			break;
 
-		mutex_enter(&curr->ms_sync_lock);
-		mutex_enter(&curr->ms_lock);
-		boolean_t flushed = metaslab_flush(curr, tx);
-		mutex_exit(&curr->ms_lock);
-		mutex_exit(&curr->ms_sync_lock);
-
-		/*
-		 * If we failed to flush a metaslab (because it was loading),
-		 * then we are done with the block heuristic as it's not
-		 * possible to destroy any log space maps once you've skipped
-		 * a metaslab. In that case we just set our counter to 0 but
-		 * we continue looping in case there is still memory pressure
-		 * due to unflushed changes. Note that, flushing a metaslab
-		 * that is not the oldest flushed in the pool, will never
-		 * destroy any log space maps [see spa_cleanup_old_sm_logs()].
-		 */
-		if (!flushed) {
-			want_to_flush = 0;
-		} else if (want_to_flush > 0) {
-			want_to_flush--;
-		}
+		if (metaslab_unflushed_dirty(curr)) {
+			mutex_enter(&curr->ms_sync_lock);
+			mutex_enter(&curr->ms_lock);
+			metaslab_flush(curr, tx);
+			mutex_exit(&curr->ms_lock);
+			mutex_exit(&curr->ms_sync_lock);
+			if (want_to_flush > 0)
+				want_to_flush--;
+		} else
+			metaslab_unflushed_bump(curr, tx, B_FALSE);
 
 		visited++;
 	}
 	ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
+
+	spa_log_sm_set_blocklimit(spa);
 }
 
 /*
@@ -904,6 +932,7 @@ spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
 		avl_remove(&spa->spa_sm_logs_by_txg, sls);
 		space_map_free_obj(mos, sls->sls_sm_obj, tx);
 		VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
+		spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks);
 		spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
 		kmem_free(sls, sizeof (spa_log_sm_t));
 	}
@@ -963,12 +992,7 @@ spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
 	VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
 	    0, UINT64_MAX, SPA_MINBLOCKSHIFT));
 
-	/*
-	 * If the log space map feature was just enabled, the blocklimit
-	 * has not yet been set.
-	 */
-	if (spa_log_sm_blocklimit(spa) == 0)
-		spa_log_sm_set_blocklimit(spa);
+	spa_log_sm_set_blocklimit(spa);
 }
 
 /*
@@ -1094,12 +1118,18 @@ spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
 		panic("invalid maptype_t");
 		break;
 	}
+	if (!metaslab_unflushed_dirty(ms)) {
+		metaslab_set_unflushed_dirty(ms, B_TRUE);
+		spa_log_summary_dirty_flushed_metaslab(spa,
+		    metaslab_unflushed_txg(ms));
+	}
 	return (0);
 }
 
 static int
 spa_ld_log_sm_data(spa_t *spa)
 {
+	spa_log_sm_t *sls, *psls;
 	int error = 0;
 
 	/*
@@ -1113,50 +1143,98 @@ spa_ld_log_sm_data(spa_t *spa)
 	ASSERT0(spa->spa_unflushed_stats.sus_memused);
 
 	hrtime_t read_logs_starttime = gethrtime();
-	/* this is a no-op when we don't have space map logs */
-	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
-	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
-		space_map_t *sm = NULL;
-		error = space_map_open(&sm, spa_meta_objset(spa),
-		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT);
-		if (error != 0) {
-			spa_load_failed(spa, "spa_ld_log_sm_data(): failed at "
-			    "space_map_open(obj=%llu) [error %d]",
-			    (u_longlong_t)sls->sls_sm_obj, error);
-			goto out;
+
+	/* Prefetch log spacemaps dnodes. */
+	for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
+	    sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+		dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj,
+		    ZIO_PRIORITY_SYNC_READ);
+	}
+
+	uint_t pn = 0;
+	uint64_t ps = 0;
+	uint64_t nsm = 0;
+	psls = sls = avl_first(&spa->spa_sm_logs_by_txg);
+	while (sls != NULL) {
+		/* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */
+		if (psls != NULL && pn < 16 &&
+		    (pn < 2 || ps < 2 * dmu_prefetch_max)) {
+			error = space_map_open(&psls->sls_sm,
+			    spa_meta_objset(spa), psls->sls_sm_obj, 0,
+			    UINT64_MAX, SPA_MINBLOCKSHIFT);
+			if (error != 0) {
+				spa_load_failed(spa, "spa_ld_log_sm_data(): "
+				    "failed at space_map_open(obj=%llu) "
+				    "[error %d]",
+				    (u_longlong_t)sls->sls_sm_obj, error);
+				goto out;
+			}
+			dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj,
+			    0, 0, space_map_length(psls->sls_sm),
+			    ZIO_PRIORITY_ASYNC_READ);
+			pn++;
+			ps += space_map_length(psls->sls_sm);
+			psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls);
+			continue;
 		}
 
+		/* Load TXG log spacemap into ms_unflushed_allocs/frees. */
+		kpreempt(KPREEMPT_SYNC);
+		ASSERT0(sls->sls_nblocks);
+		sls->sls_nblocks = space_map_nblocks(sls->sls_sm);
+		spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
+		summary_add_data(spa, sls->sls_txg,
+		    sls->sls_mscount, 0, sls->sls_nblocks);
+
+		spa_import_progress_set_notes_nolog(spa,
+		    "Read %llu of %lu log space maps", (u_longlong_t)nsm,
+		    avl_numnodes(&spa->spa_sm_logs_by_txg));
+
 		struct spa_ld_log_sm_arg vla = {
 			.slls_spa = spa,
 			.slls_txg = sls->sls_txg
 		};
-		error = space_map_iterate(sm, space_map_length(sm),
-		    spa_ld_log_sm_cb, &vla);
+		error = space_map_iterate(sls->sls_sm,
+		    space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla);
 		if (error != 0) {
-			space_map_close(sm);
 			spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
 			    "at space_map_iterate(obj=%llu) [error %d]",
 			    (u_longlong_t)sls->sls_sm_obj, error);
 			goto out;
 		}
 
-		ASSERT0(sls->sls_nblocks);
-		sls->sls_nblocks = space_map_nblocks(sm);
-		spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
-		summary_add_data(spa, sls->sls_txg,
-		    sls->sls_mscount, sls->sls_nblocks);
+		pn--;
+		ps -= space_map_length(sls->sls_sm);
+		nsm++;
+		space_map_close(sls->sls_sm);
+		sls->sls_sm = NULL;
+		sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls);
 
-		space_map_close(sm);
+		/* Update log block limits considering just loaded. */
+		spa_log_sm_set_blocklimit(spa);
 	}
+
 	hrtime_t read_logs_endtime = gethrtime();
 	spa_load_note(spa,
-	    "read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
-	    "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg),
+	    "Read %lu log space maps (%llu total blocks - blksz = %llu bytes) "
+	    "in %lld ms", avl_numnodes(&spa->spa_sm_logs_by_txg),
 	    (u_longlong_t)spa_log_sm_nblocks(spa),
 	    (u_longlong_t)zfs_log_sm_blksz,
-	    (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
+	    (longlong_t)NSEC2MSEC(read_logs_endtime - read_logs_starttime));
 
 out:
+	if (error != 0) {
+		for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+		    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+			if (sls->sls_sm) {
+				space_map_close(sls->sls_sm);
+				sls->sls_sm = NULL;
+			}
+		}
+	} else {
+		ASSERT0(pn);
+		ASSERT0(ps);
+	}
 	/*
 	 * Now that the metaslabs contain their unflushed changes:
 	 * [1] recalculate their actual allocated space
@@ -1237,6 +1315,9 @@ spa_ld_unflushed_txgs(vdev_t *vd)
 		}
 
 		ms->ms_unflushed_txg = entry.msp_unflushed_txg;
+		ms->ms_unflushed_dirty = B_FALSE;
+		ASSERT(range_tree_is_empty(ms->ms_unflushed_allocs));
+		ASSERT(range_tree_is_empty(ms->ms_unflushed_frees));
 		if (ms->ms_unflushed_txg != 0) {
 			mutex_enter(&spa->spa_flushed_ms_lock);
 			avl_add(&spa->spa_metaslabs_by_flushed, ms);
@@ -1283,40 +1364,44 @@ spa_ld_log_spacemaps(spa_t *spa)
 }
 
 /* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, ULONG, ZMOD_RW,
-    "Specific hard-limit in memory that ZFS allows to be used for "
-    "unflushed changes");
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, U64, ZMOD_RW,
+	"Specific hard-limit in memory that ZFS allows to be used for "
+	"unflushed changes");
 
-ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, ULONG, ZMOD_RW,
-    "Percentage of the overall system memory that ZFS allows to be "
-    "used for unflushed changes (value is calculated over 1000000 for "
-    "finer granularity)");
-
-ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, ULONG, ZMOD_RW,
-    "Hard limit (upper-bound) in the size of the space map log "
-    "in terms of blocks.");
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, U64, ZMOD_RW,
+	"Percentage of the overall system memory that ZFS allows to be "
+	"used for unflushed changes (value is calculated over 1000000 for "
+	"finer granularity)");
 
-ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW,
-    "Lower-bound limit for the maximum amount of blocks allowed in "
-    "log spacemap (see zfs_unflushed_log_block_max)");
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, U64, ZMOD_RW,
+	"Hard limit (upper-bound) in the size of the space map log "
+	"in terms of blocks.");
 
-ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW,
-    "Tunable used to determine the number of blocks that can be used for "
-    "the spacemap log, expressed as a percentage of the total number of "
-    "metaslabs in the pool (e.g. 400 means the number of log blocks is "
-    "capped at 4 times the number of metaslabs)");
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, U64, ZMOD_RW,
+	"Lower-bound limit for the maximum amount of blocks allowed in "
+	"log spacemap (see zfs_unflushed_log_block_max)");
 
-ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, ULONG, ZMOD_RW,
-    "The number of past TXGs that the flushing algorithm of the log "
-    "spacemap feature uses to estimate incoming log blocks");
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, U64, ZMOD_RW,
+    "Hard limit (upper-bound) in the size of the space map log "
+    "in terms of dirty TXGs.");
 
-ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, ULONG, ZMOD_RW,
-    "Maximum number of rows allowed in the summary of the spacemap log");
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, UINT, ZMOD_RW,
+	"Tunable used to determine the number of blocks that can be used for "
+	"the spacemap log, expressed as a percentage of the total number of "
+	"metaslabs in the pool (e.g. 400 means the number of log blocks is "
+	"capped at 4 times the number of metaslabs)");
 
-ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, ULONG, ZMOD_RW,
-    "Minimum number of metaslabs to flush per dirty TXG");
+ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, U64, ZMOD_RW,
+	"The number of past TXGs that the flushing algorithm of the log "
+	"spacemap feature uses to estimate incoming log blocks");
 
 ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW,
-    "Prevent the log spacemaps from being flushed and destroyed "
-    "during pool export/destroy");
+	"Prevent the log spacemaps from being flushed and destroyed "
+	"during pool export/destroy");
 /* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, U64, ZMOD_RW,
+	"Maximum number of rows allowed in the summary of the spacemap log");
+
+ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, U64, ZMOD_RW,
+	"Minimum number of metaslabs to flush per dirty TXG");
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index 1ecd2294dba0..d1d41bbe7214 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -20,16 +20,18 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2017 Datto Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 #include <sys/zfs_context.h>
+#include <sys/zfs_chksum.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
@@ -56,6 +58,7 @@
 #include <sys/fs/zfs.h>
 #include <sys/metaslab_impl.h>
 #include <sys/arc.h>
+#include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/kstat.h>
 #include "zfs_prop.h"
@@ -79,7 +82,8 @@
  *		- Check if spa_refcount is zero
  *		- Rename a spa_t
  *		- add/remove/attach/detach devices
- *		- Held for the duration of create/destroy/import/export
+ *		- Held for the duration of create/destroy
+ *		- Held at the start and end of import and export
  *
  *	It does not need to handle recursion.  A create or destroy may
  *	reference objects (files or zvols) in other pools, but by
@@ -232,17 +236,16 @@
  * locking is, always, based on spa_namespace_lock and spa_config_lock[].
  */
 
-static avl_tree_t spa_namespace_avl;
+avl_tree_t spa_namespace_avl;
 kmutex_t spa_namespace_lock;
-static kcondvar_t spa_namespace_cv;
-int spa_max_replication_override = SPA_DVAS_PER_BP;
+kcondvar_t spa_namespace_cv;
+static const int spa_max_replication_override = SPA_DVAS_PER_BP;
 
 static kmutex_t spa_spare_lock;
 static avl_tree_t spa_spare_avl;
 static kmutex_t spa_l2cache_lock;
 static avl_tree_t spa_l2cache_avl;
 
-kmem_cache_t *spa_buffer_pool;
 spa_mode_t spa_mode_global = SPA_MODE_UNINIT;
 
 #ifdef ZFS_DEBUG
@@ -304,25 +307,25 @@ int zfs_free_leak_on_eio = B_FALSE;
  * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
  * in one of three behaviors controlled by zfs_deadman_failmode.
  */
-unsigned long zfs_deadman_synctime_ms = 600000UL;
+uint64_t zfs_deadman_synctime_ms = 600000UL;  /* 10 min. */
 
 /*
  * This value controls the maximum amount of time zio_wait() will block for an
  * outstanding IO.  By default this is 300 seconds at which point the "hung"
  * behavior will be applied as described for zfs_deadman_synctime_ms.
  */
-unsigned long zfs_deadman_ziotime_ms = 300000UL;
+uint64_t zfs_deadman_ziotime_ms = 300000UL;  /* 5 min. */
 
 /*
  * Check time in milliseconds. This defines the frequency at which we check
  * for hung I/O.
  */
-unsigned long zfs_deadman_checktime_ms = 60000UL;
+uint64_t zfs_deadman_checktime_ms = 60000UL;  /* 1 min. */
 
 /*
  * By default the deadman is enabled.
  */
-int zfs_deadman_enabled = 1;
+int zfs_deadman_enabled = B_TRUE;
 
 /*
  * Controls the behavior of the deadman when it detects a "hung" I/O.
@@ -332,7 +335,7 @@ int zfs_deadman_enabled = 1;
  * continue - Attempt to recover from a "hung" I/O
  * panic    - Panic the system
  */
-char *zfs_deadman_failmode = "wait";
+const char *zfs_deadman_failmode = "wait";
 
 /*
  * The worst case is single-sector max-parity RAID-Z blocks, in which
@@ -343,7 +346,7 @@ char *zfs_deadman_failmode = "wait";
  * the worst case is:
  *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
  */
-int spa_asize_inflation = 24;
+uint_t spa_asize_inflation = 24;
 
 /*
  * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
@@ -383,11 +386,21 @@ int spa_asize_inflation = 24;
  *
  * See also the comments in zfs_space_check_t.
  */
-int spa_slop_shift = 5;
-uint64_t spa_min_slop = 128ULL * 1024 * 1024;
-uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
-int spa_allocators = 4;
+uint_t spa_slop_shift = 5;
+static const uint64_t spa_min_slop = 128ULL * 1024 * 1024;
+static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
 
+/*
+ * Number of allocators to use, per spa instance
+ */
+static int spa_num_allocators = 4;
+static int spa_cpus_per_allocator = 4;
+
+/*
+ * Spa active allocator.
+ * Valid values are zfs_active_allocator=<dynamic|cursor|new-dynamic>.
+ */
+const char *zfs_active_allocator = "dynamic";
 
 void
 spa_load_failed(spa_t *spa, const char *fmt, ...)
@@ -415,20 +428,22 @@ spa_load_note(spa_t *spa, const char *fmt, ...)
 
 	zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
 	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
+
+	spa_import_progress_set_notes_nolog(spa, "%s", buf);
 }
 
 /*
  * By default dedup and user data indirects land in the special class
  */
-int zfs_ddt_data_is_special = B_TRUE;
-int zfs_user_indirect_is_special = B_TRUE;
+static int zfs_ddt_data_is_special = B_TRUE;
+static int zfs_user_indirect_is_special = B_TRUE;
 
 /*
  * The percentage of special class final space reserved for metadata only.
  * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
  * let metadata into the class.
  */
-int zfs_special_class_metadata_reserve_pct = 25;
+static uint_t zfs_special_class_metadata_reserve_pct = 25;
 
 /*
  * ==========================================================================
@@ -462,7 +477,7 @@ spa_config_lock_destroy(spa_t *spa)
 }
 
 int
-spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
+spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw)
 {
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
@@ -492,9 +507,11 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
 	return (1);
 }
 
-void
-spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
+static void
+spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw,
+    int mmp_flag)
 {
+	(void) tag;
 	int wlocks_held = 0;
 
 	ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
@@ -507,7 +524,8 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
 			continue;
 		mutex_enter(&scl->scl_lock);
 		if (rw == RW_READER) {
-			while (scl->scl_writer || scl->scl_write_wanted) {
+			while (scl->scl_writer ||
+			    (!mmp_flag && scl->scl_write_wanted)) {
 				cv_wait(&scl->scl_cv, &scl->scl_lock);
 			}
 		} else {
@@ -526,8 +544,30 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
 }
 
 void
+spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
+{
+	spa_config_enter_impl(spa, locks, tag, rw, 0);
+}
+
+/*
+ * The spa_config_enter_mmp() allows the mmp thread to cut in front of
+ * outstanding write lock requests. This is needed since the mmp updates are
+ * time sensitive and failure to service them promptly will result in a
+ * suspended pool. This pool suspension has been seen in practice when there is
+ * a single disk in a pool that is responding slowly and presumably about to
+ * fail.
+ */
+
+void
+spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw)
+{
+	spa_config_enter_impl(spa, locks, tag, rw, 1);
+}
+
+void
 spa_config_exit(spa_t *spa, int locks, const void *tag)
 {
+	(void) tag;
 	for (int i = SCL_LOCKS - 1; i >= 0; i--) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		if (!(locks & (1 << i)))
@@ -581,6 +621,7 @@ spa_lookup(const char *name)
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
+retry:
 	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
 
 	/*
@@ -592,6 +633,20 @@ spa_lookup(const char *name)
 		*cp = '\0';
 
 	spa = avl_find(&spa_namespace_avl, &search, &where);
+	if (spa == NULL)
+		return (NULL);
+
+	/*
+	 * Avoid racing with import/export, which don't hold the namespace
+	 * lock for their entire duration.
+	 */
+	if ((spa->spa_load_thread != NULL &&
+	    spa->spa_load_thread != curthread) ||
+	    (spa->spa_export_thread != NULL &&
+	    spa->spa_export_thread != curthread)) {
+		cv_wait(&spa_namespace_cv, &spa_namespace_lock);
+		goto retry;
+	}
 
 	return (spa);
 }
@@ -684,11 +739,13 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
 	spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
 	spa_set_deadman_failmode(spa, zfs_deadman_failmode);
+	spa_set_allocator(spa, zfs_active_allocator);
 
 	zfs_refcount_create(&spa->spa_refcount);
 	spa_config_lock_init(spa);
 	spa_stats_init(spa);
 
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	avl_add(&spa_namespace_avl, spa);
 
 	/*
@@ -697,15 +754,25 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	if (altroot)
 		spa->spa_root = spa_strdup(altroot);
 
-	spa->spa_alloc_count = spa_allocators;
+	/* Do not allow more allocators than fraction of CPUs. */
+	spa->spa_alloc_count = MAX(MIN(spa_num_allocators,
+	    boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1);
+
 	spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
 	    sizeof (spa_alloc_t), KM_SLEEP);
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
 		    NULL);
 		avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
-		    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+		    sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
 	}
+	if (spa->spa_alloc_count > 1) {
+		spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t,
+		    sau_inuse[spa->spa_alloc_count]), KM_SLEEP);
+		mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT,
+		    NULL);
+	}
+
 	avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
 	avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg,
@@ -746,6 +813,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	spa->spa_min_ashift = INT_MAX;
 	spa->spa_max_ashift = 0;
 	spa->spa_min_alloc = INT_MAX;
+	spa->spa_gcd_alloc = INT_MAX;
 
 	/* Reset cached value */
 	spa->spa_dedup_dspace = ~0ULL;
@@ -783,13 +851,11 @@ spa_remove(spa_t *spa)
 	nvlist_free(spa->spa_config_splitting);
 
 	avl_remove(&spa_namespace_avl, spa);
-	cv_broadcast(&spa_namespace_cv);
 
 	if (spa->spa_root)
 		spa_strfree(spa->spa_root);
 
-	while ((dp = list_head(&spa->spa_config_list)) != NULL) {
-		list_remove(&spa->spa_config_list, dp);
+	while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) {
 		if (dp->scd_path != NULL)
 			spa_strfree(dp->scd_path);
 		kmem_free(dp, sizeof (spa_config_dirent_t));
@@ -801,6 +867,11 @@ spa_remove(spa_t *spa)
 	}
 	kmem_free(spa->spa_allocs, spa->spa_alloc_count *
 	    sizeof (spa_alloc_t));
+	if (spa->spa_alloc_count > 1) {
+		mutex_destroy(&spa->spa_allocs_use->sau_lock);
+		kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t,
+		    sau_inuse[spa->spa_alloc_count]));
+	}
 
 	avl_destroy(&spa->spa_metaslabs_by_flushed);
 	avl_destroy(&spa->spa_sm_logs_by_txg);
@@ -875,22 +946,25 @@ spa_next(spa_t *prev)
  * have the namespace lock held.
  */
 void
-spa_open_ref(spa_t *spa, void *tag)
+spa_open_ref(spa_t *spa, const void *tag)
 {
 	ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
-	    MUTEX_HELD(&spa_namespace_lock));
+	    MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_load_thread == curthread);
 	(void) zfs_refcount_add(&spa->spa_refcount, tag);
 }
 
 /*
  * Remove a reference to the given spa_t.  Must have at least one reference, or
- * have the namespace lock held.
+ * have the namespace lock held or be part of a pool import/export.
  */
 void
-spa_close(spa_t *spa, void *tag)
+spa_close(spa_t *spa, const void *tag)
 {
 	ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
-	    MUTEX_HELD(&spa_namespace_lock));
+	    MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_load_thread == curthread ||
+	    spa->spa_export_thread == curthread);
 	(void) zfs_refcount_remove(&spa->spa_refcount, tag);
 }
 
@@ -903,20 +977,22 @@ spa_close(spa_t *spa, void *tag)
  * so the asserts in spa_close() do not apply.
  */
 void
-spa_async_close(spa_t *spa, void *tag)
+spa_async_close(spa_t *spa, const void *tag)
 {
 	(void) zfs_refcount_remove(&spa->spa_refcount, tag);
 }
 
 /*
  * Check to see if the spa refcount is zero.  Must be called with
- * spa_namespace_lock held.  We really compare against spa_minref, which is the
- * number of references acquired when opening a pool
+ * spa_namespace_lock held or be the spa export thread.  We really
+ * compare against spa_minref, which is the  number of references
+ * acquired when opening a pool
  */
 boolean_t
 spa_refcount_zero(spa_t *spa)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_export_thread == curthread);
 
 	return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
 }
@@ -1164,6 +1240,8 @@ spa_vdev_enter(spa_t *spa)
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
 
+	ASSERT0(spa->spa_export_thread);
+
 	vdev_autotrim_stop_all(spa);
 
 	return (spa_vdev_config_enter(spa));
@@ -1181,6 +1259,8 @@ spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
 
+	ASSERT0(spa->spa_export_thread);
+
 	vdev_autotrim_stop_all(spa);
 
 	if (guid != 0) {
@@ -1213,7 +1293,8 @@ spa_vdev_config_enter(spa_t *spa)
  * of multiple transactions without releasing the spa_namespace_lock.
  */
 void
-spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
+spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error,
+    const char *tag)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
@@ -1287,7 +1368,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
 	 * If the config changed, update the config cache.
 	 */
 	if (config_changed)
-		spa_write_cachefile(spa, B_FALSE, B_TRUE);
+		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
 }
 
 /*
@@ -1382,7 +1463,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
 	 */
 	if (config_changed) {
 		mutex_enter(&spa_namespace_lock);
-		spa_write_cachefile(spa, B_FALSE, B_TRUE);
+		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
 		mutex_exit(&spa_namespace_lock);
 	}
 
@@ -1476,8 +1557,7 @@ spa_strdup(const char *s)
 
 	len = strlen(s);
 	new = kmem_alloc(len + 1, KM_SLEEP);
-	bcopy(s, new, len);
-	new[len] = '\0';
+	memcpy(new, s, len + 1);
 
 	return (new);
 }
@@ -1512,8 +1592,8 @@ void
 snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
 {
 	char type[256];
-	char *checksum = NULL;
-	char *compress = NULL;
+	const char *checksum = NULL;
+	const char *compress = NULL;
 
 	if (bp != NULL) {
 		if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
@@ -1534,7 +1614,7 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
 		compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
 	}
 
-	SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
+	SNPRINTF_BLKPTR(kmem_scnprintf, ' ', buf, buflen, bp, type, checksum,
 	    compress);
 }
 
@@ -1652,10 +1732,10 @@ spa_altroot(spa_t *spa, char *buf, size_t buflen)
 	if (spa->spa_root == NULL)
 		buf[0] = '\0';
 	else
-		(void) strncpy(buf, spa->spa_root, buflen);
+		(void) strlcpy(buf, spa->spa_root, buflen);
 }
 
-int
+uint32_t
 spa_sync_pass(spa_t *spa)
 {
 	return (spa->spa_sync_pass);
@@ -1795,7 +1875,8 @@ spa_get_slop_space(spa_t *spa)
 	 * deduplicated data, so since it's not useful to reserve more
 	 * space with more deduplicated data, we subtract that out here.
 	 */
-	space = spa_get_dspace(spa) - spa->spa_dedup_dspace;
+	space =
+	    spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa);
 	slop = MIN(space >> spa_slop_shift, spa_max_slop);
 
 	/*
@@ -1832,37 +1913,28 @@ void
 spa_update_dspace(spa_t *spa)
 {
 	spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
-	    ddt_get_dedup_dspace(spa);
-	if (spa->spa_vdev_removal != NULL) {
+	    ddt_get_dedup_dspace(spa) + brt_get_dspace(spa);
+	if (spa->spa_nonallocating_dspace > 0) {
 		/*
-		 * We can't allocate from the removing device, so subtract
-		 * its size if it was included in dspace (i.e. if this is a
-		 * normal-class vdev, not special/dedup).  This prevents the
-		 * DMU/DSL from filling up the (now smaller) pool while we
-		 * are in the middle of removing the device.
+		 * Subtract the space provided by all non-allocating vdevs that
+		 * contribute to dspace.  If a file is overwritten, its old
+		 * blocks are freed and new blocks are allocated.  If there are
+		 * no snapshots of the file, the available space should remain
+		 * the same.  The old blocks could be freed from the
+		 * non-allocating vdev, but the new blocks must be allocated on
+		 * other (allocating) vdevs.  By reserving the entire size of
+		 * the non-allocating vdevs (including allocated space), we
+		 * ensure that there will be enough space on the allocating
+		 * vdevs for this file overwrite to succeed.
 		 *
 		 * Note that the DMU/DSL doesn't actually know or care
 		 * how much space is allocated (it does its own tracking
 		 * of how much space has been logically used).  So it
 		 * doesn't matter that the data we are moving may be
-		 * allocated twice (on the old device and the new
-		 * device).
-		 */
-		spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-		vdev_t *vd =
-		    vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
-		/*
-		 * If the stars align, we can wind up here after
-		 * vdev_remove_complete() has cleared vd->vdev_mg but before
-		 * spa->spa_vdev_removal gets cleared, so we must check before
-		 * we dereference.
+		 * allocated twice (on the old device and the new device).
 		 */
-		if (vd->vdev_mg &&
-		    vd->vdev_mg->mg_class == spa_normal_class(spa)) {
-			spa->spa_dspace -= spa_deflate(spa) ?
-			    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
-		}
-		spa_config_exit(spa, SCL_VDEV, FTAG);
+		ASSERT3U(spa->spa_dspace, >=, spa->spa_nonallocating_dspace);
+		spa->spa_dspace -= spa->spa_nonallocating_dspace;
 	}
 }
 
@@ -2153,6 +2225,7 @@ typedef struct spa_import_progress {
 	uint64_t		pool_guid;	/* unique id for updates */
 	char			*pool_name;
 	spa_load_state_t	spa_load_state;
+	char			*spa_load_notes;
 	uint64_t		mmp_sec_remaining;	/* MMP activity check */
 	uint64_t		spa_load_max_txg;	/* rewind txg */
 	procfs_list_node_t	smh_node;
@@ -2163,9 +2236,9 @@ spa_history_list_t *spa_import_progress_list = NULL;
 static int
 spa_import_progress_show_header(struct seq_file *f)
 {
-	seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid",
+	seq_printf(f, "%-20s %-14s %-14s %-12s %-16s %s\n", "pool_guid",
 	    "load_state", "multihost_secs", "max_txg",
-	    "pool_name");
+	    "pool_name", "notes");
 	return (0);
 }
 
@@ -2174,11 +2247,12 @@ spa_import_progress_show(struct seq_file *f, void *data)
 {
 	spa_import_progress_t *sip = (spa_import_progress_t *)data;
 
-	seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n",
+	seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %-16s %s\n",
 	    (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state,
 	    (u_longlong_t)sip->mmp_sec_remaining,
 	    (u_longlong_t)sip->spa_load_max_txg,
-	    (sip->pool_name ? sip->pool_name : "-"));
+	    (sip->pool_name ? sip->pool_name : "-"),
+	    (sip->spa_load_notes ? sip->spa_load_notes : "-"));
 
 	return (0);
 }
@@ -2192,6 +2266,8 @@ spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size)
 		sip = list_remove_head(&shl->procfs_list.pl_list);
 		if (sip->pool_name)
 			spa_strfree(sip->pool_name);
+		if (sip->spa_load_notes)
+			kmem_strfree(sip->spa_load_notes);
 		kmem_free(sip, sizeof (spa_import_progress_t));
 		shl->size--;
 	}
@@ -2247,6 +2323,10 @@ spa_import_progress_set_state(uint64_t pool_guid,
 	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
 		if (sip->pool_guid == pool_guid) {
 			sip->spa_load_state = load_state;
+			if (sip->spa_load_notes != NULL) {
+				kmem_strfree(sip->spa_load_notes);
+				sip->spa_load_notes = NULL;
+			}
 			error = 0;
 			break;
 		}
@@ -2256,6 +2336,59 @@ spa_import_progress_set_state(uint64_t pool_guid,
 	return (error);
 }
 
+static void
+spa_import_progress_set_notes_impl(spa_t *spa, boolean_t log_dbgmsg,
+    const char *fmt, va_list adx)
+{
+	spa_history_list_t *shl = spa_import_progress_list;
+	spa_import_progress_t *sip;
+	uint64_t pool_guid = spa_guid(spa);
+
+	if (shl->size == 0)
+		return;
+
+	char *notes = kmem_vasprintf(fmt, adx);
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
+	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
+		if (sip->pool_guid == pool_guid) {
+			if (sip->spa_load_notes != NULL) {
+				kmem_strfree(sip->spa_load_notes);
+				sip->spa_load_notes = NULL;
+			}
+			sip->spa_load_notes = notes;
+			if (log_dbgmsg)
+				zfs_dbgmsg("'%s' %s", sip->pool_name, notes);
+			notes = NULL;
+			break;
+		}
+	}
+	mutex_exit(&shl->procfs_list.pl_lock);
+	if (notes != NULL)
+		kmem_strfree(notes);
+}
+
+void
+spa_import_progress_set_notes(spa_t *spa, const char *fmt, ...)
+{
+	va_list adx;
+
+	va_start(adx, fmt);
+	spa_import_progress_set_notes_impl(spa, B_TRUE, fmt, adx);
+	va_end(adx);
+}
+
+void
+spa_import_progress_set_notes_nolog(spa_t *spa, const char *fmt, ...)
+{
+	va_list adx;
+
+	va_start(adx, fmt);
+	spa_import_progress_set_notes_impl(spa, B_FALSE, fmt, adx);
+	va_end(adx);
+}
+
 int
 spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg)
 {
@@ -2313,7 +2446,7 @@ spa_import_progress_add(spa_t *spa)
 {
 	spa_history_list_t *shl = spa_import_progress_list;
 	spa_import_progress_t *sip;
-	char *poolname = NULL;
+	const char *poolname = NULL;
 
 	sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP);
 	sip->pool_guid = spa_guid(spa);
@@ -2324,6 +2457,7 @@ spa_import_progress_add(spa_t *spa)
 		poolname = spa_name(spa);
 	sip->pool_name = spa_strdup(poolname);
 	sip->spa_load_state = spa_load_state(spa);
+	sip->spa_load_notes = NULL;
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	procfs_list_add(&shl->procfs_list, sip);
@@ -2343,6 +2477,8 @@ spa_import_progress_remove(uint64_t pool_guid)
 		if (sip->pool_guid == pool_guid) {
 			if (sip->pool_name)
 				spa_strfree(sip->pool_name);
+			if (sip->spa_load_notes)
+				spa_strfree(sip->spa_load_notes);
 			list_remove(&shl->procfs_list.pl_list, sip);
 			shl->size--;
 			kmem_free(sip, sizeof (spa_import_progress_t));
@@ -2417,18 +2553,20 @@ spa_init(spa_mode_t mode)
 	unique_init();
 	zfs_btree_init();
 	metaslab_stat_init();
+	brt_init();
 	ddt_init();
 	zio_init();
 	dmu_init();
 	zil_init();
-	vdev_cache_stat_init();
 	vdev_mirror_stat_init();
 	vdev_raidz_math_init();
 	vdev_file_init();
 	zfs_prop_init();
+	chksum_init();
 	zpool_prop_init();
 	zpool_feature_init();
 	spa_config_load();
+	vdev_prop_init();
 	l2arc_start();
 	scan_init();
 	qat_init();
@@ -2443,13 +2581,14 @@ spa_fini(void)
 	spa_evict_all();
 
 	vdev_file_fini();
-	vdev_cache_stat_fini();
 	vdev_mirror_stat_fini();
 	vdev_raidz_math_fini();
+	chksum_fini();
 	zil_fini();
 	dmu_fini();
 	zio_fini();
 	ddt_fini();
+	brt_fini();
 	metaslab_stat_fini();
 	zfs_btree_fini();
 	unique_fini();
@@ -2557,10 +2696,18 @@ spa_scan_stat_init(spa_t *spa)
 		spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
 	else
 		spa->spa_scan_pass_scrub_pause = 0;
+
+	if (dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan))
+		spa->spa_scan_pass_errorscrub_pause = spa->spa_scan_pass_start;
+	else
+		spa->spa_scan_pass_errorscrub_pause = 0;
+
 	spa->spa_scan_pass_scrub_spent_paused = 0;
 	spa->spa_scan_pass_exam = 0;
 	spa->spa_scan_pass_issued = 0;
-	vdev_scan_stat_init(spa->spa_root_vdev);
+
+	// error scrub stats
+	spa->spa_scan_pass_errorscrub_spent_paused = 0;
 }
 
 /*
@@ -2571,9 +2718,11 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 {
 	dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
 
-	if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
+	if (scn == NULL || (scn->scn_phys.scn_func == POOL_SCAN_NONE &&
+	    scn->errorscrub_phys.dep_func == POOL_SCAN_NONE))
 		return (SET_ERROR(ENOENT));
-	bzero(ps, sizeof (pool_scan_stat_t));
+
+	memset(ps, 0, sizeof (pool_scan_stat_t));
 
 	/* data stored on disk */
 	ps->pss_func = scn->scn_phys.scn_func;
@@ -2582,7 +2731,7 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 	ps->pss_end_time = scn->scn_phys.scn_end_time;
 	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
 	ps->pss_examined = scn->scn_phys.scn_examined;
-	ps->pss_to_process = scn->scn_phys.scn_to_process;
+	ps->pss_skipped = scn->scn_phys.scn_skipped;
 	ps->pss_processed = scn->scn_phys.scn_processed;
 	ps->pss_errors = scn->scn_phys.scn_errors;
 
@@ -2595,6 +2744,18 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 	ps->pss_issued =
 	    scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
 
+	/* error scrub data stored on disk */
+	ps->pss_error_scrub_func = scn->errorscrub_phys.dep_func;
+	ps->pss_error_scrub_state = scn->errorscrub_phys.dep_state;
+	ps->pss_error_scrub_start = scn->errorscrub_phys.dep_start_time;
+	ps->pss_error_scrub_end = scn->errorscrub_phys.dep_end_time;
+	ps->pss_error_scrub_examined = scn->errorscrub_phys.dep_examined;
+	ps->pss_error_scrub_to_be_examined =
+	    scn->errorscrub_phys.dep_to_examine;
+
+	/* error scrub data not stored on disk */
+	ps->pss_pass_error_scrub_pause = spa->spa_scan_pass_errorscrub_pause;
+
 	return (0);
 }
 
@@ -2714,8 +2875,7 @@ spa_state_to_name(spa_t *spa)
 	vdev_state_t state = rvd->vdev_state;
 	vdev_aux_t aux = rvd->vdev_stat.vs_aux;
 
-	if (spa_suspended(spa) &&
-	    (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE))
+	if (spa_suspended(spa))
 		return ("SUSPENDED");
 
 	switch (state) {
@@ -2926,13 +3086,13 @@ ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW,
 	"Set to ignore IO errors during free and permanently leak the space");
 
-ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, U64, ZMOD_RW,
 	"Dead I/O check interval in milliseconds");
 
 ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, enabled, INT, ZMOD_RW,
 	"Enable deadman timer");
 
-ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, UINT, ZMOD_RW,
 	"SPA size estimate multiplication factor");
 
 ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW,
@@ -2947,17 +3107,23 @@ ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode,
 	"Failmode for deadman timer");
 
 ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms,
-	param_set_deadman_synctime, param_get_ulong, ZMOD_RW,
+	param_set_deadman_synctime, spl_param_get_u64, ZMOD_RW,
 	"Pool sync expiration time in milliseconds");
 
 ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms,
-	param_set_deadman_ziotime, param_get_ulong, ZMOD_RW,
+	param_set_deadman_ziotime, spl_param_get_u64, ZMOD_RW,
 	"IO expiration time in milliseconds");
 
-ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW,
 	"Small file blocks in special vdevs depends on this much "
 	"free space available");
 /* END CSTYLED */
 
 ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
-	param_get_int, ZMOD_RW, "Reserved free space in pool");
+	param_get_uint, ZMOD_RW, "Reserved free space in pool");
+
+ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW,
+	"Number of allocators per spa");
+
+ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW,
+	"Minimum number of CPUs per allocators");
diff --git a/sys/contrib/openzfs/module/zfs/spa_stats.c b/sys/contrib/openzfs/module/zfs/spa_stats.c
index 534ac72fee7b..17ed2a620b1e 100644
--- a/sys/contrib/openzfs/module/zfs/spa_stats.c
+++ b/sys/contrib/openzfs/module/zfs/spa_stats.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -28,22 +28,22 @@
 /*
  * Keeps stats on last N reads per spa_t, disabled by default.
  */
-int zfs_read_history = 0;
+static uint_t zfs_read_history = B_FALSE;
 
 /*
  * Include cache hits in history, disabled by default.
  */
-int zfs_read_history_hits = 0;
+static int zfs_read_history_hits = B_FALSE;
 
 /*
  * Keeps stats on the last 100 txgs by default.
  */
-int zfs_txg_history = 100;
+static uint_t zfs_txg_history = 100;
 
 /*
  * Keeps stats on the last N MMP updates, disabled by default.
  */
-int zfs_multihost_history = 0;
+static uint_t zfs_multihost_history = B_FALSE;
 
 /*
  * ==========================================================================
@@ -819,6 +819,41 @@ spa_state_init(spa_t *spa)
 	kmem_strfree(name);
 }
 
+static int
+spa_guid_data(char *buf, size_t size, void *data)
+{
+	spa_t *spa = (spa_t *)data;
+	(void) snprintf(buf, size, "%llu\n", (u_longlong_t)spa_guid(spa));
+	return (0);
+}
+
+static void
+spa_guid_init(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.guid;
+	char *name;
+	kstat_t *ksp;
+
+	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+	name = kmem_asprintf("zfs/%s", spa_name(spa));
+
+	ksp = kstat_create(name, 0, "guid", "misc",
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+	shk->kstat = ksp;
+	if (ksp) {
+		ksp->ks_lock = &shk->lock;
+		ksp->ks_data = NULL;
+		ksp->ks_private = spa;
+		ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS;
+		kstat_set_raw_ops(ksp, NULL, spa_guid_data, spa_state_addr);
+		kstat_install(ksp);
+	}
+
+	kmem_strfree(name);
+}
+
 static void
 spa_health_destroy(spa_t *spa)
 {
@@ -830,7 +865,18 @@ spa_health_destroy(spa_t *spa)
 	mutex_destroy(&shk->lock);
 }
 
-static spa_iostats_t spa_iostats_template = {
+static void
+spa_guid_destroy(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.guid;
+	kstat_t *ksp = shk->kstat;
+	if (ksp)
+		kstat_delete(ksp);
+
+	mutex_destroy(&shk->lock);
+}
+
+static const spa_iostats_t spa_iostats_template = {
 	{ "trim_extents_written",		KSTAT_DATA_UINT64 },
 	{ "trim_bytes_written",			KSTAT_DATA_UINT64 },
 	{ "trim_extents_skipped",		KSTAT_DATA_UINT64 },
@@ -950,6 +996,7 @@ spa_stats_init(spa_t *spa)
 	spa_tx_assign_init(spa);
 	spa_mmp_history_init(spa);
 	spa_state_init(spa);
+	spa_guid_init(spa);
 	spa_iostats_init(spa);
 }
 
@@ -962,18 +1009,17 @@ spa_stats_destroy(spa_t *spa)
 	spa_txg_history_destroy(spa);
 	spa_read_history_destroy(spa);
 	spa_mmp_history_destroy(spa);
+	spa_guid_destroy(spa);
 }
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, read_history, INT, ZMOD_RW,
-    "Historical statistics for the last N reads");
+ZFS_MODULE_PARAM(zfs, zfs_, read_history, UINT, ZMOD_RW,
+	"Historical statistics for the last N reads");
 
 ZFS_MODULE_PARAM(zfs, zfs_, read_history_hits, INT, ZMOD_RW,
-    "Include cache hits in read history");
+	"Include cache hits in read history");
 
-ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, INT, ZMOD_RW,
-    "Historical statistics for the last N txgs");
+ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, UINT, ZMOD_RW,
+	"Historical statistics for the last N txgs");
 
-ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, INT, ZMOD_RW,
-    "Historical statistics for last N multihost writes");
-/* END CSTYLED */
+ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, UINT, ZMOD_RW,
+	"Historical statistics for last N multihost writes");
diff --git a/sys/contrib/openzfs/module/zfs/space_map.c b/sys/contrib/openzfs/module/zfs/space_map.c
index 11d4798925e4..a336ff41eadb 100644
--- a/sys/contrib/openzfs/module/zfs/space_map.c
+++ b/sys/contrib/openzfs/module/zfs/space_map.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -455,7 +455,8 @@ space_map_histogram_clear(space_map_t *sm)
 	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
 		return;
 
-	bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
+	memset(sm->sm_phys->smp_histogram, 0,
+	    sizeof (sm->sm_phys->smp_histogram));
 }
 
 boolean_t
@@ -548,7 +549,7 @@ space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
 static void
 space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend,
     maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp,
-    void *tag, dmu_tx_t *tx)
+    const void *tag, dmu_tx_t *tx)
 {
 	ASSERT3U(words, !=, 0);
 	ASSERT3U(words, <=, 2);
@@ -896,7 +897,7 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
 		 * will be reset.  Do the same in the common case so that
 		 * bugs related to the uncommon case do not go unnoticed.
 		 */
-		bzero(sm->sm_phys->smp_histogram,
+		memset(sm->sm_phys->smp_histogram, 0,
 		    sizeof (sm->sm_phys->smp_histogram));
 	}
 
diff --git a/sys/contrib/openzfs/module/zfs/space_reftree.c b/sys/contrib/openzfs/module/zfs/space_reftree.c
index 080fc6646512..ee11e162dd5b 100644
--- a/sys/contrib/openzfs/module/zfs/space_reftree.c
+++ b/sys/contrib/openzfs/module/zfs/space_reftree.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/txg.c b/sys/contrib/openzfs/module/zfs/txg.c
index c9eb84bbdb12..5ce6be69be14 100644
--- a/sys/contrib/openzfs/module/zfs/txg.c
+++ b/sys/contrib/openzfs/module/zfs/txg.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -108,10 +108,10 @@
  * now transition to the syncing state.
  */
 
-static void txg_sync_thread(void *arg);
-static void txg_quiesce_thread(void *arg);
+static __attribute__((noreturn)) void txg_sync_thread(void *arg);
+static __attribute__((noreturn)) void txg_quiesce_thread(void *arg);
 
-int zfs_txg_timeout = 5;	/* max seconds worth of delta per txg */
+uint_t zfs_txg_timeout = 5;	/* max seconds worth of delta per txg */
 
 /*
  * Prepare the txg subsystem.
@@ -121,7 +121,7 @@ txg_init(dsl_pool_t *dp, uint64_t txg)
 {
 	tx_state_t *tx = &dp->dp_tx;
 	int c;
-	bzero(tx, sizeof (tx_state_t));
+	memset(tx, 0, sizeof (tx_state_t));
 
 	tx->tx_cpu = vmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
 
@@ -186,7 +186,7 @@ txg_fini(dsl_pool_t *dp)
 
 	vmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
 
-	bzero(tx, sizeof (tx_state_t));
+	memset(tx, 0, sizeof (tx_state_t));
 }
 
 /*
@@ -429,7 +429,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 }
 
 static void
-txg_do_callbacks(list_t *cb_list)
+txg_do_callbacks(void *cb_list)
 {
 	dmu_tx_do_callbacks(cb_list, 0);
 
@@ -479,7 +479,7 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
 
 		list_move_tail(cb_list, &tc->tc_callbacks[g]);
 
-		(void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
+		(void) taskq_dispatch(tx->tx_commit_cb_taskq,
 		    txg_do_callbacks, cb_list, TQ_SLEEP);
 	}
 }
@@ -514,7 +514,7 @@ txg_has_quiesced_to_sync(dsl_pool_t *dp)
 	return (tx->tx_quiesced_txg != 0);
 }
 
-static void
+static __attribute__((noreturn)) void
 txg_sync_thread(void *arg)
 {
 	dsl_pool_t *dp = arg;
@@ -551,6 +551,15 @@ txg_sync_thread(void *arg)
 		}
 
 		/*
+		 * When we're suspended, nothing should be changing and for
+		 * MMP we don't want to bump anything that would make it
+		 * harder to detect if another host is changing it when
+		 * resuming after a MMP suspend.
+		 */
+		if (spa_suspended(spa))
+			continue;
+
+		/*
 		 * Wait until the quiesce thread hands off a txg to us,
 		 * prompting it to do so if necessary.
 		 */
@@ -605,7 +614,7 @@ txg_sync_thread(void *arg)
 	}
 }
 
-static void
+static __attribute__((noreturn)) void
 txg_quiesce_thread(void *arg)
 {
 	dsl_pool_t *dp = arg;
@@ -895,15 +904,10 @@ txg_list_destroy(txg_list_t *tl)
 boolean_t
 txg_all_lists_empty(txg_list_t *tl)
 {
-	mutex_enter(&tl->tl_lock);
-	for (int i = 0; i < TXG_SIZE; i++) {
-		if (!txg_list_empty_impl(tl, i)) {
-			mutex_exit(&tl->tl_lock);
-			return (B_FALSE);
-		}
-	}
-	mutex_exit(&tl->tl_lock);
-	return (B_TRUE);
+	boolean_t res = B_TRUE;
+	for (int i = 0; i < TXG_SIZE; i++)
+		res &= (tl->tl_head[i] == NULL);
+	return (res);
 }
 
 /*
@@ -1069,7 +1073,5 @@ EXPORT_SYMBOL(txg_wait_callbacks);
 EXPORT_SYMBOL(txg_stalled);
 EXPORT_SYMBOL(txg_sync_waiting);
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, timeout, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, timeout, UINT, ZMOD_RW,
 	"Max seconds worth of delta per txg");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/uberblock.c b/sys/contrib/openzfs/module/zfs/uberblock.c
index b8857d74d810..22ee8036c473 100644
--- a/sys/contrib/openzfs/module/zfs/uberblock.c
+++ b/sys/contrib/openzfs/module/zfs/uberblock.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -70,5 +70,5 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay)
 	}
 	ub->ub_checkpoint_txg = 0;
 
-	return (ub->ub_rootbp.blk_birth == txg);
+	return (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) == txg);
 }
diff --git a/sys/contrib/openzfs/module/zfs/unique.c b/sys/contrib/openzfs/module/zfs/unique.c
index 0e076797a002..799e4095db33 100644
--- a/sys/contrib/openzfs/module/zfs/unique.c
+++ b/sys/contrib/openzfs/module/zfs/unique.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index 4a67ba85f58a..c74f72159dc9 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -28,7 +28,8 @@
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Datto Inc. All rights reserved.
- * Copyright [2021] Hewlett Packard Enterprise Development LP
+ * Copyright (c) 2021, Klara Inc.
+ * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
  */
 
 #include <sys/zfs_context.h>
@@ -57,8 +58,10 @@
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
+#include <sys/vdev_raidz.h>
 #include <sys/zvol.h>
 #include <sys/zfs_ratelimit.h>
+#include "zfs_prop.h"
 
 /*
  * One metaslab from each (normal-class) vdev is used by the ZIL.  These are
@@ -79,22 +82,22 @@
  * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
  * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
  */
-int zfs_embedded_slog_min_ms = 64;
+static uint_t zfs_embedded_slog_min_ms = 64;
 
 /* default target for number of metaslabs per top-level vdev */
-int zfs_vdev_default_ms_count = 200;
+static uint_t zfs_vdev_default_ms_count = 200;
 
 /* minimum number of metaslabs per top-level vdev */
-int zfs_vdev_min_ms_count = 16;
+static uint_t zfs_vdev_min_ms_count = 16;
 
 /* practical upper limit of total metaslabs per top-level vdev */
-int zfs_vdev_ms_count_limit = 1ULL << 17;
+static uint_t zfs_vdev_ms_count_limit = 1ULL << 17;
 
 /* lower limit for metaslab size (512M) */
-int zfs_vdev_default_ms_shift = 29;
+static uint_t zfs_vdev_default_ms_shift = 29;
 
 /* upper limit for metaslab size (16G) */
-int zfs_vdev_max_ms_shift = 34;
+static uint_t zfs_vdev_max_ms_shift = 34;
 
 int vdev_validate_skip = B_FALSE;
 
@@ -107,18 +110,23 @@ int zfs_vdev_dtl_sm_blksz = (1 << 12);
 /*
  * Rate limit slow IO (delay) events to this many per second.
  */
-unsigned int zfs_slow_io_events_per_second = 20;
+static unsigned int zfs_slow_io_events_per_second = 20;
+
+/*
+ * Rate limit deadman "hung IO" events to this many per second.
+ */
+static unsigned int zfs_deadman_events_per_second = 1;
 
 /*
  * Rate limit checksum events after this many checksum errors per second.
  */
-unsigned int zfs_checksum_events_per_second = 20;
+static unsigned int zfs_checksum_events_per_second = 20;
 
 /*
  * Ignore errors during scrub/resilver.  Allows to work around resilver
  * upon import when there are pool errors.
  */
-int zfs_scan_ignore_errors = 0;
+static int zfs_scan_ignore_errors = 0;
 
 /*
  * vdev-wide space maps that have lots of entries written to them at
@@ -134,8 +142,16 @@ int zfs_vdev_standard_sm_blksz = (1 << 17);
  */
 int zfs_nocacheflush = 0;
 
-uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX;
-uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
+/*
+ * Maximum and minimum ashift values that can be automatically set based on
+ * vdev's physical ashift (disk's physical sector size).  While ASHIFT_MAX
+ * is higher than the maximum value, it is intentionally limited here to not
+ * excessively impact pool space efficiency.  Higher ashift values may still
+ * be forced by vdev logical ashift or by user via ashift property, but won't
+ * be set automatically as a performance optimization.
+ */
+uint_t zfs_vdev_max_auto_ashift = 14;
+uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
 
 void
 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
@@ -214,7 +230,7 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
  * Virtual device management.
  */
 
-static vdev_ops_t *vdev_ops_table[] = {
+static vdev_ops_t *const vdev_ops_table[] = {
 	&vdev_root_ops,
 	&vdev_raidz_ops,
 	&vdev_draid_ops,
@@ -236,7 +252,7 @@ static vdev_ops_t *vdev_ops_table[] = {
 static vdev_ops_t *
 vdev_getops(const char *type)
 {
-	vdev_ops_t *ops, **opspp;
+	vdev_ops_t *ops, *const *opspp;
 
 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
 		if (strcmp(ops->vdev_op_type, type) == 0)
@@ -261,11 +277,12 @@ vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
 		return (vd->vdev_mg);
 }
 
-/* ARGSUSED */
 void
 vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 {
+	(void) vd, (void) remain_rs;
+
 	physical_rs->rs_start = logical_rs->rs_start;
 	physical_rs->rs_end = logical_rs->rs_end;
 }
@@ -294,13 +311,13 @@ vdev_derive_alloc_bias(const char *bias)
  * all children.  This is what's used by anything other than RAID-Z.
  */
 uint64_t
-vdev_default_asize(vdev_t *vd, uint64_t psize)
+vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
 	uint64_t csize;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
-		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+		csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);
 		asize = MAX(asize, csize);
 	}
 
@@ -336,7 +353,8 @@ vdev_get_min_asize(vdev_t *vd)
 	 * to the nearest metaslab.
 	 */
 	if (vd == vd->vdev_top)
-		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
+		return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift,
+		    uint64_t));
 
 	return (pvd->vdev_ops->vdev_op_min_asize(pvd));
 }
@@ -378,6 +396,33 @@ vdev_get_nparity(vdev_t *vd)
 	return (nparity);
 }
 
+static int
+vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
+{
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa->spa_meta_objset;
+	uint64_t objid;
+	int err;
+
+	if (vd->vdev_root_zap != 0) {
+		objid = vd->vdev_root_zap;
+	} else if (vd->vdev_top_zap != 0) {
+		objid = vd->vdev_top_zap;
+	} else if (vd->vdev_leaf_zap != 0) {
+		objid = vd->vdev_leaf_zap;
+	} else {
+		return (EINVAL);
+	}
+
+	err = zap_lookup(mos, objid, vdev_prop_to_name(prop),
+	    sizeof (uint64_t), 1, value);
+
+	if (err == ENOENT)
+		*value = vdev_prop_default_numeric(prop);
+
+	return (err);
+}
+
 /*
  * Get the number of data disks for a top-level vdev.
  */
@@ -472,7 +517,7 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 
 	newchild = kmem_alloc(newsize, KM_SLEEP);
 	if (pvd->vdev_child != NULL) {
-		bcopy(pvd->vdev_child, newchild, oldsize);
+		memcpy(newchild, pvd->vdev_child, oldsize);
 		kmem_free(pvd->vdev_child, oldsize);
 	}
 
@@ -626,11 +671,21 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	 */
 	zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
 	    1);
-	zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second,
+	zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second,
 	    1);
 	zfs_ratelimit_init(&vd->vdev_checksum_rl,
 	    &zfs_checksum_events_per_second, 1);
 
+	/*
+	 * Default Thresholds for tuning ZED
+	 */
+	vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N);
+	vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
+	vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
+	vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
+	vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
+	vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
+
 	list_link_init(&vd->vdev_config_dirty_node);
 	list_link_init(&vd->vdev_state_dirty_node);
 	list_link_init(&vd->vdev_initialize_node);
@@ -652,6 +707,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -668,7 +724,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	    offsetof(struct vdev, vdev_dtl_node));
 	vd->vdev_stat.vs_timestamp = gethrtime();
 	vdev_queue_init(vd);
-	vdev_cache_init(vd);
 
 	return (vd);
 }
@@ -683,11 +738,11 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
     int alloctype)
 {
 	vdev_ops_t *ops;
-	char *type;
+	const char *type;
 	uint64_t guid = 0, islog;
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
-	char *tmp = NULL;
+	const char *tmp = NULL;
 	int rc;
 	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
 	boolean_t top_level = (parent && !parent->vdev_parent);
@@ -742,7 +797,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 		return (SET_ERROR(ENOTSUP));
 
 	if (top_level && alloctype == VDEV_ALLOC_ADD) {
-		char *bias;
+		const char *bias;
 
 		/*
 		 * If creating a top-level vdev, check for allocation
@@ -788,8 +843,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	if (top_level && alloc_bias != VDEV_BIAS_NONE)
 		vd->vdev_alloc_bias = alloc_bias;
 
-	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
-		vd->vdev_path = spa_strdup(vd->vdev_path);
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0)
+		vd->vdev_path = spa_strdup(tmp);
 
 	/*
 	 * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
@@ -803,18 +858,17 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 		vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 	}
 
-	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
-		vd->vdev_devid = spa_strdup(vd->vdev_devid);
-	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
-	    &vd->vdev_physpath) == 0)
-		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0)
+		vd->vdev_devid = spa_strdup(tmp);
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0)
+		vd->vdev_physpath = spa_strdup(tmp);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
-	    &vd->vdev_enc_sysfs_path) == 0)
-		vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path);
+	    &tmp) == 0)
+		vd->vdev_enc_sysfs_path = spa_strdup(tmp);
 
-	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
-		vd->vdev_fru = spa_strdup(vd->vdev_fru);
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0)
+		vd->vdev_fru = spa_strdup(tmp);
 
 	/*
 	 * Set the whole_disk property.  If it's not specified, leave the value
@@ -844,9 +898,15 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	    &vd->vdev_not_present);
 
 	/*
-	 * Get the alignment requirement.
+	 * Get the alignment requirement. Ignore pool ashift for vdev
+	 * attach case.
 	 */
-	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
+	if (alloctype != VDEV_ALLOC_ATTACH) {
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+		    &vd->vdev_ashift);
+	} else {
+		vd->vdev_attaching = B_TRUE;
+	}
 
 	/*
 	 * Retrieve the vdev creation time.
@@ -854,6 +914,14 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
 	    &vd->vdev_crtxg);
 
+	if (vd->vdev_ops == &vdev_root_ops &&
+	    (alloctype == VDEV_ALLOC_LOAD ||
+	    alloctype == VDEV_ALLOC_SPLIT ||
+	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP,
+		    &vd->vdev_root_zap);
+	}
+
 	/*
 	 * If we're a top-level vdev, try to load the allocation parameters.
 	 */
@@ -865,10 +933,14 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 		    &vd->vdev_ms_shift);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    &vd->vdev_asize);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
+		    &vd->vdev_noalloc);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
 		    &vd->vdev_removing);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
 		    &vd->vdev_top_zap);
+		vd->vdev_rz_expanding = nvlist_exists(nv,
+		    ZPOOL_CONFIG_RAIDZ_EXPANDING);
 	} else {
 		ASSERT0(vd->vdev_top_zap);
 	}
@@ -943,7 +1015,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 			    &vd->vdev_removed);
 
 			if (vd->vdev_faulted || vd->vdev_degraded) {
-				char *aux;
+				const char *aux;
 
 				vd->vdev_label_aux =
 				    VDEV_AUX_ERR_EXCEEDED;
@@ -1040,7 +1112,6 @@ vdev_free(vdev_t *vd)
 	 * Clean up vdev structure.
 	 */
 	vdev_queue_fini(vd);
-	vdev_cache_fini(vd);
 
 	if (vd->vdev_path)
 		spa_strfree(vd->vdev_path);
@@ -1103,6 +1174,7 @@ vdev_free(vdev_t *vd)
 	mutex_destroy(&vd->vdev_trim_io_lock);
 	cv_destroy(&vd->vdev_trim_cv);
 	cv_destroy(&vd->vdev_autotrim_cv);
+	cv_destroy(&vd->vdev_autotrim_kick_cv);
 	cv_destroy(&vd->vdev_trim_io_cv);
 
 	mutex_destroy(&vd->vdev_rebuild_lock);
@@ -1131,7 +1203,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 
 	ASSERT(tvd == tvd->vdev_top);
 
-	tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite;
 	tvd->vdev_ms_array = svd->vdev_ms_array;
 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
 	tvd->vdev_ms_count = svd->vdev_ms_count;
@@ -1183,8 +1254,10 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 	ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
 	ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
 	ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
+	ASSERT0(tvd->vdev_noalloc);
 	ASSERT0(tvd->vdev_removing);
 	ASSERT0(tvd->vdev_rebuilding);
+	tvd->vdev_noalloc = svd->vdev_noalloc;
 	tvd->vdev_removing = svd->vdev_removing;
 	tvd->vdev_rebuilding = svd->vdev_rebuilding;
 	tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
@@ -1200,6 +1273,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 	svd->vdev_indirect_mapping = NULL;
 	svd->vdev_indirect_births = NULL;
 	svd->vdev_obsolete_sm = NULL;
+	svd->vdev_noalloc = 0;
 	svd->vdev_removing = 0;
 	svd->vdev_rebuilding = 0;
 
@@ -1335,6 +1409,36 @@ vdev_remove_parent(vdev_t *cvd)
 	vdev_free(mvd);
 }
 
+/*
+ * Choose GCD for spa_gcd_alloc.
+ */
+static uint64_t
+vdev_gcd(uint64_t a, uint64_t b)
+{
+	while (b != 0) {
+		uint64_t t = b;
+		b = a % b;
+		a = t;
+	}
+	return (a);
+}
+
+/*
+ * Set spa_min_alloc and spa_gcd_alloc.
+ */
+static void
+vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
+{
+	if (min_alloc < spa->spa_min_alloc)
+		spa->spa_min_alloc = min_alloc;
+	if (spa->spa_gcd_alloc == INT_MAX) {
+		spa->spa_gcd_alloc = min_alloc;
+	} else {
+		spa->spa_gcd_alloc = vdev_gcd(min_alloc,
+		    spa->spa_gcd_alloc);
+	}
+}
+
 void
 vdev_metaslab_group_create(vdev_t *vd)
 {
@@ -1387,8 +1491,7 @@ vdev_metaslab_group_create(vdev_t *vd)
 				spa->spa_min_ashift = vd->vdev_ashift;
 
 			uint64_t min_alloc = vdev_get_min_alloc(vd);
-			if (min_alloc < spa->spa_min_alloc)
-				spa->spa_min_alloc = min_alloc;
+			vdev_spa_set_alloc(spa, min_alloc);
 		}
 	}
 }
@@ -1418,7 +1521,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 	mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 
 	if (expanding) {
-		bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
+		memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp));
 		vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
 	}
 
@@ -1498,11 +1601,15 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 
 	/*
-	 * If the vdev is being removed we don't activate
-	 * the metaslabs since we want to ensure that no new
-	 * allocations are performed on this device.
+	 * If the vdev is marked as non-allocating then don't
+	 * activate the metaslabs since we want to ensure that
+	 * no allocations are performed on this device.
 	 */
-	if (!expanding && !vd->vdev_removing) {
+	if (vd->vdev_noalloc) {
+		/* track non-allocating vdev space */
+		spa->spa_nonallocating_dspace += spa_deflate(spa) ?
+		    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+	} else if (!expanding) {
 		metaslab_group_activate(vd->vdev_mg);
 		if (vd->vdev_log_mg != NULL)
 			metaslab_group_activate(vd->vdev_log_mg);
@@ -1511,13 +1618,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 	if (txg == 0)
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 
-	/*
-	 * Regardless whether this vdev was just added or it is being
-	 * expanded, the metaslab count has changed. Recalculate the
-	 * block limit.
-	 */
-	spa_log_sm_set_blocklimit(spa);
-
 	return (0);
 }
 
@@ -1565,12 +1665,12 @@ vdev_metaslab_fini(vdev_t *vd)
 		}
 	}
 	ASSERT0(vd->vdev_ms_count);
-	ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
 }
 
 typedef struct vdev_probe_stats {
 	boolean_t	vps_readable;
 	boolean_t	vps_writeable;
+	boolean_t	vps_zio_done_probe;
 	int		vps_flags;
 } vdev_probe_stats_t;
 
@@ -1604,6 +1704,8 @@ vdev_probe_done(zio_t *zio)
 
 		vd->vdev_cant_read |= !vps->vps_readable;
 		vd->vdev_cant_write |= !vps->vps_writeable;
+		vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u",
+		    vd->vdev_cant_read, vd->vdev_cant_write);
 
 		if (vdev_readable(vd) &&
 		    (vdev_writeable(vd) || !spa_writeable(spa))) {
@@ -1614,6 +1716,17 @@ vdev_probe_done(zio_t *zio)
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
 			    spa, vd, NULL, NULL, 0);
 			zio->io_error = SET_ERROR(ENXIO);
+
+			/*
+			 * If this probe was initiated from zio pipeline, then
+			 * change the state in a spa_async_request. Probes that
+			 * were initiated from a vdev_open can change the state
+			 * as part of the open call.
+			 */
+			if (vps->vps_zio_done_probe) {
+				vd->vdev_fault_wanted = B_TRUE;
+				spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
+			}
 		}
 
 		mutex_enter(&vd->vdev_probe_lock);
@@ -1663,8 +1776,8 @@ vdev_probe(vdev_t *vd, zio_t *zio)
 		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
 
 		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
-		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
-		    ZIO_FLAG_TRYHARD;
+		    ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
+		vps->vps_zio_done_probe = (zio != NULL);
 
 		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
 			/*
@@ -1691,15 +1804,6 @@ vdev_probe(vdev_t *vd, zio_t *zio)
 		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
 		    vdev_probe_done, vps,
 		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
-
-		/*
-		 * We can't change the vdev state in this context, so we
-		 * kick off an async task to do it on our behalf.
-		 */
-		if (zio != NULL) {
-			vd->vdev_probe_wanted = B_TRUE;
-			spa_async_request(spa, SPA_ASYNC_PROBE);
-		}
 	}
 
 	if (zio != NULL)
@@ -1767,6 +1871,7 @@ vdev_uses_zvols(vdev_t *vd)
 static boolean_t
 vdev_default_open_children_func(vdev_t *vd)
 {
+	(void) vd;
 	return (B_TRUE);
 }
 
@@ -1825,21 +1930,42 @@ vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
 }
 
 /*
- * Compute the raidz-deflation ratio.  Note, we hard-code
- * in 128k (1 << 17) because it is the "typical" blocksize.
- * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
- * otherwise it would inconsistently account for existing bp's.
+ * Compute the raidz-deflation ratio.  Note, we hard-code 128k (1 << 17)
+ * because it is the "typical" blocksize.  Even though SPA_MAXBLOCKSIZE
+ * changed, this algorithm can not change, otherwise it would inconsistently
+ * account for existing bp's.  We also hard-code txg 0 for the same reason
+ * since expanded RAIDZ vdevs can use a different asize for different birth
+ * txg's.
  */
 static void
 vdev_set_deflate_ratio(vdev_t *vd)
 {
 	if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
 		vd->vdev_deflate_ratio = (1 << 17) /
-		    (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
+		    (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
+		    SPA_MINBLOCKSHIFT);
 	}
 }
 
 /*
+ * Choose the best of two ashifts, preferring one between logical ashift
+ * (absolute minimum) and administrator defined maximum, otherwise take
+ * the biggest of the two.
+ */
+uint64_t
+vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b)
+{
+	if (a > logical && a <= zfs_vdev_max_auto_ashift) {
+		if (b <= logical || b > zfs_vdev_max_auto_ashift)
+			return (a);
+		else
+			return (MAX(a, b));
+	} else if (b <= logical || b > zfs_vdev_max_auto_ashift)
+		return (MAX(a, b));
+	return (b);
+}
+
+/*
  * Maximize performance by inflating the configured ashift for top level
  * vdevs to be as close to the physical ashift as possible while maintaining
  * administrator defined limits and ensuring it doesn't go below the
@@ -1850,7 +1976,8 @@ vdev_ashift_optimize(vdev_t *vd)
 {
 	ASSERT(vd == vd->vdev_top);
 
-	if (vd->vdev_ashift < vd->vdev_physical_ashift) {
+	if (vd->vdev_ashift < vd->vdev_physical_ashift &&
+	    vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) {
 		vd->vdev_ashift = MIN(
 		    MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
 		    MAX(zfs_vdev_min_auto_ashift,
@@ -1915,6 +2042,14 @@ vdev_open(vdev_t *vd)
 
 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
 	    &logical_ashift, &physical_ashift);
+
+	/* Keep the device in removed state if unplugged */
+	if (error == ENOENT && vd->vdev_removed) {
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED,
+		    VDEV_AUX_NONE);
+		return (error);
+	}
+
 	/*
 	 * Physical volume size should never be larger than its max size, unless
 	 * the disk has shrunk while we were reading it or the device is buggy
@@ -1986,8 +2121,8 @@ vdev_open(vdev_t *vd)
 		}
 	}
 
-	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
-	max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
+	osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t);
+	max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t);
 
 	if (vd->vdev_children == 0) {
 		if (osize < SPA_MINDEVSIZE) {
@@ -2062,9 +2197,9 @@ vdev_open(vdev_t *vd)
 				return (SET_ERROR(EDOM));
 			}
 
-			if (vd->vdev_top == vd) {
+			if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE)
 				vdev_ashift_optimize(vd);
-			}
+			vd->vdev_attaching = B_FALSE;
 		}
 		if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
 		    vd->vdev_ashift > ASHIFT_MAX)) {
@@ -2125,8 +2260,7 @@ vdev_open(vdev_t *vd)
 	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 	    vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
 		uint64_t min_alloc = vdev_get_min_alloc(vd);
-		if (min_alloc < spa->spa_min_alloc)
-			spa->spa_min_alloc = min_alloc;
+		vdev_spa_set_alloc(spa, min_alloc);
 	}
 
 	/*
@@ -2371,22 +2505,36 @@ vdev_validate(vdev_t *vd)
 }
 
 static void
-vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
+vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid)
 {
-	char *old, *new;
-	if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
-		if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
-			zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
-			    "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
-			    dvd->vdev_path, svd->vdev_path);
-			spa_strfree(dvd->vdev_path);
-			dvd->vdev_path = spa_strdup(svd->vdev_path);
+	if (svd != NULL && *dvd != NULL) {
+		if (strcmp(svd, *dvd) != 0) {
+			zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed "
+			    "from '%s' to '%s'", (u_longlong_t)guid, prefix,
+			    *dvd, svd);
+			spa_strfree(*dvd);
+			*dvd = spa_strdup(svd);
 		}
-	} else if (svd->vdev_path != NULL) {
-		dvd->vdev_path = spa_strdup(svd->vdev_path);
+	} else if (svd != NULL) {
+		*dvd = spa_strdup(svd);
 		zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
-		    (u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
+		    (u_longlong_t)guid, *dvd);
 	}
+}
+
+static void
+vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
+{
+	char *old, *new;
+
+	vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path,
+	    dvd->vdev_guid);
+
+	vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid,
+	    dvd->vdev_guid);
+
+	vdev_update_path("vdev_physpath", svd->vdev_physpath,
+	    &dvd->vdev_physpath, dvd->vdev_guid);
 
 	/*
 	 * Our enclosure sysfs path may have changed between imports
@@ -2527,8 +2675,6 @@ vdev_close(vdev_t *vd)
 
 	vd->vdev_ops->vdev_op_close(vd);
 
-	vdev_cache_purge(vd);
-
 	/*
 	 * We record the previous state before we close it, so that if we are
 	 * doing a reopen(), we don't generate FMA ereports if we notice that
@@ -2615,6 +2761,17 @@ vdev_reopen(vdev_t *vd)
 	}
 
 	/*
+	 * Recheck if resilver is still needed and cancel any
+	 * scheduled resilver if resilver is unneeded.
+	 */
+	if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) &&
+	    spa->spa_async_tasks & SPA_ASYNC_RESILVER) {
+		mutex_enter(&spa->spa_async_lock);
+		spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER;
+		mutex_exit(&spa->spa_async_lock);
+	}
+
+	/*
 	 * Reassess parent vdev's health.
 	 */
 	vdev_propagate_state(vd);
@@ -2848,6 +3005,8 @@ boolean_t
 vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
+	(void) dva, (void) psize;
+
 	/* Set by sequential resilver. */
 	if (phys_birth == TXG_UNKNOWN)
 		return (B_TRUE);
@@ -3103,32 +3262,71 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
 
 		if (txg != 0)
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
-		return;
+	} else {
+		mutex_enter(&vd->vdev_dtl_lock);
+		for (int t = 0; t < DTL_TYPES; t++) {
+			/* account for child's outage in parent's missing map */
+			int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
+			if (t == DTL_SCRUB) {
+				/* leaf vdevs only */
+				continue;
+			}
+			if (t == DTL_PARTIAL) {
+				/* i.e. non-zero */
+				minref = 1;
+			} else if (vdev_get_nparity(vd) != 0) {
+				/* RAIDZ, DRAID */
+				minref = vdev_get_nparity(vd) + 1;
+			} else {
+				/* any kind of mirror */
+				minref = vd->vdev_children;
+			}
+			space_reftree_create(&reftree);
+			for (int c = 0; c < vd->vdev_children; c++) {
+				vdev_t *cvd = vd->vdev_child[c];
+				mutex_enter(&cvd->vdev_dtl_lock);
+				space_reftree_add_map(&reftree,
+				    cvd->vdev_dtl[s], 1);
+				mutex_exit(&cvd->vdev_dtl_lock);
+			}
+			space_reftree_generate_map(&reftree,
+			    vd->vdev_dtl[t], minref);
+			space_reftree_destroy(&reftree);
+		}
+		mutex_exit(&vd->vdev_dtl_lock);
 	}
 
-	mutex_enter(&vd->vdev_dtl_lock);
-	for (int t = 0; t < DTL_TYPES; t++) {
-		/* account for child's outage in parent's missing map */
-		int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
-		if (t == DTL_SCRUB)
-			continue;			/* leaf vdevs only */
-		if (t == DTL_PARTIAL)
-			minref = 1;			/* i.e. non-zero */
-		else if (vdev_get_nparity(vd) != 0)
-			minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
-		else
-			minref = vd->vdev_children;	/* any kind of mirror */
-		space_reftree_create(&reftree);
-		for (int c = 0; c < vd->vdev_children; c++) {
-			vdev_t *cvd = vd->vdev_child[c];
-			mutex_enter(&cvd->vdev_dtl_lock);
-			space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
-			mutex_exit(&cvd->vdev_dtl_lock);
-		}
-		space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
-		space_reftree_destroy(&reftree);
+	if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) {
+		raidz_dtl_reassessed(vd);
 	}
-	mutex_exit(&vd->vdev_dtl_lock);
+}
+
+/*
+ * Iterate over all the vdevs except spare, and post kobj events
+ */
+void
+vdev_post_kobj_evt(vdev_t *vd)
+{
+	if (vd->vdev_ops->vdev_op_kobj_evt_post &&
+	    vd->vdev_kobj_flag == B_FALSE) {
+		vd->vdev_kobj_flag = B_TRUE;
+		vd->vdev_ops->vdev_op_kobj_evt_post(vd);
+	}
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_post_kobj_evt(vd->vdev_child[c]);
+}
+
+/*
+ * Iterate over all the vdevs except spare, and clear kobj events
+ */
+void
+vdev_clear_kobj_evt(vdev_t *vd)
+{
+	vd->vdev_kobj_flag = B_FALSE;
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_clear_kobj_evt(vd->vdev_child[c]);
 }
 
 int
@@ -3242,6 +3440,12 @@ vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
 				vdev_zap_allocation_data(vd, tx);
 		}
 	}
+	if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 &&
+	    spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) {
+		if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2))
+			spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx);
+		vd->vdev_root_zap = vdev_create_link_zap(vd, tx);
+	}
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		vdev_construct_zaps(vd->vdev_child[i], tx);
@@ -3469,6 +3673,12 @@ vdev_load(vdev_t *vd)
 
 	vdev_set_deflate_ratio(vd);
 
+	if (vd->vdev_ops == &vdev_raidz_ops) {
+		error = vdev_raidz_load(vd);
+		if (error != 0)
+			return (error);
+	}
+
 	/*
 	 * On spa_load path, grab the allocation bias from our zap
 	 */
@@ -3492,6 +3702,26 @@ vdev_load(vdev_t *vd)
 		}
 	}
 
+	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+		spa_t *spa = vd->vdev_spa;
+		uint64_t failfast;
+
+		error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+		    vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast),
+		    1, &failfast);
+		if (error == 0) {
+			vd->vdev_failfast = failfast & 1;
+		} else if (error == ENOENT) {
+			vd->vdev_failfast = vdev_prop_default_numeric(
+			    VDEV_PROP_FAILFAST);
+		} else {
+			vdev_dbgmsg(vd,
+			    "vdev_load: zap_lookup(top_zap=%llu) "
+			    "failed [error=%d]",
+			    (u_longlong_t)vd->vdev_top_zap, error);
+		}
+	}
+
 	/*
 	 * Load any rebuild state from the top-level vdev zap.
 	 */
@@ -3506,6 +3736,51 @@ vdev_load(vdev_t *vd)
 		}
 	}
 
+	if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) {
+		uint64_t zapobj;
+
+		if (vd->vdev_top_zap != 0)
+			zapobj = vd->vdev_top_zap;
+		else
+			zapobj = vd->vdev_leaf_zap;
+
+		error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N,
+		    &vd->vdev_checksum_n);
+		if (error && error != ENOENT)
+			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+			    "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+		error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T,
+		    &vd->vdev_checksum_t);
+		if (error && error != ENOENT)
+			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+			    "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+		error = vdev_prop_get_int(vd, VDEV_PROP_IO_N,
+		    &vd->vdev_io_n);
+		if (error && error != ENOENT)
+			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+			    "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+		error = vdev_prop_get_int(vd, VDEV_PROP_IO_T,
+		    &vd->vdev_io_t);
+		if (error && error != ENOENT)
+			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+			    "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
+		    &vd->vdev_slow_io_n);
+		if (error && error != ENOENT)
+			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+			    "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T,
+		    &vd->vdev_slow_io_t);
+		if (error && error != ENOENT)
+			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+			    "failed [error=%d]", (u_longlong_t)zapobj, error);
+	}
+
 	/*
 	 * If this is a top-level vdev, initialize its metaslabs.
 	 */
@@ -3793,10 +4068,22 @@ vdev_sync(vdev_t *vd, uint64_t txg)
 	dmu_tx_commit(tx);
 }
 
+/*
+ * Return the amount of space that should be (or was) allocated for the given
+ * psize (compressed block size) in the given TXG. Note that for expanded
+ * RAIDZ vdevs, the size allocated for older BP's may be larger. See
+ * vdev_raidz_asize().
+ */
+uint64_t
+vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
+{
+	return (vd->vdev_ops->vdev_op_asize(vd, psize, txg));
+}
+
 uint64_t
 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
 {
-	return (vd->vdev_ops->vdev_op_asize(vd, psize));
+	return (vdev_psize_to_asize_txg(vd, psize, 0));
 }
 
 /*
@@ -3912,6 +4199,36 @@ vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
+int
+vdev_remove_wanted(spa_t *spa, uint64_t guid)
+{
+	vdev_t *vd;
+
+	spa_vdev_state_enter(spa, SCL_NONE);
+
+	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
+
+	/*
+	 * If the vdev is already removed, or expanding which can trigger
+	 * repartition add/remove events, then don't do anything.
+	 */
+	if (vd->vdev_removed || vd->vdev_expanding)
+		return (spa_vdev_state_exit(spa, NULL, 0));
+
+	/*
+	 * Confirm the vdev has been removed, otherwise don't do anything.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL)))
+		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
+
+	vd->vdev_remove_wanted = B_TRUE;
+	spa_async_request(spa, SPA_ASYNC_REMOVE);
+
+	return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+
 /*
  * Online the given vdev.
  *
@@ -3932,9 +4249,6 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
-
 	wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
 	oldstate = vd->vdev_state;
 
@@ -3973,6 +4287,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 		/* XXX - L2ARC 1.0 does not support expansion */
 		if (vd->vdev_aux)
 			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
+		spa->spa_ccw_fail_time = 0;
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
 
@@ -4002,9 +4317,19 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 
 	if (wasoffline ||
 	    (oldstate < VDEV_STATE_DEGRADED &&
-	    vd->vdev_state >= VDEV_STATE_DEGRADED))
+	    vd->vdev_state >= VDEV_STATE_DEGRADED)) {
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
 
+		/*
+		 * Asynchronously detach spare vdev if resilver or
+		 * rebuild is not required
+		 */
+		if (vd->vdev_unspare &&
+		    !dsl_scan_resilvering(spa->spa_dsl_pool) &&
+		    !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) &&
+		    !vdev_rebuild_active(tvd))
+			spa_async_request(spa, SPA_ASYNC_DETACH_SPARE);
+	}
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
@@ -4155,9 +4480,9 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 		vdev_clear(spa, vd->vdev_child[c]);
 
 	/*
-	 * It makes no sense to "clear" an indirect vdev.
+	 * It makes no sense to "clear" an indirect  or removed vdev.
 	 */
-	if (!vdev_is_concrete(vd))
+	if (!vdev_is_concrete(vd) || vd->vdev_removed)
 		return;
 
 	/*
@@ -4297,6 +4622,8 @@ vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
 static void
 vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
 {
+	(void) cvd;
+
 	int t, b;
 	for (t = 0; t < ZIO_TYPES; t++) {
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++)
@@ -4386,11 +4713,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 
 		memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
 
-		for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
-			vsx->vsx_active_queue[t] =
-			    vd->vdev_queue.vq_class[t].vqc_active;
-			vsx->vsx_pend_queue[t] = avl_numnodes(
-			    &vd->vdev_queue.vq_class[t].vqc_queued_tree);
+		for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
+			vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
+			vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
 		}
 	}
 }
@@ -4401,12 +4726,13 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 	vdev_t *tvd = vd->vdev_top;
 	mutex_enter(&vd->vdev_stat_lock);
 	if (vs) {
-		bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+		memcpy(vs, &vd->vdev_stat, sizeof (*vs));
 		vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 		vs->vs_state = vd->vdev_state;
 		vs->vs_rsize = vdev_get_min_asize(vd);
 
 		if (vd->vdev_ops->vdev_op_leaf) {
+			vs->vs_pspace = vd->vdev_psize;
 			vs->vs_rsize += VDEV_LABEL_START_SIZE +
 			    VDEV_LABEL_END_SIZE;
 			/*
@@ -4444,15 +4770,18 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 		 * can expand.
 		 */
 		if (vd->vdev_aux == NULL && tvd != NULL) {
-			vs->vs_esize = P2ALIGN(
+			vs->vs_esize = P2ALIGN_TYPED(
 			    vd->vdev_max_asize - vd->vdev_asize,
-			    1ULL << tvd->vdev_ms_shift);
+			    1ULL << tvd->vdev_ms_shift, uint64_t);
 		}
 
 		vs->vs_configured_ashift = vd->vdev_top != NULL
 		    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
 		vs->vs_logical_ashift = vd->vdev_logical_ashift;
-		vs->vs_physical_ashift = vd->vdev_physical_ashift;
+		if (vd->vdev_physical_ashift <= ASHIFT_MAX)
+			vs->vs_physical_ashift = vd->vdev_physical_ashift;
+		else
+			vs->vs_physical_ashift = 0;
 
 		/*
 		 * Report fragmentation and rebuild progress for top-level,
@@ -4469,6 +4798,8 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 			vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
 			    vd->vdev_mg->mg_fragmentation : 0;
 		}
+		vs->vs_noalloc = MAX(vd->vdev_noalloc,
+		    tvd ? tvd->vdev_noalloc : 0);
 	}
 
 	vdev_get_stats_ex_impl(vd, vs, vsx);
@@ -4512,8 +4843,14 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
 	vdev_t *pvd;
 	uint64_t txg = zio->io_txg;
+/* Suppress ASAN false positive */
+#ifdef __SANITIZE_ADDRESS__
+	vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL;
+	vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL;
+#else
 	vdev_stat_t *vs = &vd->vdev_stat;
 	vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
+#endif
 	zio_type_t type = zio->io_type;
 	int flags = zio->io_flags;
 
@@ -4597,11 +4934,11 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 
 			/*
 			 * TRIM ops and bytes are reported to user space as
-			 * ZIO_TYPE_IOCTL.  This is done to preserve the
+			 * ZIO_TYPE_FLUSH.  This is done to preserve the
 			 * vdev_stat_t structure layout for user space.
 			 */
 			if (type == ZIO_TYPE_TRIM)
-				vs_type = ZIO_TYPE_IOCTL;
+				vs_type = ZIO_TYPE_FLUSH;
 
 			/*
 			 * Solely for the purposes of 'zpool iostat -lqrw'
@@ -4730,6 +5067,7 @@ void
 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
     int64_t space_delta)
 {
+	(void) defer_delta;
 	int64_t dspace_delta;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
@@ -5191,7 +5529,9 @@ vdev_expand(vdev_t *vd, uint64_t txg)
 
 	vdev_set_deflate_ratio(vd);
 
-	if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
+	if ((vd->vdev_spa->spa_raidz_expand == NULL ||
+	    vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) &&
+	    (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
 	    vdev_is_concrete(vd)) {
 		vdev_metaslab_group_create(vd);
 		VERIFY(vdev_metaslab_init(vd, txg) == 0);
@@ -5207,9 +5547,13 @@ vdev_split(vdev_t *vd)
 {
 	vdev_t *cvd, *pvd = vd->vdev_parent;
 
+	VERIFY3U(pvd->vdev_children, >, 1);
+
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
+	ASSERT3P(pvd->vdev_child, !=, NULL);
+
 	cvd = pvd->vdev_child[0];
 	if (pvd->vdev_children == 1) {
 		vdev_remove_parent(cvd);
@@ -5219,7 +5563,7 @@ vdev_split(vdev_t *vd)
 }
 
 void
-vdev_deadman(vdev_t *vd, char *tag)
+vdev_deadman(vdev_t *vd, const char *tag)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
@@ -5231,20 +5575,20 @@ vdev_deadman(vdev_t *vd, char *tag)
 		vdev_queue_t *vq = &vd->vdev_queue;
 
 		mutex_enter(&vq->vq_lock);
-		if (avl_numnodes(&vq->vq_active_tree) > 0) {
+		if (vq->vq_active > 0) {
 			spa_t *spa = vd->vdev_spa;
 			zio_t *fio;
 			uint64_t delta;
 
-			zfs_dbgmsg("slow vdev: %s has %lu active IOs",
-			    vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
+			zfs_dbgmsg("slow vdev: %s has %u active IOs",
+			    vd->vdev_path, vq->vq_active);
 
 			/*
 			 * Look at the head of all the pending queues,
 			 * if any I/O has been outstanding for longer than
 			 * the spa_deadman_synctime invoke the deadman logic.
 			 */
-			fio = avl_first(&vq->vq_active_tree);
+			fio = list_head(&vq->vq_active_list);
 			delta = gethrtime() - fio->io_timestamp;
 			if (delta > spa_deadman_synctime(spa))
 				zio_deadman(fio, tag);
@@ -5375,6 +5719,23 @@ vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
 	}
 }
 
+static char *
+vdev_name(vdev_t *vd, char *buf, int buflen)
+{
+	if (vd->vdev_path == NULL) {
+		if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) {
+			strlcpy(buf, vd->vdev_spa->spa_name, buflen);
+		} else if (!vd->vdev_ops->vdev_op_leaf) {
+			snprintf(buf, buflen, "%s-%llu",
+			    vd->vdev_ops->vdev_op_type,
+			    (u_longlong_t)vd->vdev_id);
+		}
+	} else {
+		strlcpy(buf, vd->vdev_path, buflen);
+	}
+	return (buf);
+}
+
 /*
  * Look at the vdev tree and determine whether any devices are currently being
  * replaced.
@@ -5404,31 +5765,730 @@ vdev_replace_in_progress(vdev_t *vdev)
 	return (B_FALSE);
 }
 
+/*
+ * Add a (source=src, propname=propval) list to an nvlist.
+ */
+static void
+vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval,
+    uint64_t intval, zprop_source_t src)
+{
+	nvlist_t *propval;
+
+	propval = fnvlist_alloc();
+	fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
+
+	if (strval != NULL)
+		fnvlist_add_string(propval, ZPROP_VALUE, strval);
+	else
+		fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
+
+	fnvlist_add_nvlist(nvl, propname, propval);
+	nvlist_free(propval);
+}
+
+static void
+vdev_props_set_sync(void *arg, dmu_tx_t *tx)
+{
+	vdev_t *vd;
+	nvlist_t *nvp = arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	objset_t *mos = spa->spa_meta_objset;
+	nvpair_t *elem = NULL;
+	uint64_t vdev_guid;
+	uint64_t objid;
+	nvlist_t *nvprops;
+
+	vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
+	nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS);
+	vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
+
+	/* this vdev could get removed while waiting for this sync task */
+	if (vd == NULL)
+		return;
+
+	/*
+	 * Set vdev property values in the vdev props mos object.
+	 */
+	if (vd->vdev_root_zap != 0) {
+		objid = vd->vdev_root_zap;
+	} else if (vd->vdev_top_zap != 0) {
+		objid = vd->vdev_top_zap;
+	} else if (vd->vdev_leaf_zap != 0) {
+		objid = vd->vdev_leaf_zap;
+	} else {
+		panic("unexpected vdev type");
+	}
+
+	mutex_enter(&spa->spa_props_lock);
+
+	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
+		uint64_t intval;
+		const char *strval;
+		vdev_prop_t prop;
+		const char *propname = nvpair_name(elem);
+		zprop_type_t proptype;
+
+		switch (prop = vdev_name_to_prop(propname)) {
+		case VDEV_PROP_USERPROP:
+			if (vdev_prop_user(propname)) {
+				strval = fnvpair_value_string(elem);
+				if (strlen(strval) == 0) {
+					/* remove the property if value == "" */
+					(void) zap_remove(mos, objid, propname,
+					    tx);
+				} else {
+					VERIFY0(zap_update(mos, objid, propname,
+					    1, strlen(strval) + 1, strval, tx));
+				}
+				spa_history_log_internal(spa, "vdev set", tx,
+				    "vdev_guid=%llu: %s=%s",
+				    (u_longlong_t)vdev_guid, nvpair_name(elem),
+				    strval);
+			}
+			break;
+		default:
+			/* normalize the property name */
+			propname = vdev_prop_to_name(prop);
+			proptype = vdev_prop_get_type(prop);
+
+			if (nvpair_type(elem) == DATA_TYPE_STRING) {
+				ASSERT(proptype == PROP_TYPE_STRING);
+				strval = fnvpair_value_string(elem);
+				VERIFY0(zap_update(mos, objid, propname,
+				    1, strlen(strval) + 1, strval, tx));
+				spa_history_log_internal(spa, "vdev set", tx,
+				    "vdev_guid=%llu: %s=%s",
+				    (u_longlong_t)vdev_guid, nvpair_name(elem),
+				    strval);
+			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
+				intval = fnvpair_value_uint64(elem);
+
+				if (proptype == PROP_TYPE_INDEX) {
+					const char *unused;
+					VERIFY0(vdev_prop_index_to_string(
+					    prop, intval, &unused));
+				}
+				VERIFY0(zap_update(mos, objid, propname,
+				    sizeof (uint64_t), 1, &intval, tx));
+				spa_history_log_internal(spa, "vdev set", tx,
+				    "vdev_guid=%llu: %s=%lld",
+				    (u_longlong_t)vdev_guid,
+				    nvpair_name(elem), (longlong_t)intval);
+			} else {
+				panic("invalid vdev property type %u",
+				    nvpair_type(elem));
+			}
+		}
+
+	}
+
+	mutex_exit(&spa->spa_props_lock);
+}
+
+int
+vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	spa_t *spa = vd->vdev_spa;
+	nvpair_t *elem = NULL;
+	uint64_t vdev_guid;
+	nvlist_t *nvprops;
+	int error = 0;
+
+	ASSERT(vd != NULL);
+
+	/* Check that vdev has a zap we can use */
+	if (vd->vdev_root_zap == 0 &&
+	    vd->vdev_top_zap == 0 &&
+	    vd->vdev_leaf_zap == 0)
+		return (SET_ERROR(EINVAL));
+
+	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
+	    &vdev_guid) != 0)
+		return (SET_ERROR(EINVAL));
+
+	if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS,
+	    &nvprops) != 0)
+		return (SET_ERROR(EINVAL));
+
+	if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL)
+		return (SET_ERROR(EINVAL));
+
+	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
+		const char *propname = nvpair_name(elem);
+		vdev_prop_t prop = vdev_name_to_prop(propname);
+		uint64_t intval = 0;
+		const char *strval = NULL;
+
+		if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) {
+			error = EINVAL;
+			goto end;
+		}
+
+		if (vdev_prop_readonly(prop)) {
+			error = EROFS;
+			goto end;
+		}
+
+		/* Special Processing */
+		switch (prop) {
+		case VDEV_PROP_PATH:
+			if (vd->vdev_path == NULL) {
+				error = EROFS;
+				break;
+			}
+			if (nvpair_value_string(elem, &strval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			/* New path must start with /dev/ */
+			if (strncmp(strval, "/dev/", 5)) {
+				error = EINVAL;
+				break;
+			}
+			error = spa_vdev_setpath(spa, vdev_guid, strval);
+			break;
+		case VDEV_PROP_ALLOCATING:
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			if (intval != vd->vdev_noalloc)
+				break;
+			if (intval == 0)
+				error = spa_vdev_noalloc(spa, vdev_guid);
+			else
+				error = spa_vdev_alloc(spa, vdev_guid);
+			break;
+		case VDEV_PROP_FAILFAST:
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			vd->vdev_failfast = intval & 1;
+			break;
+		case VDEV_PROP_CHECKSUM_N:
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			vd->vdev_checksum_n = intval;
+			break;
+		case VDEV_PROP_CHECKSUM_T:
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			vd->vdev_checksum_t = intval;
+			break;
+		case VDEV_PROP_IO_N:
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			vd->vdev_io_n = intval;
+			break;
+		case VDEV_PROP_IO_T:
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			vd->vdev_io_t = intval;
+			break;
+		case VDEV_PROP_SLOW_IO_N:
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			vd->vdev_slow_io_n = intval;
+			break;
+		case VDEV_PROP_SLOW_IO_T:
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			vd->vdev_slow_io_t = intval;
+			break;
+		default:
+			/* Most processing is done in vdev_props_set_sync */
+			break;
+		}
+end:
+		if (error != 0) {
+			intval = error;
+			vdev_prop_add_list(outnvl, propname, strval, intval, 0);
+			return (error);
+		}
+	}
+
+	return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync,
+	    innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+int
+vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa->spa_meta_objset;
+	int err = 0;
+	uint64_t objid;
+	uint64_t vdev_guid;
+	nvpair_t *elem = NULL;
+	nvlist_t *nvprops = NULL;
+	uint64_t intval = 0;
+	char *strval = NULL;
+	const char *propname = NULL;
+	vdev_prop_t prop;
+
+	ASSERT(vd != NULL);
+	ASSERT(mos != NULL);
+
+	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV,
+	    &vdev_guid) != 0)
+		return (SET_ERROR(EINVAL));
+
+	nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
+
+	if (vd->vdev_root_zap != 0) {
+		objid = vd->vdev_root_zap;
+	} else if (vd->vdev_top_zap != 0) {
+		objid = vd->vdev_top_zap;
+	} else if (vd->vdev_leaf_zap != 0) {
+		objid = vd->vdev_leaf_zap;
+	} else {
+		return (SET_ERROR(EINVAL));
+	}
+	ASSERT(objid != 0);
+
+	mutex_enter(&spa->spa_props_lock);
+
+	if (nvprops != NULL) {
+		char namebuf[64] = { 0 };
+
+		while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
+			intval = 0;
+			strval = NULL;
+			propname = nvpair_name(elem);
+			prop = vdev_name_to_prop(propname);
+			zprop_source_t src = ZPROP_SRC_DEFAULT;
+			uint64_t integer_size, num_integers;
+
+			switch (prop) {
+			/* Special Read-only Properties */
+			case VDEV_PROP_NAME:
+				strval = vdev_name(vd, namebuf,
+				    sizeof (namebuf));
+				if (strval == NULL)
+					continue;
+				vdev_prop_add_list(outnvl, propname, strval, 0,
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_CAPACITY:
+				/* percent used */
+				intval = (vd->vdev_stat.vs_dspace == 0) ? 0 :
+				    (vd->vdev_stat.vs_alloc * 100 /
+				    vd->vdev_stat.vs_dspace);
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    intval, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_STATE:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_state, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_GUID:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_guid, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_ASIZE:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_asize, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_PSIZE:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_psize, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_ASHIFT:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_ashift, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_SIZE:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_FREE:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_dspace -
+				    vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_ALLOCATED:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_EXPANDSZ:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_esize, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_FRAGMENTATION:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_fragmentation,
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_PARITY:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vdev_get_nparity(vd), ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_PATH:
+				if (vd->vdev_path == NULL)
+					continue;
+				vdev_prop_add_list(outnvl, propname,
+				    vd->vdev_path, 0, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_DEVID:
+				if (vd->vdev_devid == NULL)
+					continue;
+				vdev_prop_add_list(outnvl, propname,
+				    vd->vdev_devid, 0, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_PHYS_PATH:
+				if (vd->vdev_physpath == NULL)
+					continue;
+				vdev_prop_add_list(outnvl, propname,
+				    vd->vdev_physpath, 0, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_ENC_PATH:
+				if (vd->vdev_enc_sysfs_path == NULL)
+					continue;
+				vdev_prop_add_list(outnvl, propname,
+				    vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_FRU:
+				if (vd->vdev_fru == NULL)
+					continue;
+				vdev_prop_add_list(outnvl, propname,
+				    vd->vdev_fru, 0, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_PARENT:
+				if (vd->vdev_parent != NULL) {
+					strval = vdev_name(vd->vdev_parent,
+					    namebuf, sizeof (namebuf));
+					vdev_prop_add_list(outnvl, propname,
+					    strval, 0, ZPROP_SRC_NONE);
+				}
+				continue;
+			case VDEV_PROP_CHILDREN:
+				if (vd->vdev_children > 0)
+					strval = kmem_zalloc(ZAP_MAXVALUELEN,
+					    KM_SLEEP);
+				for (uint64_t i = 0; i < vd->vdev_children;
+				    i++) {
+					const char *vname;
+
+					vname = vdev_name(vd->vdev_child[i],
+					    namebuf, sizeof (namebuf));
+					if (vname == NULL)
+						vname = "(unknown)";
+					if (strlen(strval) > 0)
+						strlcat(strval, ",",
+						    ZAP_MAXVALUELEN);
+					strlcat(strval, vname, ZAP_MAXVALUELEN);
+				}
+				if (strval != NULL) {
+					vdev_prop_add_list(outnvl, propname,
+					    strval, 0, ZPROP_SRC_NONE);
+					kmem_free(strval, ZAP_MAXVALUELEN);
+				}
+				continue;
+			case VDEV_PROP_NUMCHILDREN:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_children, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_READ_ERRORS:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_read_errors,
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_WRITE_ERRORS:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_write_errors,
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_CHECKSUM_ERRORS:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_checksum_errors,
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_INITIALIZE_ERRORS:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_initialize_errors,
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_OPS_NULL:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_ops[ZIO_TYPE_NULL],
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_OPS_READ:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_ops[ZIO_TYPE_READ],
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_OPS_WRITE:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE],
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_OPS_FREE:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_ops[ZIO_TYPE_FREE],
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_OPS_CLAIM:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM],
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_OPS_TRIM:
+				/*
+				 * TRIM ops and bytes are reported to user
+				 * space as ZIO_TYPE_FLUSH.  This is done to
+				 * preserve the vdev_stat_t structure layout
+				 * for user space.
+				 */
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH],
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_BYTES_NULL:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL],
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_BYTES_READ:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_bytes[ZIO_TYPE_READ],
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_BYTES_WRITE:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE],
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_BYTES_FREE:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE],
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_BYTES_CLAIM:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM],
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_BYTES_TRIM:
+				/*
+				 * TRIM ops and bytes are reported to user
+				 * space as ZIO_TYPE_FLUSH.  This is done to
+				 * preserve the vdev_stat_t structure layout
+				 * for user space.
+				 */
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH],
+				    ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_REMOVING:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    vd->vdev_removing, ZPROP_SRC_NONE);
+				continue;
+			case VDEV_PROP_RAIDZ_EXPANDING:
+				/* Only expose this for raidz */
+				if (vd->vdev_ops == &vdev_raidz_ops) {
+					vdev_prop_add_list(outnvl, propname,
+					    NULL, vd->vdev_rz_expanding,
+					    ZPROP_SRC_NONE);
+				}
+				continue;
+			/* Numeric Properites */
+			case VDEV_PROP_ALLOCATING:
+				/* Leaf vdevs cannot have this property */
+				if (vd->vdev_mg == NULL &&
+				    vd->vdev_top != NULL) {
+					src = ZPROP_SRC_NONE;
+					intval = ZPROP_BOOLEAN_NA;
+				} else {
+					err = vdev_prop_get_int(vd, prop,
+					    &intval);
+					if (err && err != ENOENT)
+						break;
+
+					if (intval ==
+					    vdev_prop_default_numeric(prop))
+						src = ZPROP_SRC_DEFAULT;
+					else
+						src = ZPROP_SRC_LOCAL;
+				}
+
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    intval, src);
+				break;
+			case VDEV_PROP_FAILFAST:
+				src = ZPROP_SRC_LOCAL;
+				strval = NULL;
+
+				err = zap_lookup(mos, objid, nvpair_name(elem),
+				    sizeof (uint64_t), 1, &intval);
+				if (err == ENOENT) {
+					intval = vdev_prop_default_numeric(
+					    prop);
+					err = 0;
+				} else if (err) {
+					break;
+				}
+				if (intval == vdev_prop_default_numeric(prop))
+					src = ZPROP_SRC_DEFAULT;
+
+				vdev_prop_add_list(outnvl, propname, strval,
+				    intval, src);
+				break;
+			case VDEV_PROP_CHECKSUM_N:
+			case VDEV_PROP_CHECKSUM_T:
+			case VDEV_PROP_IO_N:
+			case VDEV_PROP_IO_T:
+			case VDEV_PROP_SLOW_IO_N:
+			case VDEV_PROP_SLOW_IO_T:
+				err = vdev_prop_get_int(vd, prop, &intval);
+				if (err && err != ENOENT)
+					break;
+
+				if (intval == vdev_prop_default_numeric(prop))
+					src = ZPROP_SRC_DEFAULT;
+				else
+					src = ZPROP_SRC_LOCAL;
+
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    intval, src);
+				break;
+			/* Text Properties */
+			case VDEV_PROP_COMMENT:
+				/* Exists in the ZAP below */
+				/* FALLTHRU */
+			case VDEV_PROP_USERPROP:
+				/* User Properites */
+				src = ZPROP_SRC_LOCAL;
+
+				err = zap_length(mos, objid, nvpair_name(elem),
+				    &integer_size, &num_integers);
+				if (err)
+					break;
+
+				switch (integer_size) {
+				case 8:
+					/* User properties cannot be integers */
+					err = EINVAL;
+					break;
+				case 1:
+					/* string property */
+					strval = kmem_alloc(num_integers,
+					    KM_SLEEP);
+					err = zap_lookup(mos, objid,
+					    nvpair_name(elem), 1,
+					    num_integers, strval);
+					if (err) {
+						kmem_free(strval,
+						    num_integers);
+						break;
+					}
+					vdev_prop_add_list(outnvl, propname,
+					    strval, 0, src);
+					kmem_free(strval, num_integers);
+					break;
+				}
+				break;
+			default:
+				err = ENOENT;
+				break;
+			}
+			if (err)
+				break;
+		}
+	} else {
+		/*
+		 * Get all properties from the MOS vdev property object.
+		 */
+		zap_cursor_t zc;
+		zap_attribute_t za;
+		for (zap_cursor_init(&zc, mos, objid);
+		    (err = zap_cursor_retrieve(&zc, &za)) == 0;
+		    zap_cursor_advance(&zc)) {
+			intval = 0;
+			strval = NULL;
+			zprop_source_t src = ZPROP_SRC_DEFAULT;
+			propname = za.za_name;
+
+			switch (za.za_integer_length) {
+			case 8:
+				/* We do not allow integer user properties */
+				/* This is likely an internal value */
+				break;
+			case 1:
+				/* string property */
+				strval = kmem_alloc(za.za_num_integers,
+				    KM_SLEEP);
+				err = zap_lookup(mos, objid, za.za_name, 1,
+				    za.za_num_integers, strval);
+				if (err) {
+					kmem_free(strval, za.za_num_integers);
+					break;
+				}
+				vdev_prop_add_list(outnvl, propname, strval, 0,
+				    src);
+				kmem_free(strval, za.za_num_integers);
+				break;
+
+			default:
+				break;
+			}
+		}
+		zap_cursor_fini(&zc);
+	}
+
+	mutex_exit(&spa->spa_props_lock);
+	if (err && err != ENOENT) {
+		return (err);
+	}
+
+	return (0);
+}
+
 EXPORT_SYMBOL(vdev_fault);
 EXPORT_SYMBOL(vdev_degrade);
 EXPORT_SYMBOL(vdev_online);
 EXPORT_SYMBOL(vdev_offline);
 EXPORT_SYMBOL(vdev_clear);
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW,
 	"Target number of metaslabs per top-level vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, INT, ZMOD_RW,
-	"Default limit for metaslab size");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW,
+	"Default lower limit for metaslab size");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW,
+	"Default upper limit for metaslab size");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW,
 	"Minimum number of metaslabs per top-level vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW,
 	"Practical upper limit of total metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
 	"Rate limit slow IO (delay) events to this many per second");
 
+ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW,
+	"Rate limit hung IO (deadman) events to this many per second");
+
+/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
 	"Rate limit checksum events to this many checksum errors per second "
-	"(do not set below zed threshold).");
+	"(do not set below ZED threshold).");
+/* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW,
 	"Ignore errors during resilver/scrub");
@@ -5439,15 +6499,16 @@ ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
 	"Disable cache flushes");
 
-ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW,
 	"Minimum number of metaslabs required to dedicate one for log blocks");
 
+/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
-	param_set_min_auto_ashift, param_get_ulong, ZMOD_RW,
+	param_set_min_auto_ashift, param_get_uint, ZMOD_RW,
 	"Minimum ashift used when creating new top-level vdevs");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
-	param_set_max_auto_ashift, param_get_ulong, ZMOD_RW,
+	param_set_max_auto_ashift, param_get_uint, ZMOD_RW,
 	"Maximum ashift used when optimizing for logical -> physical sector "
 	"size on new top-level vdevs");
 /* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_cache.c b/sys/contrib/openzfs/module/zfs/vdev_cache.c
deleted file mode 100644
index 6e82184b800d..000000000000
--- a/sys/contrib/openzfs/module/zfs/vdev_cache.c
+++ /dev/null
@@ -1,437 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/kstat.h>
-#include <sys/abd.h>
-
-/*
- * Virtual device read-ahead caching.
- *
- * This file implements a simple LRU read-ahead cache.  When the DMU reads
- * a given block, it will often want other, nearby blocks soon thereafter.
- * We take advantage of this by reading a larger disk region and caching
- * the result.  In the best case, this can turn 128 back-to-back 512-byte
- * reads into a single 64k read followed by 127 cache hits; this reduces
- * latency dramatically.  In the worst case, it can turn an isolated 512-byte
- * read into a 64k read, which doesn't affect latency all that much but is
- * terribly wasteful of bandwidth.  A more intelligent version of the cache
- * could keep track of access patterns and not do read-ahead unless it sees
- * at least two temporally close I/Os to the same region.  Currently, only
- * metadata I/O is inflated.  A further enhancement could take advantage of
- * more semantic information about the I/O.  And it could use something
- * faster than an AVL tree; that was chosen solely for convenience.
- *
- * There are five cache operations: allocate, fill, read, write, evict.
- *
- * (1) Allocate.  This reserves a cache entry for the specified region.
- *     We separate the allocate and fill operations so that multiple threads
- *     don't generate I/O for the same cache miss.
- *
- * (2) Fill.  When the I/O for a cache miss completes, the fill routine
- *     places the data in the previously allocated cache entry.
- *
- * (3) Read.  Read data from the cache.
- *
- * (4) Write.  Update cache contents after write completion.
- *
- * (5) Evict.  When allocating a new entry, we evict the oldest (LRU) entry
- *     if the total cache size exceeds zfs_vdev_cache_size.
- */
-
-/*
- * These tunables are for performance analysis.
- */
-/*
- * All i/os smaller than zfs_vdev_cache_max will be turned into
- * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
- * track buffer).  At most zfs_vdev_cache_size bytes will be kept in each
- * vdev's vdev_cache.
- *
- * TODO: Note that with the current ZFS code, it turns out that the
- * vdev cache is not helpful, and in some cases actually harmful.  It
- * is better if we disable this.  Once some time has passed, we should
- * actually remove this to simplify the code.  For now we just disable
- * it by setting the zfs_vdev_cache_size to zero.  Note that Solaris 11
- * has made these same changes.
- */
-int zfs_vdev_cache_max = 1<<14;			/* 16KB */
-int zfs_vdev_cache_size = 0;
-int zfs_vdev_cache_bshift = 16;
-
-#define	VCBS (1 << zfs_vdev_cache_bshift)	/* 64KB */
-
-kstat_t	*vdc_ksp = NULL;
-
-typedef struct vdc_stats {
-	kstat_named_t vdc_stat_delegations;
-	kstat_named_t vdc_stat_hits;
-	kstat_named_t vdc_stat_misses;
-} vdc_stats_t;
-
-static vdc_stats_t vdc_stats = {
-	{ "delegations",	KSTAT_DATA_UINT64 },
-	{ "hits",		KSTAT_DATA_UINT64 },
-	{ "misses",		KSTAT_DATA_UINT64 }
-};
-
-#define	VDCSTAT_BUMP(stat)	atomic_inc_64(&vdc_stats.stat.value.ui64);
-
-static inline int
-vdev_cache_offset_compare(const void *a1, const void *a2)
-{
-	const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
-	const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
-
-	return (TREE_CMP(ve1->ve_offset, ve2->ve_offset));
-}
-
-static int
-vdev_cache_lastused_compare(const void *a1, const void *a2)
-{
-	const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
-	const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
-
-	int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused);
-	if (likely(cmp))
-		return (cmp);
-
-	/*
-	 * Among equally old entries, sort by offset to ensure uniqueness.
-	 */
-	return (vdev_cache_offset_compare(a1, a2));
-}
-
-/*
- * Evict the specified entry from the cache.
- */
-static void
-vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
-{
-	ASSERT(MUTEX_HELD(&vc->vc_lock));
-	ASSERT3P(ve->ve_fill_io, ==, NULL);
-	ASSERT3P(ve->ve_abd, !=, NULL);
-
-	avl_remove(&vc->vc_lastused_tree, ve);
-	avl_remove(&vc->vc_offset_tree, ve);
-	abd_free(ve->ve_abd);
-	kmem_free(ve, sizeof (vdev_cache_entry_t));
-}
-
-/*
- * Allocate an entry in the cache.  At the point we don't have the data,
- * we're just creating a placeholder so that multiple threads don't all
- * go off and read the same blocks.
- */
-static vdev_cache_entry_t *
-vdev_cache_allocate(zio_t *zio)
-{
-	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
-	uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
-	vdev_cache_entry_t *ve;
-
-	ASSERT(MUTEX_HELD(&vc->vc_lock));
-
-	if (zfs_vdev_cache_size == 0)
-		return (NULL);
-
-	/*
-	 * If adding a new entry would exceed the cache size,
-	 * evict the oldest entry (LRU).
-	 */
-	if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
-	    zfs_vdev_cache_size) {
-		ve = avl_first(&vc->vc_lastused_tree);
-		if (ve->ve_fill_io != NULL)
-			return (NULL);
-		ASSERT3U(ve->ve_hits, !=, 0);
-		vdev_cache_evict(vc, ve);
-	}
-
-	ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
-	ve->ve_offset = offset;
-	ve->ve_lastused = ddi_get_lbolt();
-	ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE);
-
-	avl_add(&vc->vc_offset_tree, ve);
-	avl_add(&vc->vc_lastused_tree, ve);
-
-	return (ve);
-}
-
-static void
-vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
-{
-	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
-
-	ASSERT(MUTEX_HELD(&vc->vc_lock));
-	ASSERT3P(ve->ve_fill_io, ==, NULL);
-
-	if (ve->ve_lastused != ddi_get_lbolt()) {
-		avl_remove(&vc->vc_lastused_tree, ve);
-		ve->ve_lastused = ddi_get_lbolt();
-		avl_add(&vc->vc_lastused_tree, ve);
-	}
-
-	ve->ve_hits++;
-	abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size);
-}
-
-/*
- * Fill a previously allocated cache entry with data.
- */
-static void
-vdev_cache_fill(zio_t *fio)
-{
-	vdev_t *vd = fio->io_vd;
-	vdev_cache_t *vc = &vd->vdev_cache;
-	vdev_cache_entry_t *ve = fio->io_private;
-	zio_t *pio;
-
-	ASSERT3U(fio->io_size, ==, VCBS);
-
-	/*
-	 * Add data to the cache.
-	 */
-	mutex_enter(&vc->vc_lock);
-
-	ASSERT3P(ve->ve_fill_io, ==, fio);
-	ASSERT3U(ve->ve_offset, ==, fio->io_offset);
-	ASSERT3P(ve->ve_abd, ==, fio->io_abd);
-
-	ve->ve_fill_io = NULL;
-
-	/*
-	 * Even if this cache line was invalidated by a missed write update,
-	 * any reads that were queued up before the missed update are still
-	 * valid, so we can satisfy them from this line before we evict it.
-	 */
-	zio_link_t *zl = NULL;
-	while ((pio = zio_walk_parents(fio, &zl)) != NULL)
-		vdev_cache_hit(vc, ve, pio);
-
-	if (fio->io_error || ve->ve_missed_update)
-		vdev_cache_evict(vc, ve);
-
-	mutex_exit(&vc->vc_lock);
-}
-
-/*
- * Read data from the cache.  Returns B_TRUE cache hit, B_FALSE on miss.
- */
-boolean_t
-vdev_cache_read(zio_t *zio)
-{
-	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
-	vdev_cache_entry_t *ve, *ve_search;
-	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
-	zio_t *fio;
-	uint64_t cache_phase __maybe_unused = P2PHASE(zio->io_offset, VCBS);
-
-	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
-
-	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
-		return (B_FALSE);
-
-	if (zio->io_size > zfs_vdev_cache_max)
-		return (B_FALSE);
-
-	/*
-	 * If the I/O straddles two or more cache blocks, don't cache it.
-	 */
-	if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
-		return (B_FALSE);
-
-	ASSERT3U(cache_phase + zio->io_size, <=, VCBS);
-
-	mutex_enter(&vc->vc_lock);
-
-	ve_search = kmem_alloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
-	ve_search->ve_offset = cache_offset;
-	ve = avl_find(&vc->vc_offset_tree, ve_search, NULL);
-	kmem_free(ve_search, sizeof (vdev_cache_entry_t));
-
-	if (ve != NULL) {
-		if (ve->ve_missed_update) {
-			mutex_exit(&vc->vc_lock);
-			return (B_FALSE);
-		}
-
-		if ((fio = ve->ve_fill_io) != NULL) {
-			zio_vdev_io_bypass(zio);
-			zio_add_child(zio, fio);
-			mutex_exit(&vc->vc_lock);
-			VDCSTAT_BUMP(vdc_stat_delegations);
-			return (B_TRUE);
-		}
-
-		vdev_cache_hit(vc, ve, zio);
-		zio_vdev_io_bypass(zio);
-
-		mutex_exit(&vc->vc_lock);
-		VDCSTAT_BUMP(vdc_stat_hits);
-		return (B_TRUE);
-	}
-
-	ve = vdev_cache_allocate(zio);
-
-	if (ve == NULL) {
-		mutex_exit(&vc->vc_lock);
-		return (B_FALSE);
-	}
-
-	fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
-	    ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
-	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
-
-	ve->ve_fill_io = fio;
-	zio_vdev_io_bypass(zio);
-	zio_add_child(zio, fio);
-
-	mutex_exit(&vc->vc_lock);
-	zio_nowait(fio);
-	VDCSTAT_BUMP(vdc_stat_misses);
-
-	return (B_TRUE);
-}
-
-/*
- * Update cache contents upon write completion.
- */
-void
-vdev_cache_write(zio_t *zio)
-{
-	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
-	vdev_cache_entry_t *ve, ve_search;
-	uint64_t io_start = zio->io_offset;
-	uint64_t io_end = io_start + zio->io_size;
-	uint64_t min_offset = P2ALIGN(io_start, VCBS);
-	uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
-	avl_index_t where;
-
-	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
-
-	mutex_enter(&vc->vc_lock);
-
-	ve_search.ve_offset = min_offset;
-	ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
-
-	if (ve == NULL)
-		ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
-
-	while (ve != NULL && ve->ve_offset < max_offset) {
-		uint64_t start = MAX(ve->ve_offset, io_start);
-		uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
-
-		if (ve->ve_fill_io != NULL) {
-			ve->ve_missed_update = 1;
-		} else {
-			abd_copy_off(ve->ve_abd, zio->io_abd,
-			    start - ve->ve_offset, start - io_start,
-			    end - start);
-		}
-		ve = AVL_NEXT(&vc->vc_offset_tree, ve);
-	}
-	mutex_exit(&vc->vc_lock);
-}
-
-void
-vdev_cache_purge(vdev_t *vd)
-{
-	vdev_cache_t *vc = &vd->vdev_cache;
-	vdev_cache_entry_t *ve;
-
-	mutex_enter(&vc->vc_lock);
-	while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
-		vdev_cache_evict(vc, ve);
-	mutex_exit(&vc->vc_lock);
-}
-
-void
-vdev_cache_init(vdev_t *vd)
-{
-	vdev_cache_t *vc = &vd->vdev_cache;
-
-	mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
-	    sizeof (vdev_cache_entry_t),
-	    offsetof(struct vdev_cache_entry, ve_offset_node));
-
-	avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
-	    sizeof (vdev_cache_entry_t),
-	    offsetof(struct vdev_cache_entry, ve_lastused_node));
-}
-
-void
-vdev_cache_fini(vdev_t *vd)
-{
-	vdev_cache_t *vc = &vd->vdev_cache;
-
-	vdev_cache_purge(vd);
-
-	avl_destroy(&vc->vc_offset_tree);
-	avl_destroy(&vc->vc_lastused_tree);
-
-	mutex_destroy(&vc->vc_lock);
-}
-
-void
-vdev_cache_stat_init(void)
-{
-	vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc",
-	    KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t),
-	    KSTAT_FLAG_VIRTUAL);
-	if (vdc_ksp != NULL) {
-		vdc_ksp->ks_data = &vdc_stats;
-		kstat_install(vdc_ksp);
-	}
-}
-
-void
-vdev_cache_stat_fini(void)
-{
-	if (vdc_ksp != NULL) {
-		kstat_delete(vdc_ksp);
-		vdc_ksp = NULL;
-	}
-}
-
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_max, INT, ZMOD_RW,
-	"Inflate reads small than max");
-
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_size, INT, ZMOD_RD,
-	"Total size of the per-disk cache");
-
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_bshift, INT, ZMOD_RW,
-	"Shift size to inflate reads too");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c
index b8f82d52e8f0..13bb33cc6871 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_draid.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -541,7 +541,7 @@ vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp)
 int
 vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp)
 {
-	for (int i = 0; i <= VDEV_DRAID_MAX_MAPS; i++) {
+	for (int i = 0; i < VDEV_DRAID_MAX_MAPS; i++) {
 		if (draid_maps[i].dm_children == children) {
 			*mapp = &draid_maps[i];
 			return (0);
@@ -577,8 +577,9 @@ vdev_draid_permute_id(vdev_draid_config_t *vdc,
  * i.e. vdev_draid_psize_to_asize().
  */
 static uint64_t
-vdev_draid_asize(vdev_t *vd, uint64_t psize)
+vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
+	(void) txg;
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 	uint64_t ashift = vd->vdev_ashift;
 
@@ -842,6 +843,53 @@ vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr)
 }
 
 /*
+ * Verify that all empty sectors are zero filled before using them to
+ * calculate parity.  Otherwise, silent corruption in an empty sector will
+ * result in bad parity being generated.  That bad parity will then be
+ * considered authoritative and overwrite the good parity on disk.  This
+ * is possible because the checksum is only calculated over the data,
+ * thus it cannot be used to detect damage in empty sectors.
+ */
+int
+vdev_draid_map_verify_empty(zio_t *zio, raidz_row_t *rr)
+{
+	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
+	uint64_t parity_size = rr->rr_col[0].rc_size;
+	uint64_t skip_off = parity_size - skip_size;
+	uint64_t empty_off = 0;
+	int ret = 0;
+
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+	ASSERT3P(rr->rr_abd_empty, !=, NULL);
+	ASSERT3U(rr->rr_bigcols, >, 0);
+
+	void *zero_buf = kmem_zalloc(skip_size, KM_SLEEP);
+
+	for (int c = rr->rr_bigcols; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+
+		ASSERT3P(rc->rc_abd, !=, NULL);
+		ASSERT3U(rc->rc_size, ==, parity_size);
+
+		if (abd_cmp_buf_off(rc->rc_abd, zero_buf, skip_off,
+		    skip_size) != 0) {
+			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
+			abd_zero_off(rc->rc_abd, skip_off, skip_size);
+			rc->rc_error = SET_ERROR(ECKSUM);
+			ret++;
+		}
+
+		empty_off += skip_size;
+	}
+
+	ASSERT3U(empty_off, ==, abd_get_size(rr->rr_abd_empty));
+
+	kmem_free(zero_buf, skip_size);
+
+	return (ret);
+}
+
+/*
  * Given a logical address within a dRAID configuration, return the physical
  * address on the first drive in the group that this address maps to
  * (at position 'start' in permutation number 'perm').
@@ -913,7 +961,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	uint64_t io_size = abd_size;
-	uint64_t io_asize = vdev_draid_asize(vd, io_size);
+	uint64_t io_asize = vdev_draid_asize(vd, io_size, 0);
 	uint64_t group = vdev_draid_offset_to_group(vd, io_offset);
 	uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1);
 
@@ -976,15 +1024,11 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
 	/* The total number of data and parity sectors for this I/O. */
 	uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1)));
 
-	raidz_row_t *rr;
-	rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP);
-	rr->rr_cols = groupwidth;
-	rr->rr_scols = groupwidth;
+	ASSERT3U(vdc->vdc_nparity, >, 0);
+
+	raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth);
 	rr->rr_bigcols = bc;
-	rr->rr_missingdata = 0;
-	rr->rr_missingparity = 0;
 	rr->rr_firstdatacol = vdc->vdc_nparity;
-	rr->rr_abd_empty = NULL;
 #ifdef ZFS_DEBUG
 	rr->rr_offset = io_offset;
 	rr->rr_size = io_size;
@@ -1004,14 +1048,6 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
 
 		rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c);
 		rc->rc_offset = physical_offset;
-		rc->rc_abd = NULL;
-		rc->rc_orig_data = NULL;
-		rc->rc_error = 0;
-		rc->rc_tried = 0;
-		rc->rc_skipped = 0;
-		rc->rc_force_repair = 0;
-		rc->rc_allow_repair = 1;
-		rc->rc_need_orig_restore = B_FALSE;
 
 		if (q == 0 && i >= bc)
 			rc->rc_size = 0;
@@ -1080,7 +1116,7 @@ vdev_draid_map_alloc(zio_t *zio)
 	if (size < abd_size) {
 		vdev_t *vd = zio->io_vd;
 
-		io_offset += vdev_draid_asize(vd, size);
+		io_offset += vdev_draid_asize(vd, size, 0);
 		abd_offset += size;
 		abd_size -= size;
 		nrows++;
@@ -1102,7 +1138,6 @@ vdev_draid_map_alloc(zio_t *zio)
 	rm->rm_row[0] = rr[0];
 	if (nrows == 2)
 		rm->rm_row[1] = rr[1];
-
 	return (rm);
 }
 
@@ -1449,8 +1484,14 @@ vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep,
 		asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
 		max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
-		physical_ashift = MAX(physical_ashift,
-		    cvd->vdev_physical_ashift);
+	}
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (cvd->vdev_ops == &vdev_draid_spare_ops)
+			continue;
+		physical_ashift = vdev_best_ashift(logical_ashift,
+		    physical_ashift, cvd->vdev_physical_ashift);
 	}
 
 	*asizep = asize;
@@ -1678,7 +1719,7 @@ vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp,
 		uint64_t nparity = vdc->vdc_nparity;
 
 		for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) {
-			bzero(path, sizeof (path));
+			memset(path, 0, sizeof (path));
 			(void) snprintf(path, sizeof (path) - 1,
 			    "%s%llu-%llu-%llu", VDEV_TYPE_DRAID,
 			    (u_longlong_t)nparity,
@@ -1707,7 +1748,7 @@ vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp,
 	if (n > 0) {
 		(void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES);
 		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
-		    new_spares, n);
+		    (const nvlist_t **)new_spares, n);
 	}
 
 	for (int i = 0; i < n; i++)
@@ -1728,7 +1769,7 @@ vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	uint64_t offset = DVA_GET_OFFSET(dva);
-	uint64_t asize = vdev_draid_asize(vd, psize);
+	uint64_t asize = vdev_draid_asize(vd, psize, 0);
 
 	if (phys_birth == TXG_UNKNOWN) {
 		/*
@@ -1785,7 +1826,7 @@ vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
 	range_seg64_t logical_rs, physical_rs, remain_rs;
 	logical_rs.rs_start = rr->rr_offset;
 	logical_rs.rs_end = logical_rs.rs_start +
-	    vdev_draid_asize(vd, rr->rr_size);
+	    vdev_draid_asize(vd, rr->rr_size, 0);
 
 	raidz_col_t *rc = &rr->rr_col[col];
 	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
@@ -2154,6 +2195,7 @@ vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv)
 static int
 vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd)
 {
+	(void) spa;
 	uint64_t ndata, nparity, nspares, ngroups;
 	int error;
 
@@ -2382,7 +2424,6 @@ vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset)
 	return (cvd);
 }
 
-/* ARGSUSED */
 static void
 vdev_draid_spare_close(vdev_t *vd)
 {
@@ -2507,24 +2548,20 @@ vdev_draid_read_config_spare(vdev_t *vd)
 }
 
 /*
- * Handle any ioctl requested of the distributed spare.  Only flushes
- * are supported in which case all children must be flushed.
+ * Handle any flush requested of the distributed spare. All children must be
+ * flushed.
  */
 static int
-vdev_draid_spare_ioctl(zio_t *zio)
+vdev_draid_spare_flush(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	int error = 0;
 
-	if (zio->io_cmd == DKIOCFLUSHWRITECACHE) {
-		for (int c = 0; c < vd->vdev_children; c++) {
-			zio_nowait(zio_vdev_child_io(zio, NULL,
-			    vd->vdev_child[c], zio->io_offset, zio->io_abd,
-			    zio->io_size, zio->io_type, zio->io_priority, 0,
-			    vdev_draid_spare_child_done, zio));
-		}
-	} else {
-		error = SET_ERROR(ENOTSUP);
+	for (int c = 0; c < vd->vdev_children; c++) {
+		zio_nowait(zio_vdev_child_io(zio, NULL,
+		    vd->vdev_child[c], zio->io_offset, zio->io_abd,
+		    zio->io_size, zio->io_type, zio->io_priority, 0,
+		    vdev_draid_spare_child_done, zio));
 	}
 
 	return (error);
@@ -2555,8 +2592,8 @@ vdev_draid_spare_io_start(zio_t *zio)
 	}
 
 	switch (zio->io_type) {
-	case ZIO_TYPE_IOCTL:
-		zio->io_error = vdev_draid_spare_ioctl(zio);
+	case ZIO_TYPE_FLUSH:
+		zio->io_error = vdev_draid_spare_flush(zio);
 		break;
 
 	case ZIO_TYPE_WRITE:
@@ -2641,10 +2678,10 @@ vdev_draid_spare_io_start(zio_t *zio)
 	zio_execute(zio);
 }
 
-/* ARGSUSED */
 static void
 vdev_draid_spare_io_done(zio_t *zio)
 {
+	(void) zio;
 }
 
 /*
@@ -2665,7 +2702,7 @@ vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp,
 		return (SET_ERROR(ENOENT));
 	}
 
-	char *spare_name;
+	const char *spare_name;
 	error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name);
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
@@ -2673,7 +2710,7 @@ vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp,
 	for (int i = 0; i < nspares; i++) {
 		nvlist_t *spare = spares[i];
 		uint64_t top_guid, spare_id;
-		char *type, *path;
+		const char *type, *path;
 
 		/* Skip non-distributed spares */
 		error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type);
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
index 14ebf5514676..acb725696674 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
@@ -48,8 +48,8 @@
  * "vdev_remap" operation that executes a callback on each contiguous
  * segment of the new location.  This function is used in multiple ways:
  *
- *  - i/os to this vdev use the callback to determine where the
- *    data is now located, and issue child i/os for each segment's new
+ *  - I/Os to this vdev use the callback to determine where the
+ *    data is now located, and issue child I/Os for each segment's new
  *    location.
  *
  *  - frees and claims to this vdev use the callback to free or claim
@@ -172,7 +172,7 @@
  * object.
  */
 
-int zfs_condense_indirect_vdevs_enable = B_TRUE;
+static int zfs_condense_indirect_vdevs_enable = B_TRUE;
 
 /*
  * Condense if at least this percent of the bytes in the mapping is
@@ -181,7 +181,7 @@ int zfs_condense_indirect_vdevs_enable = B_TRUE;
  * condenses.  Higher values will condense less often (causing less
  * i/o); lower values will reduce the mapping size more quickly.
  */
-int zfs_condense_indirect_obsolete_pct = 25;
+static uint_t zfs_condense_indirect_obsolete_pct = 25;
 
 /*
  * Condense if the obsolete space map takes up more than this amount of
@@ -189,14 +189,14 @@ int zfs_condense_indirect_obsolete_pct = 25;
  * consumed by the obsolete space map; the default of 1GB is small enough
  * that we typically don't mind "wasting" it.
  */
-unsigned long zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
+static uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
 
 /*
  * Don't bother condensing if the mapping uses less than this amount of
  * memory.  The default of 128KB is considered a "trivial" amount of
  * memory and not worth reducing.
  */
-unsigned long zfs_condense_min_mapping_bytes = 128 * 1024;
+static uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
 
 /*
  * This is used by the test suite so that it can ensure that certain
@@ -204,7 +204,7 @@ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024;
  * complete too quickly).  If used to reduce the performance impact of
  * condensing in production, a maximum value of 1 should be sufficient.
  */
-int zfs_condense_indirect_commit_entry_delay_ms = 0;
+static uint_t zfs_condense_indirect_commit_entry_delay_ms = 0;
 
 /*
  * If an indirect split block contains more than this many possible unique
@@ -214,7 +214,7 @@ int zfs_condense_indirect_commit_entry_delay_ms = 0;
  * copies to participate fairly in the reconstruction when all combinations
  * cannot be checked and prevents repeated use of one bad copy.
  */
-int zfs_reconstruct_indirect_combinations_max = 4096;
+uint_t zfs_reconstruct_indirect_combinations_max = 4096;
 
 /*
  * Enable to simulate damaged segments and validate reconstruction.  This
@@ -270,7 +270,7 @@ typedef struct indirect_split {
 	 */
 	indirect_child_t *is_good_child;
 
-	indirect_child_t is_child[1]; /* variable-length */
+	indirect_child_t is_child[];
 } indirect_split_t;
 
 /*
@@ -293,17 +293,16 @@ vdev_indirect_map_free(zio_t *zio)
 	indirect_vsd_t *iv = zio->io_vsd;
 
 	indirect_split_t *is;
-	while ((is = list_head(&iv->iv_splits)) != NULL) {
+	while ((is = list_remove_head(&iv->iv_splits)) != NULL) {
 		for (int c = 0; c < is->is_children; c++) {
 			indirect_child_t *ic = &is->is_child[c];
 			if (ic->ic_data != NULL)
 				abd_free(ic->ic_data);
 		}
-		list_remove(&iv->iv_splits, is);
 
 		indirect_child_t *ic;
-		while ((ic = list_head(&is->is_unique_child)) != NULL)
-			list_remove(&is->is_unique_child, ic);
+		while ((ic = list_remove_head(&is->is_unique_child)) != NULL)
+			;
 
 		list_destroy(&is->is_unique_child);
 
@@ -637,16 +636,15 @@ spa_condense_indirect_generate_new_mapping(vdev_t *vd,
 	}
 }
 
-/* ARGSUSED */
 static boolean_t
 spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
 {
+	(void) zthr;
 	spa_t *spa = arg;
 
 	return (spa->spa_condensing_indirect != NULL);
 }
 
-/* ARGSUSED */
 static void
 spa_condense_indirect_thread(void *arg, zthr_t *zthr)
 {
@@ -941,13 +939,12 @@ vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise)
 	return (error);
 }
 
-/* ARGSUSED */
 static void
 vdev_indirect_close(vdev_t *vd)
 {
+	(void) vd;
 }
 
-/* ARGSUSED */
 static int
 vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
@@ -1023,7 +1020,7 @@ vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
 
 	size_t copy_length = entries * sizeof (*first_mapping);
 	duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
-	bcopy(first_mapping, duplicate_mappings, copy_length);
+	memcpy(duplicate_mappings, first_mapping, copy_length);
 	*copied_entries = entries;
 
 	return (duplicate_mappings);
@@ -1321,6 +1318,7 @@ vdev_indirect_io_start(zio_t *zio)
 	    vdev_indirect_gather_splits, zio);
 
 	indirect_split_t *first = list_head(&iv->iv_splits);
+	ASSERT3P(first, !=, NULL);
 	if (first->is_size == zio->io_size) {
 		/*
 		 * This is not a split block; we are pointing to the entire
@@ -1371,9 +1369,10 @@ vdev_indirect_io_start(zio_t *zio)
 			    is != NULL; is = list_next(&iv->iv_splits, is)) {
 				zio_nowait(zio_vdev_child_io(zio, NULL,
 				    is->is_vdev, is->is_target_offset,
-				    abd_get_offset(zio->io_abd,
-				    is->is_split_offset), is->is_size,
-				    zio->io_type, zio->io_priority, 0,
+				    abd_get_offset_size(zio->io_abd,
+				    is->is_split_offset, is->is_size),
+				    is->is_size, zio->io_type,
+				    zio->io_priority, 0,
 				    vdev_indirect_child_io_done, zio));
 			}
 
@@ -1399,7 +1398,7 @@ vdev_indirect_checksum_error(zio_t *zio,
 	vd->vdev_stat.vs_checksum_errors++;
 	mutex_exit(&vd->vdev_stat_lock);
 
-	zio_bad_cksum_t zbc = {{{ 0 }}};
+	zio_bad_cksum_t zbc = { 0 };
 	abd_t *bad_abd = ic->ic_data;
 	abd_t *good_abd = is->is_good_child->ic_data;
 	(void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
@@ -1480,12 +1479,12 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
 
 			vdev_t *vd = ic->ic_vdev;
 
-			(void) zfs_ereport_post_checksum(zio->io_spa, vd,
-			    NULL, zio, is->is_target_offset, is->is_size,
-			    NULL, NULL, NULL);
 			mutex_enter(&vd->vdev_stat_lock);
 			vd->vdev_stat.vs_checksum_errors++;
 			mutex_exit(&vd->vdev_stat_lock);
+			(void) zfs_ereport_post_checksum(zio->io_spa, vd,
+			    NULL, zio, is->is_target_offset, is->is_size,
+			    NULL, NULL, NULL);
 		}
 	}
 }
@@ -1659,8 +1658,8 @@ out:
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		indirect_child_t *ic;
-		while ((ic = list_head(&is->is_unique_child)) != NULL)
-			list_remove(&is->is_unique_child, ic);
+		while ((ic = list_remove_head(&is->is_unique_child)) != NULL)
+			;
 
 		is->is_unique_children = 0;
 	}
@@ -1885,23 +1884,28 @@ EXPORT_SYMBOL(vdev_obsolete_counts_are_precise);
 EXPORT_SYMBOL(vdev_obsolete_sm_object);
 
 /* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT, ZMOD_RW,
-	"Whether to attempt condensing indirect vdev mappings");
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT,
+	ZMOD_RW, "Whether to attempt condensing indirect vdev mappings");
 
-ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_obsolete_pct, INT, ZMOD_RW,
-	"Minimum obsolete percent of bytes in the mapping to attempt condensing");
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_obsolete_pct, UINT,
+	ZMOD_RW,
+	"Minimum obsolete percent of bytes in the mapping "
+	"to attempt condensing");
 
-ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, U64, ZMOD_RW,
 	"Don't bother condensing if the mapping uses less than this amount of "
 	"memory");
 
-ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, U64,
+	ZMOD_RW,
 	"Minimum size obsolete spacemap to attempt condensing");
 
-ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms,
+	UINT, ZMOD_RW,
 	"Used by tests to ensure certain actions happen in the middle of a "
 	"condense. A maximum value of 1 should be sufficient.");
 
-ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max,
+	UINT, ZMOD_RW,
 	"Maximum number of combinations when reconstructing split segments");
 /* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c b/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c
index 99b83c392257..65a57e73604f 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c
@@ -38,6 +38,8 @@ vdev_indirect_births_verify(vdev_indirect_births_t *vib)
 
 	return (B_TRUE);
 }
+#else
+#define	vdev_indirect_births_verify(vib) ((void) sizeof (vib), B_TRUE)
 #endif
 
 uint64_t
@@ -150,7 +152,7 @@ vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
 
 	new_entries = vmem_alloc(new_size, KM_SLEEP);
 	if (old_size > 0) {
-		bcopy(vib->vib_entries, new_entries, old_size);
+		memcpy(new_entries, vib->vib_entries, old_size);
 		vmem_free(vib->vib_entries, old_size);
 	}
 	new_entries[vib->vib_phys->vib_count - 1] = vibe;
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c b/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c
index bb484a401b1b..e92495f2dd34 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c
@@ -54,6 +54,8 @@ vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim)
 
 	return (B_TRUE);
 }
+#else
+#define	vdev_indirect_mapping_verify(vim) ((void) sizeof (vim), B_TRUE)
 #endif
 
 uint64_t
@@ -480,7 +482,7 @@ vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
 	    entries_written * sizeof (vdev_indirect_mapping_entry_phys_t));
 	vim->vim_entries = vmem_alloc(new_size, KM_SLEEP);
 	if (old_size > 0) {
-		bcopy(old_entries, vim->vim_entries, old_size);
+		memcpy(vim->vim_entries, old_entries, old_size);
 		vmem_free(old_entries, old_size);
 	}
 	VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size,
@@ -582,7 +584,7 @@ vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim)
 		    0, counts_size,
 		    counts, DMU_READ_PREFETCH));
 	} else {
-		bzero(counts, counts_size);
+		memset(counts, 0, counts_size);
 	}
 	return (counts);
 }
diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
index e9156c32f384..0a7323f58df2 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_initialize.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2024 by Delphix. All rights reserved.
  */
 
 #include <sys/spa.h>
@@ -36,23 +36,20 @@
 /*
  * Value that is written to disk during initialization.
  */
-#ifdef _ILP32
-unsigned long zfs_initialize_value = 0xdeadbeefUL;
-#else
-unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL;
-#endif
+static uint64_t zfs_initialize_value = 0xdeadbeefdeadbeeeULL;
 
 /* maximum number of I/Os outstanding per leaf vdev */
-int zfs_initialize_limit = 1;
+static const int zfs_initialize_limit = 1;
 
 /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
-unsigned long zfs_initialize_chunk_size = 1024 * 1024;
+static uint64_t zfs_initialize_chunk_size = 1024 * 1024;
 
 static boolean_t
 vdev_initialize_should_stop(vdev_t *vd)
 {
 	return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
-	    vd->vdev_detached || vd->vdev_top->vdev_removing);
+	    vd->vdev_detached || vd->vdev_top->vdev_removing ||
+	    vd->vdev_top->vdev_rz_expanding);
 }
 
 static void
@@ -71,7 +68,8 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
 	kmem_free(arg, sizeof (uint64_t));
 
 	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
-	if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+	if (vd == NULL || vd->vdev_top->vdev_removing ||
+	    !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding)
 		return;
 
 	uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
@@ -101,6 +99,39 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
 }
 
 static void
+vdev_initialize_zap_remove_sync(void *arg, dmu_tx_t *tx)
+{
+	uint64_t guid = *(uint64_t *)arg;
+
+	kmem_free(arg, sizeof (uint64_t));
+
+	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
+	if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+		return;
+
+	ASSERT3S(vd->vdev_initialize_state, ==, VDEV_INITIALIZE_NONE);
+	ASSERT3U(vd->vdev_leaf_zap, !=, 0);
+
+	vd->vdev_initialize_last_offset = 0;
+	vd->vdev_initialize_action_time = 0;
+
+	objset_t *mos = vd->vdev_spa->spa_meta_objset;
+	int error;
+
+	error = zap_remove(mos, vd->vdev_leaf_zap,
+	    VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, tx);
+	VERIFY(error == 0 || error == ENOENT);
+
+	error = zap_remove(mos, vd->vdev_leaf_zap,
+	    VDEV_LEAF_ZAP_INITIALIZE_STATE, tx);
+	VERIFY(error == 0 || error == ENOENT);
+
+	error = zap_remove(mos, vd->vdev_leaf_zap,
+	    VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, tx);
+	VERIFY(error == 0 || error == ENOENT);
+}
+
+static void
 vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
@@ -127,8 +158,14 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
 
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
-	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
-	    guid, tx);
+
+	if (new_state != VDEV_INITIALIZE_NONE) {
+		dsl_sync_task_nowait(spa_get_dsl(spa),
+		    vdev_initialize_zap_update_sync, guid, tx);
+	} else {
+		dsl_sync_task_nowait(spa_get_dsl(spa),
+		    vdev_initialize_zap_remove_sync, guid, tx);
+	}
 
 	switch (new_state) {
 	case VDEV_INITIALIZE_ACTIVE:
@@ -149,6 +186,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
 		spa_history_log_internal(spa, "initialize", tx,
 		    "vdev=%s complete", vd->vdev_path);
 		break;
+	case VDEV_INITIALIZE_NONE:
+		spa_history_log_internal(spa, "uninitialize", tx,
+		    "vdev=%s", vd->vdev_path);
+		break;
 	default:
 		panic("invalid state %llu", (unsigned long long)new_state);
 	}
@@ -255,20 +296,15 @@ vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
  * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
  * allocation will guarantee these for us.
  */
-/* ARGSUSED */
 static int
 vdev_initialize_block_fill(void *buf, size_t len, void *unused)
 {
+	(void) unused;
+
 	ASSERT0(len % sizeof (uint64_t));
-#ifdef _ILP32
-	for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) {
-		*(uint32_t *)((char *)(buf) + i) = zfs_initialize_value;
-	}
-#else
 	for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
 		*(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
 	}
-#endif
 	return (0);
 }
 
@@ -487,7 +523,7 @@ vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
 	vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg);
 }
 
-static void
+static __attribute__((noreturn)) void
 vdev_initialize_thread(void *arg)
 {
 	vdev_t *vd = arg;
@@ -597,6 +633,7 @@ vdev_initialize(vdev_t *vd)
 	ASSERT(!vd->vdev_detached);
 	ASSERT(!vd->vdev_initialize_exit_wanted);
 	ASSERT(!vd->vdev_top->vdev_removing);
+	ASSERT(!vd->vdev_top->vdev_rz_expanding);
 
 	vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
 	vd->vdev_initialize_thread = thread_create(NULL, 0,
@@ -604,6 +641,24 @@ vdev_initialize(vdev_t *vd)
 }
 
 /*
+ * Uninitializes a device. Caller must hold vdev_initialize_lock.
+ * Device must be a leaf and not already be initializing.
+ */
+void
+vdev_uninitialize(vdev_t *vd)
+{
+	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	ASSERT(vdev_is_concrete(vd));
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+	ASSERT(!vd->vdev_detached);
+	ASSERT(!vd->vdev_initialize_exit_wanted);
+	ASSERT(!vd->vdev_top->vdev_removing);
+
+	vdev_initialize_change_state(vd, VDEV_INITIALIZE_NONE);
+}
+
+/*
  * Wait for the initialize thread to be terminated (cancelled or stopped).
  */
 static void
@@ -624,9 +679,11 @@ vdev_initialize_stop_wait_impl(vdev_t *vd)
 void
 vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list)
 {
+	(void) spa;
 	vdev_t *vd;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_export_thread == curthread);
 
 	while ((vd = list_remove_head(vd_list)) != NULL) {
 		mutex_enter(&vd->vdev_initialize_lock);
@@ -668,7 +725,8 @@ vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state,
 	if (vd_list == NULL) {
 		vdev_initialize_stop_wait_impl(vd);
 	} else {
-		ASSERT(MUTEX_HELD(&spa_namespace_lock));
+		ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+		    vd->vdev_spa->spa_export_thread == curthread);
 		list_insert_tail(vd_list, vd);
 	}
 }
@@ -700,7 +758,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
 	spa_t *spa = vd->vdev_spa;
 	list_t vd_list;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_export_thread == curthread);
 
 	list_create(&vd_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_initialize_node));
@@ -719,7 +778,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
 void
 vdev_initialize_restart(vdev_t *vd)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    vd->vdev_spa->spa_load_thread == curthread);
 	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
 	if (vd->vdev_leaf_zap != 0) {
@@ -738,13 +798,14 @@ vdev_initialize_restart(vdev_t *vd)
 		ASSERT(err == 0 || err == ENOENT);
 		vd->vdev_initialize_action_time = timestamp;
 
-		if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
-		    vd->vdev_offline) {
+		if ((vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+		    vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) {
 			/* load progress for reporting, but don't resume */
 			VERIFY0(vdev_initialize_load(vd));
 		} else if (vd->vdev_initialize_state ==
 		    VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) &&
 		    !vd->vdev_top->vdev_removing &&
+		    !vd->vdev_top->vdev_rz_expanding &&
 		    vd->vdev_initialize_thread == NULL) {
 			vdev_initialize(vd);
 		}
@@ -758,15 +819,14 @@ vdev_initialize_restart(vdev_t *vd)
 }
 
 EXPORT_SYMBOL(vdev_initialize);
+EXPORT_SYMBOL(vdev_uninitialize);
 EXPORT_SYMBOL(vdev_initialize_stop);
 EXPORT_SYMBOL(vdev_initialize_stop_all);
 EXPORT_SYMBOL(vdev_initialize_stop_wait);
 EXPORT_SYMBOL(vdev_initialize_restart);
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, U64, ZMOD_RW,
 	"Value written during zpool initialize");
 
-ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, U64, ZMOD_RW,
 	"Size in bytes of writes by zpool initialize");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c
index daf53f0a0c8b..ed592514fded 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_label.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_label.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -142,6 +142,7 @@
 #include <sys/zap.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
 #include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
@@ -423,6 +424,13 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
 		    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,
 		    sizeof (pcs) / sizeof (uint64_t));
 	}
+
+	pool_raidz_expand_stat_t pres;
+	if (spa_raidz_expand_get_stats(spa, &pres) == 0) {
+		fnvlist_add_uint64_array(nvl,
+		    ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres,
+		    sizeof (pres) / sizeof (uint64_t));
+	}
 }
 
 static void
@@ -486,6 +494,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 	if (vd->vdev_isspare)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
 
+	if (flags & VDEV_CONFIG_L2CACHE)
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
+
 	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
 	    vd == vd->vdev_top) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
@@ -496,7 +507,16 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    vd->vdev_asize);
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
-		if (vd->vdev_removing) {
+		if (vd->vdev_noalloc) {
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
+			    vd->vdev_noalloc);
+		}
+
+		/*
+		 * Slog devices are removed synchronously so don't
+		 * persist the vdev_removing flag to the label.
+		 */
+		if (vd->vdev_removing && !vd->vdev_islog) {
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
 			    vd->vdev_removing);
 		}
@@ -564,6 +584,12 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 			    vd->vdev_top_zap);
 		}
 
+		if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap != 0 &&
+		    spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) {
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP,
+			    vd->vdev_root_zap);
+		}
+
 		if (vd->vdev_resilver_deferred) {
 			ASSERT(vd->vdev_ops->vdev_op_leaf);
 			ASSERT(spa->spa_resilver_deferred);
@@ -640,35 +666,22 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		nvlist_t **child;
-		int c, idx;
+		uint64_t c;
 
 		ASSERT(!vd->vdev_ishole);
 
 		child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
 		    KM_SLEEP);
 
-		for (c = 0, idx = 0; c < vd->vdev_children; c++) {
-			vdev_t *cvd = vd->vdev_child[c];
-
-			/*
-			 * If we're generating an nvlist of removing
-			 * vdevs then skip over any device which is
-			 * not being removed.
-			 */
-			if ((flags & VDEV_CONFIG_REMOVING) &&
-			    !cvd->vdev_removing)
-				continue;
-
-			child[idx++] = vdev_config_generate(spa, cvd,
+		for (c = 0; c < vd->vdev_children; c++) {
+			child[c] = vdev_config_generate(spa, vd->vdev_child[c],
 			    getstats, flags);
 		}
 
-		if (idx) {
-			fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-			    child, idx);
-		}
+		fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+		    (const nvlist_t * const *)child, vd->vdev_children);
 
-		for (c = 0; c < idx; c++)
+		for (c = 0; c < vd->vdev_children; c++)
 			nvlist_free(child[c]);
 
 		kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
@@ -1018,6 +1031,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	int error;
 	uint64_t spare_guid = 0, l2cache_guid = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+	boolean_t reason_spare = (reason == VDEV_LABEL_SPARE || (reason ==
+	    VDEV_LABEL_REMOVE && vd->vdev_isspare));
+	boolean_t reason_l2cache = (reason == VDEV_LABEL_L2CACHE || (reason ==
+	    VDEV_LABEL_REMOVE && vd->vdev_isl2cache));
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
@@ -1103,36 +1120,58 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	 * really part of an active pool just yet.  The labels will
 	 * be written again with a meaningful txg by spa_sync().
 	 */
-	if (reason == VDEV_LABEL_SPARE ||
-	    (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
+	if (reason_spare || reason_l2cache) {
 		/*
-		 * For inactive hot spares, we generate a special label that
-		 * identifies as a mutually shared hot spare.  We write the
-		 * label if we are adding a hot spare, or if we are removing an
-		 * active hot spare (in which case we want to revert the
-		 * labels).
+		 * For inactive hot spares and level 2 ARC devices, we generate
+		 * a special label that identifies as a mutually shared hot
+		 * spare or l2cache device. We write the label in case of
+		 * addition or removal of hot spare or l2cache vdev (in which
+		 * case we want to revert the labels).
 		 */
 		VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
 		    spa_version(spa)) == 0);
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
-		    POOL_STATE_SPARE) == 0);
+		    reason_spare ? POOL_STATE_SPARE : POOL_STATE_L2CACHE) == 0);
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
 		    vd->vdev_guid) == 0);
-	} else if (reason == VDEV_LABEL_L2CACHE ||
-	    (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) {
+
 		/*
-		 * For level 2 ARC devices, add a special label.
+		 * This is merely to facilitate reporting the ashift of the
+		 * cache device through zdb. The actual retrieval of the
+		 * ashift (in vdev_alloc()) uses the nvlist
+		 * spa->spa_l2cache->sav_config (populated in
+		 * spa_ld_open_aux_vdevs()).
 		 */
-		VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		if (reason_l2cache) {
+			VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT,
+			    vd->vdev_ashift) == 0);
+		}
 
-		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
-		    spa_version(spa)) == 0);
-		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
-		    POOL_STATE_L2CACHE) == 0);
-		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
-		    vd->vdev_guid) == 0);
+		/*
+		 * Add path information to help find it during pool import
+		 */
+		if (vd->vdev_path != NULL) {
+			VERIFY(nvlist_add_string(label, ZPOOL_CONFIG_PATH,
+			    vd->vdev_path) == 0);
+		}
+		if (vd->vdev_devid != NULL) {
+			VERIFY(nvlist_add_string(label, ZPOOL_CONFIG_DEVID,
+			    vd->vdev_devid) == 0);
+		}
+		if (vd->vdev_physpath != NULL) {
+			VERIFY(nvlist_add_string(label, ZPOOL_CONFIG_PHYS_PATH,
+			    vd->vdev_physpath) == 0);
+		}
+
+		/*
+		 * When spare or l2cache (aux) vdev is added during pool
+		 * creation, spa->spa_uberblock is not written until this
+		 * point. Write it on next config sync.
+		 */
+		if (uberblock_verify(&spa->spa_uberblock))
+			spa->spa_aux_sync_uber = B_TRUE;
 	} else {
 		uint64_t txg = 0ULL;
 
@@ -1164,8 +1203,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	 * Initialize uberblock template.
 	 */
 	ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
-	abd_zero(ub_abd, VDEV_UBERBLOCK_RING);
 	abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
+	abd_zero_off(ub_abd, sizeof (uberblock_t),
+	    VDEV_UBERBLOCK_RING - sizeof (uberblock_t));
 	ub = abd_to_buf(ub_abd);
 	ub->ub_txg = 0;
 
@@ -1320,7 +1360,7 @@ vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv)
 				nvlist_free(config);
 				break;
 			}
-			fallthrough;
+			zfs_fallthrough;
 		default:
 			/* Check for FreeBSD zfs bootonce command string */
 			buf = abd_to_buf(abd);
@@ -1355,6 +1395,7 @@ vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env)
 	int error;
 	size_t nvsize;
 	char *nvbuf;
+	const char *tmp;
 
 	error = nvlist_size(env, &nvsize, NV_ENCODE_XDR);
 	if (error != 0)
@@ -1394,8 +1435,8 @@ vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env)
 	bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION);
 	switch (bootenv->vbe_version) {
 	case VB_RAW:
-		if (nvlist_lookup_string(env, GRUB_ENVMAP, &nvbuf) == 0) {
-			(void) strlcpy(bootenv->vbe_bootenv, nvbuf, nvsize);
+		if (nvlist_lookup_string(env, GRUB_ENVMAP, &tmp) == 0) {
+			(void) strlcpy(bootenv->vbe_bootenv, tmp, nvsize);
 		}
 		error = 0;
 		break;
@@ -1488,7 +1529,8 @@ vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
 }
 
 struct ubl_cbdata {
-	uberblock_t	*ubl_ubbest;	/* Best uberblock */
+	uberblock_t	ubl_latest;	/* Most recent uberblock */
+	uberblock_t	*ubl_ubbest;	/* Best uberblock (w/r/t max_txg) */
 	vdev_t		*ubl_vd;	/* vdev associated with the above */
 };
 
@@ -1505,6 +1547,9 @@ vdev_uberblock_load_done(zio_t *zio)
 
 	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
 		mutex_enter(&rio->io_lock);
+		if (vdev_uberblock_compare(ub, &cbp->ubl_latest) > 0) {
+			cbp->ubl_latest = *ub;
+		}
 		if (ub->ub_txg <= spa->spa_load_max_txg &&
 		    vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
 			/*
@@ -1561,11 +1606,11 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
 	ASSERT(ub);
 	ASSERT(config);
 
-	bzero(ub, sizeof (uberblock_t));
+	memset(ub, 0, sizeof (uberblock_t));
+	memset(&cb, 0, sizeof (cb));
 	*config = NULL;
 
 	cb.ubl_ubbest = ub;
-	cb.ubl_vd = NULL;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	zio = zio_root(spa, NULL, &cb, flags);
@@ -1582,6 +1627,22 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
 		vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. "
 		    "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg);
 
+		if (ub->ub_raidz_reflow_info !=
+		    cb.ubl_latest.ub_raidz_reflow_info) {
+			vdev_dbgmsg(cb.ubl_vd,
+			    "spa=%s best uberblock (txg=%llu info=0x%llx) "
+			    "has different raidz_reflow_info than latest "
+			    "uberblock (txg=%llu info=0x%llx)",
+			    spa->spa_name,
+			    (u_longlong_t)ub->ub_txg,
+			    (u_longlong_t)ub->ub_raidz_reflow_info,
+			    (u_longlong_t)cb.ubl_latest.ub_txg,
+			    (u_longlong_t)cb.ubl_latest.ub_raidz_reflow_info);
+			memset(ub, 0, sizeof (uberblock_t));
+			spa_config_exit(spa, SCL_ALL, FTAG);
+			return;
+		}
+
 		*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
 		if (*config == NULL && spa->spa_extreme_rewind) {
 			vdev_dbgmsg(cb.ubl_vd, "failed to read label config. "
@@ -1703,13 +1764,29 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
 		vd->vdev_copy_uberblocks = B_FALSE;
 	}
 
+	/*
+	 * We chose a slot based on the txg.  If this uberblock has a special
+	 * RAIDZ expansion state, then it is essentially an update of the
+	 * current uberblock (it has the same txg).  However, the current
+	 * state is committed, so we want to write it to a different slot. If
+	 * we overwrote the same slot, and we lose power during the uberblock
+	 * write, and the disk does not do single-sector overwrites
+	 * atomically (even though it is required to - i.e. we should see
+	 * either the old or the new uberblock), then we could lose this
+	 * txg's uberblock. Rewinding to the previous txg's uberblock may not
+	 * be possible because RAIDZ expansion may have already overwritten
+	 * some of the data, so we need the progress indicator in the
+	 * uberblock.
+	 */
 	int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0;
-	int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m);
+	int n = (ub->ub_txg - (RRSS_GET_STATE(ub) == RRSS_SCRATCH_VALID)) %
+	    (VDEV_UBERBLOCK_COUNT(vd) - m);
 
 	/* Copy the uberblock_t into the ABD */
 	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
-	abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
 	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
+	abd_zero_off(ub_abd, sizeof (uberblock_t),
+	    VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t));
 
 	for (int l = 0; l < VDEV_LABELS; l++)
 		vdev_label_write(zio, vd, l, ub_abd,
@@ -1721,7 +1798,7 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
 }
 
 /* Sync the uberblocks to all vdevs in svd[] */
-static int
+int
 vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
 {
 	spa_t *spa = svd[0]->vdev_spa;
@@ -1733,6 +1810,16 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
 	for (int v = 0; v < svdcount; v++)
 		vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags);
 
+	if (spa->spa_aux_sync_uber) {
+		for (int v = 0; v < spa->spa_spares.sav_count; v++) {
+			vdev_uberblock_sync(zio, &good_writes, ub,
+			    spa->spa_spares.sav_vdevs[v], flags);
+		}
+		for (int v = 0; v < spa->spa_l2cache.sav_count; v++) {
+			vdev_uberblock_sync(zio, &good_writes, ub,
+			    spa->spa_l2cache.sav_vdevs[v], flags);
+		}
+	}
 	(void) zio_wait(zio);
 
 	/*
@@ -1747,6 +1834,19 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
 			zio_flush(zio, svd[v]);
 		}
 	}
+	if (spa->spa_aux_sync_uber) {
+		spa->spa_aux_sync_uber = B_FALSE;
+		for (int v = 0; v < spa->spa_spares.sav_count; v++) {
+			if (vdev_writeable(spa->spa_spares.sav_vdevs[v])) {
+				zio_flush(zio, spa->spa_spares.sav_vdevs[v]);
+			}
+		}
+		for (int v = 0; v < spa->spa_l2cache.sav_count; v++) {
+			if (vdev_writeable(spa->spa_l2cache.sav_vdevs[v])) {
+				zio_flush(zio, spa->spa_l2cache.sav_vdevs[v]);
+			}
+		}
+	}
 
 	(void) zio_wait(zio);
 
@@ -1927,6 +2027,7 @@ retry:
 	/*
 	 * If this isn't a resync due to I/O errors,
 	 * and nothing changed in this transaction group,
+	 * and multihost protection isn't enabled,
 	 * and the vdev configuration hasn't changed,
 	 * then there's nothing to do.
 	 */
@@ -1934,7 +2035,8 @@ retry:
 		boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
 		    txg, spa->spa_mmp.mmp_delay);
 
-		if (!changed && list_is_empty(&spa->spa_config_dirty_list))
+		if (!changed && list_is_empty(&spa->spa_config_dirty_list) &&
+		    !spa_multihost(spa))
 			return (0);
 	}
 
diff --git a/sys/contrib/openzfs/module/zfs/vdev_mirror.c b/sys/contrib/openzfs/module/zfs/vdev_mirror.c
index 5eb331046953..102eacb03349 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_mirror.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_mirror.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -35,6 +35,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/zio.h>
+#include <sys/zio_checksum.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 
@@ -102,6 +103,7 @@ vdev_mirror_stat_fini(void)
  */
 typedef struct mirror_child {
 	vdev_t		*mc_vd;
+	abd_t		*mc_abd;
 	uint64_t	mc_offset;
 	int		mc_error;
 	int		mc_load;
@@ -121,7 +123,7 @@ typedef struct mirror_map {
 	mirror_child_t	mm_child[];
 } mirror_map_t;
 
-static int vdev_mirror_shift = 21;
+static const int vdev_mirror_shift = 21;
 
 /*
  * The load configuration settings below are tuned by default for
@@ -407,8 +409,14 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
-		*physical_ashift = MAX(*physical_ashift,
-		    cvd->vdev_physical_ashift);
+	}
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (cvd->vdev_open_error)
+			continue;
+		*physical_ashift = vdev_best_ashift(*logical_ashift,
+		    *physical_ashift, cvd->vdev_physical_ashift);
 	}
 
 	if (numerrors == vd->vdev_children) {
@@ -439,32 +447,6 @@ vdev_mirror_child_done(zio_t *zio)
 	mc->mc_skipped = 0;
 }
 
-static void
-vdev_mirror_scrub_done(zio_t *zio)
-{
-	mirror_child_t *mc = zio->io_private;
-
-	if (zio->io_error == 0) {
-		zio_t *pio;
-		zio_link_t *zl = NULL;
-
-		mutex_enter(&zio->io_lock);
-		while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
-			mutex_enter(&pio->io_lock);
-			ASSERT3U(zio->io_size, >=, pio->io_size);
-			abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
-			mutex_exit(&pio->io_lock);
-		}
-		mutex_exit(&zio->io_lock);
-	}
-
-	abd_free(zio->io_abd);
-
-	mc->mc_error = zio->io_error;
-	mc->mc_tried = 1;
-	mc->mc_skipped = 0;
-}
-
 /*
  * Check the other, lower-index DVAs to see if they're on the same
  * vdev as the child we picked.  If they are, use them since they
@@ -549,7 +531,7 @@ vdev_mirror_child_select(zio_t *zio)
 	uint64_t txg = zio->io_txg;
 	int c, lowest_load;
 
-	ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
+	ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg);
 
 	lowest_load = INT_MAX;
 	mm->mm_preferred_cnt = 0;
@@ -637,16 +619,15 @@ vdev_mirror_io_start(zio_t *zio)
 	}
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		if (zio->io_bp != NULL &&
-		    (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
+		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
 			/*
-			 * For scrubbing reads (if we can verify the
-			 * checksum here, as indicated by io_bp being
-			 * non-NULL) we need to allocate a read buffer for
-			 * each child and issue reads to all children.  If
-			 * any child succeeds, it will copy its data into
-			 * zio->io_data in vdev_mirror_scrub_done.
+			 * For scrubbing reads we need to issue reads to all
+			 * children.  One child can reuse parent buffer, but
+			 * for others we have to allocate separate ones to
+			 * verify checksums if io_bp is non-NULL, or compare
+			 * them in vdev_mirror_io_done() otherwise.
 			 */
+			boolean_t first = B_TRUE;
 			for (c = 0; c < mm->mm_children; c++) {
 				mc = &mm->mm_child[c];
 
@@ -658,12 +639,15 @@ vdev_mirror_io_start(zio_t *zio)
 					continue;
 				}
 
-				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-				    mc->mc_vd, mc->mc_offset,
+				mc->mc_abd = first ? zio->io_abd :
 				    abd_alloc_sametype(zio->io_abd,
-				    zio->io_size), zio->io_size,
-				    zio->io_type, zio->io_priority, 0,
-				    vdev_mirror_scrub_done, mc));
+				    zio->io_size);
+				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+				    mc->mc_vd, mc->mc_offset, mc->mc_abd,
+				    zio->io_size, zio->io_type,
+				    zio->io_priority, 0,
+				    vdev_mirror_child_done, mc));
+				first = B_FALSE;
 			}
 			zio_execute(zio);
 			return;
@@ -731,6 +715,7 @@ vdev_mirror_io_done(zio_t *zio)
 	int c;
 	int good_copies = 0;
 	int unexpected_errors = 0;
+	int last_good_copy = -1;
 
 	if (mm == NULL)
 		return;
@@ -742,6 +727,7 @@ vdev_mirror_io_done(zio_t *zio)
 			if (!mc->mc_skipped)
 				unexpected_errors++;
 		} else if (mc->mc_tried) {
+			last_good_copy = c;
 			good_copies++;
 		}
 	}
@@ -755,7 +741,6 @@ vdev_mirror_io_done(zio_t *zio)
 		 * no non-degraded top-level vdevs left, and not update DTLs
 		 * if we intend to reallocate.
 		 */
-		/* XXPOLICY */
 		if (good_copies != mm->mm_children) {
 			/*
 			 * Always require at least one good copy.
@@ -782,7 +767,6 @@ vdev_mirror_io_done(zio_t *zio)
 	/*
 	 * If we don't have a good copy yet, keep trying other children.
 	 */
-	/* XXPOLICY */
 	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
 		ASSERT(c >= 0 && c < mm->mm_children);
 		mc = &mm->mm_child[c];
@@ -794,7 +778,80 @@ vdev_mirror_io_done(zio_t *zio)
 		return;
 	}
 
-	/* XXPOLICY */
+	if (zio->io_flags & ZIO_FLAG_SCRUB && !mm->mm_resilvering) {
+		abd_t *best_abd = NULL;
+		if (last_good_copy >= 0)
+			best_abd = mm->mm_child[last_good_copy].mc_abd;
+
+		/*
+		 * If we're scrubbing but don't have a BP available (because
+		 * this vdev is under a raidz or draid vdev) then the best we
+		 * can do is compare all of the copies read.  If they're not
+		 * identical then return a checksum error and the most likely
+		 * correct data.  The raidz code will issue a repair I/O if
+		 * possible.
+		 */
+		if (zio->io_bp == NULL) {
+			ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops ||
+			    zio->io_vd->vdev_ops == &vdev_spare_ops);
+
+			abd_t *pref_abd = NULL;
+			for (c = 0; c < last_good_copy; c++) {
+				mc = &mm->mm_child[c];
+				if (mc->mc_error || !mc->mc_tried)
+					continue;
+
+				if (abd_cmp(mc->mc_abd, best_abd) != 0)
+					zio->io_error = SET_ERROR(ECKSUM);
+
+				/*
+				 * The distributed spare is always prefered
+				 * by vdev_mirror_child_select() so it's
+				 * considered to be the best candidate.
+				 */
+				if (pref_abd == NULL &&
+				    mc->mc_vd->vdev_ops ==
+				    &vdev_draid_spare_ops)
+					pref_abd = mc->mc_abd;
+
+				/*
+				 * In the absence of a preferred copy, use
+				 * the parent pointer to avoid a memory copy.
+				 */
+				if (mc->mc_abd == zio->io_abd)
+					best_abd = mc->mc_abd;
+			}
+			if (pref_abd)
+				best_abd = pref_abd;
+		} else {
+
+			/*
+			 * If we have a BP available, then checksums are
+			 * already verified and we just need a buffer
+			 * with valid data, preferring parent one to
+			 * avoid a memory copy.
+			 */
+			for (c = 0; c < last_good_copy; c++) {
+				mc = &mm->mm_child[c];
+				if (mc->mc_error || !mc->mc_tried)
+					continue;
+				if (mc->mc_abd == zio->io_abd) {
+					best_abd = mc->mc_abd;
+					break;
+				}
+			}
+		}
+
+		if (best_abd && best_abd != zio->io_abd)
+			abd_copy(zio->io_abd, best_abd, zio->io_size);
+		for (c = 0; c < mm->mm_children; c++) {
+			mc = &mm->mm_child[c];
+			if (mc->mc_abd != zio->io_abd)
+				abd_free(mc->mc_abd);
+			mc->mc_abd = NULL;
+		}
+	}
+
 	if (good_copies == 0) {
 		zio->io_error = vdev_mirror_worst_error(mm);
 		ASSERT(zio->io_error != 0);
@@ -880,6 +937,8 @@ static uint64_t
 vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
     uint64_t max_segment)
 {
+	(void) start;
+
 	uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift),
 	    SPA_MAXBLOCKSIZE);
 
@@ -961,20 +1020,21 @@ vdev_ops_t vdev_spare_ops = {
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW,
-	"Rotating media load increment for non-seeking I/O's");
+	"Rotating media load increment for non-seeking I/Os");
 
-ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT, ZMOD_RW,
-	"Rotating media load increment for seeking I/O's");
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT,
+	ZMOD_RW, "Rotating media load increment for seeking I/Os");
 
-ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT, ZMOD_RW,
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT,
+	ZMOD_RW,
 	"Offset in bytes from the last I/O which triggers "
 	"a reduced rotating media seek increment");
+/* END CSTYLED */
 
-ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT, ZMOD_RW,
-	"Non-rotating media load increment for non-seeking I/O's");
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT,
+	ZMOD_RW, "Non-rotating media load increment for non-seeking I/Os");
 
-ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT, ZMOD_RW,
-	"Non-rotating media load increment for seeking I/O's");
-/* END CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT,
+	ZMOD_RW, "Non-rotating media load increment for seeking I/Os");
diff --git a/sys/contrib/openzfs/module/zfs/vdev_missing.c b/sys/contrib/openzfs/module/zfs/vdev_missing.c
index e9145fd012d7..d3580882c3e0 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_missing.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_missing.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -42,7 +42,6 @@
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 
-/* ARGSUSED */
 static int
 vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *ashift, uint64_t *pshift)
@@ -53,6 +52,7 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
 	 * VDEV_AUX_BAD_GUID_SUM.  So we pretend to succeed, knowing that we
 	 * will fail the GUID sum check before ever trying to open the pool.
 	 */
+	(void) vd;
 	*psize = 0;
 	*max_psize = 0;
 	*ashift = 0;
@@ -60,13 +60,12 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
 	return (0);
 }
 
-/* ARGSUSED */
 static void
 vdev_missing_close(vdev_t *vd)
 {
+	(void) vd;
 }
 
-/* ARGSUSED */
 static void
 vdev_missing_io_start(zio_t *zio)
 {
@@ -74,10 +73,10 @@ vdev_missing_io_start(zio_t *zio)
 	zio_execute(zio);
 }
 
-/* ARGSUSED */
 static void
 vdev_missing_io_done(zio_t *zio)
 {
+	(void) zio;
 }
 
 vdev_ops_t vdev_missing_ops = {
diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c
index cc5b15b8c028..092b3f375be0 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_queue.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -121,7 +121,7 @@
  * The maximum number of i/os active to each device.  Ideally, this will be >=
  * the sum of each queue's max_active.
  */
-uint32_t zfs_vdev_max_active = 1000;
+uint_t zfs_vdev_max_active = 1000;
 
 /*
  * Per-queue limits on the number of i/os active to each device.  If the
@@ -141,24 +141,24 @@ uint32_t zfs_vdev_max_active = 1000;
  * more quickly, but reads and writes to have higher latency and lower
  * throughput.
  */
-uint32_t zfs_vdev_sync_read_min_active = 10;
-uint32_t zfs_vdev_sync_read_max_active = 10;
-uint32_t zfs_vdev_sync_write_min_active = 10;
-uint32_t zfs_vdev_sync_write_max_active = 10;
-uint32_t zfs_vdev_async_read_min_active = 1;
-uint32_t zfs_vdev_async_read_max_active = 3;
-uint32_t zfs_vdev_async_write_min_active = 2;
-uint32_t zfs_vdev_async_write_max_active = 10;
-uint32_t zfs_vdev_scrub_min_active = 1;
-uint32_t zfs_vdev_scrub_max_active = 3;
-uint32_t zfs_vdev_removal_min_active = 1;
-uint32_t zfs_vdev_removal_max_active = 2;
-uint32_t zfs_vdev_initializing_min_active = 1;
-uint32_t zfs_vdev_initializing_max_active = 1;
-uint32_t zfs_vdev_trim_min_active = 1;
-uint32_t zfs_vdev_trim_max_active = 2;
-uint32_t zfs_vdev_rebuild_min_active = 1;
-uint32_t zfs_vdev_rebuild_max_active = 3;
+static uint_t zfs_vdev_sync_read_min_active = 10;
+static uint_t zfs_vdev_sync_read_max_active = 10;
+static uint_t zfs_vdev_sync_write_min_active = 10;
+static uint_t zfs_vdev_sync_write_max_active = 10;
+static uint_t zfs_vdev_async_read_min_active = 1;
+/*  */ uint_t zfs_vdev_async_read_max_active = 3;
+static uint_t zfs_vdev_async_write_min_active = 2;
+/*  */ uint_t zfs_vdev_async_write_max_active = 10;
+static uint_t zfs_vdev_scrub_min_active = 1;
+static uint_t zfs_vdev_scrub_max_active = 3;
+static uint_t zfs_vdev_removal_min_active = 1;
+static uint_t zfs_vdev_removal_max_active = 2;
+static uint_t zfs_vdev_initializing_min_active = 1;
+static uint_t zfs_vdev_initializing_max_active = 1;
+static uint_t zfs_vdev_trim_min_active = 1;
+static uint_t zfs_vdev_trim_max_active = 2;
+static uint_t zfs_vdev_rebuild_min_active = 1;
+static uint_t zfs_vdev_rebuild_max_active = 3;
 
 /*
  * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -167,8 +167,8 @@ uint32_t zfs_vdev_rebuild_max_active = 3;
  * zfs_vdev_async_write_max_active. The value is linearly interpolated
  * between min and max.
  */
-int zfs_vdev_async_write_active_min_dirty_percent = 30;
-int zfs_vdev_async_write_active_max_dirty_percent = 60;
+uint_t zfs_vdev_async_write_active_min_dirty_percent = 30;
+uint_t zfs_vdev_async_write_active_max_dirty_percent = 60;
 
 /*
  * For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
@@ -178,7 +178,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60;
  * interactive I/O, then the vdev is considered to be "idle", and the number
  * of concurrently-active non-interactive I/O's is increased to *_max_active.
  */
-uint_t zfs_vdev_nia_delay = 5;
+static uint_t zfs_vdev_nia_delay = 5;
 
 /*
  * Some HDDs tend to prioritize sequential I/O so high that concurrent
@@ -190,7 +190,7 @@ uint_t zfs_vdev_nia_delay = 5;
  * I/Os.  This enforced wait ensures the HDD services the interactive I/O
  * within a reasonable amount of time.
  */
-uint_t zfs_vdev_nia_credit = 5;
+static uint_t zfs_vdev_nia_credit = 5;
 
 /*
  * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
@@ -198,10 +198,10 @@ uint_t zfs_vdev_nia_credit = 5;
  * we include spans of optional I/Os to aid aggregation at the disk even when
  * they aren't able to help us aggregate at this level.
  */
-int zfs_vdev_aggregation_limit = 1 << 20;
-int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE;
-int zfs_vdev_read_gap_limit = 32 << 10;
-int zfs_vdev_write_gap_limit = 4 << 10;
+static uint_t zfs_vdev_aggregation_limit = 1 << 20;
+static uint_t zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE;
+static uint_t zfs_vdev_read_gap_limit = 32 << 10;
+static uint_t zfs_vdev_write_gap_limit = 4 << 10;
 
 /*
  * Define the queue depth percentage for each top-level. This percentage is
@@ -214,9 +214,9 @@ int zfs_vdev_write_gap_limit = 4 << 10;
  * to 30 allocations per device.
  */
 #ifdef _KERNEL
-int zfs_vdev_queue_depth_pct = 1000;
+uint_t zfs_vdev_queue_depth_pct = 1000;
 #else
-int zfs_vdev_queue_depth_pct = 300;
+uint_t zfs_vdev_queue_depth_pct = 300;
 #endif
 
 /*
@@ -226,14 +226,7 @@ int zfs_vdev_queue_depth_pct = 300;
  * we assume that the average allocation size is 4k, so we need the queue depth
  * to be 32 per allocator to get good aggregation of sequential writes.
  */
-int zfs_vdev_def_queue_depth = 32;
-
-/*
- * Allow TRIM I/Os to be aggregated.  This should normally not be needed since
- * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
- * by the TRIM code in zfs_trim.c.
- */
-int zfs_vdev_aggregate_trim = 0;
+uint_t zfs_vdev_def_queue_depth = 32;
 
 static int
 vdev_queue_offset_compare(const void *x1, const void *x2)
@@ -249,39 +242,64 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
 	return (TREE_PCMP(z1, z2));
 }
 
-static inline avl_tree_t *
-vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
-{
-	return (&vq->vq_class[p].vqc_queued_tree);
-}
-
-static inline avl_tree_t *
-vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
-{
-	ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
-	if (t == ZIO_TYPE_READ)
-		return (&vq->vq_read_offset_tree);
-	else if (t == ZIO_TYPE_WRITE)
-		return (&vq->vq_write_offset_tree);
-	else
-		return (&vq->vq_trim_offset_tree);
-}
+#define	VDQ_T_SHIFT 29
 
 static int
-vdev_queue_timestamp_compare(const void *x1, const void *x2)
+vdev_queue_to_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = (const zio_t *)x1;
 	const zio_t *z2 = (const zio_t *)x2;
 
-	int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp);
+	int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT,
+	    z2->io_timestamp >> VDQ_T_SHIFT);
+	int ocmp = TREE_CMP(z1->io_offset, z2->io_offset);
+	int cmp = tcmp ? tcmp : ocmp;
 
-	if (likely(cmp))
+	if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE)))
 		return (cmp);
 
 	return (TREE_PCMP(z1, z2));
 }
 
-static int
+static inline boolean_t
+vdev_queue_class_fifo(zio_priority_t p)
+{
+	return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE ||
+	    p == ZIO_PRIORITY_TRIM);
+}
+
+static void
+vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio)
+{
+	zio_priority_t p = zio->io_priority;
+	vq->vq_cqueued |= 1U << p;
+	if (vdev_queue_class_fifo(p)) {
+		list_insert_tail(&vq->vq_class[p].vqc_list, zio);
+		vq->vq_class[p].vqc_list_numnodes++;
+	}
+	else
+		avl_add(&vq->vq_class[p].vqc_tree, zio);
+}
+
+static void
+vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio)
+{
+	zio_priority_t p = zio->io_priority;
+	uint32_t empty;
+	if (vdev_queue_class_fifo(p)) {
+		list_t *list = &vq->vq_class[p].vqc_list;
+		list_remove(list, zio);
+		empty = list_is_empty(list);
+		vq->vq_class[p].vqc_list_numnodes--;
+	} else {
+		avl_tree_t *tree = &vq->vq_class[p].vqc_tree;
+		avl_remove(tree, zio);
+		empty = avl_is_empty(tree);
+	}
+	vq->vq_cqueued &= ~(empty << p);
+}
+
+static uint_t
 vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
 {
 	switch (p) {
@@ -313,10 +331,10 @@ vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
 	}
 }
 
-static int
+static uint_t
 vdev_queue_max_async_writes(spa_t *spa)
 {
-	int writes;
+	uint_t writes;
 	uint64_t dirty = 0;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	uint64_t min_bytes = zfs_dirty_data_max *
@@ -359,8 +377,8 @@ vdev_queue_max_async_writes(spa_t *spa)
 	return (writes);
 }
 
-static int
-vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
+static uint_t
+vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p)
 {
 	switch (p) {
 	case ZIO_PRIORITY_SYNC_READ:
@@ -370,7 +388,7 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
 	case ZIO_PRIORITY_ASYNC_READ:
 		return (zfs_vdev_async_read_max_active);
 	case ZIO_PRIORITY_ASYNC_WRITE:
-		return (vdev_queue_max_async_writes(spa));
+		return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa));
 	case ZIO_PRIORITY_SCRUB:
 		if (vq->vq_ia_active > 0) {
 			return (MIN(vq->vq_nia_credit,
@@ -408,16 +426,16 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
 }
 
 /*
- * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
+ * Return the i/o class to issue from, or ZIO_PRIORITY_NUM_QUEUEABLE if
  * there is no eligible class.
  */
 static zio_priority_t
 vdev_queue_class_to_issue(vdev_queue_t *vq)
 {
-	spa_t *spa = vq->vq_vdev->vdev_spa;
-	zio_priority_t p, n;
+	uint32_t cq = vq->vq_cqueued;
+	zio_priority_t p, p1;
 
-	if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
+	if (cq == 0 || vq->vq_active >= zfs_vdev_max_active)
 		return (ZIO_PRIORITY_NUM_QUEUEABLE);
 
 	/*
@@ -425,14 +443,18 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
 	 * Do round-robin to reduce starvation due to zfs_vdev_max_active
 	 * and vq_nia_credit limits.
 	 */
-	for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
-		p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
-		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
-		    vq->vq_class[p].vqc_active <
-		    vdev_queue_class_min_active(vq, p)) {
-			vq->vq_last_prio = p;
-			return (p);
-		}
+	p1 = vq->vq_last_prio + 1;
+	if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE)
+		p1 = 0;
+	for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+		if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+		    vdev_queue_class_min_active(vq, p))
+			goto found;
+	}
+	for (p = 0; p < p1; p++) {
+		if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+		    vdev_queue_class_min_active(vq, p))
+			goto found;
 	}
 
 	/*
@@ -440,16 +462,14 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
 	 * maximum # outstanding i/os.
 	 */
 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
-		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
-		    vq->vq_class[p].vqc_active <
-		    vdev_queue_class_max_active(spa, vq, p)) {
-			vq->vq_last_prio = p;
-			return (p);
-		}
+		if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+		    vdev_queue_class_max_active(vq, p))
+			break;
 	}
 
-	/* No eligible queued i/os */
-	return (ZIO_PRIORITY_NUM_QUEUEABLE);
+found:
+	vq->vq_last_prio = p;
+	return (p);
 }
 
 void
@@ -458,42 +478,30 @@ vdev_queue_init(vdev_t *vd)
 	vdev_queue_t *vq = &vd->vdev_queue;
 	zio_priority_t p;
 
-	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
 	vq->vq_vdev = vd;
-	taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent);
-
-	avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_queue_node));
-	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
-	    vdev_queue_offset_compare, sizeof (zio_t),
-	    offsetof(struct zio, io_offset_node));
-	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
-	    vdev_queue_offset_compare, sizeof (zio_t),
-	    offsetof(struct zio, io_offset_node));
-	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
-	    vdev_queue_offset_compare, sizeof (zio_t),
-	    offsetof(struct zio, io_offset_node));
 
 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
-		int (*compfn) (const void *, const void *);
-
-		/*
-		 * The synchronous/trim i/o queues are dispatched in FIFO rather
-		 * than LBA order. This provides more consistent latency for
-		 * these i/os.
-		 */
-		if (p == ZIO_PRIORITY_SYNC_READ ||
-		    p == ZIO_PRIORITY_SYNC_WRITE ||
-		    p == ZIO_PRIORITY_TRIM) {
-			compfn = vdev_queue_timestamp_compare;
+		if (vdev_queue_class_fifo(p)) {
+			list_create(&vq->vq_class[p].vqc_list,
+			    sizeof (zio_t),
+			    offsetof(struct zio, io_queue_node.l));
 		} else {
-			compfn = vdev_queue_offset_compare;
+			avl_create(&vq->vq_class[p].vqc_tree,
+			    vdev_queue_to_compare, sizeof (zio_t),
+			    offsetof(struct zio, io_queue_node.a));
 		}
-		avl_create(vdev_queue_class_tree(vq, p), compfn,
-		    sizeof (zio_t), offsetof(struct zio, io_queue_node));
 	}
+	avl_create(&vq->vq_read_offset_tree,
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));
+	avl_create(&vq->vq_write_offset_tree,
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));
 
 	vq->vq_last_offset = 0;
+	list_create(&vq->vq_active_list, sizeof (struct zio),
+	    offsetof(struct zio, io_queue_node.l));
+	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
 }
 
 void
@@ -501,30 +509,39 @@ vdev_queue_fini(vdev_t *vd)
 {
 	vdev_queue_t *vq = &vd->vdev_queue;
 
-	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
-		avl_destroy(vdev_queue_class_tree(vq, p));
-	avl_destroy(&vq->vq_active_tree);
-	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
-	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
-	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
+	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+		if (vdev_queue_class_fifo(p))
+			list_destroy(&vq->vq_class[p].vqc_list);
+		else
+			avl_destroy(&vq->vq_class[p].vqc_tree);
+	}
+	avl_destroy(&vq->vq_read_offset_tree);
+	avl_destroy(&vq->vq_write_offset_tree);
 
+	list_destroy(&vq->vq_active_list);
 	mutex_destroy(&vq->vq_lock);
 }
 
 static void
 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
-	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
-	avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
+	zio->io_queue_state = ZIO_QS_QUEUED;
+	vdev_queue_class_add(vq, zio);
+	if (zio->io_type == ZIO_TYPE_READ)
+		avl_add(&vq->vq_read_offset_tree, zio);
+	else if (zio->io_type == ZIO_TYPE_WRITE)
+		avl_add(&vq->vq_write_offset_tree, zio);
 }
 
 static void
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
-	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
-	avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
+	vdev_queue_class_remove(vq, zio);
+	if (zio->io_type == ZIO_TYPE_READ)
+		avl_remove(&vq->vq_read_offset_tree, zio);
+	else if (zio->io_type == ZIO_TYPE_WRITE)
+		avl_remove(&vq->vq_write_offset_tree, zio);
+	zio->io_queue_state = ZIO_QS_NONE;
 }
 
 static boolean_t
@@ -546,14 +563,16 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 {
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	vq->vq_class[zio->io_priority].vqc_active++;
+	vq->vq_cactive[zio->io_priority]++;
+	vq->vq_active++;
 	if (vdev_queue_is_interactive(zio->io_priority)) {
 		if (++vq->vq_ia_active == 1)
 			vq->vq_nia_credit = 1;
 	} else if (vq->vq_ia_active > 0) {
 		vq->vq_nia_credit--;
 	}
-	avl_add(&vq->vq_active_tree, zio);
+	zio->io_queue_state = ZIO_QS_ACTIVE;
+	list_insert_tail(&vq->vq_active_list, zio);
 }
 
 static void
@@ -561,7 +580,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	vq->vq_class[zio->io_priority].vqc_active--;
+	vq->vq_cactive[zio->io_priority]--;
+	vq->vq_active--;
 	if (vdev_queue_is_interactive(zio->io_priority)) {
 		if (--vq->vq_ia_active == 0)
 			vq->vq_nia_credit = 0;
@@ -569,7 +589,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 			vq->vq_nia_credit = zfs_vdev_nia_credit;
 	} else if (vq->vq_ia_active == 0)
 		vq->vq_nia_credit++;
-	avl_remove(&vq->vq_active_tree, zio);
+	list_remove(&vq->vq_active_list, zio);
+	zio->io_queue_state = ZIO_QS_NONE;
 }
 
 static void
@@ -602,29 +623,28 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 	uint64_t maxgap = 0;
 	uint64_t size;
 	uint64_t limit;
-	int maxblocksize;
 	boolean_t stretch = B_FALSE;
-	avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
-	enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
 	uint64_t next_offset;
 	abd_t *abd;
+	avl_tree_t *t;
+
+	/*
+	 * TRIM aggregation should not be needed since code in zfs_trim.c can
+	 * submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M).
+	 */
+	if (zio->io_type == ZIO_TYPE_TRIM)
+		return (NULL);
+
+	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
+		return (NULL);
 
-	maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
 	if (vq->vq_vdev->vdev_nonrot)
 		limit = zfs_vdev_aggregation_limit_non_rotating;
 	else
 		limit = zfs_vdev_aggregation_limit;
-	limit = MAX(MIN(limit, maxblocksize), 0);
-
-	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
-		return (NULL);
-
-	/*
-	 * While TRIM commands could be aggregated based on offset this
-	 * behavior is disabled until it's determined to be beneficial.
-	 */
-	if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
+	if (limit == 0)
 		return (NULL);
+	limit = MIN(limit, SPA_MAXBLOCKSIZE);
 
 	/*
 	 * I/Os to distributed spares are directly dispatched to the dRAID
@@ -635,8 +655,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 
 	first = last = zio;
 
-	if (zio->io_type == ZIO_TYPE_READ)
+	if (zio->io_type == ZIO_TYPE_READ) {
 		maxgap = zfs_vdev_read_gap_limit;
+		t = &vq->vq_read_offset_tree;
+	} else {
+		ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+		t = &vq->vq_write_offset_tree;
+	}
 
 	/*
 	 * We can aggregate I/Os that are sufficiently adjacent and of
@@ -657,6 +682,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 	 * Walk backwards through sufficiently contiguous I/Os
 	 * recording the last non-optional I/O.
 	 */
+	zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
 	while ((dio = AVL_PREV(t, first)) != NULL &&
 	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 	    IO_SPAN(dio, last) <= limit &&
@@ -686,7 +712,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 	    (IO_SPAN(first, dio) <= limit ||
 	    (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
-	    IO_SPAN(first, dio) <= maxblocksize &&
+	    IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE &&
 	    IO_GAP(last, dio) <= maxgap &&
 	    dio->io_type == zio->io_type) {
 		last = dio;
@@ -725,6 +751,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 		 * after our span is mandatory.
 		 */
 		dio = AVL_NEXT(t, last);
+		ASSERT3P(dio, !=, NULL);
 		dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
 	} else {
 		/* do not include the optional i/o */
@@ -739,7 +766,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 		return (NULL);
 
 	size = IO_SPAN(first, last);
-	ASSERT3U(size, <=, maxblocksize);
+	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 
 	abd = abd_alloc_gang();
 	if (abd == NULL)
@@ -747,8 +774,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 
 	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
 	    abd, size, first->io_type, zio->io_priority,
-	    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
-	    vdev_queue_agg_io_done, NULL);
+	    flags | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL);
 	aio->io_timestamp = first->io_timestamp;
 
 	nio = first;
@@ -756,6 +782,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 	do {
 		dio = nio;
 		nio = AVL_NEXT(t, dio);
+		ASSERT3P(dio, !=, NULL);
 		zio_add_child(dio, aio);
 		vdev_queue_io_remove(vq, dio);
 
@@ -823,19 +850,30 @@ again:
 		return (NULL);
 	}
 
-	/*
-	 * For LBA-ordered queues (async / scrub / initializing), issue the
-	 * i/o which follows the most recently issued i/o in LBA (offset) order.
-	 *
-	 * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
-	 */
-	tree = vdev_queue_class_tree(vq, p);
-	vq->vq_io_search.io_timestamp = 0;
-	vq->vq_io_search.io_offset = vq->vq_last_offset - 1;
-	VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL);
-	zio = avl_nearest(tree, idx, AVL_AFTER);
-	if (zio == NULL)
-		zio = avl_first(tree);
+	if (vdev_queue_class_fifo(p)) {
+		zio = list_head(&vq->vq_class[p].vqc_list);
+	} else {
+		/*
+		 * For LBA-ordered queues (async / scrub / initializing),
+		 * issue the I/O which follows the most recently issued I/O
+		 * in LBA (offset) order, but to avoid starvation only within
+		 * the same 0.5 second interval as the first I/O.
+		 */
+		tree = &vq->vq_class[p].vqc_tree;
+		zio = aio = avl_first(tree);
+		if (zio->io_offset < vq->vq_last_offset) {
+			vq->vq_io_search.io_timestamp = zio->io_timestamp;
+			vq->vq_io_search.io_offset = vq->vq_last_offset;
+			zio = avl_find(tree, &vq->vq_io_search, &idx);
+			if (zio == NULL) {
+				zio = avl_nearest(tree, idx, AVL_AFTER);
+				if (zio == NULL ||
+				    (zio->io_timestamp >> VDQ_T_SHIFT) !=
+				    (aio->io_timestamp >> VDQ_T_SHIFT))
+					zio = aio;
+			}
+		}
+	}
 	ASSERT3U(zio->io_priority, ==, p);
 
 	aio = vdev_queue_aggregate(vq, zio);
@@ -905,7 +943,7 @@ vdev_queue_io(zio_t *zio)
 		ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM);
 	}
 
-	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+	zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
 	zio->io_timestamp = gethrtime();
 
 	mutex_enter(&vq->vq_lock);
@@ -966,7 +1004,6 @@ void
 vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
-	avl_tree_t *tree;
 
 	/*
 	 * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
@@ -1001,12 +1038,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
 	 * Otherwise, the zio is currently active and we cannot change its
 	 * priority.
 	 */
-	tree = vdev_queue_class_tree(vq, zio->io_priority);
-	if (avl_find(tree, zio, NULL) == zio) {
-		avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+	if (zio->io_queue_state == ZIO_QS_QUEUED) {
+		vdev_queue_class_remove(vq, zio);
 		zio->io_priority = priority;
-		avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
-	} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
+		vdev_queue_class_add(vq, zio);
+	} else if (zio->io_queue_state == ZIO_QS_NONE) {
 		zio->io_priority = priority;
 	}
 
@@ -1019,10 +1055,10 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
  * vq_lock mutex use here, instead we prefer to keep it lock free for
  * performance.
  */
-int
+uint32_t
 vdev_queue_length(vdev_t *vd)
 {
-	return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+	return (vd->vdev_queue.vq_active);
 }
 
 uint64_t
@@ -1031,91 +1067,99 @@ vdev_queue_last_offset(vdev_t *vd)
 	return (vd->vdev_queue.vq_last_offset);
 }
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, INT, ZMOD_RW,
-	"Max vdev I/O aggregation size");
+uint64_t
+vdev_queue_class_length(vdev_t *vd, zio_priority_t p)
+{
+	vdev_queue_t *vq = &vd->vdev_queue;
+	if (vdev_queue_class_fifo(p))
+		return (vq->vq_class[p].vqc_list_numnodes);
+	else
+		return (avl_numnodes(&vq->vq_class[p].vqc_tree));
+}
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, INT, ZMOD_RW,
-	"Max vdev I/O aggregation size for non-rotating media");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW,
+	"Max vdev I/O aggregation size");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, INT, ZMOD_RW,
-	"Allow TRIM I/O to be aggregated");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT,
+	ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW,
 	"Aggregate read I/O over gap");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, UINT, ZMOD_RW,
 	"Aggregate write I/O over gap");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, UINT, ZMOD_RW,
 	"Maximum number of active I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent, INT, ZMOD_RW,
-	"Async write concurrency max threshold");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent,
+	UINT, ZMOD_RW, "Async write concurrency max threshold");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent, INT, ZMOD_RW,
-	"Async write concurrency min threshold");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent,
+	UINT, ZMOD_RW, "Async write concurrency min threshold");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, UINT, ZMOD_RW,
 	"Max active async read I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, UINT, ZMOD_RW,
 	"Min active async read I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, UINT, ZMOD_RW,
 	"Max active async write I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, UINT, ZMOD_RW,
 	"Min active async write I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, UINT, ZMOD_RW,
 	"Max active initializing I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, UINT, ZMOD_RW,
 	"Min active initializing I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, UINT, ZMOD_RW,
 	"Max active removal I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, UINT, ZMOD_RW,
 	"Min active removal I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, UINT, ZMOD_RW,
 	"Max active scrub I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, UINT, ZMOD_RW,
 	"Min active scrub I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, UINT, ZMOD_RW,
 	"Max active sync read I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, UINT, ZMOD_RW,
 	"Min active sync read I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, UINT, ZMOD_RW,
 	"Max active sync write I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, UINT, ZMOD_RW,
 	"Min active sync write I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, UINT, ZMOD_RW,
 	"Max active trim/discard I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, UINT, ZMOD_RW,
 	"Min active trim/discard I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, UINT, ZMOD_RW,
 	"Max active rebuild I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, UINT, ZMOD_RW,
 	"Min active rebuild I/Os per vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, UINT, ZMOD_RW,
 	"Number of non-interactive I/Os to allow in sequence");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, UINT, ZMOD_RW,
 	"Number of non-interactive I/Os before _max_active");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW,
 	"Queue depth percentage for each top-level vdev");
-/* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW,
+	"Default queue depth for each allocator");
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
index 7e7202ec1e55..15c8b8ca6016 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -27,15 +27,22 @@
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
 #include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
+#include <sys/dmu_tx.h>
 #include <sys/abd.h>
+#include <sys/zfs_rlock.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_raidz_impl.h>
 #include <sys/vdev_draid.h>
+#include <sys/uberblock_impl.h>
+#include <sys/dsl_scan.h>
 
 #ifdef ZFS_DEBUG
 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
@@ -135,6 +142,237 @@
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 }
 
+
+/*
+ * Big Theory Statement for how a RAIDZ VDEV is expanded
+ *
+ * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
+ * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
+ * that have been previously expanded can be expanded again.
+ *
+ * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
+ * the VDEV) when an expansion starts.  And the expansion will pause if any
+ * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
+ * operations on the pool can continue while an expansion is in progress (e.g.
+ * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
+ * and zpool initialize which can't be run during an expansion.  Following a
+ * reboot or export/import, the expansion resumes where it left off.
+ *
+ * == Reflowing the Data ==
+ *
+ * The expansion involves reflowing (copying) the data from the current set
+ * of disks to spread it across the new set which now has one more disk. This
+ * reflow operation is similar to reflowing text when the column width of a
+ * text editor window is expanded. The text doesn’t change but the location of
+ * the text changes to accommodate the new width. An example reflow result for
+ * a 4-wide RAIDZ1 to a 5-wide is shown below.
+ *
+ *                            Reflow End State
+ *            Each letter indicates a parity group (logical stripe)
+ *
+ *         Before expansion                         After Expansion
+ *     D1     D2     D3     D4               D1     D2     D3     D4     D5
+ *  +------+------+------+------+         +------+------+------+------+------+
+ *  |      |      |      |      |         |      |      |      |      |      |
+ *  |  A   |  A   |  A   |  A   |         |  A   |  A   |  A   |  A   |  B   |
+ *  |     1|     2|     3|     4|         |     1|     2|     3|     4|     5|
+ *  +------+------+------+------+         +------+------+------+------+------+
+ *  |      |      |      |      |         |      |      |      |      |      |
+ *  |  B   |  B   |  C   |  C   |         |  B   |  C   |  C   |  C   |  C   |
+ *  |     5|     6|     7|     8|         |     6|     7|     8|     9|    10|
+ *  +------+------+------+------+         +------+------+------+------+------+
+ *  |      |      |      |      |         |      |      |      |      |      |
+ *  |  C   |  C   |  D   |  D   |         |  D   |  D   |  E   |  E   |  E   |
+ *  |     9|    10|    11|    12|         |    11|    12|    13|    14|    15|
+ *  +------+------+------+------+         +------+------+------+------+------+
+ *  |      |      |      |      |         |      |      |      |      |      |
+ *  |  E   |  E   |  E   |  E   |   -->   |  E   |  F   |  F   |  G   |  G   |
+ *  |    13|    14|    15|    16|         |    16|    17|    18|p   19|    20|
+ *  +------+------+------+------+         +------+------+------+------+------+
+ *  |      |      |      |      |         |      |      |      |      |      |
+ *  |  F   |  F   |  G   |  G   |         |  G   |  G   |  H   |  H   |  H   |
+ *  |    17|    18|    19|    20|         |    21|    22|    23|    24|    25|
+ *  +------+------+------+------+         +------+------+------+------+------+
+ *  |      |      |      |      |         |      |      |      |      |      |
+ *  |  G   |  G   |  H   |  H   |         |  H   |  I   |  I   |  J   |  J   |
+ *  |    21|    22|    23|    24|         |    26|    27|    28|    29|    30|
+ *  +------+------+------+------+         +------+------+------+------+------+
+ *  |      |      |      |      |         |      |      |      |      |      |
+ *  |  H   |  H   |  I   |  I   |         |  J   |  J   |      |      |  K   |
+ *  |    25|    26|    27|    28|         |    31|    32|    33|    34|    35|
+ *  +------+------+------+------+         +------+------+------+------+------+
+ *
+ * This reflow approach has several advantages. There is no need to read or
+ * modify the block pointers or recompute any block checksums.  The reflow
+ * doesn’t need to know where the parity sectors reside. We can read and write
+ * data sequentially and the copy can occur in a background thread in open
+ * context. The design also allows for fast discovery of what data to copy.
+ *
+ * The VDEV metaslabs are processed, one at a time, to copy the block data to
+ * have it flow across all the disks. The metaslab is disabled for allocations
+ * during the copy. As an optimization, we only copy the allocated data which
+ * can be determined by looking at the metaslab range tree. During the copy we
+ * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
+ * need to be able to survive losing parity count disks).  This means we
+ * cannot overwrite data during the reflow that would be needed if a disk is
+ * lost.
+ *
+ * After the reflow completes, all newly-written blocks will have the new
+ * layout, i.e., they will have the parity to data ratio implied by the new
+ * number of disks in the RAIDZ group.  Even though the reflow copies all of
+ * the allocated space (data and parity), it is only rearranged, not changed.
+ *
+ * This act of reflowing the data has a few implications about blocks
+ * that were written before the reflow completes:
+ *
+ *  - Old blocks will still use the same amount of space (i.e., they will have
+ *    the parity to data ratio implied by the old number of disks in the RAIDZ
+ *    group).
+ *  - Reading old blocks will be slightly slower than before the reflow, for
+ *    two reasons. First, we will have to read from all disks in the RAIDZ
+ *    VDEV, rather than being able to skip the children that contain only
+ *    parity of this block (because the data of a single block is now spread
+ *    out across all the disks).  Second, in most cases there will be an extra
+ *    bcopy, needed to rearrange the data back to its original layout in memory.
+ *
+ * == Scratch Area ==
+ *
+ * As we copy the block data, we can only progress to the point that writes
+ * will not overlap with blocks whose progress has not yet been recorded on
+ * disk.  Since partially-copied rows are always read from the old location,
+ * we need to stop one row before the sector-wise overlap, to prevent any
+ * row-wise overlap. For example, in the diagram above, when we reflow sector
+ * B6 it will overwite the original location for B5.
+ *
+ * To get around this, a scratch space is used so that we can start copying
+ * without risking data loss by overlapping the row. As an added benefit, it
+ * improves performance at the beginning of the reflow, but that small perf
+ * boost wouldn't be worth the complexity on its own.
+ *
+ * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
+ * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
+ * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
+ * the widths will likely be single digits so we can get a substantial chuck
+ * size using only a few MB of scratch per disk.
+ *
+ * The scratch area is persisted to disk which holds a large amount of reflowed
+ * state. We can always read the partially written stripes when a disk fails or
+ * the copy is interrupted (crash) during the initial copying phase and also
+ * get past a small chunk size restriction.  At a minimum, the scratch space
+ * must be large enough to get us to the point that one row does not overlap
+ * itself when moved (i.e new_width^2).  But going larger is even better. We
+ * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
+ * as our scratch space to handle overwriting the initial part of the VDEV.
+ *
+ *	0     256K   512K                    4M
+ *	+------+------+-----------------------+-----------------------------
+ *	| VDEV | VDEV |   Boot Block (3.5M)   |  Allocatable space ...
+ *	|  L0  |  L1  |       Reserved        |     (Metaslabs)
+ *	+------+------+-----------------------+-------------------------------
+ *                        Scratch Area
+ *
+ * == Reflow Progress Updates ==
+ * After the initial scratch-based reflow, the expansion process works
+ * similarly to device removal. We create a new open context thread which
+ * reflows the data, and periodically kicks off sync tasks to update logical
+ * state. In this case, state is the committed progress (offset of next data
+ * to copy). We need to persist the completed offset on disk, so that if we
+ * crash we know which format each VDEV offset is in.
+ *
+ * == Time Dependent Geometry ==
+ *
+ * In non-expanded RAIDZ, blocks are read from disk in a column by column
+ * fashion. For a multi-row block, the second sector is in the first column
+ * not in the second column. This allows us to issue full reads for each
+ * column directly into the request buffer. The block data is thus laid out
+ * sequentially in a column-by-column fashion.
+ *
+ * For example, in the before expansion diagram above, one logical block might
+ * be sectors G19-H26. The parity is in G19,H23; and the data is in
+ * G20,H24,G21,H25,G22,H26.
+ *
+ * After a block is reflowed, the sectors that were all in the original column
+ * data can now reside in different columns. When reading from an expanded
+ * VDEV, we need to know the logical stripe width for each block so we can
+ * reconstitute the block’s data after the reads are completed. Likewise,
+ * when we perform the combinatorial reconstruction we need to know the
+ * original width so we can retry combinations from the past layouts.
+ *
+ * Time dependent geometry is what we call having blocks with different layouts
+ * (stripe widths) in the same VDEV. This time-dependent geometry uses the
+ * block’s birth time (+ the time expansion ended) to establish the correct
+ * width for a given block. After an expansion completes, we record the time
+ * for blocks written with a particular width (geometry).
+ *
+ * == On Disk Format Changes ==
+ *
+ * New pool feature flag, 'raidz_expansion' whose reference count is the number
+ * of RAIDZ VDEVs that have been expanded.
+ *
+ * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
+ *
+ * Since the uberblock can point to arbitrary blocks, which might be on the
+ * expanding RAIDZ, and might or might not have been expanded. We need to know
+ * which way a block is laid out before reading it. This info is the next
+ * offset that needs to be reflowed and we persist that in the uberblock, in
+ * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
+ * After the expansion is complete, we then use the raidz_expand_txgs array
+ * (see below) to determine how to read a block and the ub_raidz_reflow_info
+ * field no longer required.
+ *
+ * The uberblock's ub_raidz_reflow_info field also holds the scratch space
+ * state (i.e., active or not) which is also required before reading a block
+ * during the initial phase of reflowing the data.
+ *
+ * The top-level RAIDZ VDEV has two new entries in the nvlist:
+ *
+ * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
+ *                            and used after the expansion is complete to
+ *                            determine how to read a raidz block
+ * 'raidz_expanding' boolean: present during reflow and removed after completion
+ *                            used during a spa import to resume an unfinished
+ *                            expansion
+ *
+ * And finally the VDEVs top zap adds the following informational entries:
+ *   VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
+ *   VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
+ *   VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
+ *   VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
+ */
+
+/*
+ * For testing only: pause the raidz expansion after reflowing this amount.
+ * (accessed by ZTS and ztest)
+ */
+#ifdef	_KERNEL
+static
+#endif	/* _KERNEL */
+unsigned long raidz_expand_max_reflow_bytes = 0;
+
+/*
+ * For testing only: pause the raidz expansion at a certain point.
+ */
+uint_t raidz_expand_pause_point = 0;
+
+/*
+ * Maximum amount of copy io's outstanding at once.
+ */
+static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
+
+/*
+ * Apply raidz map abds aggregation if the number of rows in the map is equal
+ * or greater than the value below.
+ */
+static unsigned long raidz_io_aggregate_rows = 4;
+
+/*
+ * Automatically start a pool scrub when a RAIDZ expansion completes in
+ * order to verify the checksums of all blocks which have been copied
+ * during the expansion.  Automatic scrubbing is enabled by default and
+ * is strongly recommended.
+ */
+static int zfs_scrub_after_expand = 1;
+
 static void
 vdev_raidz_row_free(raidz_row_t *rr)
 {
@@ -159,6 +397,17 @@ vdev_raidz_map_free(raidz_map_t *rm)
 	for (int i = 0; i < rm->rm_nrows; i++)
 		vdev_raidz_row_free(rm->rm_row[i]);
 
+	if (rm->rm_nphys_cols) {
+		for (int i = 0; i < rm->rm_nphys_cols; i++) {
+			if (rm->rm_phys_col[i].rc_abd != NULL)
+				abd_free(rm->rm_phys_col[i].rc_abd);
+		}
+
+		kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
+		    rm->rm_nphys_cols);
+	}
+
+	ASSERT3P(rm->rm_lr, ==, NULL);
 	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
 }
 
@@ -170,10 +419,37 @@ vdev_raidz_map_free_vsd(zio_t *zio)
 	vdev_raidz_map_free(rm);
 }
 
+static int
+vdev_raidz_reflow_compare(const void *x1, const void *x2)
+{
+	const reflow_node_t *l = x1;
+	const reflow_node_t *r = x2;
+
+	return (TREE_CMP(l->re_txg, r->re_txg));
+}
+
 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 	.vsd_free = vdev_raidz_map_free_vsd,
 };
 
+raidz_row_t *
+vdev_raidz_row_alloc(int cols)
+{
+	raidz_row_t *rr =
+	    kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
+
+	rr->rr_cols = cols;
+	rr->rr_scols = cols;
+
+	for (int c = 0; c < cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		rc->rc_shadow_devidx = INT_MAX;
+		rc->rc_shadow_offset = UINT64_MAX;
+		rc->rc_allow_repair = 1;
+	}
+	return (rr);
+}
+
 static void
 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
 {
@@ -302,7 +578,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
 	uint64_t f = b % dcols;
 	/* The starting byte offset on each child vdev. */
 	uint64_t o = (b / dcols) << ashift;
-	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+	uint64_t acols, scols;
 
 	raidz_map_t *rm =
 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
@@ -312,22 +588,22 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
 	 * "Quotient": The number of data sectors for this stripe on all but
 	 * the "big column" child vdevs that also contain "remainder" data.
 	 */
-	q = s / (dcols - nparity);
+	uint64_t q = s / (dcols - nparity);
 
 	/*
 	 * "Remainder": The number of partial stripe data sectors in this I/O.
 	 * This will add a sector to some, but not all, child vdevs.
 	 */
-	r = s - q * (dcols - nparity);
+	uint64_t r = s - q * (dcols - nparity);
 
 	/* The number of "big columns" - those which contain remainder data. */
-	bc = (r == 0 ? 0 : r + nparity);
+	uint64_t bc = (r == 0 ? 0 : r + nparity);
 
 	/*
 	 * The total number of data and parity sectors associated with
 	 * this I/O.
 	 */
-	tot = s + nparity * (q + (r == 0 ? 0 : 1));
+	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
 
 	/*
 	 * acols: The columns that will be accessed.
@@ -343,43 +619,28 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
 	}
 
 	ASSERT3U(acols, <=, scols);
-
-	rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP);
+	rr = vdev_raidz_row_alloc(scols);
 	rm->rm_row[0] = rr;
-
 	rr->rr_cols = acols;
-	rr->rr_scols = scols;
 	rr->rr_bigcols = bc;
-	rr->rr_missingdata = 0;
-	rr->rr_missingparity = 0;
 	rr->rr_firstdatacol = nparity;
-	rr->rr_abd_empty = NULL;
-	rr->rr_nempty = 0;
 #ifdef ZFS_DEBUG
 	rr->rr_offset = zio->io_offset;
 	rr->rr_size = zio->io_size;
 #endif
 
-	asize = 0;
+	uint64_t asize = 0;
 
-	for (c = 0; c < scols; c++) {
+	for (uint64_t c = 0; c < scols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
-		col = f + c;
-		coff = o;
+		uint64_t col = f + c;
+		uint64_t coff = o;
 		if (col >= dcols) {
 			col -= dcols;
 			coff += 1ULL << ashift;
 		}
 		rc->rc_devidx = col;
 		rc->rc_offset = coff;
-		rc->rc_abd = NULL;
-		rc->rc_orig_data = NULL;
-		rc->rc_error = 0;
-		rc->rc_tried = 0;
-		rc->rc_skipped = 0;
-		rc->rc_force_repair = 0;
-		rc->rc_allow_repair = 1;
-		rc->rc_need_orig_restore = B_FALSE;
 
 		if (c >= acols)
 			rc->rc_size = 0;
@@ -419,13 +680,12 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
 	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
 
 	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
-		devidx = rr->rr_col[0].rc_devidx;
+		uint64_t devidx = rr->rr_col[0].rc_devidx;
 		o = rr->rr_col[0].rc_offset;
 		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
 		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
 		rr->rr_col[1].rc_devidx = devidx;
 		rr->rr_col[1].rc_offset = o;
-
 		if (rm->rm_skipstart == 0)
 			rm->rm_skipstart = 1;
 	}
@@ -435,7 +695,338 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
 	} else {
 		vdev_raidz_map_alloc_read(zio, rm);
 	}
+	/* init RAIDZ parity ops */
+	rm->rm_ops = vdev_raidz_math_get_ops();
+
+	return (rm);
+}
+
+/*
+ * Everything before reflow_offset_synced should have been moved to the new
+ * location (read and write completed).  However, this may not yet be reflected
+ * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
+ * uberblock has not yet been written). If reflow is not in progress,
+ * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
+ * entirely before reflow_offset_synced, it will come from the new location.
+ * Otherwise this row will come from the old location.  Therefore, rows that
+ * straddle the reflow_offset_synced will come from the old location.
+ *
+ * For writes, reflow_offset_next is the next offset to copy.  If a sector has
+ * been copied, but not yet reflected in the on-disk progress
+ * (reflow_offset_synced), it will also be written to the new (already copied)
+ * offset.
+ */
+noinline raidz_map_t *
+vdev_raidz_map_alloc_expanded(zio_t *zio,
+    uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
+    uint64_t nparity, uint64_t reflow_offset_synced,
+    uint64_t reflow_offset_next, boolean_t use_scratch)
+{
+	abd_t *abd = zio->io_abd;
+	uint64_t offset = zio->io_offset;
+	uint64_t size = zio->io_size;
+
+	/* The zio's size in units of the vdev's minimum sector size. */
+	uint64_t s = size >> ashift;
+
+	/*
+	 * "Quotient": The number of data sectors for this stripe on all but
+	 * the "big column" child vdevs that also contain "remainder" data.
+	 * AKA "full rows"
+	 */
+	uint64_t q = s / (logical_cols - nparity);
+
+	/*
+	 * "Remainder": The number of partial stripe data sectors in this I/O.
+	 * This will add a sector to some, but not all, child vdevs.
+	 */
+	uint64_t r = s - q * (logical_cols - nparity);
+
+	/* The number of "big columns" - those which contain remainder data. */
+	uint64_t bc = (r == 0 ? 0 : r + nparity);
+
+	/*
+	 * The total number of data and parity sectors associated with
+	 * this I/O.
+	 */
+	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+	/* How many rows contain data (not skip) */
+	uint64_t rows = howmany(tot, logical_cols);
+	int cols = MIN(tot, logical_cols);
+
+	raidz_map_t *rm =
+	    kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
+	    KM_SLEEP);
+	rm->rm_nrows = rows;
+	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+	rm->rm_skipstart = bc;
+	uint64_t asize = 0;
+
+	for (uint64_t row = 0; row < rows; row++) {
+		boolean_t row_use_scratch = B_FALSE;
+		raidz_row_t *rr = vdev_raidz_row_alloc(cols);
+		rm->rm_row[row] = rr;
+
+		/* The starting RAIDZ (parent) vdev sector of the row. */
+		uint64_t b = (offset >> ashift) + row * logical_cols;
+
+		/*
+		 * If we are in the middle of a reflow, and the copying has
+		 * not yet completed for any part of this row, then use the
+		 * old location of this row.  Note that reflow_offset_synced
+		 * reflects the i/o that's been completed, because it's
+		 * updated by a synctask, after zio_wait(spa_txg_zio[]).
+		 * This is sufficient for our check, even if that progress
+		 * has not yet been recorded to disk (reflected in
+		 * spa_ubsync).  Also note that we consider the last row to
+		 * be "full width" (`cols`-wide rather than `bc`-wide) for
+		 * this calculation. This causes a tiny bit of unnecessary
+		 * double-writes but is safe and simpler to calculate.
+		 */
+		int row_phys_cols = physical_cols;
+		if (b + cols > reflow_offset_synced >> ashift)
+			row_phys_cols--;
+		else if (use_scratch)
+			row_use_scratch = B_TRUE;
+
+		/* starting child of this row */
+		uint64_t child_id = b % row_phys_cols;
+		/* The starting byte offset on each child vdev. */
+		uint64_t child_offset = (b / row_phys_cols) << ashift;
+
+		/*
+		 * Note, rr_cols is the entire width of the block, even
+		 * if this row is shorter.  This is needed because parity
+		 * generation (for Q and R) needs to know the entire width,
+		 * because it treats the short row as though it was
+		 * full-width (and the "phantom" sectors were zero-filled).
+		 *
+		 * Another approach to this would be to set cols shorter
+		 * (to just the number of columns that we might do i/o to)
+		 * and have another mechanism to tell the parity generation
+		 * about the "entire width".  Reconstruction (at least
+		 * vdev_raidz_reconstruct_general()) would also need to
+		 * know about the "entire width".
+		 */
+		rr->rr_firstdatacol = nparity;
+#ifdef ZFS_DEBUG
+		/*
+		 * note: rr_size is PSIZE, not ASIZE
+		 */
+		rr->rr_offset = b << ashift;
+		rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
+#endif
+
+		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
+			if (child_id >= row_phys_cols) {
+				child_id -= row_phys_cols;
+				child_offset += 1ULL << ashift;
+			}
+			raidz_col_t *rc = &rr->rr_col[c];
+			rc->rc_devidx = child_id;
+			rc->rc_offset = child_offset;
+
+			/*
+			 * Get this from the scratch space if appropriate.
+			 * This only happens if we crashed in the middle of
+			 * raidz_reflow_scratch_sync() (while it's running,
+			 * the rangelock prevents us from doing concurrent
+			 * io), and even then only during zpool import or
+			 * when the pool is imported readonly.
+			 */
+			if (row_use_scratch)
+				rc->rc_offset -= VDEV_BOOT_SIZE;
+
+			uint64_t dc = c - rr->rr_firstdatacol;
+			if (c < rr->rr_firstdatacol) {
+				rc->rc_size = 1ULL << ashift;
+
+				/*
+				 * Parity sectors' rc_abd's are set below
+				 * after determining if this is an aggregation.
+				 */
+			} else if (row == rows - 1 && bc != 0 && c >= bc) {
+				/*
+				 * Past the end of the block (even including
+				 * skip sectors).  This sector is part of the
+				 * map so that we have full rows for p/q parity
+				 * generation.
+				 */
+				rc->rc_size = 0;
+				rc->rc_abd = NULL;
+			} else {
+				/* "data column" (col excluding parity) */
+				uint64_t off;
+
+				if (c < bc || r == 0) {
+					off = dc * rows + row;
+				} else {
+					off = r * rows +
+					    (dc - r) * (rows - 1) + row;
+				}
+				rc->rc_size = 1ULL << ashift;
+				rc->rc_abd = abd_get_offset_struct(
+				    &rc->rc_abdstruct, abd, off << ashift,
+				    rc->rc_size);
+			}
+
+			if (rc->rc_size == 0)
+				continue;
+
+			/*
+			 * If any part of this row is in both old and new
+			 * locations, the primary location is the old
+			 * location. If this sector was already copied to the
+			 * new location, we need to also write to the new,
+			 * "shadow" location.
+			 *
+			 * Note, `row_phys_cols != physical_cols` indicates
+			 * that the primary location is the old location.
+			 * `b+c < reflow_offset_next` indicates that the copy
+			 * to the new location has been initiated. We know
+			 * that the copy has completed because we have the
+			 * rangelock, which is held exclusively while the
+			 * copy is in progress.
+			 */
+			if (row_use_scratch ||
+			    (row_phys_cols != physical_cols &&
+			    b + c < reflow_offset_next >> ashift)) {
+				rc->rc_shadow_devidx = (b + c) % physical_cols;
+				rc->rc_shadow_offset =
+				    ((b + c) / physical_cols) << ashift;
+				if (row_use_scratch)
+					rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
+			}
+
+			asize += rc->rc_size;
+		}
+
+		/*
+		 * See comment in vdev_raidz_map_alloc()
+		 */
+		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
+		    (offset & (1ULL << 20))) {
+			ASSERT(rr->rr_cols >= 2);
+			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
+
+			int devidx0 = rr->rr_col[0].rc_devidx;
+			uint64_t offset0 = rr->rr_col[0].rc_offset;
+			int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
+			uint64_t shadow_offset0 =
+			    rr->rr_col[0].rc_shadow_offset;
+
+			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
+			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
+			rr->rr_col[0].rc_shadow_devidx =
+			    rr->rr_col[1].rc_shadow_devidx;
+			rr->rr_col[0].rc_shadow_offset =
+			    rr->rr_col[1].rc_shadow_offset;
+
+			rr->rr_col[1].rc_devidx = devidx0;
+			rr->rr_col[1].rc_offset = offset0;
+			rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
+			rr->rr_col[1].rc_shadow_offset = shadow_offset0;
+		}
+	}
+	ASSERT3U(asize, ==, tot << ashift);
+
+	/*
+	 * Determine if the block is contiguous, in which case we can use
+	 * an aggregation.
+	 */
+	if (rows >= raidz_io_aggregate_rows) {
+		rm->rm_nphys_cols = physical_cols;
+		rm->rm_phys_col =
+		    kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
+		    KM_SLEEP);
+
+		/*
+		 * Determine the aggregate io's offset and size, and check
+		 * that the io is contiguous.
+		 */
+		for (int i = 0;
+		    i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
+			raidz_row_t *rr = rm->rm_row[i];
+			for (int c = 0; c < rr->rr_cols; c++) {
+				raidz_col_t *rc = &rr->rr_col[c];
+				raidz_col_t *prc =
+				    &rm->rm_phys_col[rc->rc_devidx];
+
+				if (rc->rc_size == 0)
+					continue;
+
+				if (prc->rc_size == 0) {
+					ASSERT0(prc->rc_offset);
+					prc->rc_offset = rc->rc_offset;
+				} else if (prc->rc_offset + prc->rc_size !=
+				    rc->rc_offset) {
+					/*
+					 * This block is not contiguous and
+					 * therefore can't be aggregated.
+					 * This is expected to be rare, so
+					 * the cost of allocating and then
+					 * freeing rm_phys_col is not
+					 * significant.
+					 */
+					kmem_free(rm->rm_phys_col,
+					    sizeof (raidz_col_t) *
+					    rm->rm_nphys_cols);
+					rm->rm_phys_col = NULL;
+					rm->rm_nphys_cols = 0;
+					break;
+				}
+				prc->rc_size += rc->rc_size;
+			}
+		}
+	}
+	if (rm->rm_phys_col != NULL) {
+		/*
+		 * Allocate aggregate ABD's.
+		 */
+		for (int i = 0; i < rm->rm_nphys_cols; i++) {
+			raidz_col_t *prc = &rm->rm_phys_col[i];
+
+			prc->rc_devidx = i;
+
+			if (prc->rc_size == 0)
+				continue;
+
+			prc->rc_abd =
+			    abd_alloc_linear(rm->rm_phys_col[i].rc_size,
+			    B_FALSE);
+		}
 
+		/*
+		 * Point the parity abd's into the aggregate abd's.
+		 */
+		for (int i = 0; i < rm->rm_nrows; i++) {
+			raidz_row_t *rr = rm->rm_row[i];
+			for (int c = 0; c < rr->rr_firstdatacol; c++) {
+				raidz_col_t *rc = &rr->rr_col[c];
+				raidz_col_t *prc =
+				    &rm->rm_phys_col[rc->rc_devidx];
+				rc->rc_abd =
+				    abd_get_offset_struct(&rc->rc_abdstruct,
+				    prc->rc_abd,
+				    rc->rc_offset - prc->rc_offset,
+				    rc->rc_size);
+			}
+		}
+	} else {
+		/*
+		 * Allocate new abd's for the parity sectors.
+		 */
+		for (int i = 0; i < rm->rm_nrows; i++) {
+			raidz_row_t *rr = rm->rm_row[i];
+			for (int c = 0; c < rr->rr_firstdatacol; c++) {
+				raidz_col_t *rc = &rr->rr_col[c];
+				rc->rc_abd =
+				    abd_alloc_linear(rc->rc_size,
+				    B_TRUE);
+			}
+		}
+	}
 	/* init RAIDZ parity ops */
 	rm->rm_ops = vdev_raidz_math_get_ops();
 
@@ -453,11 +1044,11 @@ vdev_raidz_p_func(void *buf, size_t size, void *private)
 {
 	struct pqr_struct *pqr = private;
 	const uint64_t *src = buf;
-	int i, cnt = size / sizeof (src[0]);
+	int cnt = size / sizeof (src[0]);
 
 	ASSERT(pqr->p && !pqr->q && !pqr->r);
 
-	for (i = 0; i < cnt; i++, src++, pqr->p++)
+	for (int i = 0; i < cnt; i++, src++, pqr->p++)
 		*pqr->p ^= *src;
 
 	return (0);
@@ -469,11 +1060,11 @@ vdev_raidz_pq_func(void *buf, size_t size, void *private)
 	struct pqr_struct *pqr = private;
 	const uint64_t *src = buf;
 	uint64_t mask;
-	int i, cnt = size / sizeof (src[0]);
+	int cnt = size / sizeof (src[0]);
 
 	ASSERT(pqr->p && pqr->q && !pqr->r);
 
-	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
+	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
 		*pqr->p ^= *src;
 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 		*pqr->q ^= *src;
@@ -488,11 +1079,11 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private)
 	struct pqr_struct *pqr = private;
 	const uint64_t *src = buf;
 	uint64_t mask;
-	int i, cnt = size / sizeof (src[0]);
+	int cnt = size / sizeof (src[0]);
 
 	ASSERT(pqr->p && pqr->q && pqr->r);
 
-	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
+	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
 		*pqr->p ^= *src;
 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 		*pqr->q ^= *src;
@@ -618,7 +1209,15 @@ vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
 void
 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
 {
-	ASSERT3U(rr->rr_cols, !=, 0);
+	if (rr->rr_cols == 0) {
+		/*
+		 * We are handling this block one row at a time (because
+		 * this block has a different logical vs physical width,
+		 * due to RAIDZ expansion), and this is a pad-only row,
+		 * which has no parity.
+		 */
+		return;
+	}
 
 	/* Generate using the new math implementation */
 	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
@@ -648,10 +1247,10 @@ vdev_raidz_generate_parity(raidz_map_t *rm)
 	}
 }
 
-/* ARGSUSED */
 static int
 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
 {
+	(void) private;
 	uint64_t *dst = dbuf;
 	uint64_t *src = sbuf;
 	int cnt = size / sizeof (src[0]);
@@ -663,11 +1262,11 @@ vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
 	return (0);
 }
 
-/* ARGSUSED */
 static int
 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
     void *private)
 {
+	(void) private;
 	uint64_t *dst = dbuf;
 	uint64_t *src = sbuf;
 	uint64_t mask;
@@ -681,10 +1280,10 @@ vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
 	return (0);
 }
 
-/* ARGSUSED */
 static int
 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
 {
+	(void) private;
 	uint64_t *dst = buf;
 	uint64_t mask;
 	int cnt = size / sizeof (dst[0]);
@@ -770,6 +1369,9 @@ vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
 	int x = tgts[0];
 	abd_t *dst, *src;
 
+	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
+		zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
+
 	ASSERT3U(ntgts, ==, 1);
 	ASSERT3U(x, >=, rr->rr_firstdatacol);
 	ASSERT3U(x, <, rr->rr_cols);
@@ -802,6 +1404,9 @@ vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
 	int c, exp;
 	abd_t *dst, *src;
 
+	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
+		zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
+
 	ASSERT(ntgts == 1);
 
 	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
@@ -848,6 +1453,9 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
 	int y = tgts[1];
 	abd_t *xd, *yd;
 
+	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
+		zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
+
 	ASSERT(ntgts == 2);
 	ASSERT(x < y);
 	ASSERT(x >= rr->rr_firstdatacol);
@@ -926,7 +1534,6 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
 }
 
-/* BEGIN CSTYLED */
 /*
  * In the general case of reconstruction, we must solve the system of linear
  * equations defined by the coefficients used to generate parity as well as
@@ -1078,7 +1685,6 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
  * that reason, we only build the coefficients in the rows that correspond to
  * targeted columns.
  */
-/* END CSTYLED */
 
 static void
 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
@@ -1285,8 +1891,9 @@ vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
 static void
 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
 {
-	int n, i, c, t, tt;
-	int nmissing_rows;
+	int i, c, t, tt;
+	unsigned int n;
+	unsigned int nmissing_rows;
 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
 	int parity_map[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *p, *pp;
@@ -1297,11 +1904,14 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
 
 	abd_t **bufs = NULL;
 
+	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
+		zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
 	/*
 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
 	 * temporary linear ABDs if any non-linear ABDs are found.
 	 */
 	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
+		ASSERT(rr->rr_col[i].rc_abd != NULL);
 		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
 			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
 			    KM_PUSHPAGE);
@@ -1429,10 +2039,23 @@ vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
 	int nbadparity, nbaddata;
 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
 
+	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
+		zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
+		    rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
+		    (int)rr->rr_missingparity);
+	}
+
 	nbadparity = rr->rr_firstdatacol;
 	nbaddata = rr->rr_cols - nbadparity;
 	ntgts = 0;
 	for (i = 0, c = 0; c < rr->rr_cols; c++) {
+		if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
+			zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
+			    "offset=%llx error=%u)",
+			    rr, c, (int)rr->rr_col[c].rc_devidx,
+			    (long long)rr->rr_col[c].rc_offset,
+			    (int)rr->rr_col[c].rc_error);
+		}
 		if (c < rr->rr_firstdatacol)
 			parity_valid[c] = B_FALSE;
 
@@ -1529,12 +2152,25 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
-		*physical_ashift = MAX(*physical_ashift,
-		    cvd->vdev_physical_ashift);
 	}
+	for (c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (cvd->vdev_open_error != 0)
+			continue;
+		*physical_ashift = vdev_best_ashift(*logical_ashift,
+		    *physical_ashift, cvd->vdev_physical_ashift);
+	}
+
+	if (vd->vdev_rz_expanding) {
+		*asize *= vd->vdev_children - 1;
+		*max_asize *= vd->vdev_children - 1;
 
-	*asize *= vd->vdev_children;
-	*max_asize *= vd->vdev_children;
+		vd->vdev_min_asize = *asize;
+	} else {
+		*asize *= vd->vdev_children;
+		*max_asize *= vd->vdev_children;
+	}
 
 	if (numerrors > nparity) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
@@ -1553,19 +2189,70 @@ vdev_raidz_close(vdev_t *vd)
 	}
 }
 
+/*
+ * Return the logical width to use, given the txg in which the allocation
+ * happened.  Note that BP_GET_BIRTH() is usually the txg in which the
+ * BP was allocated.  Remapped BP's (that were relocated due to device
+ * removal, see remap_blkptr_cb()), will have a more recent physical birth
+ * which reflects when the BP was relocated, but we can ignore these because
+ * they can't be on RAIDZ (device removal doesn't support RAIDZ).
+ */
 static uint64_t
-vdev_raidz_asize(vdev_t *vd, uint64_t psize)
+vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
+{
+	reflow_node_t lookup = {
+		.re_txg = txg,
+	};
+	avl_index_t where;
+
+	uint64_t width;
+	mutex_enter(&vdrz->vd_expand_lock);
+	reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
+	if (re != NULL) {
+		width = re->re_logical_width;
+	} else {
+		re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
+		if (re != NULL)
+			width = re->re_logical_width;
+		else
+			width = vdrz->vd_original_width;
+	}
+	mutex_exit(&vdrz->vd_expand_lock);
+	return (width);
+}
+
+/*
+ * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
+ * more space due to the lower data-to-parity ratio.  In this case it's
+ * important to pass in the correct txg.  Note that vdev_gang_header_asize()
+ * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
+ * regardless of txg.  This is assured because for a single data sector, we
+ * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
+ */
+static uint64_t
+vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	uint64_t asize;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
-	uint64_t cols = vdrz->vd_logical_width;
+	uint64_t cols = vdrz->vd_original_width;
 	uint64_t nparity = vdrz->vd_nparity;
 
+	cols = vdev_raidz_get_logical_width(vdrz, txg);
+
 	asize = ((psize - 1) >> ashift) + 1;
 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
 	asize = roundup(asize, nparity + 1) << ashift;
 
+#ifdef ZFS_DEBUG
+	uint64_t asize_new = ((psize - 1) >> ashift) + 1;
+	uint64_t ncols_new = vdrz->vd_physical_width;
+	asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
+	    (ncols_new - nparity));
+	asize_new = roundup(asize_new, nparity + 1) << ashift;
+	VERIFY3U(asize_new, <=, asize);
+#endif
+
 	return (asize);
 }
 
@@ -1592,21 +2279,37 @@ vdev_raidz_child_done(zio_t *zio)
 }
 
 static void
-vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
+vdev_raidz_shadow_child_done(zio_t *zio)
 {
-#ifdef ZFS_DEBUG
-	vdev_t *tvd = vd->vdev_top;
+	raidz_col_t *rc = zio->io_private;
+
+	rc->rc_shadow_error = zio->io_error;
+}
 
+static void
+vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
+{
+	(void) rm;
+#ifdef ZFS_DEBUG
 	range_seg64_t logical_rs, physical_rs, remain_rs;
 	logical_rs.rs_start = rr->rr_offset;
 	logical_rs.rs_end = logical_rs.rs_start +
-	    vdev_raidz_asize(vd, rr->rr_size);
+	    vdev_raidz_asize(zio->io_vd, rr->rr_size,
+	    BP_GET_BIRTH(zio->io_bp));
 
 	raidz_col_t *rc = &rr->rr_col[col];
-	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+	vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
 	ASSERT(vdev_xlate_is_empty(&remain_rs));
+	if (vdev_xlate_is_empty(&physical_rs)) {
+		/*
+		 * If we are in the middle of expansion, the
+		 * physical->logical mapping is changing so vdev_xlate()
+		 * can't give us a reliable answer.
+		 */
+		return;
+	}
 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
 	/*
@@ -1617,7 +2320,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
 	 */
 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
-		    rc->rc_size + (1 << tvd->vdev_ashift));
+		    rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
 	} else {
 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
 	}
@@ -1625,7 +2328,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
 }
 
 static void
-vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
+vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
 {
 	vdev_t *vd = zio->io_vd;
 	raidz_map_t *rm = zio->io_vsd;
@@ -1637,31 +2340,66 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 		/* Verify physical to logical translation */
-		vdev_raidz_io_verify(vd, rr, c);
+		vdev_raidz_io_verify(zio, rm, rr, c);
 
-		if (rc->rc_size > 0) {
-			ASSERT3P(rc->rc_abd, !=, NULL);
-			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset, rc->rc_abd,
-			    abd_get_size(rc->rc_abd), zio->io_type,
-			    zio->io_priority, 0, vdev_raidz_child_done, rc));
-		} else {
-			/*
-			 * Generate optional write for skip sector to improve
-			 * aggregation contiguity.
-			 */
-			ASSERT3P(rc->rc_abd, ==, NULL);
-			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset, NULL, 1ULL << ashift,
-			    zio->io_type, zio->io_priority,
-			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL,
-			    NULL));
+		if (rc->rc_size == 0)
+			continue;
+
+		ASSERT3U(rc->rc_offset + rc->rc_size, <,
+		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
+
+		ASSERT3P(rc->rc_abd, !=, NULL);
+		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+		    rc->rc_offset, rc->rc_abd,
+		    abd_get_size(rc->rc_abd), zio->io_type,
+		    zio->io_priority, 0, vdev_raidz_child_done, rc));
+
+		if (rc->rc_shadow_devidx != INT_MAX) {
+			vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
+
+			ASSERT3U(
+			    rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
+			    cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
+
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
+			    rc->rc_shadow_offset, rc->rc_abd,
+			    abd_get_size(rc->rc_abd),
+			    zio->io_type, zio->io_priority, 0,
+			    vdev_raidz_shadow_child_done, rc));
 		}
 	}
 }
 
+/*
+ * Generate optional I/Os for skip sectors to improve aggregation contiguity.
+ * This only works for vdev_raidz_map_alloc() (not _expanded()).
+ */
+static void
+raidz_start_skip_writes(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	uint64_t ashift = vd->vdev_top->vdev_ashift;
+	raidz_map_t *rm = zio->io_vsd;
+	ASSERT3U(rm->rm_nrows, ==, 1);
+	raidz_row_t *rr = rm->rm_row[0];
+	for (int c = 0; c < rr->rr_scols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+		if (rc->rc_size != 0)
+			continue;
+		ASSERT3P(rc->rc_abd, ==, NULL);
+
+		ASSERT3U(rc->rc_offset, <,
+		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
+
+		zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
+		    NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
+		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
+	}
+}
+
 static void
-vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
+vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
 {
 	vdev_t *vd = zio->io_vd;
 
@@ -1693,7 +2431,8 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
 			rc->rc_skipped = 1;
 			continue;
 		}
-		if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
+		if (forceparity ||
+		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
@@ -1703,6 +2442,56 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
 	}
 }
 
+static void
+vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
+{
+	vdev_t *vd = zio->io_vd;
+
+	for (int i = 0; i < rm->rm_nphys_cols; i++) {
+		raidz_col_t *prc = &rm->rm_phys_col[i];
+		if (prc->rc_size == 0)
+			continue;
+
+		ASSERT3U(prc->rc_devidx, ==, i);
+		vdev_t *cvd = vd->vdev_child[i];
+		if (!vdev_readable(cvd)) {
+			prc->rc_error = SET_ERROR(ENXIO);
+			prc->rc_tried = 1;	/* don't even try */
+			prc->rc_skipped = 1;
+			continue;
+		}
+		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
+			prc->rc_error = SET_ERROR(ESTALE);
+			prc->rc_skipped = 1;
+			continue;
+		}
+		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+		    prc->rc_offset, prc->rc_abd, prc->rc_size,
+		    zio->io_type, zio->io_priority, 0,
+		    vdev_raidz_child_done, prc));
+	}
+}
+
+static void
+vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
+{
+	/*
+	 * If there are multiple rows, we will be hitting
+	 * all disks, so go ahead and read the parity so
+	 * that we are reading in decent size chunks.
+	 */
+	boolean_t forceparity = rm->rm_nrows > 1;
+
+	if (rm->rm_phys_col) {
+		vdev_raidz_io_start_read_phys_cols(zio, rm);
+	} else {
+		for (int i = 0; i < rm->rm_nrows; i++) {
+			raidz_row_t *rr = rm->rm_row[i];
+			vdev_raidz_io_start_read_row(zio, rr, forceparity);
+		}
+	}
+}
+
 /*
  * Start an IO operation on a RAIDZ VDev
  *
@@ -1726,24 +2515,83 @@ vdev_raidz_io_start(zio_t *zio)
 	vdev_t *vd = zio->io_vd;
 	vdev_t *tvd = vd->vdev_top;
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
+	raidz_map_t *rm;
+
+	uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
+	    BP_GET_BIRTH(zio->io_bp));
+	if (logical_width != vdrz->vd_physical_width) {
+		zfs_locked_range_t *lr = NULL;
+		uint64_t synced_offset = UINT64_MAX;
+		uint64_t next_offset = UINT64_MAX;
+		boolean_t use_scratch = B_FALSE;
+		/*
+		 * Note: when the expansion is completing, we set
+		 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
+		 * in a later txg than when we last update spa_ubsync's state
+		 * (see the end of spa_raidz_expand_thread()).  Therefore we
+		 * may see vre_state!=SCANNING before
+		 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
+		 * on disk, but the copying progress has been synced to disk
+		 * (and reflected in spa_ubsync).  In this case it's fine to
+		 * treat the expansion as completed, since if we crash there's
+		 * no additional copying to do.
+		 */
+		if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
+			ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
+			    &vdrz->vn_vre);
+			lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
+			    zio->io_offset, zio->io_size, RL_READER);
+			use_scratch =
+			    (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
+			    RRSS_SCRATCH_VALID);
+			synced_offset =
+			    RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
+			next_offset = vdrz->vn_vre.vre_offset;
+			/*
+			 * If we haven't resumed expanding since importing the
+			 * pool, vre_offset won't have been set yet.  In
+			 * this case the next offset to be copied is the same
+			 * as what was synced.
+			 */
+			if (next_offset == UINT64_MAX) {
+				next_offset = synced_offset;
+			}
+		}
+		if (use_scratch) {
+			zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
+			    "%lld next_offset=%lld use_scratch=%u",
+			    zio,
+			    zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
+			    (long long)zio->io_offset,
+			    (long long)synced_offset,
+			    (long long)next_offset,
+			    use_scratch);
+		}
+
+		rm = vdev_raidz_map_alloc_expanded(zio,
+		    tvd->vdev_ashift, vdrz->vd_physical_width,
+		    logical_width, vdrz->vd_nparity,
+		    synced_offset, next_offset, use_scratch);
+		rm->rm_lr = lr;
+	} else {
+		rm = vdev_raidz_map_alloc(zio,
+		    tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
+	}
+	rm->rm_original_width = vdrz->vd_original_width;
 
-	raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift,
-	    vdrz->vd_logical_width, vdrz->vd_nparity);
 	zio->io_vsd = rm;
 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
-
-	/*
-	 * Until raidz expansion is implemented all maps for a raidz vdev
-	 * contain a single row.
-	 */
-	ASSERT3U(rm->rm_nrows, ==, 1);
-	raidz_row_t *rr = rm->rm_row[0];
-
 	if (zio->io_type == ZIO_TYPE_WRITE) {
-		vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift);
+		for (int i = 0; i < rm->rm_nrows; i++) {
+			vdev_raidz_io_start_write(zio, rm->rm_row[i]);
+		}
+
+		if (logical_width == vdrz->vd_physical_width) {
+			raidz_start_skip_writes(zio);
+		}
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_READ);
-		vdev_raidz_io_start_read(zio, rr);
+		vdev_raidz_io_start_read(zio, rm);
 	}
 
 	zio_execute(zio);
@@ -1752,8 +2600,8 @@ vdev_raidz_io_start(zio_t *zio)
 /*
  * Report a checksum error for a child of a RAID-Z device.
  */
-static void
-raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
+void
+vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
 {
 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
 
@@ -1765,12 +2613,12 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
 		zbc.zbc_has_cksum = 0;
 		zbc.zbc_injected = rm->rm_ecksuminjected;
 
-		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
-		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
-		    rc->rc_abd, bad_data, &zbc);
 		mutex_enter(&vd->vdev_stat_lock);
 		vd->vdev_stat.vs_checksum_errors++;
 		mutex_exit(&vd->vdev_stat_lock);
+		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
+		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
+		    rc->rc_abd, bad_data, &zbc);
 	}
 }
 
@@ -1781,11 +2629,9 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
 static int
 raidz_checksum_verify(zio_t *zio)
 {
-	zio_bad_cksum_t zbc;
+	zio_bad_cksum_t zbc = {0};
 	raidz_map_t *rm = zio->io_vsd;
 
-	bzero(&zbc, sizeof (zio_bad_cksum_t));
-
 	int ret = zio_checksum_error(zio, &zbc);
 	if (ret != 0 && zbc.zbc_injected != 0)
 		rm->rm_ecksuminjected = 1;
@@ -1819,11 +2665,19 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 
-		orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size);
-		abd_copy(orig[c], rc->rc_abd, rc->rc_size);
+		orig[c] = rc->rc_abd;
+		ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
+		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
 	}
 
 	/*
+	 * Verify any empty sectors are zero filled to ensure the parity
+	 * is calculated correctly even if these non-data sectors are damaged.
+	 */
+	if (rr->rr_nempty && rr->rr_abd_empty != NULL)
+		ret += vdev_draid_map_verify_empty(zio, rr);
+
+	/*
 	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
 	 * isn't harmful but it does have the side effect of fixing stuff
 	 * we didn't realize was necessary (i.e. even if we return 0).
@@ -1837,7 +2691,9 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
 			continue;
 
 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
-			raidz_checksum_error(zio, rc, orig[c]);
+			zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
+			    c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
+			vdev_raidz_checksum_error(zio, rc, orig[c]);
 			rc->rc_error = SET_ERROR(ECKSUM);
 			ret++;
 		}
@@ -1852,8 +2708,10 @@ vdev_raidz_worst_error(raidz_row_t *rr)
 {
 	int error = 0;
 
-	for (int c = 0; c < rr->rr_cols; c++)
+	for (int c = 0; c < rr->rr_cols; c++) {
 		error = zio_worst_error(error, rr->rr_col[c].rc_error);
+		error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
+	}
 
 	return (error);
 }
@@ -1882,6 +2740,9 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
 			parity_untried++;
 		}
+
+		if (rc->rc_force_repair)
+			unexpected_errors++;
 	}
 
 	/*
@@ -1897,7 +2758,6 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
 	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
 		int n = raidz_parity_verify(zio, rr);
 		unexpected_errors += n;
-		ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol);
 	}
 
 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
@@ -1917,6 +2777,10 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
 				continue;
 			}
 
+			zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
+			    "offset=%llx",
+			    zio, c, rc->rc_devidx, (long long)rc->rc_offset);
+
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    ZIO_TYPE_WRITE,
@@ -1926,6 +2790,42 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
+
+	/*
+	 * Scrub or resilver i/o's: overwrite any shadow locations with the
+	 * good data.  This ensures that if we've already copied this sector,
+	 * it will be corrected if it was damaged.  This writes more than is
+	 * necessary, but since expansion is paused during scrub/resilver, at
+	 * most a single row will have a shadow location.
+	 */
+	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
+	    (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
+		for (int c = 0; c < rr->rr_cols; c++) {
+			raidz_col_t *rc = &rr->rr_col[c];
+			vdev_t *vd = zio->io_vd;
+
+			if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
+				continue;
+			vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
+
+			/*
+			 * Note: We don't want to update the repair stats
+			 * because that would incorrectly indicate that there
+			 * was bad data to repair, which we aren't sure about.
+			 * By clearing the SCAN_THREAD flag, we prevent this
+			 * from happening, despite having the REPAIR flag set.
+			 * We need to set SELF_HEAL so that this i/o can't be
+			 * bypassed by zio_vdev_io_start().
+			 */
+			zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
+			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
+			    NULL, NULL);
+			cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
+			zio_nowait(cio);
+		}
+	}
 }
 
 static void
@@ -1945,6 +2845,43 @@ raidz_restore_orig_data(raidz_map_t *rm)
 }
 
 /*
+ * During raidz_reconstruct() for expanded VDEV, we need special consideration
+ * failure simulations.  See note in raidz_reconstruct() on simulating failure
+ * of a pre-expansion device.
+ *
+ * Treating logical child i as failed, return TRUE if the given column should
+ * be treated as failed.  The idea of logical children allows us to imagine
+ * that a disk silently failed before a RAIDZ expansion (reads from this disk
+ * succeed but return the wrong data).  Since the expansion doesn't verify
+ * checksums, the incorrect data will be moved to new locations spread among
+ * the children (going diagonally across them).
+ *
+ * Higher "logical child failures" (values of `i`) indicate these
+ * "pre-expansion failures".  The first physical_width values imagine that a
+ * current child failed; the next physical_width-1 values imagine that a
+ * child failed before the most recent expansion; the next physical_width-2
+ * values imagine a child failed in the expansion before that, etc.
+ */
+static boolean_t
+raidz_simulate_failure(int physical_width, int original_width, int ashift,
+    int i, raidz_col_t *rc)
+{
+	uint64_t sector_id =
+	    physical_width * (rc->rc_offset >> ashift) +
+	    rc->rc_devidx;
+
+	for (int w = physical_width; w >= original_width; w--) {
+		if (i < w) {
+			return (sector_id % w == i);
+		} else {
+			i -= w;
+		}
+	}
+	ASSERT(!"invalid logical child id");
+	return (B_FALSE);
+}
+
+/*
  * returns EINVAL if reconstruction of the block will not be possible
  * returns ECKSUM if this specific reconstruction failed
  * returns 0 on successful reconstruction
@@ -1953,6 +2890,15 @@ static int
 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
 {
 	raidz_map_t *rm = zio->io_vsd;
+	int physical_width = zio->io_vd->vdev_children;
+	int original_width = (rm->rm_original_width != 0) ?
+	    rm->rm_original_width : physical_width;
+	int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
+
+	if (dbgmsg) {
+		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
+		    "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
+	}
 
 	/* Reconstruct each row */
 	for (int r = 0; r < rm->rm_nrows; r++) {
@@ -1962,6 +2908,9 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
 		int dead = 0;
 		int dead_data = 0;
 
+		if (dbgmsg)
+			zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
+
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			ASSERT0(rc->rc_need_orig_restore);
@@ -1974,7 +2923,10 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
 			if (rc->rc_size == 0)
 				continue;
 			for (int lt = 0; lt < ntgts; lt++) {
-				if (rc->rc_devidx == ltgts[lt]) {
+				if (raidz_simulate_failure(physical_width,
+				    original_width,
+				    zio->io_vd->vdev_top->vdev_ashift,
+				    ltgts[lt], rc)) {
 					if (rc->rc_orig_data == NULL) {
 						rc->rc_orig_data =
 						    abd_alloc_linear(
@@ -1987,13 +2939,37 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
 					dead++;
 					if (c >= nparity)
 						dead_data++;
-					my_tgts[t++] = c;
+					/*
+					 * Note: simulating failure of a
+					 * pre-expansion device can hit more
+					 * than one column, in which case we
+					 * might try to simulate more failures
+					 * than can be reconstructed, which is
+					 * also more than the size of my_tgts.
+					 * This check prevents accessing past
+					 * the end of my_tgts.  The "dead >
+					 * nparity" check below will fail this
+					 * reconstruction attempt.
+					 */
+					if (t < VDEV_RAIDZ_MAXPARITY) {
+						my_tgts[t++] = c;
+						if (dbgmsg) {
+							zfs_dbgmsg("simulating "
+							    "failure of col %u "
+							    "devidx %u", c,
+							    (int)rc->rc_devidx);
+						}
+					}
 					break;
 				}
 			}
 		}
 		if (dead > nparity) {
 			/* reconstruction not possible */
+			if (dbgmsg) {
+				zfs_dbgmsg("reconstruction not possible; "
+				    "too many failures");
+			}
 			raidz_restore_orig_data(rm);
 			return (EINVAL);
 		}
@@ -2023,7 +2999,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
 					 */
 					if (rc->rc_error == 0 &&
 					    c >= rr->rr_firstdatacol) {
-						raidz_checksum_error(zio,
+						vdev_raidz_checksum_error(zio,
 						    rc, rc->rc_orig_data);
 						rc->rc_error =
 						    SET_ERROR(ECKSUM);
@@ -2037,11 +3013,19 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
 
 		zio_checksum_verified(zio);
 
+		if (dbgmsg) {
+			zfs_dbgmsg("reconstruction successful "
+			    "(checksum verified)");
+		}
 		return (0);
 	}
 
 	/* Reconstruction failed - restore original data */
 	raidz_restore_orig_data(rm);
+	if (dbgmsg) {
+		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
+		    "failed", zio);
+	}
 	return (ECKSUM);
 }
 
@@ -2056,7 +3040,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
  * The order that we find the various possible combinations of failed
  * disks is dictated by these rules:
  * - Examine each "slot" (the "i" in tgts[i])
- *   - Try to increment this slot (tgts[i] = tgts[i] + 1)
+ *   - Try to increment this slot (tgts[i] += 1)
  *   - if we can't increment because it runs into the next slot,
  *     reset our slot to the minimum, and examine the next slot
  *
@@ -2087,18 +3071,22 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
  *
  * This strategy works for dRAID but is less efficient when there are a large
  * number of child vdevs and therefore permutations to check. Furthermore,
- * since the raidz_map_t rows likely do not overlap reconstruction would be
+ * since the raidz_map_t rows likely do not overlap, reconstruction would be
  * possible as long as there are no more than nparity data errors per row.
  * These additional permutations are not currently checked but could be as
  * a future improvement.
+ *
+ * Returns 0 on success, ECKSUM on failure.
  */
 static int
 vdev_raidz_combrec(zio_t *zio)
 {
 	int nparity = vdev_get_nparity(zio->io_vd);
 	raidz_map_t *rm = zio->io_vsd;
+	int physical_width = zio->io_vd->vdev_children;
+	int original_width = (rm->rm_original_width != 0) ?
+	    rm->rm_original_width : physical_width;
 
-	/* Check if there's enough data to attempt reconstrution. */
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 		int total_errors = 0;
@@ -2116,8 +3104,16 @@ vdev_raidz_combrec(zio_t *zio)
 		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
 		int *ltgts = &tstore[1]; /* value is logical child ID */
 
-		/* Determine number of logical children, n */
-		int n = zio->io_vd->vdev_children;
+
+		/*
+		 * Determine number of logical children, n.  See comment
+		 * above raidz_simulate_failure().
+		 */
+		int n = 0;
+		for (int w = physical_width;
+		    w >= original_width; w--) {
+			n += w;
+		}
 
 		ASSERT3U(num_failures, <=, nparity);
 		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
@@ -2148,6 +3144,14 @@ vdev_raidz_combrec(zio_t *zio)
 				if (ltgts[t] == n) {
 					/* try more failures */
 					ASSERT3U(t, ==, num_failures - 1);
+					if (zfs_flags &
+					    ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
+						zfs_dbgmsg("reconstruction "
+						    "failed for num_failures="
+						    "%u; tried all "
+						    "combinations",
+						    num_failures);
+					}
 					break;
 				}
 
@@ -2159,7 +3163,7 @@ vdev_raidz_combrec(zio_t *zio)
 				 * Try the next combination.
 				 */
 				if (ltgts[t] != ltgts[t + 1])
-					break;
+					break; // found next combination
 
 				/*
 				 * Otherwise, reset this tgt to the minimum,
@@ -2174,7 +3178,8 @@ vdev_raidz_combrec(zio_t *zio)
 				break;
 		}
 	}
-
+	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
+		zfs_dbgmsg("reconstruction failed for all num_failures");
 	return (ECKSUM);
 }
 
@@ -2199,7 +3204,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
 static void
 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
 {
-	int total_errors = 0;
+	int normal_errors = 0;
+	int shadow_errors = 0;
 
 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
@@ -2208,24 +3214,31 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
-		if (rc->rc_error) {
+		if (rc->rc_error != 0) {
 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
-
-			total_errors++;
+			normal_errors++;
+		}
+		if (rc->rc_shadow_error != 0) {
+			ASSERT(rc->rc_shadow_error != ECKSUM);
+			shadow_errors++;
 		}
 	}
 
 	/*
 	 * Treat partial writes as a success. If we couldn't write enough
-	 * columns to reconstruct the data, the I/O failed.  Otherwise,
-	 * good enough.
+	 * columns to reconstruct the data, the I/O failed.  Otherwise, good
+	 * enough.  Note that in the case of a shadow write (during raidz
+	 * expansion), depending on if we crash, either the normal (old) or
+	 * shadow (new) location may become the "real" version of the block,
+	 * so both locations must have sufficient redundancy.
 	 *
 	 * Now that we support write reallocation, it would be better
 	 * to treat partial failure as real failure unless there are
 	 * no non-degraded top-level vdevs left, and not update DTLs
 	 * if we intend to reallocate.
 	 */
-	if (total_errors > rr->rr_firstdatacol) {
+	if (normal_errors > rr->rr_firstdatacol ||
+	    shadow_errors > rr->rr_firstdatacol) {
 		zio->io_error = zio_worst_error(zio->io_error,
 		    vdev_raidz_worst_error(rr));
 	}
@@ -2242,14 +3255,24 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
 
 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
-	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
-		if (rc->rc_error) {
-			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
+		/*
+		 * If scrubbing and a replacing/sparing child vdev determined
+		 * that not all of its children have an identical copy of the
+		 * data, then clear the error so the column is treated like
+		 * any other read and force a repair to correct the damage.
+		 */
+		if (rc->rc_error == ECKSUM) {
+			ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
+			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
+			rc->rc_force_repair = 1;
+			rc->rc_error = 0;
+		}
 
+		if (rc->rc_error) {
 			if (c < rr->rr_firstdatacol)
 				parity_errors++;
 			else
@@ -2314,7 +3337,7 @@ vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
 	 * for a normal read then allocate an ABD for them now so they
 	 * may be read, verified, and any needed repairs performed.
 	 */
-	if (rr->rr_nempty && rr->rr_abd_empty == NULL)
+	if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
 		vdev_draid_map_alloc_empty(zio, rr);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
@@ -2357,12 +3380,12 @@ vdev_raidz_io_done_unrecoverable(zio_t *zio)
 			zbc.zbc_has_cksum = 0;
 			zbc.zbc_injected = rm->rm_ecksuminjected;
 
-			(void) zfs_ereport_start_checksum(zio->io_spa,
-			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
-			    rc->rc_size, &zbc);
 			mutex_enter(&cvd->vdev_stat_lock);
 			cvd->vdev_stat.vs_checksum_errors++;
 			mutex_exit(&cvd->vdev_stat_lock);
+			(void) zfs_ereport_start_checksum(zio->io_spa,
+			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
+			    rc->rc_size, &zbc);
 		}
 	}
 }
@@ -2372,11 +3395,48 @@ vdev_raidz_io_done(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
+	ASSERT(zio->io_bp != NULL);
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
 		}
 	} else {
+		if (rm->rm_phys_col) {
+			/*
+			 * This is an aggregated read.  Copy the data and status
+			 * from the aggregate abd's to the individual rows.
+			 */
+			for (int i = 0; i < rm->rm_nrows; i++) {
+				raidz_row_t *rr = rm->rm_row[i];
+
+				for (int c = 0; c < rr->rr_cols; c++) {
+					raidz_col_t *rc = &rr->rr_col[c];
+					if (rc->rc_tried || rc->rc_size == 0)
+						continue;
+
+					raidz_col_t *prc =
+					    &rm->rm_phys_col[rc->rc_devidx];
+					rc->rc_error = prc->rc_error;
+					rc->rc_tried = prc->rc_tried;
+					rc->rc_skipped = prc->rc_skipped;
+					if (c >= rr->rr_firstdatacol) {
+						/*
+						 * Note: this is slightly faster
+						 * than using abd_copy_off().
+						 */
+						char *physbuf = abd_to_buf(
+						    prc->rc_abd);
+						void *physloc = physbuf +
+						    rc->rc_offset -
+						    prc->rc_offset;
+
+						abd_copy_from_buf(rc->rc_abd,
+						    physloc, rc->rc_size);
+					}
+				}
+			}
+		}
+
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			vdev_raidz_io_done_reconstruct_known_missing(zio,
@@ -2423,7 +3483,54 @@ vdev_raidz_io_done(zio_t *zio)
 					zio_vdev_io_redone(zio);
 				return;
 			}
-
+			/*
+			 * It would be too expensive to try every possible
+			 * combination of failed sectors in every row, so
+			 * instead we try every combination of failed current or
+			 * past physical disk. This means that if the incorrect
+			 * sectors were all on Nparity disks at any point in the
+			 * past, we will find the correct data.  The only known
+			 * case where this is less durable than a non-expanded
+			 * RAIDZ, is if we have a silent failure during
+			 * expansion.  In that case, one block could be
+			 * partially in the old format and partially in the
+			 * new format, so we'd lost some sectors from the old
+			 * format and some from the new format.
+			 *
+			 * e.g. logical_width=4 physical_width=6
+			 * the 15 (6+5+4) possible failed disks are:
+			 * width=6 child=0
+			 * width=6 child=1
+			 * width=6 child=2
+			 * width=6 child=3
+			 * width=6 child=4
+			 * width=6 child=5
+			 * width=5 child=0
+			 * width=5 child=1
+			 * width=5 child=2
+			 * width=5 child=3
+			 * width=5 child=4
+			 * width=4 child=0
+			 * width=4 child=1
+			 * width=4 child=2
+			 * width=4 child=3
+			 * And we will try every combination of Nparity of these
+			 * failing.
+			 *
+			 * As a first pass, we can generate every combo,
+			 * and try reconstructing, ignoring any known
+			 * failures.  If any row has too many known + simulated
+			 * failures, then we bail on reconstructing with this
+			 * number of simulated failures.  As an improvement,
+			 * we could detect the number of whole known failures
+			 * (i.e. we have known failures on these disks for
+			 * every row; the disks never succeeded), and
+			 * subtract that from the max # failures to simulate.
+			 * We could go even further like the current
+			 * combrec code, but that doesn't seem like it
+			 * gains us very much.  If we simulate a failure
+			 * that is also a known failure, that's fine.
+			 */
 			zio->io_error = vdev_raidz_combrec(zio);
 			if (zio->io_error == ECKSUM &&
 			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
@@ -2431,6 +3538,10 @@ vdev_raidz_io_done(zio_t *zio)
 			}
 		}
 	}
+	if (rm->rm_lr != NULL) {
+		zfs_rangelock_exit(rm->rm_lr);
+		rm->rm_lr = NULL;
+	}
 }
 
 static void
@@ -2457,6 +3568,14 @@ vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
+
+	/*
+	 * If we're in the middle of a RAIDZ expansion, this block may be in
+	 * the old and/or new location.  For simplicity, always resilver it.
+	 */
+	if (vdrz->vn_vre.vre_state == DSS_SCANNING)
+		return (B_TRUE);
+
 	uint64_t dcols = vd->vdev_children;
 	uint64_t nparity = vdrz->vd_nparity;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
@@ -2496,10 +3615,29 @@ static void
 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 {
+	(void) remain_rs;
+
 	vdev_t *raidvd = cvd->vdev_parent;
 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
 
-	uint64_t width = raidvd->vdev_children;
+	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
+
+	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
+		/*
+		 * We're in the middle of expansion, in which case the
+		 * translation is in flux.  Any answer we give may be wrong
+		 * by the time we return, so it isn't safe for the caller to
+		 * act on it.  Therefore we say that this range isn't present
+		 * on any children.  The only consumers of this are "zpool
+		 * initialize" and trimming, both of which are "best effort"
+		 * anyway.
+		 */
+		physical_rs->rs_start = physical_rs->rs_end = 0;
+		remain_rs->rs_start = remain_rs->rs_end = 0;
+		return;
+	}
+
+	uint64_t width = vdrz->vd_physical_width;
 	uint64_t tgt_col = cvd->vdev_id;
 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
 
@@ -2525,15 +3663,1156 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
 	    logical_rs->rs_end - logical_rs->rs_start);
 }
 
+static void
+raidz_reflow_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = arg;
+	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+
+	/*
+	 * Ensure there are no i/os to the range that is being committed.
+	 */
+	uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
+	ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
+
+	mutex_enter(&vre->vre_lock);
+	uint64_t new_offset =
+	    MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
+	/*
+	 * We should not have committed anything that failed.
+	 */
+	VERIFY3U(vre->vre_failed_offset, >=, old_offset);
+	mutex_exit(&vre->vre_lock);
+
+	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
+	    old_offset, new_offset - old_offset,
+	    RL_WRITER);
+
+	/*
+	 * Update the uberblock that will be written when this txg completes.
+	 */
+	RAIDZ_REFLOW_SET(&spa->spa_uberblock,
+	    RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
+	vre->vre_offset_pertxg[txgoff] = 0;
+	zfs_rangelock_exit(lr);
+
+	mutex_enter(&vre->vre_lock);
+	vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
+	vre->vre_bytes_copied_pertxg[txgoff] = 0;
+	mutex_exit(&vre->vre_lock);
+
+	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
+	VERIFY0(zap_update(spa->spa_meta_objset,
+	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
+	    sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
+}
+
+static void
+raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = arg;
+	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
+
+	for (int i = 0; i < TXG_SIZE; i++)
+		VERIFY0(vre->vre_offset_pertxg[i]);
+
+	reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
+	re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
+	re->re_logical_width = vdrz->vd_physical_width;
+	mutex_enter(&vdrz->vd_expand_lock);
+	avl_add(&vdrz->vd_expand_txgs, re);
+	mutex_exit(&vdrz->vd_expand_lock);
+
+	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
+
+	/*
+	 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
+	 * will get written (based on vd_expand_txgs).
+	 */
+	vdev_config_dirty(vd);
+
+	/*
+	 * Before we change vre_state, the on-disk state must reflect that we
+	 * have completed all copying, so that vdev_raidz_io_start() can use
+	 * vre_state to determine if the reflow is in progress.  See also the
+	 * end of spa_raidz_expand_thread().
+	 */
+	VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
+	    raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
+
+	vre->vre_end_time = gethrestime_sec();
+	vre->vre_state = DSS_FINISHED;
+
+	uint64_t state = vre->vre_state;
+	VERIFY0(zap_update(spa->spa_meta_objset,
+	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
+	    sizeof (state), 1, &state, tx));
+
+	uint64_t end_time = vre->vre_end_time;
+	VERIFY0(zap_update(spa->spa_meta_objset,
+	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
+	    sizeof (end_time), 1, &end_time, tx));
+
+	spa->spa_uberblock.ub_raidz_reflow_info = 0;
+
+	spa_history_log_internal(spa, "raidz vdev expansion completed",  tx,
+	    "%s vdev %llu new width %llu", spa_name(spa),
+	    (unsigned long long)vd->vdev_id,
+	    (unsigned long long)vd->vdev_children);
+
+	spa->spa_raidz_expand = NULL;
+	raidvd->vdev_rz_expanding = B_FALSE;
+
+	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
+	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
+
+	spa_notify_waiters(spa);
+
+	/*
+	 * While we're in syncing context take the opportunity to
+	 * setup a scrub. All the data has been sucessfully copied
+	 * but we have not validated any checksums.
+	 */
+	pool_scan_func_t func = POOL_SCAN_SCRUB;
+	if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0)
+		dsl_scan_setup_sync(&func, tx);
+}
+
+/*
+ * Struct for one copy zio.
+ */
+typedef struct raidz_reflow_arg {
+	vdev_raidz_expand_t *rra_vre;
+	zfs_locked_range_t *rra_lr;
+	uint64_t rra_txg;
+} raidz_reflow_arg_t;
+
+/*
+ * The write of the new location is done.
+ */
+static void
+raidz_reflow_write_done(zio_t *zio)
+{
+	raidz_reflow_arg_t *rra = zio->io_private;
+	vdev_raidz_expand_t *vre = rra->rra_vre;
+
+	abd_free(zio->io_abd);
+
+	mutex_enter(&vre->vre_lock);
+	if (zio->io_error != 0) {
+		/* Force a reflow pause on errors */
+		vre->vre_failed_offset =
+		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
+	}
+	ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
+	vre->vre_outstanding_bytes -= zio->io_size;
+	if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
+	    vre->vre_failed_offset) {
+		vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
+		    zio->io_size;
+	}
+	cv_signal(&vre->vre_cv);
+	mutex_exit(&vre->vre_lock);
+
+	zfs_rangelock_exit(rra->rra_lr);
+
+	kmem_free(rra, sizeof (*rra));
+	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
+}
+
+/*
+ * The read of the old location is done.  The parent zio is the write to
+ * the new location.  Allow it to start.
+ */
+static void
+raidz_reflow_read_done(zio_t *zio)
+{
+	raidz_reflow_arg_t *rra = zio->io_private;
+	vdev_raidz_expand_t *vre = rra->rra_vre;
+
+	/*
+	 * If the read failed, or if it was done on a vdev that is not fully
+	 * healthy (e.g. a child that has a resilver in progress), we may not
+	 * have the correct data.  Note that it's OK if the write proceeds.
+	 * It may write garbage but the location is otherwise unused and we
+	 * will retry later due to vre_failed_offset.
+	 */
+	if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
+		zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
+		    "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
+		    (long long)rra->rra_lr->lr_offset,
+		    (long long)rra->rra_lr->lr_length,
+		    (long long)rra->rra_txg,
+		    zio->io_error,
+		    vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
+		    vdev_dtl_empty(zio->io_vd, DTL_MISSING));
+		mutex_enter(&vre->vre_lock);
+		/* Force a reflow pause on errors */
+		vre->vre_failed_offset =
+		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
+		mutex_exit(&vre->vre_lock);
+	}
+
+	zio_nowait(zio_unique_parent(zio));
+}
+
+static void
+raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
+    dmu_tx_t *tx)
+{
+	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	if (offset == 0)
+		return;
+
+	mutex_enter(&vre->vre_lock);
+	ASSERT3U(vre->vre_offset, <=, offset);
+	vre->vre_offset = offset;
+	mutex_exit(&vre->vre_lock);
+
+	if (vre->vre_offset_pertxg[txgoff] == 0) {
+		dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
+		    spa, tx);
+	}
+	vre->vre_offset_pertxg[txgoff] = offset;
+}
+
+static boolean_t
+vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
+{
+	for (int i = 0; i < raidz_vd->vdev_children; i++) {
+		/* Quick check if a child is being replaced */
+		if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+static boolean_t
+raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
+    dmu_tx_t *tx)
+{
+	spa_t *spa = vd->vdev_spa;
+	int ashift = vd->vdev_top->vdev_ashift;
+	uint64_t offset, size;
+
+	if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize,
+	    &offset, &size)) {
+		return (B_FALSE);
+	}
+	ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
+	ASSERT3U(size, >=, 1 << ashift);
+	uint64_t length = 1 << ashift;
+	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+	uint64_t blkid = offset >> ashift;
+
+	int old_children = vd->vdev_children - 1;
+
+	/*
+	 * We can only progress to the point that writes will not overlap
+	 * with blocks whose progress has not yet been recorded on disk.
+	 * Since partially-copied rows are still read from the old location,
+	 * we need to stop one row before the sector-wise overlap, to prevent
+	 * row-wise overlap.
+	 *
+	 * Note that even if we are skipping over a large unallocated region,
+	 * we can't move the on-disk progress to `offset`, because concurrent
+	 * writes/allocations could still use the currently-unallocated
+	 * region.
+	 */
+	uint64_t ubsync_blkid =
+	    RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
+	uint64_t next_overwrite_blkid = ubsync_blkid +
+	    ubsync_blkid / old_children - old_children;
+	VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
+
+	if (blkid >= next_overwrite_blkid) {
+		raidz_reflow_record_progress(vre,
+		    next_overwrite_blkid << ashift, tx);
+		return (B_TRUE);
+	}
+
+	range_tree_remove(rt, offset, length);
+
+	raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP);
+	rra->rra_vre = vre;
+	rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
+	    offset, length, RL_WRITER);
+	rra->rra_txg = dmu_tx_get_txg(tx);
+
+	raidz_reflow_record_progress(vre, offset + length, tx);
+
+	mutex_enter(&vre->vre_lock);
+	vre->vre_outstanding_bytes += length;
+	mutex_exit(&vre->vre_lock);
+
+	/*
+	 * SCL_STATE will be released when the read and write are done,
+	 * by raidz_reflow_write_done().
+	 */
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+	/* check if a replacing vdev was added, if so treat it as an error */
+	if (vdev_raidz_expand_child_replacing(vd)) {
+		zfs_dbgmsg("replacing vdev encountered, reflow paused at "
+		    "offset=%llu txg=%llu",
+		    (long long)rra->rra_lr->lr_offset,
+		    (long long)rra->rra_txg);
+
+		mutex_enter(&vre->vre_lock);
+		vre->vre_failed_offset =
+		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
+		cv_signal(&vre->vre_cv);
+		mutex_exit(&vre->vre_lock);
+
+		/* drop everything we acquired */
+		zfs_rangelock_exit(rra->rra_lr);
+		kmem_free(rra, sizeof (*rra));
+		spa_config_exit(spa, SCL_STATE, spa);
+		return (B_TRUE);
+	}
+
+	zio_t *pio = spa->spa_txg_zio[txgoff];
+	abd_t *abd = abd_alloc_for_io(length, B_FALSE);
+	zio_t *write_zio = zio_vdev_child_io(pio, NULL,
+	    vd->vdev_child[blkid % vd->vdev_children],
+	    (blkid / vd->vdev_children) << ashift,
+	    abd, length,
+	    ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
+	    ZIO_FLAG_CANFAIL,
+	    raidz_reflow_write_done, rra);
+
+	zio_nowait(zio_vdev_child_io(write_zio, NULL,
+	    vd->vdev_child[blkid % old_children],
+	    (blkid / old_children) << ashift,
+	    abd, length,
+	    ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
+	    ZIO_FLAG_CANFAIL,
+	    raidz_reflow_read_done, rra));
+
+	return (B_FALSE);
+}
+
+/*
+ * For testing (ztest specific)
+ */
+static void
+raidz_expand_pause(uint_t pause_point)
+{
+	while (raidz_expand_pause_point != 0 &&
+	    raidz_expand_pause_point <= pause_point)
+		delay(hz);
+}
+
+static void
+raidz_scratch_child_done(zio_t *zio)
+{
+	zio_t *pio = zio->io_private;
+
+	mutex_enter(&pio->io_lock);
+	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
+	mutex_exit(&pio->io_lock);
+}
+
+/*
+ * Reflow the beginning portion of the vdev into an intermediate scratch area
+ * in memory and on disk. This operation must be persisted on disk before we
+ * proceed to overwrite the beginning portion with the reflowed data.
+ *
+ * This multi-step task can fail to complete if disk errors are encountered
+ * and we can return here after a pause (waiting for disk to become healthy).
+ */
+static void
+raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
+{
+	vdev_raidz_expand_t *vre = arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	zio_t *pio;
+	int error;
+
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+	int ashift = raidvd->vdev_ashift;
+	uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
+	    uint64_t);
+	uint64_t logical_size = write_size * raidvd->vdev_children;
+	uint64_t read_size =
+	    P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
+	    1 << ashift);
+
+	/*
+	 * The scratch space must be large enough to get us to the point
+	 * that one row does not overlap itself when moved.  This is checked
+	 * by vdev_raidz_attach_check().
+	 */
+	VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
+	VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
+	VERIFY3U(write_size, <=, read_size);
+
+	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
+	    0, logical_size, RL_WRITER);
+
+	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
+	    KM_SLEEP);
+	for (int i = 0; i < raidvd->vdev_children; i++) {
+		abds[i] = abd_alloc_linear(read_size, B_FALSE);
+	}
+
+	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
+
+	/*
+	 * If we have already written the scratch area then we must read from
+	 * there, since new writes were redirected there while we were paused
+	 * or the original location may have been partially overwritten with
+	 * reflowed data.
+	 */
+	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
+		VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
+		/*
+		 * Read from scratch space.
+		 */
+		pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+		for (int i = 0; i < raidvd->vdev_children; i++) {
+			/*
+			 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
+			 * to the offset to calculate the physical offset to
+			 * write to.  Passing in a negative offset makes us
+			 * access the scratch area.
+			 */
+			zio_nowait(zio_vdev_child_io(pio, NULL,
+			    raidvd->vdev_child[i],
+			    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
+			    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ,
+			    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
+		}
+		error = zio_wait(pio);
+		if (error != 0) {
+			zfs_dbgmsg("reflow: error %d reading scratch location",
+			    error);
+			goto io_error_exit;
+		}
+		goto overwrite;
+	}
+
+	/*
+	 * Read from original location.
+	 */
+	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+	for (int i = 0; i < raidvd->vdev_children - 1; i++) {
+		ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
+		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
+		    0, abds[i], read_size, ZIO_TYPE_READ,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+		    raidz_scratch_child_done, pio));
+	}
+	error = zio_wait(pio);
+	if (error != 0) {
+		zfs_dbgmsg("reflow: error %d reading original location", error);
+io_error_exit:
+		for (int i = 0; i < raidvd->vdev_children; i++)
+			abd_free(abds[i]);
+		kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
+		zfs_rangelock_exit(lr);
+		spa_config_exit(spa, SCL_STATE, FTAG);
+		return;
+	}
+
+	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
+
+	/*
+	 * Reflow in memory.
+	 */
+	uint64_t logical_sectors = logical_size >> ashift;
+	for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
+		int oldchild = i % (raidvd->vdev_children - 1);
+		uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
+
+		int newchild = i % raidvd->vdev_children;
+		uint64_t newoff = (i / raidvd->vdev_children) << ashift;
+
+		/* a single sector should not be copying over itself */
+		ASSERT(!(newchild == oldchild && newoff == oldoff));
+
+		abd_copy_off(abds[newchild], abds[oldchild],
+		    newoff, oldoff, 1 << ashift);
+	}
+
+	/*
+	 * Verify that we filled in everything we intended to (write_size on
+	 * each child).
+	 */
+	VERIFY0(logical_sectors % raidvd->vdev_children);
+	VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
+	    write_size);
+
+	/*
+	 * Write to scratch location (boot area).
+	 */
+	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+	for (int i = 0; i < raidvd->vdev_children; i++) {
+		/*
+		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
+		 * the offset to calculate the physical offset to write to.
+		 * Passing in a negative offset lets us access the boot area.
+		 */
+		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
+		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
+		    write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+		    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
+	}
+	error = zio_wait(pio);
+	if (error != 0) {
+		zfs_dbgmsg("reflow: error %d writing scratch location", error);
+		goto io_error_exit;
+	}
+	pio = zio_root(spa, NULL, NULL, 0);
+	zio_flush(pio, raidvd);
+	zio_wait(pio);
+
+	zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
+	    (long long)logical_size);
+
+	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
+
+	/*
+	 * Update uberblock to indicate that scratch space is valid.  This is
+	 * needed because after this point, the real location may be
+	 * overwritten.  If we crash, we need to get the data from the
+	 * scratch space, rather than the real location.
+	 *
+	 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
+	 * will prefer this uberblock.
+	 */
+	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
+	spa->spa_ubsync.ub_timestamp++;
+	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
+	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
+	if (spa_multihost(spa))
+		mmp_update_uberblock(spa, &spa->spa_ubsync);
+
+	zfs_dbgmsg("reflow: uberblock updated "
+	    "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
+	    (long long)spa->spa_ubsync.ub_txg,
+	    (long long)logical_size,
+	    (long long)spa->spa_ubsync.ub_timestamp);
+
+	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
+
+	/*
+	 * Overwrite with reflow'ed data.
+	 */
+overwrite:
+	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+	for (int i = 0; i < raidvd->vdev_children; i++) {
+		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
+		    0, abds[i], write_size, ZIO_TYPE_WRITE,
+		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
+		    raidz_scratch_child_done, pio));
+	}
+	error = zio_wait(pio);
+	if (error != 0) {
+		/*
+		 * When we exit early here and drop the range lock, new
+		 * writes will go into the scratch area so we'll need to
+		 * read from there when we return after pausing.
+		 */
+		zfs_dbgmsg("reflow: error %d writing real location", error);
+		/*
+		 * Update the uberblock that is written when this txg completes.
+		 */
+		RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
+		    logical_size);
+		goto io_error_exit;
+	}
+	pio = zio_root(spa, NULL, NULL, 0);
+	zio_flush(pio, raidvd);
+	zio_wait(pio);
+
+	zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
+	    (long long)logical_size);
+	for (int i = 0; i < raidvd->vdev_children; i++)
+		abd_free(abds[i]);
+	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
+
+	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
+
+	/*
+	 * Update uberblock to indicate that the initial part has been
+	 * reflow'ed.  This is needed because after this point (when we exit
+	 * the rangelock), we allow regular writes to this region, which will
+	 * be written to the new location only (because reflow_offset_next ==
+	 * reflow_offset_synced).  If we crashed and re-copied from the
+	 * scratch space, we would lose the regular writes.
+	 */
+	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
+	    logical_size);
+	spa->spa_ubsync.ub_timestamp++;
+	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
+	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
+	if (spa_multihost(spa))
+		mmp_update_uberblock(spa, &spa->spa_ubsync);
+
+	zfs_dbgmsg("reflow: uberblock updated "
+	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
+	    (long long)spa->spa_ubsync.ub_txg,
+	    (long long)logical_size,
+	    (long long)spa->spa_ubsync.ub_timestamp);
+
+	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
+
+	/*
+	 * Update progress.
+	 */
+	vre->vre_offset = logical_size;
+	zfs_rangelock_exit(lr);
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
+	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
+	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
+	/*
+	 * Note - raidz_reflow_sync() will update the uberblock state to
+	 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
+	 */
+	raidz_reflow_sync(spa, tx);
+
+	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
+}
+
+/*
+ * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
+ * here.  No other i/o can be in progress, so we don't need the vre_rangelock.
+ */
+void
+vdev_raidz_reflow_copy_scratch(spa_t *spa)
+{
+	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+	uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
+	ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
+
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+	ASSERT0(logical_size % raidvd->vdev_children);
+	uint64_t write_size = logical_size / raidvd->vdev_children;
+
+	zio_t *pio;
+
+	/*
+	 * Read from scratch space.
+	 */
+	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
+	    KM_SLEEP);
+	for (int i = 0; i < raidvd->vdev_children; i++) {
+		abds[i] = abd_alloc_linear(write_size, B_FALSE);
+	}
+
+	pio = zio_root(spa, NULL, NULL, 0);
+	for (int i = 0; i < raidvd->vdev_children; i++) {
+		/*
+		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
+		 * the offset to calculate the physical offset to write to.
+		 * Passing in a negative offset lets us access the boot area.
+		 */
+		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
+		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
+		    write_size, ZIO_TYPE_READ,
+		    ZIO_PRIORITY_ASYNC_READ, 0,
+		    raidz_scratch_child_done, pio));
+	}
+	zio_wait(pio);
+
+	/*
+	 * Overwrite real location with reflow'ed data.
+	 */
+	pio = zio_root(spa, NULL, NULL, 0);
+	for (int i = 0; i < raidvd->vdev_children; i++) {
+		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
+		    0, abds[i], write_size, ZIO_TYPE_WRITE,
+		    ZIO_PRIORITY_ASYNC_WRITE, 0,
+		    raidz_scratch_child_done, pio));
+	}
+	zio_wait(pio);
+	pio = zio_root(spa, NULL, NULL, 0);
+	zio_flush(pio, raidvd);
+	zio_wait(pio);
+
+	zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
+	    "to real location", (long long)logical_size);
+
+	for (int i = 0; i < raidvd->vdev_children; i++)
+		abd_free(abds[i]);
+	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
+
+	/*
+	 * Update uberblock.
+	 */
+	RAIDZ_REFLOW_SET(&spa->spa_ubsync,
+	    RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
+	spa->spa_ubsync.ub_timestamp++;
+	VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
+	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
+	if (spa_multihost(spa))
+		mmp_update_uberblock(spa, &spa->spa_ubsync);
+
+	zfs_dbgmsg("reflow recovery: uberblock updated "
+	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
+	    (long long)spa->spa_ubsync.ub_txg,
+	    (long long)logical_size,
+	    (long long)spa->spa_ubsync.ub_timestamp);
+
+	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
+	    spa_first_txg(spa));
+	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+	vre->vre_offset = logical_size;
+	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
+	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
+	/*
+	 * Note that raidz_reflow_sync() will update the uberblock once more
+	 */
+	raidz_reflow_sync(spa, tx);
+
+	dmu_tx_commit(tx);
+
+	spa_config_exit(spa, SCL_STATE, FTAG);
+}
+
+static boolean_t
+spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
+{
+	(void) zthr;
+	spa_t *spa = arg;
+
+	return (spa->spa_raidz_expand != NULL &&
+	    !spa->spa_raidz_expand->vre_waiting_for_resilver);
+}
+
+/*
+ * RAIDZ expansion background thread
+ *
+ * Can be called multiple times if the reflow is paused
+ */
+static void
+spa_raidz_expand_thread(void *arg, zthr_t *zthr)
+{
+	spa_t *spa = arg;
+	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+
+	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
+		vre->vre_offset = 0;
+	else
+		vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
+
+	/* Reflow the begining portion using the scratch area */
+	if (vre->vre_offset == 0) {
+		VERIFY0(dsl_sync_task(spa_name(spa),
+		    NULL, raidz_reflow_scratch_sync,
+		    vre, 0, ZFS_SPACE_CHECK_NONE));
+
+		/* if we encountered errors then pause */
+		if (vre->vre_offset == 0) {
+			mutex_enter(&vre->vre_lock);
+			vre->vre_waiting_for_resilver = B_TRUE;
+			mutex_exit(&vre->vre_lock);
+			return;
+		}
+	}
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+
+	uint64_t guid = raidvd->vdev_guid;
+
+	/* Iterate over all the remaining metaslabs */
+	for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
+	    i < raidvd->vdev_ms_count &&
+	    !zthr_iscancelled(zthr) &&
+	    vre->vre_failed_offset == UINT64_MAX; i++) {
+		metaslab_t *msp = raidvd->vdev_ms[i];
+
+		metaslab_disable(msp);
+		mutex_enter(&msp->ms_lock);
+
+		/*
+		 * The metaslab may be newly created (for the expanded
+		 * space), in which case its trees won't exist yet,
+		 * so we need to bail out early.
+		 */
+		if (msp->ms_new) {
+			mutex_exit(&msp->ms_lock);
+			metaslab_enable(msp, B_FALSE, B_FALSE);
+			continue;
+		}
+
+		VERIFY0(metaslab_load(msp));
+
+		/*
+		 * We want to copy everything except the free (allocatable)
+		 * space.  Note that there may be a little bit more free
+		 * space (e.g. in ms_defer), and it's fine to copy that too.
+		 */
+		range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64,
+		    NULL, 0, 0);
+		range_tree_add(rt, msp->ms_start, msp->ms_size);
+		range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
+		mutex_exit(&msp->ms_lock);
+
+		/*
+		 * Force the last sector of each metaslab to be copied.  This
+		 * ensures that we advance the on-disk progress to the end of
+		 * this metaslab while the metaslab is disabled.  Otherwise, we
+		 * could move past this metaslab without advancing the on-disk
+		 * progress, and then an allocation to this metaslab would not
+		 * be copied.
+		 */
+		int sectorsz = 1 << raidvd->vdev_ashift;
+		uint64_t ms_last_offset = msp->ms_start +
+		    msp->ms_size - sectorsz;
+		if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {
+			range_tree_add(rt, ms_last_offset, sectorsz);
+		}
+
+		/*
+		 * When we are resuming from a paused expansion (i.e.
+		 * when importing a pool with a expansion in progress),
+		 * discard any state that we have already processed.
+		 */
+		range_tree_clear(rt, 0, vre->vre_offset);
+
+		while (!zthr_iscancelled(zthr) &&
+		    !range_tree_is_empty(rt) &&
+		    vre->vre_failed_offset == UINT64_MAX) {
+
+			/*
+			 * We need to periodically drop the config lock so that
+			 * writers can get in.  Additionally, we can't wait
+			 * for a txg to sync while holding a config lock
+			 * (since a waiting writer could cause a 3-way deadlock
+			 * with the sync thread, which also gets a config
+			 * lock for reader).  So we can't hold the config lock
+			 * while calling dmu_tx_assign().
+			 */
+			spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+			/*
+			 * If requested, pause the reflow when the amount
+			 * specified by raidz_expand_max_reflow_bytes is reached
+			 *
+			 * This pause is only used during testing or debugging.
+			 */
+			while (raidz_expand_max_reflow_bytes != 0 &&
+			    raidz_expand_max_reflow_bytes <=
+			    vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
+				delay(hz);
+			}
+
+			mutex_enter(&vre->vre_lock);
+			while (vre->vre_outstanding_bytes >
+			    raidz_expand_max_copy_bytes) {
+				cv_wait(&vre->vre_cv, &vre->vre_lock);
+			}
+			mutex_exit(&vre->vre_lock);
+
+			dmu_tx_t *tx =
+			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+
+			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+			uint64_t txg = dmu_tx_get_txg(tx);
+
+			/*
+			 * Reacquire the vdev_config lock.  Theoretically, the
+			 * vdev_t that we're expanding may have changed.
+			 */
+			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+			raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+
+			boolean_t needsync =
+			    raidz_reflow_impl(raidvd, vre, rt, tx);
+
+			dmu_tx_commit(tx);
+
+			if (needsync) {
+				spa_config_exit(spa, SCL_CONFIG, FTAG);
+				txg_wait_synced(spa->spa_dsl_pool, txg);
+				spa_config_enter(spa, SCL_CONFIG, FTAG,
+				    RW_READER);
+			}
+		}
+
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+		metaslab_enable(msp, B_FALSE, B_FALSE);
+		range_tree_vacate(rt, NULL, NULL);
+		range_tree_destroy(rt);
+
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+	}
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	/*
+	 * The txg_wait_synced() here ensures that all reflow zio's have
+	 * completed, and vre_failed_offset has been set if necessary.  It
+	 * also ensures that the progress of the last raidz_reflow_sync() is
+	 * written to disk before raidz_reflow_complete_sync() changes the
+	 * in-memory vre_state.  vdev_raidz_io_start() uses vre_state to
+	 * determine if a reflow is in progress, in which case we may need to
+	 * write to both old and new locations.  Therefore we can only change
+	 * vre_state once this is not necessary, which is once the on-disk
+	 * progress (in spa_ubsync) has been set past any possible writes (to
+	 * the end of the last metaslab).
+	 */
+	txg_wait_synced(spa->spa_dsl_pool, 0);
+
+	if (!zthr_iscancelled(zthr) &&
+	    vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
+		/*
+		 * We are not being canceled or paused, so the reflow must be
+		 * complete. In that case also mark it as completed on disk.
+		 */
+		ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
+		VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+		    raidz_reflow_complete_sync, spa,
+		    0, ZFS_SPACE_CHECK_NONE));
+		(void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
+	} else {
+		/*
+		 * Wait for all copy zio's to complete and for all the
+		 * raidz_reflow_sync() synctasks to be run.
+		 */
+		spa_history_log_internal(spa, "reflow pause",
+		    NULL, "offset=%llu failed_offset=%lld",
+		    (long long)vre->vre_offset,
+		    (long long)vre->vre_failed_offset);
+		mutex_enter(&vre->vre_lock);
+		if (vre->vre_failed_offset != UINT64_MAX) {
+			/*
+			 * Reset progress so that we will retry everything
+			 * after the point that something failed.
+			 */
+			vre->vre_offset = vre->vre_failed_offset;
+			vre->vre_failed_offset = UINT64_MAX;
+			vre->vre_waiting_for_resilver = B_TRUE;
+		}
+		mutex_exit(&vre->vre_lock);
+	}
+}
+
+void
+spa_start_raidz_expansion_thread(spa_t *spa)
+{
+	ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
+	spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
+	    spa_raidz_expand_thread_check, spa_raidz_expand_thread,
+	    spa, defclsyspri);
+}
+
+void
+raidz_dtl_reassessed(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	if (spa->spa_raidz_expand != NULL) {
+		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+		/*
+		 * we get called often from vdev_dtl_reassess() so make
+		 * sure it's our vdev and any replacing is complete
+		 */
+		if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
+		    !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
+			mutex_enter(&vre->vre_lock);
+			if (vre->vre_waiting_for_resilver) {
+				vdev_dbgmsg(vd, "DTL reassessed, "
+				    "continuing raidz expansion");
+				vre->vre_waiting_for_resilver = B_FALSE;
+				zthr_wakeup(spa->spa_raidz_expand_zthr);
+			}
+			mutex_exit(&vre->vre_lock);
+		}
+	}
+}
+
+int
+vdev_raidz_attach_check(vdev_t *new_child)
+{
+	vdev_t *raidvd = new_child->vdev_parent;
+	uint64_t new_children = raidvd->vdev_children;
+
+	/*
+	 * We use the "boot" space as scratch space to handle overwriting the
+	 * initial part of the vdev.  If it is too small, then this expansion
+	 * is not allowed.  This would be very unusual (e.g. ashift > 13 and
+	 * >200 children).
+	 */
+	if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
+		return (EINVAL);
+	}
+	return (0);
+}
+
+void
+vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
+{
+	vdev_t *new_child = arg;
+	spa_t *spa = new_child->vdev_spa;
+	vdev_t *raidvd = new_child->vdev_parent;
+	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
+	ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
+	ASSERT3P(raidvd->vdev_top, ==, raidvd);
+	ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
+	ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
+	ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
+	    new_child);
+
+	spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
+
+	vdrz->vd_physical_width++;
+
+	VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
+	vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
+	vdrz->vn_vre.vre_offset = 0;
+	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
+	spa->spa_raidz_expand = &vdrz->vn_vre;
+	zthr_wakeup(spa->spa_raidz_expand_zthr);
+
+	/*
+	 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
+	 * written to the config.
+	 */
+	vdev_config_dirty(raidvd);
+
+	vdrz->vn_vre.vre_start_time = gethrestime_sec();
+	vdrz->vn_vre.vre_end_time = 0;
+	vdrz->vn_vre.vre_state = DSS_SCANNING;
+	vdrz->vn_vre.vre_bytes_copied = 0;
+
+	uint64_t state = vdrz->vn_vre.vre_state;
+	VERIFY0(zap_update(spa->spa_meta_objset,
+	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
+	    sizeof (state), 1, &state, tx));
+
+	uint64_t start_time = vdrz->vn_vre.vre_start_time;
+	VERIFY0(zap_update(spa->spa_meta_objset,
+	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
+	    sizeof (start_time), 1, &start_time, tx));
+
+	(void) zap_remove(spa->spa_meta_objset,
+	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
+	(void) zap_remove(spa->spa_meta_objset,
+	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
+
+	spa_history_log_internal(spa, "raidz vdev expansion started",  tx,
+	    "%s vdev %llu new width %llu", spa_name(spa),
+	    (unsigned long long)raidvd->vdev_id,
+	    (unsigned long long)raidvd->vdev_children);
+}
+
+int
+vdev_raidz_load(vdev_t *vd)
+{
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
+	int err;
+
+	uint64_t state = DSS_NONE;
+	uint64_t start_time = 0;
+	uint64_t end_time = 0;
+	uint64_t bytes_copied = 0;
+
+	if (vd->vdev_top_zap != 0) {
+		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
+		    sizeof (state), 1, &state);
+		if (err != 0 && err != ENOENT)
+			return (err);
+
+		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
+		    sizeof (start_time), 1, &start_time);
+		if (err != 0 && err != ENOENT)
+			return (err);
+
+		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
+		    sizeof (end_time), 1, &end_time);
+		if (err != 0 && err != ENOENT)
+			return (err);
+
+		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
+		    sizeof (bytes_copied), 1, &bytes_copied);
+		if (err != 0 && err != ENOENT)
+			return (err);
+	}
+
+	/*
+	 * If we are in the middle of expansion, vre_state should have
+	 * already been set by vdev_raidz_init().
+	 */
+	EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
+	vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
+	vdrz->vn_vre.vre_start_time = start_time;
+	vdrz->vn_vre.vre_end_time = end_time;
+	vdrz->vn_vre.vre_bytes_copied = bytes_copied;
+
+	return (0);
+}
+
+int
+spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
+{
+	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+
+	if (vre == NULL) {
+		/* no removal in progress; find most recent completed */
+		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
+			vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
+			if (vd->vdev_ops == &vdev_raidz_ops) {
+				vdev_raidz_t *vdrz = vd->vdev_tsd;
+
+				if (vdrz->vn_vre.vre_end_time != 0 &&
+				    (vre == NULL ||
+				    vdrz->vn_vre.vre_end_time >
+				    vre->vre_end_time)) {
+					vre = &vdrz->vn_vre;
+				}
+			}
+		}
+	}
+
+	if (vre == NULL) {
+		return (SET_ERROR(ENOENT));
+	}
+
+	pres->pres_state = vre->vre_state;
+	pres->pres_expanding_vdev = vre->vre_vdev_id;
+
+	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
+	pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
+
+	mutex_enter(&vre->vre_lock);
+	pres->pres_reflowed = vre->vre_bytes_copied;
+	for (int i = 0; i < TXG_SIZE; i++)
+		pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
+	mutex_exit(&vre->vre_lock);
+
+	pres->pres_start_time = vre->vre_start_time;
+	pres->pres_end_time = vre->vre_end_time;
+	pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
+
+	return (0);
+}
+
 /*
  * Initialize private RAIDZ specific fields from the nvlist.
  */
 static int
 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
 {
-	vdev_raidz_t *vdrz;
-	uint64_t nparity;
-
 	uint_t children;
 	nvlist_t **child;
 	int error = nvlist_lookup_nvlist_array(nv,
@@ -2541,6 +4820,7 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
 
+	uint64_t nparity;
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
 		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
 			return (SET_ERROR(EINVAL));
@@ -2567,10 +4847,56 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
 		nparity = 1;
 	}
 
-	vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
-	vdrz->vd_logical_width = children;
+	vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
+	vdrz->vn_vre.vre_vdev_id = -1;
+	vdrz->vn_vre.vre_offset = UINT64_MAX;
+	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
+	mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
+	zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
+	mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
+	avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
+	    sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
+
+	vdrz->vd_physical_width = children;
 	vdrz->vd_nparity = nparity;
 
+	/* note, the ID does not exist when creating a pool */
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
+	    &vdrz->vn_vre.vre_vdev_id);
+
+	boolean_t reflow_in_progress =
+	    nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
+	if (reflow_in_progress) {
+		spa->spa_raidz_expand = &vdrz->vn_vre;
+		vdrz->vn_vre.vre_state = DSS_SCANNING;
+	}
+
+	vdrz->vd_original_width = children;
+	uint64_t *txgs;
+	unsigned int txgs_size = 0;
+	error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
+	    &txgs, &txgs_size);
+	if (error == 0) {
+		for (int i = 0; i < txgs_size; i++) {
+			reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
+			re->re_txg = txgs[txgs_size - i - 1];
+			re->re_logical_width = vdrz->vd_physical_width - i;
+
+			if (reflow_in_progress)
+				re->re_logical_width--;
+
+			avl_add(&vdrz->vd_expand_txgs, re);
+		}
+
+		vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
+	}
+	if (reflow_in_progress) {
+		vdrz->vd_original_width--;
+		zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
+		    children, txgs_size);
+	}
+
 	*tsd = vdrz;
 
 	return (0);
@@ -2579,7 +4905,20 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
 static void
 vdev_raidz_fini(vdev_t *vd)
 {
-	kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t));
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
+	if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
+		vd->vdev_spa->spa_raidz_expand = NULL;
+	reflow_node_t *re;
+	void *cookie = NULL;
+	avl_tree_t *tree = &vdrz->vd_expand_txgs;
+	while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
+		kmem_free(re, sizeof (*re));
+	avl_destroy(&vdrz->vd_expand_txgs);
+	mutex_destroy(&vdrz->vd_expand_lock);
+	mutex_destroy(&vdrz->vn_vre.vre_lock);
+	cv_destroy(&vdrz->vn_vre.vre_cv);
+	zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
+	kmem_free(vdrz, sizeof (*vdrz));
 }
 
 /*
@@ -2607,6 +4946,29 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
 	 * it.
 	 */
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
+
+	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
+		fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
+	}
+
+	mutex_enter(&vdrz->vd_expand_lock);
+	if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
+		uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
+		uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
+		    KM_SLEEP);
+		uint64_t i = 0;
+
+		for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
+		    re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
+			txgs[i++] = re->re_txg;
+		}
+
+		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
+		    txgs, count);
+
+		kmem_free(txgs, sizeof (uint64_t) * count);
+	}
+	mutex_exit(&vdrz->vd_expand_lock);
 }
 
 static uint64_t
@@ -2646,3 +5008,15 @@ vdev_ops_t vdev_raidz_ops = {
 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
+	"For testing, pause RAIDZ expansion after reflowing this many bytes");
+ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
+	"Max amount of concurrent i/o for RAIDZ expansion");
+ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
+	"For expanded RAIDZ, aggregate reads that have more rows than this");
+ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
+	"For expanded RAIDZ, automatically start a pool scrub when expansion "
+	"completes");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c
index 03df2df5adaf..e12b96170f55 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -22,6 +22,7 @@
  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
  */
 
+#include <sys/simd.h>
 #include <sys/zfs_context.h>
 #include <sys/types.h>
 #include <sys/zio.h>
@@ -29,7 +30,6 @@
 #include <sys/zfs_debug.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_raidz_impl.h>
-#include <sys/simd.h>
 
 /* Opaque implementation with NULL methods to represent original methods */
 static const raidz_impl_ops_t vdev_raidz_original_impl = {
@@ -43,7 +43,7 @@ static raidz_impl_ops_t vdev_raidz_fastest_impl = {
 };
 
 /* All compiled in implementations */
-const raidz_impl_ops_t *raidz_all_maths[] = {
+static const raidz_impl_ops_t *const raidz_all_maths[] = {
 	&vdev_raidz_original_impl,
 	&vdev_raidz_scalar_impl,
 #if defined(__x86_64) && defined(HAVE_SSE2)	/* only x86_64 for now */
@@ -268,10 +268,10 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, raidz_row_t *rr,
 		return (rec_fn(rr, dt));
 }
 
-const char *raidz_gen_name[] = {
+const char *const raidz_gen_name[] = {
 	"gen_p", "gen_pq", "gen_pqr"
 };
-const char *raidz_rec_name[] = {
+const char *const raidz_rec_name[] = {
 	"rec_p", "rec_q", "rec_r",
 	"rec_pq", "rec_pr", "rec_qr", "rec_pqr"
 };
@@ -283,22 +283,19 @@ const char *raidz_rec_name[] = {
 static int
 raidz_math_kstat_headers(char *buf, size_t size)
 {
-	int i;
-	ssize_t off;
-
 	ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN);
 
-	off = snprintf(buf, size, "%-17s", "implementation");
+	ssize_t off = kmem_scnprintf(buf, size, "%-17s", "implementation");
 
-	for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
-		off += snprintf(buf + off, size - off, "%-16s",
+	for (int i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
+		off += kmem_scnprintf(buf + off, size - off, "%-16s",
 		    raidz_gen_name[i]);
 
-	for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
-		off += snprintf(buf + off, size - off, "%-16s",
+	for (int i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
+		off += kmem_scnprintf(buf + off, size - off, "%-16s",
 		    raidz_rec_name[i]);
 
-	(void) snprintf(buf + off, size - off, "\n");
+	(void) kmem_scnprintf(buf + off, size - off, "\n");
 
 	return (0);
 }
@@ -314,34 +311,35 @@ raidz_math_kstat_data(char *buf, size_t size, void *data)
 	ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN);
 
 	if (cstat == fstat) {
-		off += snprintf(buf + off, size - off, "%-17s", "fastest");
+		off += kmem_scnprintf(buf + off, size - off, "%-17s",
+		    "fastest");
 
 		for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) {
 			int id = fstat->gen[i];
-			off += snprintf(buf + off, size - off, "%-16s",
+			off += kmem_scnprintf(buf + off, size - off, "%-16s",
 			    raidz_supp_impl[id]->name);
 		}
 		for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) {
 			int id = fstat->rec[i];
-			off += snprintf(buf + off, size - off, "%-16s",
+			off += kmem_scnprintf(buf + off, size - off, "%-16s",
 			    raidz_supp_impl[id]->name);
 		}
 	} else {
 		ptrdiff_t id = cstat - raidz_impl_kstats;
 
-		off += snprintf(buf + off, size - off, "%-17s",
+		off += kmem_scnprintf(buf + off, size - off, "%-17s",
 		    raidz_supp_impl[id]->name);
 
 		for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
-			off += snprintf(buf + off, size - off, "%-16llu",
+			off += kmem_scnprintf(buf + off, size - off, "%-16llu",
 			    (u_longlong_t)cstat->gen[i]);
 
 		for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
-			off += snprintf(buf + off, size - off, "%-16llu",
+			off += kmem_scnprintf(buf + off, size - off, "%-16llu",
 			    (u_longlong_t)cstat->rec[i]);
 	}
 
-	(void) snprintf(buf + off, size - off, "\n");
+	(void) kmem_scnprintf(buf + off, size - off, "\n");
 
 	return (0);
 }
@@ -566,7 +564,7 @@ vdev_raidz_math_fini(void)
 }
 
 static const struct {
-	char *name;
+	const char *name;
 	uint32_t sel;
 } math_impl_opts[] = {
 		{ "cycle",	IMPL_CYCLE },
@@ -655,13 +653,15 @@ zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp)
 	/* list mandatory options */
 	for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) {
 		fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s ";
-		cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
+		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
+		    math_impl_opts[i].name);
 	}
 
 	/* list all supported implementations */
 	for (i = 0; i < raidz_supp_impl_cnt; i++) {
 		fmt = (i == impl) ? "[%s] " : "%s ";
-		cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
+		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
+		    raidz_supp_impl[i]->name);
 	}
 
 	return (cnt);
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c
index 0a67ceb84920..4aa7bc2b9708 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h
index e46b2536546c..f0f6546f7f71 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c
index e072f51cd635..bd9de91a4ba8 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -210,9 +210,13 @@ DEFINE_GEN_METHODS(aarch64_neonx2);
  * If compiled with -O0, gcc doesn't do any stack frame coalescing
  * and -Wframe-larger-than=1024 is triggered in debug mode.
  */
+#if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic ignored "-Wframe-larger-than="
+#endif
 DEFINE_REC_METHODS(aarch64_neonx2);
+#if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic pop
+#endif
 
 static boolean_t
 raidz_will_aarch64_neonx2_work(void)
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c
index 65e4bebce8fa..e5bbc7decbfa 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c
index f06b469023eb..3b709ed34fc4 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c
index aab653b77491..5ec71a04133a 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h
index 35e016fc65a5..5d77c5d046d5 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -214,9 +214,10 @@ raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private)
 }
 
 
-#define	raidz_copy(dabd, sabd, size)					\
+#define	raidz_copy(dabd, sabd, off, size)				\
 {									\
-	abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\
+	abd_iterate_func2(dabd, sabd, off, off, size, raidz_copy_abd_cb, \
+	    NULL);							\
 }
 
 /*
@@ -254,9 +255,10 @@ raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private)
 	return (0);
 }
 
-#define	raidz_add(dabd, sabd, size)					\
+#define	raidz_add(dabd, sabd, off, size)				\
 {									\
-	abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\
+	abd_iterate_func2(dabd, sabd, off, off, size, raidz_add_abd_cb, \
+	    NULL);							\
 }
 
 /*
@@ -343,7 +345,10 @@ raidz_mul_abd_cb(void *dc, size_t size, void *private)
  * the parity/syndrome if data column is shorter.
  *
  * P parity is calculated using raidz_add_abd().
+ *
+ * For CPU L2 cache blocking we process 64KB at a time.
  */
+#define	CHUNK		65536
 
 /*
  * Generate P parity (RAIDZ1)
@@ -357,20 +362,26 @@ raidz_generate_p_impl(raidz_row_t * const rr)
 	const size_t ncols = rr->rr_cols;
 	const size_t psize = rr->rr_col[CODE_P].rc_size;
 	abd_t *pabd = rr->rr_col[CODE_P].rc_abd;
-	size_t size;
-	abd_t *dabd;
+	size_t off, size;
 
 	raidz_math_begin();
 
-	/* start with first data column */
-	raidz_copy(pabd, rr->rr_col[1].rc_abd, psize);
+	for (off = 0; off < psize; off += CHUNK) {
+
+		/* start with first data column */
+		size = MIN(CHUNK, psize - off);
+		raidz_copy(pabd, rr->rr_col[1].rc_abd, off, size);
 
-	for (c = 2; c < ncols; c++) {
-		dabd = rr->rr_col[c].rc_abd;
-		size = rr->rr_col[c].rc_size;
+		for (c = 2; c < ncols; c++) {
+			size = rr->rr_col[c].rc_size;
+			if (size <= off)
+				continue;
 
-		/* add data column */
-		raidz_add(pabd, dabd, size);
+			/* add data column */
+			size = MIN(CHUNK, size - off);
+			abd_t *dabd = rr->rr_col[c].rc_abd;
+			raidz_add(pabd, dabd, off, size);
+		}
 	}
 
 	raidz_math_end();
@@ -423,7 +434,7 @@ raidz_generate_pq_impl(raidz_row_t * const rr)
 	size_t c;
 	const size_t ncols = rr->rr_cols;
 	const size_t csize = rr->rr_col[CODE_P].rc_size;
-	size_t dsize;
+	size_t off, size, dsize;
 	abd_t *dabd;
 	abd_t *cabds[] = {
 		rr->rr_col[CODE_P].rc_abd,
@@ -432,15 +443,20 @@ raidz_generate_pq_impl(raidz_row_t * const rr)
 
 	raidz_math_begin();
 
-	raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize);
-	raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize);
+	for (off = 0; off < csize; off += CHUNK) {
 
-	for (c = 3; c < ncols; c++) {
-		dabd = rr->rr_col[c].rc_abd;
-		dsize = rr->rr_col[c].rc_size;
+		size = MIN(CHUNK, csize - off);
+		raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, off, size);
+		raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, off, size);
 
-		abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2,
-		    raidz_gen_pq_add);
+		for (c = 3; c < ncols; c++) {
+			dabd = rr->rr_col[c].rc_abd;
+			dsize = rr->rr_col[c].rc_size;
+			dsize = (dsize > off) ? MIN(CHUNK, dsize - off) : 0;
+
+			abd_raidz_gen_iterate(cabds, dabd, off, size, dsize, 2,
+			    raidz_gen_pq_add);
+		}
 	}
 
 	raidz_math_end();
@@ -460,8 +476,8 @@ static void
 raidz_gen_pqr_add(void **c, const void *dc, const size_t csize,
     const size_t dsize)
 {
-	v_t *p = (v_t *)c[0];
-	v_t *q = (v_t *)c[1];
+	v_t *p = (v_t *)c[CODE_P];
+	v_t *q = (v_t *)c[CODE_Q];
 	v_t *r = (v_t *)c[CODE_R];
 	const v_t *d = (const v_t *)dc;
 	const v_t * const dend = d + (dsize / sizeof (v_t));
@@ -486,7 +502,7 @@ raidz_gen_pqr_add(void **c, const void *dc, const size_t csize,
 
 
 /*
- * Generate PQR parity (RAIDZ2)
+ * Generate PQR parity (RAIDZ3)
  *
  * @rr	RAIDZ row
  */
@@ -496,7 +512,7 @@ raidz_generate_pqr_impl(raidz_row_t * const rr)
 	size_t c;
 	const size_t ncols = rr->rr_cols;
 	const size_t csize = rr->rr_col[CODE_P].rc_size;
-	size_t dsize;
+	size_t off, size, dsize;
 	abd_t *dabd;
 	abd_t *cabds[] = {
 		rr->rr_col[CODE_P].rc_abd,
@@ -506,16 +522,21 @@ raidz_generate_pqr_impl(raidz_row_t * const rr)
 
 	raidz_math_begin();
 
-	raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize);
-	raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize);
-	raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize);
+	for (off = 0; off < csize; off += CHUNK) {
 
-	for (c = 4; c < ncols; c++) {
-		dabd = rr->rr_col[c].rc_abd;
-		dsize = rr->rr_col[c].rc_size;
+		size = MIN(CHUNK, csize - off);
+		raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, off, size);
+		raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, off, size);
+		raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, off, size);
 
-		abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3,
-		    raidz_gen_pqr_add);
+		for (c = 4; c < ncols; c++) {
+			dabd = rr->rr_col[c].rc_abd;
+			dsize = rr->rr_col[c].rc_size;
+			dsize = (dsize > off) ? MIN(CHUNK, dsize - off) : 0;
+
+			abd_raidz_gen_iterate(cabds, dabd, off, size, dsize, 3,
+			    raidz_gen_pqr_add);
+		}
 	}
 
 	raidz_math_end();
@@ -592,26 +613,31 @@ raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx)
 	const size_t x = tgtidx[TARGET_X];
 	const size_t xsize = rr->rr_col[x].rc_size;
 	abd_t *xabd = rr->rr_col[x].rc_abd;
-	size_t size;
-	abd_t *dabd;
+	size_t off, size;
 
 	if (xabd == NULL)
 		return (1 << CODE_P);
 
 	raidz_math_begin();
 
-	/* copy P into target */
-	raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize);
+	for (off = 0; off < xsize; off += CHUNK) {
 
-	/* generate p_syndrome */
-	for (c = firstdc; c < ncols; c++) {
-		if (c == x)
-			continue;
+		/* copy P into target */
+		size = MIN(CHUNK, xsize - off);
+		raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, off, size);
 
-		dabd = rr->rr_col[c].rc_abd;
-		size = MIN(rr->rr_col[c].rc_size, xsize);
+		/* generate p_syndrome */
+		for (c = firstdc; c < ncols; c++) {
+			if (c == x)
+				continue;
+			size = rr->rr_col[c].rc_size;
+			if (size <= off)
+				continue;
 
-		raidz_add(xabd, dabd, size);
+			size = MIN(CHUNK, MIN(size, xsize) - off);
+			abd_t *dabd = rr->rr_col[c].rc_abd;
+			raidz_add(xabd, dabd, off, size);
+		}
 	}
 
 	raidz_math_end();
@@ -683,7 +709,7 @@ raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx)
 
 	/* Start with first data column if present */
 	if (firstdc != x) {
-		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
 	} else {
 		raidz_zero(xabd, xsize);
 	}
@@ -698,12 +724,12 @@ raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx)
 			dsize = rr->rr_col[c].rc_size;
 		}
 
-		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
+		abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 1,
 		    raidz_syn_q_abd);
 	}
 
 	/* add Q to the syndrome */
-	raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize);
+	raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, 0, xsize);
 
 	/* transform the syndrome */
 	abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff);
@@ -777,7 +803,7 @@ raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx)
 
 	/* Start with first data column if present */
 	if (firstdc != x) {
-		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
 	} else {
 		raidz_zero(xabd, xsize);
 	}
@@ -793,12 +819,12 @@ raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx)
 			dsize = rr->rr_col[c].rc_size;
 		}
 
-		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
+		abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 1,
 		    raidz_syn_r_abd);
 	}
 
 	/* add R to the syndrome */
-	raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize);
+	raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, 0, xsize);
 
 	/* transform the syndrome */
 	abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff);
@@ -934,8 +960,8 @@ raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx)
 
 	/* Start with first data column if present */
 	if (firstdc != x) {
-		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
-		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
+		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
 	} else {
 		raidz_zero(xabd, xsize);
 		raidz_zero(yabd, xsize);
@@ -951,7 +977,7 @@ raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx)
 			dsize = rr->rr_col[c].rc_size;
 		}
 
-		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+		abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 2,
 		    raidz_syn_pq_abd);
 	}
 
@@ -959,7 +985,7 @@ raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx)
 
 	/* Copy shorter targets back to the original abd buffer */
 	if (ysize < xsize)
-		raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+		raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize);
 
 	raidz_math_end();
 
@@ -1094,8 +1120,8 @@ raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
 
 	/* Start with first data column if present */
 	if (firstdc != x) {
-		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
-		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
+		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
 	} else {
 		raidz_zero(xabd, xsize);
 		raidz_zero(yabd, xsize);
@@ -1111,7 +1137,7 @@ raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
 			dsize = rr->rr_col[c].rc_size;
 		}
 
-		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+		abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 2,
 		    raidz_syn_pr_abd);
 	}
 
@@ -1121,7 +1147,7 @@ raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
 	 * Copy shorter targets back to the original abd buffer
 	 */
 	if (ysize < xsize)
-		raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+		raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize);
 
 	raidz_math_end();
 
@@ -1261,8 +1287,8 @@ raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
 
 	/* Start with first data column if present */
 	if (firstdc != x) {
-		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
-		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
+		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
 	} else {
 		raidz_zero(xabd, xsize);
 		raidz_zero(yabd, xsize);
@@ -1278,7 +1304,7 @@ raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
 			dsize = rr->rr_col[c].rc_size;
 		}
 
-		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+		abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 2,
 		    raidz_syn_qr_abd);
 	}
 
@@ -1288,7 +1314,7 @@ raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
 	 * Copy shorter targets back to the original abd buffer
 	 */
 	if (ysize < xsize)
-		raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+		raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize);
 
 	raidz_math_end();
 
@@ -1456,9 +1482,9 @@ raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
 
 	/* Start with first data column if present */
 	if (firstdc != x) {
-		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
-		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
-		raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
+		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
+		raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
 	} else {
 		raidz_zero(xabd, xsize);
 		raidz_zero(yabd, xsize);
@@ -1475,7 +1501,7 @@ raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
 			dsize = rr->rr_col[c].rc_size;
 		}
 
-		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3,
+		abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 3,
 		    raidz_syn_pqr_abd);
 	}
 
@@ -1485,9 +1511,9 @@ raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
 	 * Copy shorter targets back to the original abd buffer
 	 */
 	if (ysize < xsize)
-		raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+		raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize);
 	if (zsize < xsize)
-		raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize);
+		raidz_copy(rr->rr_col[z].rc_abd, zabd, 0, zsize);
 
 	raidz_math_end();
 
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c
index 1db2c4cd3a47..ff493b8b7bc0 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h
index 3842f5fd637c..f76eb47a9c66 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -26,10 +26,6 @@
 #include <sys/types.h>
 #include <sys/simd.h>
 
-#ifdef __linux__
-#define	__asm __asm__ __volatile__
-#endif
-
 #define	_REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
 #define	REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
 
@@ -142,7 +138,7 @@ typedef struct v {
 {								\
 	switch (REG_CNT(r)) {					\
 	case 8:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"lvx 21,0,%[SRC0]\n"				\
 		"lvx 20,0,%[SRC1]\n"				\
 		"lvx 19,0,%[SRC2]\n"				\
@@ -172,7 +168,7 @@ typedef struct v {
 		:	"v18", "v19", "v20", "v21");		\
 		break;						\
 	case 4:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"lvx 21,0,%[SRC0]\n"				\
 		"lvx 20,0,%[SRC1]\n"				\
 		"lvx 19,0,%[SRC2]\n"				\
@@ -189,7 +185,7 @@ typedef struct v {
 		:	"v18", "v19", "v20", "v21");		\
 		break;						\
 	case 2:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"lvx 21,0,%[SRC0]\n"				\
 		"lvx 20,0,%[SRC1]\n"				\
 		"vxor " VR0(r) "," VR0(r) ",21\n"		\
@@ -208,7 +204,7 @@ typedef struct v {
 {								\
 	switch (REG_CNT(r)) {					\
 	case 8:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"vxor " VR4(r) "," VR4(r) "," VR0(r) "\n"	\
 		"vxor " VR5(r) "," VR5(r) "," VR1(r) "\n"	\
 		"vxor " VR6(r) "," VR6(r) "," VR2(r) "\n"	\
@@ -217,7 +213,7 @@ typedef struct v {
 		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));	\
 		break;						\
 	case 4:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"vxor " VR2(r) "," VR2(r) "," VR0(r) "\n"	\
 		"vxor " VR3(r) "," VR3(r) "," VR1(r) "\n"	\
 		:	UVR2(r), UVR3(r)			\
@@ -232,7 +228,7 @@ typedef struct v {
 {								\
 	switch (REG_CNT(r)) {					\
 	case 8:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"vxor " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
 		"vxor " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
 		"vxor " VR2(r) "," VR2(r) "," VR2(r) "\n"	\
@@ -245,7 +241,7 @@ typedef struct v {
 			WVR4(r), WVR5(r), WVR6(r), WVR7(r));	\
 		break;						\
 	case 4:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"vxor " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
 		"vxor " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
 		"vxor " VR2(r) "," VR2(r) "," VR2(r) "\n"	\
@@ -253,7 +249,7 @@ typedef struct v {
 		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r));	\
 		break;						\
 	case 2:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"vxor " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
 		"vxor " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
 		:	WVR0(r), WVR1(r));			\
@@ -267,7 +263,7 @@ typedef struct v {
 {								\
 	switch (REG_CNT(r)) {					\
 	case 8:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"vor " VR4(r) "," VR0(r) "," VR0(r) "\n"	\
 		"vor " VR5(r) "," VR1(r) "," VR1(r) "\n"	\
 		"vor " VR6(r) "," VR2(r) "," VR2(r) "\n"	\
@@ -276,7 +272,7 @@ typedef struct v {
 		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));	\
 		break;						\
 	case 4:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"vor " VR2(r) "," VR0(r) "," VR0(r) "\n"	\
 		"vor " VR3(r) "," VR1(r) "," VR1(r) "\n"	\
 		:	WVR2(r), WVR3(r)			\
@@ -291,7 +287,7 @@ typedef struct v {
 {								\
 	switch (REG_CNT(r)) {					\
 	case 8:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"lvx " VR0(r) " ,0,%[SRC0]\n"			\
 		"lvx " VR1(r) " ,0,%[SRC1]\n"			\
 		"lvx " VR2(r) " ,0,%[SRC2]\n"			\
@@ -312,7 +308,7 @@ typedef struct v {
 		[SRC7] "r" ((OFFSET(src, 112))));		\
 		break;						\
 	case 4:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"lvx " VR0(r) " ,0,%[SRC0]\n"			\
 		"lvx " VR1(r) " ,0,%[SRC1]\n"			\
 		"lvx " VR2(r) " ,0,%[SRC2]\n"			\
@@ -324,7 +320,7 @@ typedef struct v {
 		[SRC3] "r" ((OFFSET(src, 48))));		\
 		break;						\
 	case 2:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"lvx " VR0(r) " ,0,%[SRC0]\n"			\
 		"lvx " VR1(r) " ,0,%[SRC1]\n"			\
 		:	WVR0(r), WVR1(r)			\
@@ -340,7 +336,7 @@ typedef struct v {
 {								\
 	switch (REG_CNT(r)) {					\
 	case 8:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"stvx " VR0(r) " ,0,%[DST0]\n"			\
 		"stvx " VR1(r) " ,0,%[DST1]\n"			\
 		"stvx " VR2(r) " ,0,%[DST2]\n"			\
@@ -362,7 +358,7 @@ typedef struct v {
 		:	"memory");				\
 		break;						\
 	case 4:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"stvx " VR0(r) " ,0,%[DST0]\n"			\
 		"stvx " VR1(r) " ,0,%[DST1]\n"			\
 		"stvx " VR2(r) " ,0,%[DST2]\n"			\
@@ -375,7 +371,7 @@ typedef struct v {
 		: "memory");					\
 		break;						\
 	case 2:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"stvx " VR0(r) " ,0,%[DST0]\n"			\
 		"stvx " VR1(r) " ,0,%[DST1]\n"			\
 		: :	[DST0] "r" ((OFFSET(dst, 0))),		\
@@ -400,7 +396,7 @@ typedef struct v {
 
 #define	MUL2_SETUP()						\
 {								\
-	__asm(							\
+	__asm__ __volatile__(					\
 		"vspltisb " VR(16) ",14\n"			\
 		"vspltisb " VR(17) ",15\n"			\
 		"vaddubm " VR(16) "," VR(17) "," VR(16) "\n"	\
@@ -412,7 +408,7 @@ typedef struct v {
 {								\
 	switch (REG_CNT(r)) {					\
 	case 4:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"vcmpgtsb 19," VR(17) "," VR0(r) "\n"		\
 		"vcmpgtsb 18," VR(17) "," VR1(r) "\n"		\
 		"vcmpgtsb 21," VR(17) "," VR2(r) "\n"		\
@@ -434,7 +430,7 @@ typedef struct v {
 		:	"v18", "v19", "v20", "v21");		\
 		break;						\
 	case 2:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		"vcmpgtsb 19," VR(17) "," VR0(r) "\n"		\
 		"vcmpgtsb 18," VR(17) "," VR1(r) "\n"		\
 		"vand 19,19," VR(16) "\n"			\
@@ -478,7 +474,7 @@ typedef struct v {
 {								\
 	switch (REG_CNT(r)) {					\
 	case 2:							\
-		__asm(						\
+		__asm__ __volatile__(				\
 		/* lts for upper part */			\
 		"vspltisb 15,15\n"				\
 		"lvx 10,0,%[lt0]\n"				\
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c
index 9e9c15ff4ba2..b51352b4e90b 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -142,7 +142,7 @@ static const struct {
 		a.b[6] = mul_lt[a.b[6]];				\
 		a.b[5] = mul_lt[a.b[5]];				\
 		a.b[4] = mul_lt[a.b[4]];				\
-		fallthrough;						\
+		zfs_fallthrough;					\
 	case 4:								\
 		a.b[3] = mul_lt[a.b[3]];				\
 		a.b[2] = mul_lt[a.b[2]];				\
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c
index 56a0b123d952..02b5d6a609ab 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c
index 5ddc079a4f5d..244f137b3d09 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
index 4d7de0c6c44c..8a8b02cab5c6 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -22,6 +22,8 @@
  *
  * Copyright (c) 2018, Intel Corporation.
  * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
+ * Copyright (c) 2024 by Delphix. All rights reserved.
  */
 
 #include <sys/vdev_impl.h>
@@ -33,6 +35,7 @@
 #include <sys/zio.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
+#include <sys/arc_impl.h>
 #include <sys/zap.h>
 
 /*
@@ -103,7 +106,7 @@
  * Size of rebuild reads; defaults to 1MiB per data disk and is capped at
  * SPA_MAXBLOCKSIZE.
  */
-unsigned long zfs_rebuild_max_segment = 1024 * 1024;
+static uint64_t zfs_rebuild_max_segment = 1024 * 1024;
 
 /*
  * Maximum number of parallelly executed bytes per leaf vdev caused by a
@@ -115,25 +118,25 @@ unsigned long zfs_rebuild_max_segment = 1024 * 1024;
  * segment size is also large (zfs_rebuild_max_segment=1M).  This helps keep
  * the queue depth short.
  *
- * 32MB was selected as the default value to achieve good performance with
- * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential
- * rebuild was unable to saturate all of the drives using smaller values.
- * With a value of 32MB the sequential resilver write rate was measured at
- * 800MB/s sustained while rebuilding to a distributed spare.
+ * 64MB was observed to deliver the best performance and set as the default.
+ * Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c)
+ * and a rebuild rate of 1.2GB/s was measured to the distribute spare.
+ * Smaller values were unable to fully saturate the available pool I/O.
  */
-unsigned long zfs_rebuild_vdev_limit = 32 << 20;
+static uint64_t zfs_rebuild_vdev_limit = 64 << 20;
 
 /*
  * Automatically start a pool scrub when the last active sequential resilver
  * completes in order to verify the checksums of all blocks which have been
  * resilvered. This option is enabled by default and is strongly recommended.
  */
-int zfs_rebuild_scrub_enabled = 1;
+static int zfs_rebuild_scrub_enabled = 1;
 
 /*
  * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync().
  */
-static void vdev_rebuild_thread(void *arg);
+static __attribute__((noreturn)) void vdev_rebuild_thread(void *arg);
+static void vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx);
 
 /*
  * Clear the per-vdev rebuild bytes value for a vdev tree.
@@ -227,7 +230,7 @@ vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx)
 	spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
 
 	mutex_enter(&vd->vdev_rebuild_lock);
-	bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+	memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
 	vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE;
 	vrp->vrp_min_txg = 0;
 	vrp->vrp_max_txg = dmu_tx_get_txg(tx);
@@ -260,7 +263,7 @@ vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx)
 }
 
 static void
-vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name)
+vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, const char *name)
 {
 	nvlist_t *aux = fnvlist_alloc();
 
@@ -307,6 +310,17 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 
 	mutex_enter(&vd->vdev_rebuild_lock);
+
+	/*
+	 * Handle a second device failure if it occurs after all rebuild I/O
+	 * has completed but before this sync task has been executed.
+	 */
+	if (vd->vdev_rebuild_reset_wanted) {
+		mutex_exit(&vd->vdev_rebuild_lock);
+		vdev_rebuild_reset_sync(arg, tx);
+		return;
+	}
+
 	vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE;
 	vrp->vrp_end_time = gethrestime_sec();
 
@@ -448,7 +462,7 @@ vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx)
 	}
 
 	clear_rebuild_bytes(vd);
-	bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+	memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
 
 	if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) {
@@ -558,8 +572,10 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
 	vdev_rebuild_blkptr_init(&blk, vd, start, size);
 	uint64_t psize = BP_GET_PSIZE(&blk);
 
-	if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN))
+	if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) {
+		vr->vr_pass_bytes_skipped += size;
 		return (0);
+	}
 
 	mutex_enter(&vr->vr_io_lock);
 
@@ -701,7 +717,7 @@ vdev_rebuild_load(vdev_t *vd)
 	vd->vdev_rebuilding = B_FALSE;
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) {
-		bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+		memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
 		mutex_exit(&vd->vdev_rebuild_lock);
 		return (SET_ERROR(ENOTSUP));
 	}
@@ -718,7 +734,7 @@ vdev_rebuild_load(vdev_t *vd)
 	 * status allowing a new resilver/rebuild to be started.
 	 */
 	if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) {
-		bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+		memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
 	} else if (err) {
 		mutex_exit(&vd->vdev_rebuild_lock);
 		return (err);
@@ -736,11 +752,12 @@ vdev_rebuild_load(vdev_t *vd)
  * Each scan thread is responsible for rebuilding a top-level vdev.  The
  * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS.
  */
-static void
+static __attribute__((noreturn)) void
 vdev_rebuild_thread(void *arg)
 {
 	vdev_t *vd = arg;
 	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
 	int error = 0;
 
 	/*
@@ -760,7 +777,6 @@ vdev_rebuild_thread(void *arg)
 	ASSERT(vd->vdev_rebuilding);
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD));
 	ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE);
-	ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE);
 
 	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
@@ -773,9 +789,7 @@ vdev_rebuild_thread(void *arg)
 	vr->vr_pass_start_time = gethrtime();
 	vr->vr_pass_bytes_scanned = 0;
 	vr->vr_pass_bytes_issued = 0;
-
-	vr->vr_bytes_inflight_max = MAX(1ULL << 20,
-	    zfs_rebuild_vdev_limit * vd->vdev_children);
+	vr->vr_pass_bytes_skipped = 0;
 
 	uint64_t update_est_time = gethrtime();
 	vdev_rebuild_update_bytes_est(vd, 0);
@@ -793,6 +807,17 @@ vdev_rebuild_thread(void *arg)
 		vr->vr_scan_msp = msp;
 
 		/*
+		 * Calculate the max number of in-flight bytes for top-level
+		 * vdev scanning operations (minimum 1MB, maximum 1/2 of
+		 * arc_c_max shared by all top-level vdevs).  Limits for the
+		 * issuing phase are done per top-level vdev and are handled
+		 * separately.
+		 */
+		uint64_t limit = (arc_c_max / 2) / MAX(rvd->vdev_children, 1);
+		vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20,
+		    zfs_rebuild_vdev_limit * vd->vdev_children));
+
+		/*
 		 * Removal of vdevs from the vdev tree may eliminate the need
 		 * for the rebuild, in which case it should be canceled.  The
 		 * vdev_rebuild_cancel_wanted flag is set until the sync task
@@ -1047,7 +1072,8 @@ vdev_rebuild_restart_impl(vdev_t *vd)
 void
 vdev_rebuild_restart(spa_t *spa)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_load_thread == curthread);
 
 	vdev_rebuild_restart_impl(spa->spa_root_vdev);
 }
@@ -1061,7 +1087,8 @@ vdev_rebuild_stop_wait(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_export_thread == curthread);
 
 	if (vd == spa->spa_root_vdev) {
 		for (uint64_t i = 0; i < vd->vdev_children; i++)
@@ -1111,7 +1138,7 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
 	    tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS);
 
 	if (error == ENOENT) {
-		bzero(vrs, sizeof (vdev_rebuild_stat_t));
+		memset(vrs, 0, sizeof (vdev_rebuild_stat_t));
 		vrs->vrs_state = VDEV_REBUILD_NONE;
 		error = 0;
 	} else if (error == 0) {
@@ -1132,19 +1159,18 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
 		    vr->vr_pass_start_time);
 		vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned;
 		vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued;
+		vrs->vrs_pass_bytes_skipped = vr->vr_pass_bytes_skipped;
 		mutex_exit(&tvd->vdev_rebuild_lock);
 	}
 
 	return (error);
 }
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, U64, ZMOD_RW,
 	"Max segment size in bytes of rebuild reads");
 
-ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, U64, ZMOD_RW,
 	"Max bytes in flight per leaf vdev for sequential resilvers");
 
 ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW,
 	"Automatically scrub after sequential resilver completes");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c
index f762c1df96aa..1249657f9d72 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_removal.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -94,7 +94,7 @@ typedef struct vdev_copy_arg {
  * doing a device removal.  This determines how much i/o we can have
  * in flight concurrently.
  */
-int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
+static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
 
 /*
  * The largest contiguous segment that we will attempt to allocate when
@@ -104,7 +104,7 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
  *
  * See also the accessor function spa_remove_max_segment().
  */
-int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
+uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
 
 /*
  * Ignore hard IO errors during device removal.  When set if a device
@@ -112,7 +112,7 @@ int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
  * not be cancelled.  This can result in a normally recoverable block
  * becoming permanently damaged and is not recommended.
  */
-int zfs_removal_ignore_errors = 0;
+static int zfs_removal_ignore_errors = 0;
 
 /*
  * Allow a remap segment to span free chunks of at most this size. The main
@@ -130,7 +130,7 @@ int zfs_removal_ignore_errors = 0;
  *  - we'll do larger allocations, which may fail and fall back on smaller
  *    allocations
  */
-int vdev_removal_max_span = 32 * 1024;
+uint_t vdev_removal_max_span = 32 * 1024;
 
 /*
  * This is used by the test suite so that it can ensure that certain
@@ -140,7 +140,7 @@ int zfs_removal_suspend_progress = 0;
 
 #define	VDEV_REMOVAL_ZAP_OBJS	"lzap"
 
-static void spa_vdev_remove_thread(void *arg);
+static __attribute__((noreturn)) void spa_vdev_remove_thread(void *arg);
 static int spa_vdev_remove_cancel_impl(spa_t *spa);
 
 static void
@@ -168,8 +168,178 @@ spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
 }
 
 static void
-spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
-    nvlist_t *dev_to_remove)
+vdev_activate(vdev_t *vd)
+{
+	metaslab_group_t *mg = vd->vdev_mg;
+	spa_t *spa = vd->vdev_spa;
+	uint64_t vdev_space = spa_deflate(spa) ?
+	    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+
+	ASSERT(!vd->vdev_islog);
+	ASSERT(vd->vdev_noalloc);
+
+	metaslab_group_activate(mg);
+	metaslab_group_activate(vd->vdev_log_mg);
+
+	ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space);
+
+	spa->spa_nonallocating_dspace -= vdev_space;
+
+	vd->vdev_noalloc = B_FALSE;
+}
+
+static int
+vdev_passivate(vdev_t *vd, uint64_t *txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	int error;
+
+	ASSERT(!vd->vdev_noalloc);
+
+	vdev_t *rvd = spa->spa_root_vdev;
+	metaslab_group_t *mg = vd->vdev_mg;
+	metaslab_class_t *normal = spa_normal_class(spa);
+	if (mg->mg_class == normal) {
+		/*
+		 * We must check that this is not the only allocating device in
+		 * the pool before passivating, otherwise we will not be able
+		 * to make progress because we can't allocate from any vdevs.
+		 */
+		boolean_t last = B_TRUE;
+		for (uint64_t id = 0; id < rvd->vdev_children; id++) {
+			vdev_t *cvd = rvd->vdev_child[id];
+
+			if (cvd == vd ||
+			    cvd->vdev_ops == &vdev_indirect_ops)
+				continue;
+
+			metaslab_class_t *mc = cvd->vdev_mg->mg_class;
+			if (mc != normal)
+				continue;
+
+			if (!cvd->vdev_noalloc) {
+				last = B_FALSE;
+				break;
+			}
+		}
+		if (last)
+			return (SET_ERROR(EINVAL));
+	}
+
+	metaslab_group_passivate(mg);
+	ASSERT(!vd->vdev_islog);
+	metaslab_group_passivate(vd->vdev_log_mg);
+
+	/*
+	 * Wait for the youngest allocations and frees to sync,
+	 * and then wait for the deferral of those frees to finish.
+	 */
+	spa_vdev_config_exit(spa, NULL,
+	    *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
+	/*
+	 * We must ensure that no "stubby" log blocks are allocated
+	 * on the device to be removed.  These blocks could be
+	 * written at any time, including while we are in the middle
+	 * of copying them.
+	 */
+	error = spa_reset_logs(spa);
+
+	*txg = spa_vdev_config_enter(spa);
+
+	if (error != 0) {
+		metaslab_group_activate(mg);
+		ASSERT(!vd->vdev_islog);
+		if (vd->vdev_log_mg != NULL)
+			metaslab_group_activate(vd->vdev_log_mg);
+		return (error);
+	}
+
+	spa->spa_nonallocating_dspace += spa_deflate(spa) ?
+	    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+	vd->vdev_noalloc = B_TRUE;
+
+	return (0);
+}
+
+/*
+ * Turn off allocations for a top-level device from the pool.
+ *
+ * Turning off allocations for a top-level device can take a significant
+ * amount of time. As a result we use the spa_vdev_config_[enter/exit]
+ * functions which allow us to grab and release the spa_config_lock while
+ * still holding the namespace lock. During each step the configuration
+ * is synced out.
+ */
+int
+spa_vdev_noalloc(spa_t *spa, uint64_t guid)
+{
+	vdev_t *vd;
+	uint64_t txg;
+	int error = 0;
+
+	ASSERT(!MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_writeable(spa));
+
+	txg = spa_vdev_enter(spa);
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+
+	if (vd == NULL)
+		error = SET_ERROR(ENOENT);
+	else if (vd->vdev_mg == NULL)
+		error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
+	else if (!vd->vdev_noalloc)
+		error = vdev_passivate(vd, &txg);
+
+	if (error == 0) {
+		vdev_dirty_leaves(vd, VDD_DTL, txg);
+		vdev_config_dirty(vd);
+	}
+
+	error = spa_vdev_exit(spa, NULL, txg, error);
+
+	return (error);
+}
+
+int
+spa_vdev_alloc(spa_t *spa, uint64_t guid)
+{
+	vdev_t *vd;
+	uint64_t txg;
+	int error = 0;
+
+	ASSERT(!MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_writeable(spa));
+
+	txg = spa_vdev_enter(spa);
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+
+	if (vd == NULL)
+		error = SET_ERROR(ENOENT);
+	else if (vd->vdev_mg == NULL)
+		error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
+	else if (!vd->vdev_removing)
+		vdev_activate(vd);
+
+	if (error == 0) {
+		vdev_dirty_leaves(vd, VDD_DTL, txg);
+		vdev_config_dirty(vd);
+	}
+
+	(void) spa_vdev_exit(spa, NULL, txg, error);
+
+	return (error);
+}
+
+static void
+spa_vdev_remove_aux(nvlist_t *config, const char *name, nvlist_t **dev,
+    int count, nvlist_t *dev_to_remove)
 {
 	nvlist_t **newdev = NULL;
 
@@ -183,7 +353,8 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
 	}
 
 	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
-	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
+	fnvlist_add_nvlist_array(config, name, (const nvlist_t * const *)newdev,
+	    count - 1);
 
 	for (int i = 0; i < count - 1; i++)
 		nvlist_free(newdev[i]);
@@ -997,11 +1168,11 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
 	metaslab_class_t *mc = mg->mg_class;
 	if (mc->mc_groups == 0)
 		mc = spa_normal_class(spa);
-	int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
-	    zal, 0);
+	int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg,
+	    METASLAB_DONT_THROTTLE, zal, 0);
 	if (error == ENOSPC && mc != spa_normal_class(spa)) {
 		error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
-		    &dst, 0, NULL, txg, 0, zal, 0);
+		    &dst, 0, NULL, txg, METASLAB_DONT_THROTTLE, zal, 0);
 	}
 	if (error != 0)
 		return (error);
@@ -1193,6 +1364,10 @@ vdev_remove_complete(spa_t *spa)
 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
 	ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
+	vdev_rebuild_stop_wait(vd);
+	ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
+	uint64_t vdev_space = spa_deflate(spa) ?
+	    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
 
 	sysevent_t *ev = spa_event_create(spa, vd, NULL,
 	    ESC_ZFS_VDEV_REMOVE_DEV);
@@ -1200,6 +1375,12 @@ vdev_remove_complete(spa_t *spa)
 	zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)txg);
 
+	ASSERT3U(0, !=, vdev_space);
+	ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space);
+
+	/* the vdev is no longer part of the dspace */
+	spa->spa_nonallocating_dspace -= vdev_space;
+
 	/*
 	 * Discard allocation state.
 	 */
@@ -1207,7 +1388,6 @@ vdev_remove_complete(spa_t *spa)
 		vdev_metaslab_fini(vd);
 		metaslab_group_destroy(vd->vdev_mg);
 		vd->vdev_mg = NULL;
-		spa_log_sm_set_blocklimit(spa);
 	}
 	if (vd->vdev_log_mg != NULL) {
 		ASSERT0(vd->vdev_ms_count);
@@ -1410,7 +1590,7 @@ spa_remove_max_segment(spa_t *spa)
  * TXG have completed (see spa_txg_zio) and writes the new mappings to disk
  * (see vdev_mapping_sync()).
  */
-static void
+static __attribute__((noreturn)) void
 spa_vdev_remove_thread(void *arg)
 {
 	spa_t *spa = arg;
@@ -1619,10 +1799,32 @@ spa_vdev_remove_suspend(spa_t *spa)
 	mutex_exit(&svr->svr_lock);
 }
 
-/* ARGSUSED */
+/*
+ * Return true if the "allocating" property has been set to "off"
+ */
+static boolean_t
+vdev_prop_allocating_off(vdev_t *vd)
+{
+	uint64_t objid = vd->vdev_top_zap;
+	uint64_t allocating = 1;
+
+	/* no vdev property object => no props */
+	if (objid != 0) {
+		spa_t *spa = vd->vdev_spa;
+		objset_t *mos = spa->spa_meta_objset;
+
+		mutex_enter(&spa->spa_props_lock);
+		(void) zap_lookup(mos, objid, "allocating", sizeof (uint64_t),
+		    1, &allocating);
+		mutex_exit(&spa->spa_props_lock);
+	}
+	return (allocating == 0);
+}
+
 static int
 spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx)
 {
+	(void) arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	if (spa->spa_vdev_removal == NULL)
@@ -1634,10 +1836,10 @@ spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx)
  * Cancel a removal by freeing all entries from the partial mapping
  * and marking the vdev as no longer being removing.
  */
-/* ARGSUSED */
 static void
 spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 {
+	(void) arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
@@ -1761,6 +1963,13 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 	spa_finish_removal(spa, DSS_CANCELED, tx);
 
 	vd->vdev_removing = B_FALSE;
+
+	if (!vdev_prop_allocating_off(vd)) {
+		spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
+		vdev_activate(vd);
+		spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
+	}
+
 	vdev_config_dirty(vd);
 
 	zfs_dbgmsg("canceled device removal for vdev %llu in %llu",
@@ -1774,21 +1983,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 static int
 spa_vdev_remove_cancel_impl(spa_t *spa)
 {
-	uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
-
 	int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
 	    spa_vdev_remove_cancel_sync, NULL, 0,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
-
-	if (error == 0) {
-		spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
-		vdev_t *vd = vdev_lookup_top(spa, vdid);
-		metaslab_group_activate(vd->vdev_mg);
-		ASSERT(!vd->vdev_islog);
-		metaslab_group_activate(vd->vdev_log_mg);
-		spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
-	}
-
 	return (error);
 }
 
@@ -1935,7 +2132,6 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 	 * metaslab_class_histogram_verify()
 	 */
 	vdev_metaslab_fini(vd);
-	spa_log_sm_set_blocklimit(spa);
 
 	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
 	*txg = spa_vdev_config_enter(spa);
@@ -1984,6 +2180,11 @@ spa_vdev_remove_top_check(vdev_t *vd)
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
 		return (SET_ERROR(ENOTSUP));
 
+	/*
+	 * This device is already being removed
+	 */
+	if (vd->vdev_removing)
+		return (SET_ERROR(EALREADY));
 
 	metaslab_class_t *mc = vd->vdev_mg->mg_class;
 	metaslab_class_t *normal = spa_normal_class(spa);
@@ -2002,20 +2203,12 @@ spa_vdev_remove_top_check(vdev_t *vd)
 		ASSERT3U(available, >=, vd->vdev_stat.vs_alloc);
 		if (available < vd->vdev_stat.vs_alloc)
 			return (SET_ERROR(ENOSPC));
-	} else {
+	} else if (!vd->vdev_noalloc) {
 		/* available space in the pool's normal class */
 		uint64_t available = dsl_dir_space_available(
 		    spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
-		if (available <
-		    vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
-			/*
-			 * This is a normal device. There has to be enough free
-			 * space to remove the device and leave double the
-			 * "slop" space (i.e. we must leave at least 3% of the
-			 * pool free, in addition to the normal slop space).
-			 */
+		if (available < vd->vdev_stat.vs_dspace)
 			return (SET_ERROR(ENOSPC));
-		}
 	}
 
 	/*
@@ -2058,7 +2251,6 @@ spa_vdev_remove_top_check(vdev_t *vd)
 	 * and not be raidz or draid.
 	 */
 	vdev_t *rvd = spa->spa_root_vdev;
-	int num_indirect = 0;
 	for (uint64_t id = 0; id < rvd->vdev_children; id++) {
 		vdev_t *cvd = rvd->vdev_child[id];
 
@@ -2074,8 +2266,6 @@ spa_vdev_remove_top_check(vdev_t *vd)
 		if (cvd->vdev_ashift != 0 &&
 		    cvd->vdev_alloc_bias == VDEV_BIAS_NONE)
 			ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift);
-		if (cvd->vdev_ops == &vdev_indirect_ops)
-			num_indirect++;
 		if (!vdev_is_concrete(cvd))
 			continue;
 		if (vdev_get_nparity(cvd) != 0)
@@ -2108,6 +2298,7 @@ static int
 spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 {
 	spa_t *spa = vd->vdev_spa;
+	boolean_t set_noalloc = B_FALSE;
 	int error;
 
 	/*
@@ -2116,8 +2307,6 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 	 * are errors.
 	 */
 	error = spa_vdev_remove_top_check(vd);
-	if (error != 0)
-		return (error);
 
 	/*
 	 * Stop allocating from this vdev.  Note that we must check
@@ -2127,31 +2316,22 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 	 * The above check for sufficient free space serves this
 	 * purpose.
 	 */
-	metaslab_group_t *mg = vd->vdev_mg;
-	metaslab_group_passivate(mg);
-	ASSERT(!vd->vdev_islog);
-	metaslab_group_passivate(vd->vdev_log_mg);
-
-	/*
-	 * Wait for the youngest allocations and frees to sync,
-	 * and then wait for the deferral of those frees to finish.
-	 */
-	spa_vdev_config_exit(spa, NULL,
-	    *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+	if (error == 0 && !vd->vdev_noalloc) {
+		set_noalloc = B_TRUE;
+		error = vdev_passivate(vd, txg);
+	}
 
-	/*
-	 * We must ensure that no "stubby" log blocks are allocated
-	 * on the device to be removed.  These blocks could be
-	 * written at any time, including while we are in the middle
-	 * of copying them.
-	 */
-	error = spa_reset_logs(spa);
+	if (error != 0)
+		return (error);
 
 	/*
 	 * We stop any initializing and TRIM that is currently in progress
 	 * but leave the state as "active". This will allow the process to
 	 * resume if the removal is canceled sometime later.
 	 */
+
+	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
+
 	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
 	vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE);
 	vdev_autotrim_stop_wait(vd);
@@ -2162,13 +2342,11 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 	 * Things might have changed while the config lock was dropped
 	 * (e.g. space usage).  Check for errors again.
 	 */
-	if (error == 0)
-		error = spa_vdev_remove_top_check(vd);
+	error = spa_vdev_remove_top_check(vd);
 
 	if (error != 0) {
-		metaslab_group_activate(mg);
-		ASSERT(!vd->vdev_islog);
-		metaslab_group_activate(vd->vdev_log_mg);
+		if (set_noalloc)
+			vdev_activate(vd);
 		spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 		spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
 		spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
@@ -2206,7 +2384,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 	int error = 0, error_log;
 	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
 	sysevent_t *ev = NULL;
-	char *vd_type = NULL, *vd_path = NULL;
+	const char *vd_type = NULL;
+	char *vd_path = NULL;
 
 	ASSERT(spa_writeable(spa));
 
@@ -2235,7 +2414,7 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 		 * in this pool.
 		 */
 		if (vd == NULL || unspare) {
-			char *type;
+			const char *type;
 			boolean_t draid_spare = B_FALSE;
 
 			if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type)
@@ -2363,17 +2542,17 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
 	return (0);
 }
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW,
 	"Ignore hard IO errors when removing device");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, UINT, ZMOD_RW,
 	"Largest contiguous segment to allocate when removing device");
 
-ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, UINT, ZMOD_RW,
 	"Largest span of free chunks a remap segment can span");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, INT, ZMOD_RW,
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, UINT, ZMOD_RW,
 	"Pause device removal after this many bytes are copied "
 	"(debug use only - causes removal to hang)");
 /* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_root.c b/sys/contrib/openzfs/module/zfs/vdev_root.c
index 45ddc2f71927..e132643dc330 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_root.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_root.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c
index deea7fedd770..9cf10332e8bf 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_trim.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -20,9 +20,10 @@
  */
 
 /*
- * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
  * Copyright (c) 2021 Hewlett Packard Enterprise Development LP
+ * Copyright 2023 RackTop Systems, Inc.
  */
 
 #include <sys/spa.h>
@@ -96,12 +97,12 @@
 /*
  * Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths.
  */
-unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024;
+static unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024;
 
 /*
  * Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped.
  */
-unsigned int zfs_trim_extent_bytes_min = 32 * 1024;
+static unsigned int zfs_trim_extent_bytes_min = 32 * 1024;
 
 /*
  * Skip uninitialized metaslabs during the TRIM process.  This option is
@@ -118,7 +119,7 @@ unsigned int zfs_trim_metaslab_skip = 0;
  * concurrent TRIM I/Os issued to the device is controlled by the
  * zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options.
  */
-unsigned int zfs_trim_queue_limit = 10;
+static unsigned int zfs_trim_queue_limit = 10;
 
 /*
  * The minimum number of transaction groups between automatic trims of a
@@ -134,7 +135,7 @@ unsigned int zfs_trim_queue_limit = 10;
  * has the opposite effect.  The default value of 32 was determined though
  * testing to be a reasonable compromise.
  */
-unsigned int zfs_trim_txg_batch = 32;
+static unsigned int zfs_trim_txg_batch = 32;
 
 /*
  * The trim_args are a control structure which describe how a leaf vdev
@@ -168,7 +169,8 @@ static boolean_t
 vdev_trim_should_stop(vdev_t *vd)
 {
 	return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) ||
-	    vd->vdev_detached || vd->vdev_top->vdev_removing);
+	    vd->vdev_detached || vd->vdev_top->vdev_removing ||
+	    vd->vdev_top->vdev_rz_expanding);
 }
 
 /*
@@ -179,10 +181,31 @@ vdev_autotrim_should_stop(vdev_t *tvd)
 {
 	return (tvd->vdev_autotrim_exit_wanted ||
 	    !vdev_writeable(tvd) || tvd->vdev_removing ||
+	    tvd->vdev_rz_expanding ||
 	    spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF);
 }
 
 /*
+ * Wait for given number of kicks, return true if the wait is aborted due to
+ * vdev_autotrim_exit_wanted.
+ */
+static boolean_t
+vdev_autotrim_wait_kick(vdev_t *vd, int num_of_kick)
+{
+	mutex_enter(&vd->vdev_autotrim_lock);
+	for (int i = 0; i < num_of_kick; i++) {
+		if (vd->vdev_autotrim_exit_wanted)
+			break;
+		cv_wait_idle(&vd->vdev_autotrim_kick_cv,
+		    &vd->vdev_autotrim_lock);
+	}
+	boolean_t exit_wanted = vd->vdev_autotrim_exit_wanted;
+	mutex_exit(&vd->vdev_autotrim_lock);
+
+	return (exit_wanted);
+}
+
+/*
  * The sync task for updating the on-disk state of a manual TRIM.  This
  * is scheduled by vdev_trim_change_state().
  */
@@ -202,7 +225,8 @@ vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx)
 	kmem_free(arg, sizeof (uint64_t));
 
 	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
-	if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+	if (vd == NULL || vd->vdev_top->vdev_removing ||
+	    !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding)
 		return;
 
 	uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK];
@@ -572,6 +596,7 @@ vdev_trim_ranges(trim_args_t *ta)
 	uint64_t extent_bytes_max = ta->trim_extent_bytes_max;
 	uint64_t extent_bytes_min = ta->trim_extent_bytes_min;
 	spa_t *spa = vd->vdev_spa;
+	int error = 0;
 
 	ta->trim_start_time = gethrtime();
 	ta->trim_bytes_done = 0;
@@ -591,19 +616,32 @@ vdev_trim_ranges(trim_args_t *ta)
 		uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1;
 
 		for (uint64_t w = 0; w < writes_required; w++) {
-			int error;
-
 			error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE +
 			    rs_get_start(rs, ta->trim_tree) +
 			    (w *extent_bytes_max), MIN(size -
 			    (w * extent_bytes_max), extent_bytes_max));
 			if (error != 0) {
-				return (error);
+				goto done;
 			}
 		}
 	}
 
-	return (0);
+done:
+	/*
+	 * Make sure all TRIMs for this metaslab have completed before
+	 * returning. TRIM zios have lower priority over regular or syncing
+	 * zios, so all TRIM zios for this metaslab must complete before the
+	 * metaslab is re-enabled. Otherwise it's possible write zios to
+	 * this metaslab could cut ahead of still queued TRIM zios for this
+	 * metaslab causing corruption if the ranges overlap.
+	 */
+	mutex_enter(&vd->vdev_trim_io_lock);
+	while (vd->vdev_trim_inflight[0] > 0) {
+		cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+	}
+	mutex_exit(&vd->vdev_trim_io_lock);
+
+	return (error);
 }
 
 static void
@@ -834,7 +872,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
  * by its ms_allocatable.  While a metaslab is undergoing trimming it is
  * not eligible for new allocations.
  */
-static void
+static __attribute__((noreturn)) void
 vdev_trim_thread(void *arg)
 {
 	vdev_t *vd = arg;
@@ -922,11 +960,6 @@ vdev_trim_thread(void *arg)
 	}
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
-	mutex_enter(&vd->vdev_trim_io_lock);
-	while (vd->vdev_trim_inflight[0] > 0) {
-		cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
-	}
-	mutex_exit(&vd->vdev_trim_io_lock);
 
 	range_tree_destroy(ta.trim_tree);
 
@@ -976,6 +1009,7 @@ vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure)
 	ASSERT(!vd->vdev_detached);
 	ASSERT(!vd->vdev_trim_exit_wanted);
 	ASSERT(!vd->vdev_top->vdev_removing);
+	ASSERT(!vd->vdev_rz_expanding);
 
 	vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure);
 	vd->vdev_trim_thread = thread_create(NULL, 0,
@@ -1003,9 +1037,11 @@ vdev_trim_stop_wait_impl(vdev_t *vd)
 void
 vdev_trim_stop_wait(spa_t *spa, list_t *vd_list)
 {
+	(void) spa;
 	vdev_t *vd;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_export_thread == curthread);
 
 	while ((vd = list_remove_head(vd_list)) != NULL) {
 		mutex_enter(&vd->vdev_trim_lock);
@@ -1044,7 +1080,8 @@ vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list)
 	if (vd_list == NULL) {
 		vdev_trim_stop_wait_impl(vd);
 	} else {
-		ASSERT(MUTEX_HELD(&spa_namespace_lock));
+		ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+		    vd->vdev_spa->spa_export_thread == curthread);
 		list_insert_tail(vd_list, vd);
 	}
 }
@@ -1080,7 +1117,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
 	list_t vd_list;
 	vdev_t *vd_l2cache;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_export_thread == curthread);
 
 	list_create(&vd_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_trim_node));
@@ -1113,7 +1151,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
 void
 vdev_trim_restart(vdev_t *vd)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    vd->vdev_spa->spa_load_thread == curthread);
 	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
 	if (vd->vdev_leaf_zap != 0) {
@@ -1132,12 +1171,13 @@ vdev_trim_restart(vdev_t *vd)
 		ASSERT(err == 0 || err == ENOENT);
 		vd->vdev_trim_action_time = timestamp;
 
-		if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED ||
-		    vd->vdev_offline) {
+		if ((vd->vdev_trim_state == VDEV_TRIM_SUSPENDED ||
+		    vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) {
 			/* load progress for reporting, but don't resume */
 			VERIFY0(vdev_trim_load(vd));
 		} else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE &&
 		    vdev_writeable(vd) && !vd->vdev_top->vdev_removing &&
+		    !vd->vdev_top->vdev_rz_expanding &&
 		    vd->vdev_trim_thread == NULL) {
 			VERIFY0(vdev_trim_load(vd));
 			vdev_trim(vd, vd->vdev_trim_rate,
@@ -1174,7 +1214,7 @@ vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size)
  * N.B. This behavior is different from a manual TRIM where a thread
  * is created for each leaf vdev, instead of each top-level vdev.
  */
-static void
+static __attribute__((noreturn)) void
 vdev_autotrim_thread(void *arg)
 {
 	vdev_t *vd = arg;
@@ -1187,12 +1227,10 @@ vdev_autotrim_thread(void *arg)
 	mutex_exit(&vd->vdev_autotrim_lock);
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
-	uint64_t extent_bytes_max = zfs_trim_extent_bytes_max;
-	uint64_t extent_bytes_min = zfs_trim_extent_bytes_min;
-
 	while (!vdev_autotrim_should_stop(vd)) {
 		int txgs_per_trim = MAX(zfs_trim_txg_batch, 1);
-		boolean_t issued_trim = B_FALSE;
+		uint64_t extent_bytes_max = zfs_trim_extent_bytes_max;
+		uint64_t extent_bytes_min = zfs_trim_extent_bytes_min;
 
 		/*
 		 * All of the metaslabs are divided in to groups of size
@@ -1224,6 +1262,8 @@ vdev_autotrim_thread(void *arg)
 		    i += txgs_per_trim) {
 			metaslab_t *msp = vd->vdev_ms[i];
 			range_tree_t *trim_tree;
+			boolean_t issued_trim = B_FALSE;
+			boolean_t wait_aborted = B_FALSE;
 
 			spa_config_exit(spa, SCL_CONFIG, FTAG);
 			metaslab_disable(msp);
@@ -1374,7 +1414,18 @@ vdev_autotrim_thread(void *arg)
 			range_tree_vacate(trim_tree, NULL, NULL);
 			range_tree_destroy(trim_tree);
 
-			metaslab_enable(msp, issued_trim, B_FALSE);
+			/*
+			 * Wait for couples of kicks, to ensure the trim io is
+			 * synced. If the wait is aborted due to
+			 * vdev_autotrim_exit_wanted, we need to signal
+			 * metaslab_enable() to wait for sync.
+			 */
+			if (issued_trim) {
+				wait_aborted = vdev_autotrim_wait_kick(vd,
+				    TXG_CONCURRENT_STATES + TXG_DEFER_SIZE);
+			}
+
+			metaslab_enable(msp, wait_aborted, B_FALSE);
 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 			for (uint64_t c = 0; c < children; c++) {
@@ -1388,17 +1439,14 @@ vdev_autotrim_thread(void *arg)
 			}
 
 			kmem_free(tap, sizeof (trim_args_t) * children);
+
+			if (vdev_autotrim_should_stop(vd))
+				break;
 		}
 
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 
-		/*
-		 * After completing the group of metaslabs wait for the next
-		 * open txg.  This is done to make sure that a minimum of
-		 * zfs_trim_txg_batch txgs will occur before these metaslabs
-		 * are trimmed again.
-		 */
-		txg_wait_open(spa_get_dsl(spa), 0, issued_trim);
+		vdev_autotrim_wait_kick(vd, 1);
 
 		shift++;
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
@@ -1454,7 +1502,8 @@ vdev_autotrim(spa_t *spa)
 
 		mutex_enter(&tvd->vdev_autotrim_lock);
 		if (vdev_writeable(tvd) && !tvd->vdev_removing &&
-		    tvd->vdev_autotrim_thread == NULL) {
+		    tvd->vdev_autotrim_thread == NULL &&
+		    !tvd->vdev_rz_expanding) {
 			ASSERT3P(tvd->vdev_top, ==, tvd);
 
 			tvd->vdev_autotrim_thread = thread_create(NULL, 0,
@@ -1476,11 +1525,9 @@ vdev_autotrim_stop_wait(vdev_t *tvd)
 	mutex_enter(&tvd->vdev_autotrim_lock);
 	if (tvd->vdev_autotrim_thread != NULL) {
 		tvd->vdev_autotrim_exit_wanted = B_TRUE;
-
-		while (tvd->vdev_autotrim_thread != NULL) {
-			cv_wait(&tvd->vdev_autotrim_cv,
-			    &tvd->vdev_autotrim_lock);
-		}
+		cv_broadcast(&tvd->vdev_autotrim_kick_cv);
+		cv_wait(&tvd->vdev_autotrim_cv,
+		    &tvd->vdev_autotrim_lock);
 
 		ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL);
 		tvd->vdev_autotrim_exit_wanted = B_FALSE;
@@ -1488,6 +1535,24 @@ vdev_autotrim_stop_wait(vdev_t *tvd)
 	mutex_exit(&tvd->vdev_autotrim_lock);
 }
 
+void
+vdev_autotrim_kick(spa_t *spa)
+{
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+	vdev_t *root_vd = spa->spa_root_vdev;
+	vdev_t *tvd;
+
+	for (uint64_t i = 0; i < root_vd->vdev_children; i++) {
+		tvd = root_vd->vdev_child[i];
+
+		mutex_enter(&tvd->vdev_autotrim_lock);
+		if (tvd->vdev_autotrim_thread != NULL)
+			cv_broadcast(&tvd->vdev_autotrim_kick_cv);
+		mutex_exit(&tvd->vdev_autotrim_lock);
+	}
+}
+
 /*
  * Wait for all of the vdev_autotrim_thread associated with the pool to
  * be terminated (canceled or stopped).
@@ -1507,19 +1572,19 @@ vdev_autotrim_stop_all(spa_t *spa)
 void
 vdev_autotrim_restart(spa_t *spa)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_load_thread == curthread);
 	if (spa->spa_autotrim)
 		vdev_autotrim(spa);
 }
 
-static void
+static __attribute__((noreturn)) void
 vdev_trim_l2arc_thread(void *arg)
 {
 	vdev_t		*vd = arg;
 	spa_t		*spa = vd->vdev_spa;
 	l2arc_dev_t	*dev = l2arc_vdev_get(vd);
-	trim_args_t	ta;
+	trim_args_t	ta = {0};
 	range_seg64_t 	physical_rs;
 
 	ASSERT(vdev_is_concrete(vd));
@@ -1530,7 +1595,6 @@ vdev_trim_l2arc_thread(void *arg)
 	vd->vdev_trim_partial = 0;
 	vd->vdev_trim_secure = 0;
 
-	bzero(&ta, sizeof (ta));
 	ta.trim_vdev = vd;
 	ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
 	ta.trim_type = TRIM_TYPE_MANUAL;
@@ -1590,7 +1654,7 @@ vdev_trim_l2arc_thread(void *arg)
 	 */
 	spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd,
 	    RW_READER);
-	bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize);
+	memset(dev->l2ad_dev_hdr, 0, dev->l2ad_dev_hdr_asize);
 	l2arc_dev_hdr_update(dev);
 	spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd);
 
@@ -1654,9 +1718,9 @@ vdev_trim_l2arc(spa_t *spa)
 int
 vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size)
 {
-	trim_args_t		ta;
-	range_seg64_t 		physical_rs;
-	int			error;
+	trim_args_t ta = {0};
+	range_seg64_t physical_rs;
+	int error;
 	physical_rs.rs_start = start;
 	physical_rs.rs_end = start + size;
 
@@ -1664,8 +1728,8 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size)
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 	ASSERT(!vd->vdev_detached);
 	ASSERT(!vd->vdev_top->vdev_removing);
+	ASSERT(!vd->vdev_top->vdev_rz_expanding);
 
-	bzero(&ta, sizeof (ta));
 	ta.trim_vdev = vd;
 	ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
 	ta.trim_type = TRIM_TYPE_SIMPLE;
@@ -1708,19 +1772,17 @@ EXPORT_SYMBOL(vdev_autotrim_restart);
 EXPORT_SYMBOL(vdev_trim_l2arc);
 EXPORT_SYMBOL(vdev_trim_simple);
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW,
-    "Max size of TRIM commands, larger will be split");
+	"Max size of TRIM commands, larger will be split");
 
 ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_min, UINT, ZMOD_RW,
-    "Min size of TRIM commands, smaller will be skipped");
+	"Min size of TRIM commands, smaller will be skipped");
 
 ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, metaslab_skip, UINT, ZMOD_RW,
-    "Skip metaslabs which have never been initialized");
+	"Skip metaslabs which have never been initialized");
 
 ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, txg_batch, UINT, ZMOD_RW,
-    "Min number of txgs to aggregate frees before issuing TRIM");
+	"Min number of txgs to aggregate frees before issuing TRIM");
 
 ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, queue_limit, UINT, ZMOD_RW,
-    "Max queued TRIMs outstanding per leaf vdev");
-/* END CSTYLED */
+	"Max queued TRIMs outstanding per leaf vdev");
diff --git a/sys/contrib/openzfs/module/zfs/zap.c b/sys/contrib/openzfs/module/zfs/zap.c
index 6f03beef3bdb..03b76ea1b7bf 100644
--- a/sys/contrib/openzfs/module/zfs/zap.c
+++ b/sys/contrib/openzfs/module/zfs/zap.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -22,6 +22,8 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2023 Alexander Stetsenko <alex.stetsenko@gmail.com>
+ * Copyright (c) 2023, Klara Inc.
  */
 
 /*
@@ -41,6 +43,7 @@
 
 #include <sys/spa.h>
 #include <sys/dmu.h>
+#include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_znode.h>
 #include <sys/fs/zfs.h>
@@ -76,13 +79,18 @@
  *    the zfs-specific implementation of the directory's st_size (which is
  *    the number of entries).
  */
-int zap_iterate_prefetch = B_TRUE;
+static int zap_iterate_prefetch = B_TRUE;
 
-int fzap_default_block_shift = 14; /* 16k blocksize */
+/*
+ * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be
+ * collapsed into a single block.
+ */
+int zap_shrink_enabled = B_TRUE;
 
-extern inline zap_phys_t *zap_f_phys(zap_t *zap);
+int fzap_default_block_shift = 14; /* 16k blocksize */
 
 static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
+static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx);
 
 void
 fzap_byteswap(void *vbuf, size_t size)
@@ -114,7 +122,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 	 * explicitly zero it since it might be coming from an
 	 * initialized microzap
 	 */
-	bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
+	memset(zap->zap_dbuf->db_data, 0, zap->zap_dbuf->db_size);
 	zp->zap_block_type = ZBT_HEADER;
 	zp->zap_magic = ZAP_MAGIC;
 
@@ -135,7 +143,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 	 * set up block 1 - the first leaf
 	 */
 	dmu_buf_t *db;
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db, tx);
 
@@ -184,7 +192,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
 		tbl->zt_nextblk = newblk;
 		ASSERT0(tbl->zt_blks_copied);
-		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
 		    tbl->zt_blk << bs, tbl->zt_numblks << bs,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
@@ -195,21 +203,21 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 
 	uint64_t b = tbl->zt_blks_copied;
 	dmu_buf_t *db_old;
-	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
 
 	/* first half of entries in old[b] go to new[2*b+0] */
 	dmu_buf_t *db_new;
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func(db_old->db_data, db_new->db_data, hepb);
 	dmu_buf_rele(db_new, FTAG);
 
 	/* second half of entries in old[b] go to new[2*b+1] */
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func((uint64_t *)db_old->db_data + hepb,
@@ -257,7 +265,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 	uint64_t off = idx & ((1<<(bs-3))-1);
 
 	dmu_buf_t *db;
-	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
@@ -269,7 +277,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 		uint64_t off2 = idx2 & ((1<<(bs-3))-1);
 		dmu_buf_t *db2;
 
-		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
 		    DMU_READ_NO_PREFETCH);
 		if (err != 0) {
@@ -298,16 +306,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 	uint64_t blk = idx >> (bs-3);
 	uint64_t off = idx & ((1<<(bs-3))-1);
 
-	/*
-	 * Note: this is equivalent to dmu_buf_hold(), but we use
-	 * _dnode_enter / _by_dnode because it's faster because we don't
-	 * have to hold the dnode.
-	 */
-	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
 	dmu_buf_t *db;
-	int err = dmu_buf_hold_by_dnode(dn,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
-	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err != 0)
 		return (err);
 	*valp = ((uint64_t *)db->db_data)[off];
@@ -321,11 +322,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 		 */
 		blk = (idx*2) >> (bs-3);
 
-		dn = dmu_buf_dnode_enter(zap->zap_dbuf);
-		err = dmu_buf_hold_by_dnode(dn,
+		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
 		    DMU_READ_NO_PREFETCH);
-		dmu_buf_dnode_exit(zap->zap_dbuf);
 		if (err == 0)
 			dmu_buf_rele(db, FTAG);
 	}
@@ -370,7 +369,7 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 
 		uint64_t newblk = zap_allocate_blocks(zap, 1);
 		dmu_buf_t *db_new;
-		int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
 		    DMU_READ_NO_PREFETCH);
 		if (err != 0)
@@ -426,20 +425,36 @@ zap_leaf_evict_sync(void *dbu)
 static zap_leaf_t *
 zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 {
-	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
-
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
-	rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL);
-	rw_enter(&l->l_rwlock, RW_WRITER);
-	l->l_blkid = zap_allocate_blocks(zap, 1);
-	l->l_dbuf = NULL;
+	uint64_t blkid = zap_allocate_blocks(zap, 1);
+	dmu_buf_t *db = NULL;
 
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
+	    blkid << FZAP_BLOCK_SHIFT(zap), NULL, &db,
 	    DMU_READ_NO_PREFETCH));
-	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
-	VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu));
+
+	/*
+	 * Create the leaf structure and stash it on the dbuf. If zap was
+	 * recent shrunk or truncated, the dbuf might have been sitting in the
+	 * cache waiting to be evicted, and so still have the old leaf attached
+	 * to it. If so, just reuse it.
+	 */
+	zap_leaf_t *l = dmu_buf_get_user(db);
+	if (l == NULL) {
+		l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+		l->l_blkid = blkid;
+		l->l_dbuf = db;
+		rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL);
+		dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL,
+		    &l->l_dbuf);
+		dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
+	} else {
+		ASSERT3U(l->l_blkid, ==, blkid);
+		ASSERT3P(l->l_dbuf, ==, db);
+	}
+
+	rw_enter(&l->l_rwlock, RW_WRITER);
 	dmu_buf_will_dirty(l->l_dbuf, tx);
 
 	zap_leaf_init(l, zap->zap_normflags != 0);
@@ -535,10 +550,8 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
 		return (SET_ERROR(ENOENT));
 
 	int bs = FZAP_BLOCK_SHIFT(zap);
-	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
-	int err = dmu_buf_hold_by_dnode(dn,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
-	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err != 0)
 		return (err);
 
@@ -600,6 +613,72 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
 }
 
 static int
+zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk,
+    dmu_tx_t *tx)
+{
+	int bs = FZAP_BLOCK_SHIFT(zap);
+	int epb = bs >> 3; /* entries per block */
+	int err = 0;
+
+	ASSERT(tx != NULL);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	/*
+	 * Check for i/o errors
+	 */
+	for (int i = 0; i < nptrs; i += epb) {
+		uint64_t blk;
+		err = zap_idx_to_blk(zap, idx + i, &blk);
+		if (err != 0) {
+			return (err);
+		}
+	}
+
+	for (int i = 0; i < nptrs; i++) {
+		err = zap_set_idx_to_blk(zap, idx + i, blk, tx);
+		ASSERT0(err); /* we checked for i/o errors above */
+		if (err != 0)
+			break;
+	}
+
+	return (err);
+}
+
+#define	ZAP_PREFIX_HASH(pref, pref_len)	((pref) << (64 - (pref_len)))
+
+/*
+ * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl.
+ * If two leaves are siblings, their ranges are adjecent and contain the same
+ * number of entries. In order to find out if a leaf has a sibling, we need to
+ * check the range corresponding to the sibling leaf. There is no need to check
+ * all entries in the range, we only need to check the frist and the last one.
+ */
+static uint64_t
+check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len)
+{
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len);
+	uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len;
+	uint64_t nptrs = (1 << pref_diff);
+	uint64_t first;
+	uint64_t last;
+
+	ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
+
+	if (zap_idx_to_blk(zap, idx, &first) != 0)
+		return (0);
+
+	if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0)
+		return (0);
+
+	if (first != last)
+		return (0);
+	return (first);
+}
+
+static int
 zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
 {
 	uint64_t blk;
@@ -628,7 +707,7 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
 
 static int
 zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
-    void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
+    const void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
 {
 	zap_t *zap = zn->zn_zap;
 	uint64_t hash = zn->zn_hash;
@@ -648,6 +727,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
 		uint64_t object = zap->zap_object;
 
 		zap_put_leaf(l);
+		*lp = l = NULL;
 		zap_unlockdir(zap, tag);
 		err = zap_lockdir(os, object, tx, RW_WRITER,
 		    FALSE, FALSE, tag, &zn->zn_zap);
@@ -717,7 +797,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
 
 static void
 zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
-    void *tag, dmu_tx_t *tx)
+    const void *tag, dmu_tx_t *tx)
 {
 	zap_t *zap = zn->zn_zap;
 	int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
@@ -826,7 +906,7 @@ fzap_lookup(zap_name_t *zn,
 int
 fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, void *tag, dmu_tx_t *tx)
+    const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err;
@@ -857,28 +937,24 @@ retry:
 	} else if (err == EAGAIN) {
 		err = zap_expand_leaf(zn, l, tag, tx, &l);
 		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
-		if (err == 0) {
+		if (err == 0)
 			goto retry;
-		} else if (err == ENOSPC) {
-			/*
-			 * If we failed to expand the leaf, then bailout
-			 * as there is no point trying
-			 * zap_put_leaf_maybe_grow_ptrtbl().
-			 */
-			return (err);
-		}
 	}
 
 out:
-	if (zap != NULL)
-		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+	if (l != NULL) {
+		if (err == ENOSPC)
+			zap_put_leaf(l);
+		else
+			zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+	}
 	return (err);
 }
 
 int
 fzap_add(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
-    const void *val, void *tag, dmu_tx_t *tx)
+    const void *val, const void *tag, dmu_tx_t *tx)
 {
 	int err = fzap_check(zn, integer_size, num_integers);
 	if (err != 0)
@@ -891,7 +967,7 @@ fzap_add(zap_name_t *zn,
 int
 fzap_update(zap_name_t *zn,
     int integer_size, uint64_t num_integers, const void *val,
-    void *tag, dmu_tx_t *tx)
+    const void *tag, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err;
@@ -928,8 +1004,12 @@ retry:
 			goto retry;
 	}
 
-	if (zap != NULL)
-		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+	if (l != NULL) {
+		if (err == ENOSPC)
+			zap_put_leaf(l);
+		else
+			zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+	}
 	return (err);
 }
 
@@ -948,9 +1028,9 @@ fzap_length(zap_name_t *zn,
 	if (err != 0)
 		goto out;
 
-	if (integer_size != 0)
+	if (integer_size != NULL)
 		*integer_size = zeh.zeh_integer_size;
-	if (num_integers != 0)
+	if (num_integers != NULL)
 		*num_integers = zeh.zeh_num_integers;
 out:
 	zap_put_leaf(l);
@@ -971,6 +1051,10 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
 	if (err == 0) {
 		zap_entry_remove(&zeh);
 		zap_increment_num_entries(zn->zn_zap, -1, tx);
+
+		if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 &&
+		    zap_shrink_enabled)
+			return (zap_shrink(zn, l, tx));
 	}
 	zap_put_leaf(l);
 	return (err);
@@ -987,7 +1071,7 @@ fzap_prefetch(zap_name_t *zn)
 	if (zap_idx_to_blk(zap, idx, &blk) != 0)
 		return;
 	int bs = FZAP_BLOCK_SHIFT(zap);
-	dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
+	dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs,
 	    ZIO_PRIORITY_SYNC_READ);
 }
 
@@ -1230,18 +1314,24 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 	 */
 	if (zc->zc_hash == 0 && zap_iterate_prefetch &&
 	    zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
-		dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0,
 		    zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
 		    ZIO_PRIORITY_ASYNC_READ);
 	}
 
-	if (zc->zc_leaf &&
-	    (ZAP_HASH_IDX(zc->zc_hash,
-	    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
-	    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+	if (zc->zc_leaf) {
 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
-		zap_put_leaf(zc->zc_leaf);
-		zc->zc_leaf = NULL;
+
+		/*
+		 * The leaf was either shrunk or split.
+		 */
+		if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) ||
+		    (ZAP_HASH_IDX(zc->zc_hash,
+		    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
+		    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+			zap_put_leaf(zc->zc_leaf);
+			zc->zc_leaf = NULL;
+		}
 	}
 
 again:
@@ -1250,8 +1340,6 @@ again:
 		    &zc->zc_leaf);
 		if (err != 0)
 			return (err);
-	} else {
-		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
 	}
 	l = zc->zc_leaf;
 
@@ -1358,7 +1446,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
 	} else {
-		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
 		    ZIO_PRIORITY_SYNC_READ);
@@ -1368,7 +1456,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 			dmu_buf_t *db;
 			int err;
 
-			err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+			err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 			    (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
 			    FTAG, &db, DMU_READ_NO_PREFETCH);
 			if (err == 0) {
@@ -1380,7 +1468,242 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 	}
 }
 
-/* BEGIN CSTYLED */
+/*
+ * Find last allocated block and update freeblk.
+ */
+static void
+zap_trunc(zap_t *zap)
+{
+	uint64_t nentries;
+	uint64_t lastblk;
+
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) {
+		/* External ptrtbl */
+		nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+		lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk +
+		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1;
+	} else {
+		/* Embedded ptrtbl */
+		nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+		lastblk = 0;
+	}
+
+	for (uint64_t idx = 0; idx < nentries; idx++) {
+		uint64_t blk;
+		if (zap_idx_to_blk(zap, idx, &blk) != 0)
+			return;
+		if (blk > lastblk)
+			lastblk = blk;
+	}
+
+	ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk);
+
+	zap_f_phys(zap)->zap_freeblk = lastblk + 1;
+}
+
+/*
+ * ZAP shrinking algorithm.
+ *
+ * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf
+ * only if it has a sibling. Sibling leaves have the same prefix length and
+ * their prefixes differ only by the least significant (sibling) bit. We require
+ * both siblings to be empty. This eliminates a need to rehash the non-empty
+ * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl
+ * entries of the removed leaf to point out to the remaining leaf. Prefix length
+ * of the remaining leaf is decremented. As a result, it has a new prefix and it
+ * might have a new sibling. So, we repeat the process.
+ *
+ * Steps:
+ * 1. Check if a sibling leaf (sl) exists and it is empty.
+ * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1.
+ * 3. Release the sibling (sl) to derefer it again with WRITER lock.
+ * 4. Upgrade zapdir lock to WRITER (once).
+ * 5. Derefer released leaves again.
+ * 6. If it is needed, recheck whether both leaves are still siblings and empty.
+ * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of
+ * the remaining leaf (slbit 0).
+ * 8. Free disk block of the removed leaf (dmu_free_range).
+ * 9. Decrement prefix_len of the remaining leaf.
+ * 10. Repeat the steps.
+ */
+static int
+zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
+{
+	zap_t *zap = zn->zn_zap;
+	int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+	uint64_t hash = zn->zn_hash;
+	uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
+	uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+	boolean_t trunc = B_FALSE;
+	int err = 0;
+
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
+	ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix);
+
+	boolean_t writer = B_FALSE;
+
+	/*
+	 * To avoid deadlock always deref leaves in the same order -
+	 * sibling 0 first, then sibling 1.
+	 */
+	while (prefix_len) {
+		zap_leaf_t *sl;
+		int64_t prefix_diff = zt_shift - prefix_len;
+		uint64_t sl_prefix = prefix ^ 1;
+		uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len);
+		int slbit = prefix & 1;
+
+		ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
+
+		/*
+		 * Check if there is a sibling by reading ptrtbl ptrs.
+		 */
+		if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0)
+			break;
+
+		/*
+		 * sibling 1, unlock it - we haven't yet dereferenced sibling 0.
+		 */
+		if (slbit == 1) {
+			zap_put_leaf(l);
+			l = NULL;
+		}
+
+		/*
+		 * Dereference sibling leaf and check if it is empty.
+		 */
+		if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER,
+		    &sl)) != 0)
+			break;
+
+		ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix);
+
+		/*
+		 * Check if we have a sibling and it is empty.
+		 */
+		if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len ||
+		    zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) {
+			zap_put_leaf(sl);
+			break;
+		}
+
+		zap_put_leaf(sl);
+
+		/*
+		 * If there two empty sibling, we have work to do, so
+		 * we need to lock ZAP ptrtbl as WRITER.
+		 */
+		if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) {
+			/* We failed to upgrade */
+			if (l != NULL) {
+				zap_put_leaf(l);
+				l = NULL;
+			}
+
+			/*
+			 * Usually, the right way to upgrade from a READER lock
+			 * to a WRITER lock is to call zap_unlockdir() and
+			 * zap_lockdir(), but we do not have a tag. Instead,
+			 * we do it in more sophisticated way.
+			 */
+			rw_exit(&zap->zap_rwlock);
+			rw_enter(&zap->zap_rwlock, RW_WRITER);
+			dmu_buf_will_dirty(zap->zap_dbuf, tx);
+
+			zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+			writer = B_TRUE;
+		}
+
+		/*
+		 * Here we have WRITER lock for ptrtbl.
+		 * Now, we need a WRITER lock for both siblings leaves.
+		 * Also, we have to recheck if the leaves are still siblings
+		 * and still empty.
+		 */
+		if (l == NULL) {
+			/* sibling 0 */
+			if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash),
+			    tx, RW_WRITER, &l)) != 0)
+				break;
+
+			/*
+			 * The leaf isn't empty anymore or
+			 * it was shrunk/split while our locks were down.
+			 */
+			if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 ||
+			    zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len)
+				break;
+		}
+
+		/* sibling 1 */
+		if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx,
+		    RW_WRITER, &sl)) != 0)
+			break;
+
+		/*
+		 * The leaf isn't empty anymore or
+		 * it was shrunk/split while our locks were down.
+		 */
+		if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 ||
+		    zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) {
+			zap_put_leaf(sl);
+			break;
+		}
+
+		/* If we have gotten here, we have a leaf to collapse */
+		uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff;
+		uint64_t nptrs = (1ULL << prefix_diff);
+		uint64_t sl_blkid = sl->l_blkid;
+
+		/*
+		 * Set ptrtbl entries to point out to the slibling 0 blkid
+		 */
+		if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid,
+		    tx)) != 0) {
+			zap_put_leaf(sl);
+			break;
+		}
+
+		/*
+		 * Free sibling 1 disk block.
+		 */
+		int bs = FZAP_BLOCK_SHIFT(zap);
+		if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1)
+			trunc = B_TRUE;
+
+		(void) dmu_free_range(zap->zap_objset, zap->zap_object,
+		    sl_blkid << bs, 1 << bs, tx);
+		zap_put_leaf(sl);
+
+		zap_f_phys(zap)->zap_num_leafs--;
+
+		/*
+		 * Update prefix and prefix_len.
+		 */
+		zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1;
+		zap_leaf_phys(l)->l_hdr.lh_prefix_len--;
+
+		prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
+		prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+	}
+
+	if (trunc)
+		zap_trunc(zap);
+
+	if (l != NULL)
+		zap_put_leaf(l);
+
+	return (err);
+}
+
+/* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW,
 	"When iterating ZAP object, prefetch it");
-/* END CSTYLED */
+
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW,
+	"Enable ZAP shrinking");
diff --git a/sys/contrib/openzfs/module/zfs/zap_leaf.c b/sys/contrib/openzfs/module/zfs/zap_leaf.c
index aa6c298c3b4b..032aca92695e 100644
--- a/sys/contrib/openzfs/module/zfs/zap_leaf.c
+++ b/sys/contrib/openzfs/module/zfs/zap_leaf.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -41,7 +41,8 @@
 #include <sys/zap_leaf.h>
 #include <sys/arc.h>
 
-static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
+static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le,
+    uint16_t entry);
 
 #define	CHAIN_END 0xffff /* end of the chunk chain */
 
@@ -52,18 +53,6 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
 
 #define	LEAF_HASH_ENTPTR(l, h)	(&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
 
-extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
-
-static void
-zap_memset(void *a, int c, size_t n)
-{
-	char *cp = a;
-	char *cpend = cp + n;
-
-	while (cp < cpend)
-		*cp++ = c;
-}
-
 static void
 stv(int len, void *addr, uint64_t value)
 {
@@ -81,7 +70,7 @@ stv(int len, void *addr, uint64_t value)
 		*(uint64_t *)addr = value;
 		return;
 	default:
-		cmn_err(CE_PANIC, "bad int len %d", len);
+		PANIC("bad int len %d", len);
 	}
 }
 
@@ -98,13 +87,13 @@ ldv(int len, const void *addr)
 	case 8:
 		return (*(uint64_t *)addr);
 	default:
-		cmn_err(CE_PANIC, "bad int len %d", len);
+		PANIC("bad int len %d", len);
 	}
 	return (0xFEEDFACEDEADBEEFULL);
 }
 
 void
-zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
+zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t size)
 {
 	zap_leaf_t l;
 	dmu_buf_t l_dbuf;
@@ -121,10 +110,10 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
 	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
 	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
 
-	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
+	for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
 
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
+	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
 		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
 		struct zap_leaf_entry *le;
 
@@ -162,11 +151,11 @@ void
 zap_leaf_init(zap_leaf_t *l, boolean_t sort)
 {
 	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
-	zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
+	memset(&zap_leaf_phys(l)->l_hdr, 0,
 	    sizeof (struct zap_leaf_header));
-	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
 	}
@@ -187,7 +176,7 @@ zap_leaf_chunk_alloc(zap_leaf_t *l)
 {
 	ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
 
-	int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
+	uint_t chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
 
@@ -209,7 +198,7 @@ zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
 
 	zlf->lf_type = ZAP_CHUNK_FREE;
 	zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
-	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
+	memset(zlf->lf_pad, 0, sizeof (zlf->lf_pad)); /* help it to compress */
 	zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
 
 	zap_leaf_phys(l)->l_hdr.lh_nfree++;
@@ -225,28 +214,29 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf,
 {
 	uint16_t chunk_head;
 	uint16_t *chunkp = &chunk_head;
-	int byten = 0;
+	int byten = integer_size;
 	uint64_t value = 0;
 	int shift = (integer_size - 1) * 8;
 	int len = num_integers;
 
 	ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN);
 
+	if (len > 0)
+		value = ldv(integer_size, buf);
 	while (len > 0) {
 		uint16_t chunk = zap_leaf_chunk_alloc(l);
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 
 		la->la_type = ZAP_CHUNK_ARRAY;
 		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
-			if (byten == 0)
-				value = ldv(integer_size, buf);
 			la->la_array[i] = value >> shift;
 			value <<= 8;
-			if (++byten == integer_size) {
-				byten = 0;
-				buf += integer_size;
+			if (--byten == 0) {
 				if (--len == 0)
 					break;
+				byten = integer_size;
+				buf += integer_size;
+				value = ldv(integer_size, buf);
 			}
 		}
 
@@ -266,7 +256,7 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
 	*chunkp = CHAIN_END;
 
 	while (chunk != CHAIN_END) {
-		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
+		uint_t nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
 		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
 		    ZAP_CHUNK_ARRAY);
 		zap_leaf_chunk_free(l, chunk);
@@ -306,7 +296,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_array *la =
 			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
-			bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
+			memcpy(p, la->la_array, ZAP_LEAF_ARRAY_BYTES);
 			p += ZAP_LEAF_ARRAY_BYTES;
 			chunk = la->la_next;
 		}
@@ -317,7 +307,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
+		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
 			value = (value << 8) | la->la_array[i];
 			byten++;
 			if (byten == array_int_len) {
@@ -335,7 +325,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
 
 static boolean_t
 zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
-    int chunk, int array_numints)
+    uint_t chunk, int array_numints)
 {
 	int bseen = 0;
 
@@ -346,7 +336,7 @@ zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
 
 		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
 		    sizeof (*thiskey), array_numints, thiskey);
-		boolean_t match = bcmp(thiskey, zn->zn_key_orig,
+		boolean_t match = memcmp(thiskey, zn->zn_key_orig,
 		    array_numints * sizeof (*thiskey)) == 0;
 		kmem_free(thiskey, array_numints * sizeof (*thiskey));
 		return (match);
@@ -374,7 +364,8 @@ zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-		if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
+		if (memcmp(la->la_array, (char *)zn->zn_key_orig + bseen,
+		    toread))
 			break;
 		chunk = la->la_next;
 		bseen += toread;
@@ -563,7 +554,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
 
 	uint64_t valuelen = integer_size * num_integers;
 
-	int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
+	uint_t numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
 	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
 		return (SET_ERROR(E2BIG));
@@ -625,7 +616,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
 
 	/* link it into the hash chain */
 	/* XXX if we did the search above, we could just use that */
-	uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
+	uint16_t *chunkp = zap_leaf_rehash_entry(l, le, chunk);
 
 	zap_leaf_phys(l)->l_hdr.lh_nentries++;
 
@@ -647,7 +638,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
  * form of the name.  But all callers have one of these on hand anyway,
  * so might as well take advantage.  A cleaner but slower interface
  * would accept neither argument, and compute the normalized name as
- * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
+ * needed (using zap_name_alloc_str(zap_entry_read_name(zeh))).
  */
 boolean_t
 zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
@@ -668,7 +659,7 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
 			continue;
 
 		if (zn == NULL) {
-			zn = zap_name_alloc(zap, name, MT_NORMALIZE);
+			zn = zap_name_alloc_str(zap, name, MT_NORMALIZE);
 			allocdzn = B_TRUE;
 		}
 		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
@@ -688,9 +679,8 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
  */
 
 static uint16_t *
-zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
+zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, uint16_t entry)
 {
-	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
 	struct zap_leaf_entry *le2;
 	uint16_t *chunkp;
 
@@ -723,7 +713,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
 		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
 		struct zap_leaf_array *la =
 		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		int nextchunk = la->la_next;
+		uint_t nextchunk = la->la_next;
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
@@ -740,7 +730,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
 }
 
 static void
-zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
+zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl)
 {
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
@@ -749,7 +739,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 	struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
 	*nle = *le; /* structure assignment */
 
-	(void) zap_leaf_rehash_entry(nl, chunk);
+	(void) zap_leaf_rehash_entry(nl, nle, chunk);
 
 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
 	nle->le_value_chunk =
@@ -767,7 +757,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 void
 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 {
-	int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+	uint_t bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* set new prefix and prefix_len */
 	zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
@@ -778,7 +768,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* break existing hash chains */
-	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
 
 	if (sort)
@@ -793,7 +783,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 	 * but this accesses memory more sequentially, and when we're
 	 * called, the block is usually pretty full.
 	 */
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
 		if (le->le_type != ZAP_CHUNK_ENTRY)
 			continue;
@@ -801,14 +791,14 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 		if (le->le_hash & (1ULL << bit))
 			zap_leaf_transfer_entry(l, i, nl);
 		else
-			(void) zap_leaf_rehash_entry(l, i);
+			(void) zap_leaf_rehash_entry(l, le, i);
 	}
 }
 
 void
 zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 {
-	int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+	uint_t n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_leafs_with_2n_pointers[n]++;
@@ -824,9 +814,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_n_tenths_full[n]++;
 
-	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
-		int nentries = 0;
-		int chunk = zap_leaf_phys(l)->l_hash[i];
+	for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
+		uint_t nentries = 0;
+		uint_t chunk = zap_leaf_phys(l)->l_hash[i];
 
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_entry *le =
diff --git a/sys/contrib/openzfs/module/zfs/zap_micro.c b/sys/contrib/openzfs/module/zfs/zap_micro.c
index b4611685b204..d806988af96d 100644
--- a/sys/contrib/openzfs/module/zfs/zap_micro.c
+++ b/sys/contrib/openzfs/module/zfs/zap_micro.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -33,7 +33,7 @@
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
-#include <sys/avl.h>
+#include <sys/btree.h>
 #include <sys/arc.h>
 #include <sys/dmu_objset.h>
 
@@ -41,10 +41,10 @@
 #include <sys/sunddi.h>
 #endif
 
-extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
+int zap_micro_max_size = MZAP_MAX_BLKSZ;
 
 static int mzap_upgrade(zap_t **zapp,
-    void *tag, dmu_tx_t *tx, zap_flags_t flags);
+    const void *tag, dmu_tx_t *tx, zap_flags_t flags);
 
 uint64_t
 zap_getflags(zap_t *zap)
@@ -94,7 +94,7 @@ zap_hash(zap_name_t *zn)
 			    wp++, i++) {
 				uint64_t word = *wp;
 
-				for (int j = 0; j < zn->zn_key_intlen; j++) {
+				for (int j = 0; j < 8; j++) {
 					h = (h >> 8) ^
 					    zfs_crc64_table[(h ^ word) & 0xFF];
 					word >>= NBBY;
@@ -164,18 +164,25 @@ zap_match(zap_name_t *zn, const char *matchname)
 	}
 }
 
+static zap_name_t *
+zap_name_alloc(zap_t *zap)
+{
+	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+	zn->zn_zap = zap;
+	return (zn);
+}
+
 void
 zap_name_free(zap_name_t *zn)
 {
 	kmem_free(zn, sizeof (zap_name_t));
 }
 
-zap_name_t *
-zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
+static int
+zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt)
 {
-	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+	zap_t *zap = zn->zn_zap;
 
-	zn->zn_zap = zap;
 	zn->zn_key_intlen = sizeof (*key);
 	zn->zn_key_orig = key;
 	zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
@@ -196,17 +203,13 @@ zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
 		 * what the hash is computed from.
 		 */
 		if (zap_normalize(zap, key, zn->zn_normbuf,
-		    zap->zap_normflags) != 0) {
-			zap_name_free(zn);
-			return (NULL);
-		}
+		    zap->zap_normflags) != 0)
+			return (SET_ERROR(ENOTSUP));
 		zn->zn_key_norm = zn->zn_normbuf;
 		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
 	} else {
-		if (mt != 0) {
-			zap_name_free(zn);
-			return (NULL);
-		}
+		if (mt != 0)
+			return (SET_ERROR(ENOTSUP));
 		zn->zn_key_norm = zn->zn_key_orig;
 		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
 	}
@@ -219,13 +222,22 @@ zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
 		 * what the matching is based on.  (Not the hash!)
 		 */
 		if (zap_normalize(zap, key, zn->zn_normbuf,
-		    zn->zn_normflags) != 0) {
-			zap_name_free(zn);
-			return (NULL);
-		}
+		    zn->zn_normflags) != 0)
+			return (SET_ERROR(ENOTSUP));
 		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
 	}
 
+	return (0);
+}
+
+zap_name_t *
+zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt)
+{
+	zap_name_t *zn = zap_name_alloc(zap);
+	if (zap_name_init_str(zn, key, mt) != 0) {
+		zap_name_free(zn);
+		return (NULL);
+	}
 	return (zn);
 }
 
@@ -273,51 +285,56 @@ zap_byteswap(void *buf, size_t size)
 	}
 }
 
+__attribute__((always_inline)) inline
 static int
 mze_compare(const void *arg1, const void *arg2)
 {
 	const mzap_ent_t *mze1 = arg1;
 	const mzap_ent_t *mze2 = arg2;
 
-	int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash);
-	if (likely(cmp))
-		return (cmp);
-
-	return (TREE_CMP(mze1->mze_cd, mze2->mze_cd));
+	return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd,
+	    (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
 }
 
+ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t,
+    mze_compare)
+
 static void
-mze_insert(zap_t *zap, int chunkid, uint64_t hash)
+mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
 {
+	mzap_ent_t mze;
+
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
-	mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
-	mze->mze_chunkid = chunkid;
-	mze->mze_hash = hash;
-	mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
-	ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
-	avl_add(&zap->zap_m.zap_avl, mze);
+	mze.mze_chunkid = chunkid;
+	ASSERT0(hash & 0xffffffff);
+	mze.mze_hash = hash >> 32;
+	ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff);
+	mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd;
+	ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0);
+	zfs_btree_add(&zap->zap_m.zap_tree, &mze);
 }
 
 static mzap_ent_t *
-mze_find(zap_name_t *zn)
+mze_find(zap_name_t *zn, zfs_btree_index_t *idx)
 {
 	mzap_ent_t mze_tofind;
 	mzap_ent_t *mze;
-	avl_index_t idx;
-	avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
+	zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree;
 
 	ASSERT(zn->zn_zap->zap_ismicro);
 	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
 
-	mze_tofind.mze_hash = zn->zn_hash;
+	ASSERT0(zn->zn_hash & 0xffffffff);
+	mze_tofind.mze_hash = zn->zn_hash >> 32;
 	mze_tofind.mze_cd = 0;
 
-	mze = avl_find(avl, &mze_tofind, &idx);
+	mze = zfs_btree_find(tree, &mze_tofind, idx);
 	if (mze == NULL)
-		mze = avl_nearest(avl, idx, AVL_AFTER);
-	for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
+		mze = zfs_btree_next(tree, idx, idx);
+	for (; mze && mze->mze_hash == mze_tofind.mze_hash;
+	    mze = zfs_btree_next(tree, idx, idx)) {
 		ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
 		if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
 			return (mze);
@@ -330,18 +347,21 @@ static uint32_t
 mze_find_unused_cd(zap_t *zap, uint64_t hash)
 {
 	mzap_ent_t mze_tofind;
-	avl_index_t idx;
-	avl_tree_t *avl = &zap->zap_m.zap_avl;
+	zfs_btree_index_t idx;
+	zfs_btree_t *tree = &zap->zap_m.zap_tree;
 
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
+	ASSERT0(hash & 0xffffffff);
+	hash >>= 32;
 	mze_tofind.mze_hash = hash;
 	mze_tofind.mze_cd = 0;
 
 	uint32_t cd = 0;
-	for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx);
-	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+	for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
+	    mze && mze->mze_hash == hash;
+	    mze = zfs_btree_next(tree, &idx, &idx)) {
 		if (mze->mze_cd != cd)
 			break;
 		cd++;
@@ -366,16 +386,18 @@ mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
 {
 	zap_t *zap = zn->zn_zap;
 	mzap_ent_t mze_tofind;
-	mzap_ent_t *mze;
-	avl_index_t idx;
-	avl_tree_t *avl = &zap->zap_m.zap_avl;
+	zfs_btree_index_t idx;
+	zfs_btree_t *tree = &zap->zap_m.zap_tree;
 	uint32_t mzap_ents = 0;
 
+	ASSERT0(hash & 0xffffffff);
+	hash >>= 32;
 	mze_tofind.mze_hash = hash;
 	mze_tofind.mze_cd = 0;
 
-	for (mze = avl_find(avl, &mze_tofind, &idx);
-	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+	for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
+	    mze && mze->mze_hash == hash;
+	    mze = zfs_btree_next(tree, &idx, &idx)) {
 		mzap_ents++;
 	}
 
@@ -386,28 +408,14 @@ mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
 }
 
 static void
-mze_remove(zap_t *zap, mzap_ent_t *mze)
-{
-	ASSERT(zap->zap_ismicro);
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-	avl_remove(&zap->zap_m.zap_avl, mze);
-	kmem_free(mze, sizeof (mzap_ent_t));
-}
-
-static void
 mze_destroy(zap_t *zap)
 {
-	mzap_ent_t *mze;
-	void *avlcookie = NULL;
-
-	while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)))
-		kmem_free(mze, sizeof (mzap_ent_t));
-	avl_destroy(&zap->zap_m.zap_avl);
+	zfs_btree_clear(&zap->zap_m.zap_tree);
+	zfs_btree_destroy(&zap->zap_m.zap_tree);
 }
 
 static zap_t *
-mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
+mzap_open(dmu_buf_t *db)
 {
 	zap_t *winner;
 	uint64_t *zap_hdr = (uint64_t *)db->db_data;
@@ -419,8 +427,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 	zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
 	rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
 	rw_enter(&zap->zap_rwlock, RW_WRITER);
-	zap->zap_objset = os;
-	zap->zap_object = obj;
+	zap->zap_objset = dmu_buf_get_objset(db);
+	zap->zap_object = db->db_object;
 	zap->zap_dbuf = db;
 
 	if (zap_block_type != ZBT_MICRO) {
@@ -450,21 +458,26 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 		zap->zap_salt = zap_m_phys(zap)->mz_salt;
 		zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
 		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
-		avl_create(&zap->zap_m.zap_avl, mze_compare,
-		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
 
-		for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+		/*
+		 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove()
+		 * overhead on massive inserts below.  It still allows to store
+		 * 62 entries before we have to add 2KB B-tree core node.
+		 */
+		zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
+		    mze_find_in_buf, sizeof (mzap_ent_t), 512);
+
+		zap_name_t *zn = zap_name_alloc(zap);
+		for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 			mzap_ent_phys_t *mze =
 			    &zap_m_phys(zap)->mz_chunk[i];
 			if (mze->mze_name[0]) {
-				zap_name_t *zn;
-
 				zap->zap_m.zap_num_entries++;
-				zn = zap_name_alloc(zap, mze->mze_name, 0);
+				zap_name_init_str(zn, mze->mze_name, 0);
 				mze_insert(zap, i, zn->zn_hash);
-				zap_name_free(zn);
 			}
 		}
+		zap_name_free(zn);
 	} else {
 		zap->zap_salt = zap_f_phys(zap)->zap_salt;
 		zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
@@ -505,7 +518,7 @@ handle_winner:
  * have the specified tag.
  */
 static int
-zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
+zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
 	ASSERT0(db->db_offset);
@@ -515,13 +528,13 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
 
 	*zapp = NULL;
 
-	dmu_object_info_from_db(db, &doi);
+	dmu_object_info_from_dnode(dn, &doi);
 	if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
 		return (SET_ERROR(EINVAL));
 
 	zap_t *zap = dmu_buf_get_user(db);
 	if (zap == NULL) {
-		zap = mzap_open(os, obj, db);
+		zap = mzap_open(db);
 		if (zap == NULL) {
 			/*
 			 * mzap_open() didn't like what it saw on-disk.
@@ -550,6 +563,7 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
 	}
 
 	zap->zap_objset = os;
+	zap->zap_dnode = dn;
 
 	if (lt == RW_WRITER)
 		dmu_buf_will_dirty(db, tx);
@@ -561,7 +575,7 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
 	if (zap->zap_ismicro && tx && adding &&
 	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
 		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
-		if (newsz > MZAP_MAX_BLKSZ) {
+		if (newsz > zap_micro_max_size) {
 			dprintf("upgrading obj %llu: num_entries=%u\n",
 			    (u_longlong_t)obj, zap->zap_m.zap_num_entries);
 			*zapp = zap;
@@ -581,60 +595,58 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
 
 static int
 zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
-    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+    krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
+    zap_t **zapp)
 {
 	dmu_buf_t *db;
+	int err;
 
-	int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
-	if (err != 0) {
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0)
 		return (err);
-	}
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
-	}
-#endif
-
-	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
-	if (err != 0) {
+	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0)
 		dmu_buf_rele(db, tag);
-	}
+	else
+		VERIFY(dnode_add_ref(dn, tag));
 	return (err);
 }
 
 int
 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
-    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+    krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
+    zap_t **zapp)
 {
+	dnode_t *dn;
 	dmu_buf_t *db;
+	int err;
 
-	int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	err = dnode_hold(os, obj, tag, &dn);
 	if (err != 0)
 		return (err);
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		dnode_rele(dn, tag);
+		return (err);
 	}
-#endif
-	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
-	if (err != 0)
+	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0) {
 		dmu_buf_rele(db, tag);
+		dnode_rele(dn, tag);
+	}
 	return (err);
 }
 
 void
-zap_unlockdir(zap_t *zap, void *tag)
+zap_unlockdir(zap_t *zap, const void *tag)
 {
 	rw_exit(&zap->zap_rwlock);
+	dnode_rele(zap->zap_dnode, tag);
 	dmu_buf_rele(zap->zap_dbuf, tag);
 }
 
 static int
-mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
+mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
 {
 	int err = 0;
 	zap_t *zap = *zapp;
@@ -643,7 +655,7 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
 
 	int sz = zap->zap_dbuf->db_size;
 	mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
-	bcopy(zap->zap_dbuf->db_data, mzp, sz);
+	memcpy(mzp, zap->zap_dbuf->db_data, sz);
 	int nchunks = zap->zap_m.zap_num_chunks;
 
 	if (!flags) {
@@ -657,24 +669,25 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
 
 	dprintf("upgrading obj=%llu with %u chunks\n",
 	    (u_longlong_t)zap->zap_object, nchunks);
-	/* XXX destroy the avl later, so we can use the stored hash value */
+	/* XXX destroy the tree later, so we can use the stored hash value */
 	mze_destroy(zap);
 
 	fzap_upgrade(zap, tx, flags);
 
+	zap_name_t *zn = zap_name_alloc(zap);
 	for (int i = 0; i < nchunks; i++) {
 		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
 		if (mze->mze_name[0] == 0)
 			continue;
 		dprintf("adding %s=%llu\n",
 		    mze->mze_name, (u_longlong_t)mze->mze_value);
-		zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0);
+		zap_name_init_str(zn, mze->mze_name, 0);
 		/* If we fail here, we would end up losing entries */
 		VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
 		    tag, tx));
 		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
-		zap_name_free(zn);
 	}
+	zap_name_free(zn);
 	vmem_free(mzp, sz);
 	*zapp = zap;
 	return (0);
@@ -714,7 +727,8 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
 	if (flags != 0) {
 		zap_t *zap;
 		/* Only fat zap supports flags; upgrade immediately. */
-		VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
+		VERIFY(dnode_add_ref(dn, FTAG));
+		VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER,
 		    B_FALSE, B_FALSE, &zap));
 		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
 		zap_unlockdir(zap, FTAG);
@@ -727,7 +741,7 @@ static uint64_t
 zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize,
-    dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+    dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
 {
 	uint64_t obj;
 
@@ -859,7 +873,7 @@ uint64_t
 zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize,
-    dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+    dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
 {
 	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
 	    indirect_blockshift, bonustype, bonuslen, dnodesize,
@@ -916,22 +930,23 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
  * See also the comment above zap_entry_normalization_conflict().
  */
 static boolean_t
-mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
+mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze,
+    zfs_btree_index_t *idx)
 {
-	int direction = AVL_BEFORE;
 	boolean_t allocdzn = B_FALSE;
+	mzap_ent_t *other;
+	zfs_btree_index_t oidx;
 
 	if (zap->zap_normflags == 0)
 		return (B_FALSE);
 
-again:
-	for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
+	for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx);
 	    other && other->mze_hash == mze->mze_hash;
-	    other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
+	    other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) {
 
 		if (zn == NULL) {
-			zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
-			    MT_NORMALIZE);
+			zn = zap_name_alloc_str(zap,
+			    MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
 			allocdzn = B_TRUE;
 		}
 		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
@@ -941,9 +956,20 @@ again:
 		}
 	}
 
-	if (direction == AVL_BEFORE) {
-		direction = AVL_AFTER;
-		goto again;
+	for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx);
+	    other && other->mze_hash == mze->mze_hash;
+	    other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) {
+
+		if (zn == NULL) {
+			zn = zap_name_alloc_str(zap,
+			    MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
+			allocdzn = B_TRUE;
+		}
+		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
+			if (allocdzn)
+				zap_name_free(zn);
+			return (B_TRUE);
+		}
 	}
 
 	if (allocdzn)
@@ -971,7 +997,7 @@ zap_lookup_impl(zap_t *zap, const char *name,
 {
 	int err = 0;
 
-	zap_name_t *zn = zap_name_alloc(zap, name, mt);
+	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
 	if (zn == NULL)
 		return (SET_ERROR(ENOTSUP));
 
@@ -979,7 +1005,8 @@ zap_lookup_impl(zap_t *zap, const char *name,
 		err = fzap_lookup(zn, integer_size, num_integers, buf,
 		    realname, rn_len, ncp);
 	} else {
-		mzap_ent_t *mze = mze_find(zn);
+		zfs_btree_index_t idx;
+		mzap_ent_t *mze = mze_find(zn, &idx);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 		} else {
@@ -990,11 +1017,13 @@ zap_lookup_impl(zap_t *zap, const char *name,
 			} else {
 				*(uint64_t *)buf =
 				    MZE_PHYS(zap, mze)->mze_value;
-				(void) strlcpy(realname,
-				    MZE_PHYS(zap, mze)->mze_name, rn_len);
+				if (realname != NULL)
+					(void) strlcpy(realname,
+					    MZE_PHYS(zap, mze)->mze_name,
+					    rn_len);
 				if (ncp) {
 					*ncp = mzap_normalization_conflict(zap,
-					    zn, mze);
+					    zn, mze, &idx);
 				}
 			}
 		}
@@ -1031,7 +1060,7 @@ zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
-	zn = zap_name_alloc(zap, name, 0);
+	zn = zap_name_alloc_str(zap, name, 0);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
@@ -1134,7 +1163,7 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
-	zap_name_t *zn = zap_name_alloc(zap, name, 0);
+	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
@@ -1142,7 +1171,8 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
 	if (!zap->zap_ismicro) {
 		err = fzap_length(zn, integer_size, num_integers);
 	} else {
-		mzap_ent_t *mze = mze_find(zn);
+		zfs_btree_index_t idx;
+		mzap_ent_t *mze = mze_find(zn, &idx);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 		} else {
@@ -1182,7 +1212,7 @@ static void
 mzap_addent(zap_name_t *zn, uint64_t value)
 {
 	zap_t *zap = zn->zn_zap;
-	int start = zap->zap_m.zap_alloc_next;
+	uint16_t start = zap->zap_m.zap_alloc_next;
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
@@ -1198,7 +1228,7 @@ mzap_addent(zap_name_t *zn, uint64_t value)
 	ASSERT(cd < zap_maxcd(zap));
 
 again:
-	for (int i = start; i < zap->zap_m.zap_num_chunks; i++) {
+	for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) {
 		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
 		if (mze->mze_name[0] == 0) {
 			mze->mze_value = value;
@@ -1224,12 +1254,12 @@ again:
 static int
 zap_add_impl(zap_t *zap, const char *key,
     int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx, void *tag)
+    const void *val, dmu_tx_t *tx, const void *tag)
 {
 	const uint64_t *intval = val;
 	int err = 0;
 
-	zap_name_t *zn = zap_name_alloc(zap, key, 0);
+	zap_name_t *zn = zap_name_alloc_str(zap, key, 0);
 	if (zn == NULL) {
 		zap_unlockdir(zap, tag);
 		return (SET_ERROR(ENOTSUP));
@@ -1247,7 +1277,8 @@ zap_add_impl(zap_t *zap, const char *key,
 		}
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else {
-		if (mze_find(zn) != NULL) {
+		zfs_btree_index_t idx;
+		if (mze_find(zn, &idx) != NULL) {
 			err = SET_ERROR(EEXIST);
 		} else {
 			mzap_addent(zn, *intval);
@@ -1292,6 +1323,26 @@ zap_add_by_dnode(dnode_t *dn, const char *key,
 	return (err);
 }
 
+static int
+zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx, const void *tag)
+{
+	int err;
+
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
+	zap = zn->zn_zap;	/* fzap_add() may change zap */
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_add() failed */
+		zap_unlockdir(zap, tag);
+	return (err);
+}
+
 int
 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
@@ -1303,16 +1354,26 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
-	zap = zn->zn_zap;	/* fzap_add() may change zap */
-	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap, FTAG);
+	err = zap_add_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_add_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
@@ -1327,7 +1388,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
-	zap_name_t *zn = zap_name_alloc(zap, name, 0);
+	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
@@ -1348,7 +1409,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 		}
 		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else {
-		mzap_ent_t *mze = mze_find(zn);
+		zfs_btree_index_t idx;
+		mzap_ent_t *mze = mze_find(zn, &idx);
 		if (mze != NULL) {
 			MZE_PHYS(zap, mze)->mze_value = *intval;
 		} else {
@@ -1362,27 +1424,56 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
-int
-zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+static int
+zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
+    const void *tag)
 {
-	zap_t *zap;
+	int err;
 
-	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
+		zap_unlockdir(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
-	err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
+	err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
 	zap = zn->zn_zap;	/* fzap_update() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap, FTAG);
+		zap_unlockdir(zap, tag);
+	return (err);
+}
+
+int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers, const void *val,
+    dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_update_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_update_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_update_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_update_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
@@ -1398,20 +1489,20 @@ zap_remove_impl(zap_t *zap, const char *name,
 {
 	int err = 0;
 
-	zap_name_t *zn = zap_name_alloc(zap, name, mt);
+	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
 	if (zn == NULL)
 		return (SET_ERROR(ENOTSUP));
 	if (!zap->zap_ismicro) {
 		err = fzap_remove(zn, tx);
 	} else {
-		mzap_ent_t *mze = mze_find(zn);
+		zfs_btree_index_t idx;
+		mzap_ent_t *mze = mze_find(zn, &idx);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			zap->zap_m.zap_num_entries--;
-			bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
-			    sizeof (mzap_ent_phys_t));
-			mze_remove(zap, mze);
+			memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t));
+			zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx);
 		}
 	}
 	zap_name_free(zn);
@@ -1447,6 +1538,23 @@ zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
 	return (err);
 }
 
+static int
+zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
+    dmu_tx_t *tx, const void *tag)
+{
+	int err;
+
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_remove(zn, tx);
+	zap_name_free(zn);
+	zap_unlockdir(zap, tag);
+	return (err);
+}
+
 int
 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, dmu_tx_t *tx)
@@ -1457,14 +1565,23 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_remove(zn, tx);
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
+	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
+	/* zap_remove_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+    dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
+	/* zap_remove_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
@@ -1582,29 +1699,30 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 	if (!zc->zc_zap->zap_ismicro) {
 		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
 	} else {
-		avl_index_t idx;
+		zfs_btree_index_t idx;
 		mzap_ent_t mze_tofind;
 
-		mze_tofind.mze_hash = zc->zc_hash;
+		mze_tofind.mze_hash = zc->zc_hash >> 32;
 		mze_tofind.mze_cd = zc->zc_cd;
 
-		mzap_ent_t *mze =
-		    avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
+		mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree,
+		    &mze_tofind, &idx);
 		if (mze == NULL) {
-			mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
-			    idx, AVL_AFTER);
+			mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree,
+			    &idx, &idx);
 		}
 		if (mze) {
 			mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
 			ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
 			za->za_normalization_conflict =
-			    mzap_normalization_conflict(zc->zc_zap, NULL, mze);
+			    mzap_normalization_conflict(zc->zc_zap, NULL,
+			    mze, &idx);
 			za->za_integer_length = 8;
 			za->za_num_integers = 1;
 			za->za_first_integer = mzep->mze_value;
 			(void) strlcpy(za->za_name, mzep->mze_name,
 			    sizeof (za->za_name));
-			zc->zc_hash = mze->mze_hash;
+			zc->zc_hash = (uint64_t)mze->mze_hash << 32;
 			zc->zc_cd = mze->mze_cd;
 			err = 0;
 		} else {
@@ -1634,7 +1752,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
 	if (err != 0)
 		return (err);
 
-	bzero(zs, sizeof (zap_stats_t));
+	memset(zs, 0, sizeof (zap_stats_t));
 
 	if (zap->zap_ismicro) {
 		zs->zs_blocksize = zap->zap_dbuf->db_size;
@@ -1669,14 +1787,17 @@ EXPORT_SYMBOL(zap_prefetch_uint64);
 EXPORT_SYMBOL(zap_add);
 EXPORT_SYMBOL(zap_add_by_dnode);
 EXPORT_SYMBOL(zap_add_uint64);
+EXPORT_SYMBOL(zap_add_uint64_by_dnode);
 EXPORT_SYMBOL(zap_update);
 EXPORT_SYMBOL(zap_update_uint64);
+EXPORT_SYMBOL(zap_update_uint64_by_dnode);
 EXPORT_SYMBOL(zap_length);
 EXPORT_SYMBOL(zap_length_uint64);
 EXPORT_SYMBOL(zap_remove);
 EXPORT_SYMBOL(zap_remove_by_dnode);
 EXPORT_SYMBOL(zap_remove_norm);
 EXPORT_SYMBOL(zap_remove_uint64);
+EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
 EXPORT_SYMBOL(zap_count);
 EXPORT_SYMBOL(zap_value_search);
 EXPORT_SYMBOL(zap_join);
@@ -1695,4 +1816,8 @@ EXPORT_SYMBOL(zap_cursor_advance);
 EXPORT_SYMBOL(zap_cursor_serialize);
 EXPORT_SYMBOL(zap_cursor_init_serialized);
 EXPORT_SYMBOL(zap_get_stats);
+
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW,
+	"Maximum micro ZAP size, before converting to a fat ZAP, in bytes");
 #endif
diff --git a/sys/contrib/openzfs/module/zfs/zcp.c b/sys/contrib/openzfs/module/zfs/zcp.c
index f724b44baf1d..7c279162a9d1 100644
--- a/sys/contrib/openzfs/module/zfs/zcp.c
+++ b/sys/contrib/openzfs/module/zfs/zcp.c
@@ -108,9 +108,9 @@
 
 #define	ZCP_NVLIST_MAX_DEPTH 20
 
-uint64_t zfs_lua_check_instrlimit_interval = 100;
-unsigned long zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT;
-unsigned long zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT;
+static const uint64_t zfs_lua_check_instrlimit_interval = 100;
+uint64_t zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT;
+uint64_t zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT;
 
 /*
  * Forward declarations for mutually recursive functions
@@ -277,9 +277,9 @@ zcp_table_to_nvlist(lua_State *state, int index, int depth)
 			}
 			break;
 		case LUA_TNUMBER:
-			VERIFY3U(sizeof (buf), >,
-			    snprintf(buf, sizeof (buf), "%lld",
-			    (longlong_t)lua_tonumber(state, -2)));
+			(void) snprintf(buf, sizeof (buf), "%lld",
+			    (longlong_t)lua_tonumber(state, -2));
+
 			key = buf;
 			if (saw_str_could_collide) {
 				key_could_collide = B_TRUE;
@@ -544,7 +544,7 @@ zcp_nvpair_value_to_lua(lua_State *state, nvpair_t *pair,
 		    fnvpair_value_nvlist(pair), errbuf, errbuf_len);
 		break;
 	case DATA_TYPE_STRING_ARRAY: {
-		char **strarr;
+		const char **strarr;
 		uint_t nelem;
 		(void) nvpair_value_string_array(pair, &strarr, &nelem);
 		lua_newtable(state);
@@ -622,7 +622,7 @@ zcp_dataset_hold_error(lua_State *state, dsl_pool_t *dp, const char *dsname,
  */
 dsl_dataset_t *
 zcp_dataset_hold(lua_State *state, dsl_pool_t *dp, const char *dsname,
-    void *tag)
+    const void *tag)
 {
 	dsl_dataset_t *ds;
 	int error = dsl_dataset_hold(dp, dsname, tag, &ds);
@@ -631,11 +631,11 @@ zcp_dataset_hold(lua_State *state, dsl_pool_t *dp, const char *dsname,
 }
 
 static int zcp_debug(lua_State *);
-static zcp_lib_info_t zcp_debug_info = {
+static const zcp_lib_info_t zcp_debug_info = {
 	.name = "debug",
 	.func = zcp_debug,
 	.pargs = {
-	    { .za_name = "debug string", .za_lua_type = LUA_TSTRING},
+	    { .za_name = "debug string", .za_lua_type = LUA_TSTRING },
 	    {NULL, 0}
 	},
 	.kwargs = {
@@ -648,7 +648,7 @@ zcp_debug(lua_State *state)
 {
 	const char *dbgstring;
 	zcp_run_info_t *ri = zcp_run_info(state);
-	zcp_lib_info_t *libinfo = &zcp_debug_info;
+	const zcp_lib_info_t *libinfo = &zcp_debug_info;
 
 	zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
 
@@ -661,11 +661,11 @@ zcp_debug(lua_State *state)
 }
 
 static int zcp_exists(lua_State *);
-static zcp_lib_info_t zcp_exists_info = {
+static const zcp_lib_info_t zcp_exists_info = {
 	.name = "exists",
 	.func = zcp_exists,
 	.pargs = {
-	    { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+	    { .za_name = "dataset", .za_lua_type = LUA_TSTRING },
 	    {NULL, 0}
 	},
 	.kwargs = {
@@ -678,7 +678,7 @@ zcp_exists(lua_State *state)
 {
 	zcp_run_info_t *ri = zcp_run_info(state);
 	dsl_pool_t *dp = ri->zri_pool;
-	zcp_lib_info_t *libinfo = &zcp_exists_info;
+	const zcp_lib_info_t *libinfo = &zcp_exists_info;
 
 	zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
 
@@ -769,10 +769,10 @@ zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize)
 	}
 }
 
-/* ARGSUSED */
 static void
 zcp_lua_counthook(lua_State *state, lua_Debug *ar)
 {
+	(void) ar;
 	lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
 	zcp_run_info_t *ri = lua_touserdata(state, -1);
 
@@ -780,8 +780,7 @@ zcp_lua_counthook(lua_State *state, lua_Debug *ar)
 	 * Check if we were canceled while waiting for the
 	 * txg to sync or from our open context thread
 	 */
-	if (ri->zri_canceled ||
-	    (!ri->zri_sync && issig(JUSTLOOKING) && issig(FORREAL))) {
+	if (ri->zri_canceled || (!ri->zri_sync && issig())) {
 		ri->zri_canceled = B_TRUE;
 		(void) lua_pushstring(state, "Channel program was canceled.");
 		(void) lua_error(state);
@@ -958,12 +957,12 @@ zcp_eval_impl(dmu_tx_t *tx, zcp_run_info_t *ri)
 }
 
 static void
-zcp_pool_error(zcp_run_info_t *ri, const char *poolname)
+zcp_pool_error(zcp_run_info_t *ri, const char *poolname, int error)
 {
 	ri->zri_result = SET_ERROR(ECHRNG);
 	lua_settop(ri->zri_state, 0);
-	(void) lua_pushfstring(ri->zri_state, "Could not open pool: %s",
-	    poolname);
+	(void) lua_pushfstring(ri->zri_state, "Could not open pool: %s "
+	    "errno: %d", poolname, error);
 	zcp_convert_return_values(ri->zri_state, ri->zri_outnvl,
 	    ZCP_RET_ERROR, &ri->zri_result);
 
@@ -974,10 +973,10 @@ zcp_pool_error(zcp_run_info_t *ri, const char *poolname)
  * The txg_wait_synced_sig will continue to wait for the txg to complete
  * after calling this callback.
  */
-/* ARGSUSED */
 static void
 zcp_eval_sig(void *arg, dmu_tx_t *tx)
 {
+	(void) tx;
 	zcp_run_info_t *ri = arg;
 
 	ri->zri_canceled = B_TRUE;
@@ -1013,7 +1012,7 @@ zcp_eval_open(zcp_run_info_t *ri, const char *poolname)
 
 	error = dsl_pool_hold(poolname, FTAG, &dp);
 	if (error != 0) {
-		zcp_pool_error(ri, poolname);
+		zcp_pool_error(ri, poolname, error);
 		return;
 	}
 
@@ -1159,7 +1158,7 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync,
 		err = dsl_sync_task_sig(poolname, NULL, zcp_eval_sync,
 		    zcp_eval_sig, &runinfo, 0, ZFS_SPACE_CHECK_ZCP_EVAL);
 		if (err != 0)
-			zcp_pool_error(&runinfo, poolname);
+			zcp_pool_error(&runinfo, poolname, err);
 	} else {
 		zcp_eval_open(&runinfo, poolname);
 	}
@@ -1443,10 +1442,8 @@ zcp_parse_args(lua_State *state, const char *fname, const zcp_arg_t *pargs,
 	}
 }
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_instrlimit, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_instrlimit, U64, ZMOD_RW,
 	"Max instruction limit that can be specified for a channel program");
 
-ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_memlimit, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_memlimit, U64, ZMOD_RW,
 	"Max memory limit that can be specified for a channel program");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zcp_get.c b/sys/contrib/openzfs/module/zfs/zcp_get.c
index 7256e4de1915..6fd45151d92a 100644
--- a/sys/contrib/openzfs/module/zfs/zcp_get.c
+++ b/sys/contrib/openzfs/module/zfs/zcp_get.c
@@ -76,9 +76,8 @@ get_objset_type(dsl_dataset_t *ds, zfs_type_t *type)
 static int
 get_objset_type_name(dsl_dataset_t *ds, char *str)
 {
-	int error;
-	zfs_type_t type;
-	error = get_objset_type(ds, &type);
+	zfs_type_t type = ZFS_TYPE_INVALID;
+	int error = get_objset_type(ds, &type);
 	if (error != 0)
 		return (error);
 	switch (type) {
@@ -230,7 +229,7 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname,
 	char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
 	char setpoint[ZFS_MAX_DATASET_NAME_LEN] =
 	    "Internal error - setpoint not determined";
-	zfs_type_t ds_type;
+	zfs_type_t ds_type = ZFS_TYPE_INVALID;
 	zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);
 	(void) get_objset_type(ds, &ds_type);
 
@@ -344,19 +343,13 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname,
 		}
 		break;
 	case ZFS_PROP_RECEIVE_RESUME_TOKEN: {
-		char *token = get_receive_resume_stats_impl(ds);
-
-		(void) strlcpy(strval, token, ZAP_MAXVALUELEN);
-		if (strcmp(strval, "") == 0) {
-			char *childval = get_child_receive_stats(ds);
-
-			(void) strlcpy(strval, childval, ZAP_MAXVALUELEN);
-			if (strcmp(strval, "") == 0)
-				error = ENOENT;
-
-			kmem_strfree(childval);
+		char *token = get_receive_resume_token(ds);
+		if (token != NULL) {
+			(void) strlcpy(strval, token, ZAP_MAXVALUELEN);
+			kmem_strfree(token);
+		} else {
+			error = ENOENT;
 		}
-		kmem_strfree(token);
 		break;
 	}
 	case ZFS_PROP_VOLSIZE:
@@ -398,7 +391,7 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname,
 		dsl_dataset_crypt_stats(ds, nvl);
 		if (nvlist_lookup_nvlist(nvl, zfs_prop_to_name(zfs_prop),
 		    &propval) == 0) {
-			char *source;
+			const char *source;
 
 			(void) nvlist_lookup_uint64(propval, ZPROP_VALUE,
 			    &numval);
@@ -410,6 +403,10 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname,
 		break;
 	}
 
+	case ZFS_PROP_SNAPSHOTS_CHANGED:
+		numval = dsl_dir_snap_cmtime(ds->ds_dir).tv_sec;
+		break;
+
 	default:
 		/* Did not match these props, check in the dsl_dir */
 		error = get_dsl_dir_prop(ds, zfs_prop, &numval);
@@ -470,11 +467,13 @@ get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop)
 	} else {
 		error = dsl_prop_get_ds(ds, prop_name, sizeof (numval),
 		    1, &numval, setpoint);
-
+		if (error != 0)
+			goto out;
 #ifdef _KERNEL
 		/* Fill in temporary value for prop, if applicable */
 		(void) zfs_get_temporary_prop(ds, zfs_prop, &numval, setpoint);
 #else
+		kmem_free(strval, ZAP_MAXVALUELEN);
 		return (luaL_error(state,
 		    "temporary properties only supported in kernel mode",
 		    prop_name));
@@ -491,6 +490,7 @@ get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop)
 				(void) lua_pushnumber(state, numval);
 		}
 	}
+out:
 	kmem_free(strval, ZAP_MAXVALUELEN);
 	if (error == 0)
 		get_prop_src(state, setpoint, zfs_prop);
@@ -503,8 +503,7 @@ get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop)
 boolean_t
 prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop)
 {
-	int error;
-	zfs_type_t zfs_type;
+	zfs_type_t zfs_type = ZFS_TYPE_INVALID;
 
 	/* properties not supported */
 	if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) ||
@@ -515,7 +514,7 @@ prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop)
 	if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir)))
 		return (B_FALSE);
 
-	error = get_objset_type(ds, &zfs_type);
+	int error = get_objset_type(ds, &zfs_type);
 	if (error != 0)
 		return (B_FALSE);
 	return (zfs_prop_valid_for_type(zfs_prop, zfs_type, B_FALSE));
@@ -611,8 +610,7 @@ parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type,
 		 */
 		int domain_len = strrchr(cp, '-') - cp;
 		domain_val = kmem_alloc(domain_len + 1, KM_SLEEP);
-		(void) strncpy(domain_val, cp, domain_len);
-		domain_val[domain_len] = '\0';
+		(void) strlcpy(domain_val, cp, domain_len + 1);
 		cp += domain_len + 1;
 
 		(void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
@@ -743,12 +741,12 @@ zcp_get_written_prop(lua_State *state, dsl_pool_t *dp,
 }
 
 static int zcp_get_prop(lua_State *state);
-static zcp_lib_info_t zcp_get_prop_info = {
+static const zcp_lib_info_t zcp_get_prop_info = {
 	.name = "get_prop",
 	.func = zcp_get_prop,
 	.pargs = {
-	    { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
-	    { .za_name = "property", .za_lua_type =  LUA_TSTRING},
+	    { .za_name = "dataset", .za_lua_type = LUA_TSTRING },
+	    { .za_name = "property", .za_lua_type =  LUA_TSTRING },
 	    {NULL, 0}
 	},
 	.kwargs = {
@@ -762,7 +760,7 @@ zcp_get_prop(lua_State *state)
 	const char *dataset_name;
 	const char *property_name;
 	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
-	zcp_lib_info_t *libinfo = &zcp_get_prop_info;
+	const zcp_lib_info_t *libinfo = &zcp_get_prop_info;
 
 	zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
 
diff --git a/sys/contrib/openzfs/module/zfs/zcp_iter.c b/sys/contrib/openzfs/module/zfs/zcp_iter.c
index f727c56f212d..2da0bf9740e5 100644
--- a/sys/contrib/openzfs/module/zfs/zcp_iter.c
+++ b/sys/contrib/openzfs/module/zfs/zcp_iter.c
@@ -107,12 +107,12 @@ zcp_clones_iter(lua_State *state)
 }
 
 static int zcp_clones_list(lua_State *);
-static zcp_list_info_t zcp_clones_list_info = {
+static const zcp_list_info_t zcp_clones_list_info = {
 	.name = "clones",
 	.func = zcp_clones_list,
 	.gc = NULL,
 	.pargs = {
-	    { .za_name = "snapshot", .za_lua_type = LUA_TSTRING},
+	    { .za_name = "snapshot", .za_lua_type = LUA_TSTRING },
 	    {NULL, 0}
 	},
 	.kwargs = {
@@ -194,12 +194,12 @@ zcp_snapshots_iter(lua_State *state)
 }
 
 static int zcp_snapshots_list(lua_State *);
-static zcp_list_info_t zcp_snapshots_list_info = {
+static const zcp_list_info_t zcp_snapshots_list_info = {
 	.name = "snapshots",
 	.func = zcp_snapshots_list,
 	.gc = NULL,
 	.pargs = {
-	    { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING},
+	    { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING },
 	    {NULL, 0}
 	},
 	.kwargs = {
@@ -281,12 +281,12 @@ zcp_children_iter(lua_State *state)
 }
 
 static int zcp_children_list(lua_State *);
-static zcp_list_info_t zcp_children_list_info = {
+static const zcp_list_info_t zcp_children_list_info = {
 	.name = "children",
 	.func = zcp_children_list,
 	.gc = NULL,
 	.pargs = {
-	    { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING},
+	    { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING },
 	    {NULL, 0}
 	},
 	.kwargs = {
@@ -333,7 +333,7 @@ zcp_user_props_list_gc(lua_State *state)
 static int
 zcp_user_props_iter(lua_State *state)
 {
-	char *source, *val;
+	const char *source, *val;
 	nvlist_t *nvprop;
 	nvlist_t **props = lua_touserdata(state, lua_upvalueindex(1));
 	nvpair_t *pair = lua_touserdata(state, lua_upvalueindex(2));
@@ -361,13 +361,13 @@ zcp_user_props_iter(lua_State *state)
 }
 
 static int zcp_user_props_list(lua_State *);
-static zcp_list_info_t zcp_user_props_list_info = {
+static const zcp_list_info_t zcp_user_props_list_info = {
 	.name = "user_properties",
 	.func = zcp_user_props_list,
 	.gc = zcp_user_props_list_gc,
 	.pargs = {
 	    { .za_name = "filesystem | snapshot | volume",
-	    .za_lua_type = LUA_TSTRING},
+	    .za_lua_type = LUA_TSTRING },
 	    {NULL, 0}
 	},
 	.kwargs = {
@@ -383,13 +383,13 @@ static zcp_list_info_t zcp_user_props_list_info = {
  * versions of ZFS, we declare 'properties' as an alias for
  * 'user_properties'.
  */
-static zcp_list_info_t zcp_props_list_info = {
+static const zcp_list_info_t zcp_props_list_info = {
 	.name = "properties",
 	.func = zcp_user_props_list,
 	.gc = zcp_user_props_list_gc,
 	.pargs = {
 	    { .za_name = "filesystem | snapshot | volume",
-	    .za_lua_type = LUA_TSTRING},
+	    .za_lua_type = LUA_TSTRING },
 	    {NULL, 0}
 	},
 	.kwargs = {
@@ -444,11 +444,11 @@ zcp_dataset_system_props(dsl_dataset_t *ds, nvlist_t *nv)
 }
 
 static int zcp_system_props_list(lua_State *);
-static zcp_list_info_t zcp_system_props_list_info = {
+static const zcp_list_info_t zcp_system_props_list_info = {
 	.name = "system_properties",
 	.func = zcp_system_props_list,
 	.pargs = {
-	    { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+	    { .za_name = "dataset", .za_lua_type = LUA_TSTRING },
 	    {NULL, 0}
 	},
 	.kwargs = {
@@ -467,7 +467,7 @@ zcp_system_props_list(lua_State *state)
 	char errbuf[128];
 	const char *dataset_name;
 	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
-	zcp_list_info_t *libinfo = &zcp_system_props_list_info;
+	const zcp_list_info_t *libinfo = &zcp_system_props_list_info;
 	zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
 	dataset_name = lua_tostring(state, 1);
 	nvlist_t *nv = fnvlist_alloc();
@@ -566,11 +566,11 @@ zcp_bookmarks_iter(lua_State *state)
 }
 
 static int zcp_bookmarks_list(lua_State *);
-static zcp_list_info_t zcp_bookmarks_list_info = {
+static const zcp_list_info_t zcp_bookmarks_list_info = {
 	.name = "bookmarks",
 	.func = zcp_bookmarks_list,
 	.pargs = {
-	    { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+	    { .za_name = "dataset", .za_lua_type = LUA_TSTRING },
 	    {NULL, 0}
 	},
 	.kwargs = {
@@ -654,12 +654,12 @@ zcp_holds_iter(lua_State *state)
 }
 
 static int zcp_holds_list(lua_State *);
-static zcp_list_info_t zcp_holds_list_info = {
+static const zcp_list_info_t zcp_holds_list_info = {
 	.name = "holds",
 	.func = zcp_holds_list,
 	.gc = NULL,
 	.pargs = {
-	    { .za_name = "snapshot", .za_lua_type = LUA_TSTRING},
+	    { .za_name = "snapshot", .za_lua_type = LUA_TSTRING },
 	    {NULL, 0}
 	},
 	.kwargs = {
@@ -710,8 +710,7 @@ zcp_list_func(lua_State *state)
 int
 zcp_load_list_lib(lua_State *state)
 {
-	int i;
-	zcp_list_info_t *zcp_list_funcs[] = {
+	const zcp_list_info_t *zcp_list_funcs[] = {
 		&zcp_children_list_info,
 		&zcp_snapshots_list_info,
 		&zcp_user_props_list_info,
@@ -725,8 +724,8 @@ zcp_load_list_lib(lua_State *state)
 
 	lua_newtable(state);
 
-	for (i = 0; zcp_list_funcs[i] != NULL; i++) {
-		zcp_list_info_t *info = zcp_list_funcs[i];
+	for (int i = 0; zcp_list_funcs[i] != NULL; i++) {
+		const zcp_list_info_t *info = zcp_list_funcs[i];
 
 		if (info->gc != NULL) {
 			/*
@@ -741,10 +740,9 @@ zcp_load_list_lib(lua_State *state)
 			lua_pop(state, 1);
 		}
 
-		lua_pushlightuserdata(state, info);
+		lua_pushlightuserdata(state, (void *)(uintptr_t)info);
 		lua_pushcclosure(state, &zcp_list_func, 1);
 		lua_setfield(state, -2, info->name);
-		info++;
 	}
 
 	return (1);
diff --git a/sys/contrib/openzfs/module/zfs/zcp_synctask.c b/sys/contrib/openzfs/module/zfs/zcp_synctask.c
index c6ade59b9ced..058910054d97 100644
--- a/sys/contrib/openzfs/module/zfs/zcp_synctask.c
+++ b/sys/contrib/openzfs/module/zfs/zcp_synctask.c
@@ -114,25 +114,25 @@ zcp_sync_task(lua_State *state, dsl_checkfunc_t *checkfunc,
 
 
 static int zcp_synctask_destroy(lua_State *, boolean_t, nvlist_t *);
-static zcp_synctask_info_t zcp_synctask_destroy_info = {
+static const zcp_synctask_info_t zcp_synctask_destroy_info = {
 	.name = "destroy",
 	.func = zcp_synctask_destroy,
 	.pargs = {
-	    {.za_name = "filesystem | snapshot", .za_lua_type = LUA_TSTRING},
+	    {.za_name = "filesystem | snapshot", .za_lua_type = LUA_TSTRING },
 	    {NULL, 0}
 	},
 	.kwargs = {
-	    {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN},
+	    {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN },
 	    {NULL, 0}
 	},
 	.space_check = ZFS_SPACE_CHECK_DESTROY,
 	.blocks_modified = 0
 };
 
-/* ARGSUSED */
 static int
 zcp_synctask_destroy(lua_State *state, boolean_t sync, nvlist_t *err_details)
 {
+	(void) err_details;
 	int err;
 	const char *dsname = lua_tostring(state, 1);
 
@@ -167,11 +167,11 @@ zcp_synctask_destroy(lua_State *state, boolean_t sync, nvlist_t *err_details)
 }
 
 static int zcp_synctask_promote(lua_State *, boolean_t, nvlist_t *);
-static zcp_synctask_info_t zcp_synctask_promote_info = {
+static const zcp_synctask_info_t zcp_synctask_promote_info = {
 	.name = "promote",
 	.func = zcp_synctask_promote,
 	.pargs = {
-	    {.za_name = "clone", .za_lua_type = LUA_TSTRING},
+	    {.za_name = "clone", .za_lua_type = LUA_TSTRING },
 	    {NULL, 0}
 	},
 	.kwargs = {
@@ -205,13 +205,13 @@ zcp_synctask_promote(lua_State *state, boolean_t sync, nvlist_t *err_details)
 }
 
 static int zcp_synctask_rollback(lua_State *, boolean_t, nvlist_t *err_details);
-static zcp_synctask_info_t zcp_synctask_rollback_info = {
+static const zcp_synctask_info_t zcp_synctask_rollback_info = {
 	.name = "rollback",
 	.func = zcp_synctask_rollback,
 	.space_check = ZFS_SPACE_CHECK_RESERVED,
 	.blocks_modified = 1,
 	.pargs = {
-	    {.za_name = "filesystem", .za_lua_type = LUA_TSTRING},
+	    {.za_name = "filesystem", .za_lua_type = LUA_TSTRING },
 	    {0, 0}
 	},
 	.kwargs = {
@@ -236,12 +236,12 @@ zcp_synctask_rollback(lua_State *state, boolean_t sync, nvlist_t *err_details)
 }
 
 static int zcp_synctask_snapshot(lua_State *, boolean_t, nvlist_t *);
-static zcp_synctask_info_t zcp_synctask_snapshot_info = {
+static const zcp_synctask_info_t zcp_synctask_snapshot_info = {
 	.name = "snapshot",
 	.func = zcp_synctask_snapshot,
 	.pargs = {
 	    {.za_name = "filesystem@snapname | volume@snapname",
-	    .za_lua_type = LUA_TSTRING},
+	    .za_lua_type = LUA_TSTRING },
 	    {NULL, 0}
 	},
 	.kwargs = {
@@ -251,10 +251,10 @@ static zcp_synctask_info_t zcp_synctask_snapshot_info = {
 	.blocks_modified = 3
 };
 
-/* ARGSUSED */
 static int
 zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details)
 {
+	(void) err_details;
 	int err;
 	dsl_dataset_snapshot_arg_t ddsa = { 0 };
 	const char *dsname = lua_tostring(state, 1);
@@ -302,9 +302,45 @@ zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details)
 	return (err);
 }
 
+static int zcp_synctask_rename_snapshot(lua_State *, boolean_t, nvlist_t *);
+static const zcp_synctask_info_t zcp_synctask_rename_snapshot_info = {
+	.name = "rename_snapshot",
+	.func = zcp_synctask_rename_snapshot,
+	.pargs = {
+	    {.za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING },
+	    {.za_name = "oldsnapname", .za_lua_type = LUA_TSTRING },
+	    {.za_name = "newsnapname", .za_lua_type = LUA_TSTRING },
+	    {NULL, 0}
+	},
+	.space_check = ZFS_SPACE_CHECK_RESERVED,
+	.blocks_modified = 1
+};
+
+static int
+zcp_synctask_rename_snapshot(lua_State *state, boolean_t sync,
+    nvlist_t *err_details)
+{
+	(void) err_details;
+	int err;
+	const char *fsname = lua_tostring(state, 1);
+	const char *oldsnapname = lua_tostring(state, 2);
+	const char *newsnapname = lua_tostring(state, 3);
+
+	struct dsl_dataset_rename_snapshot_arg ddrsa = { 0 };
+	ddrsa.ddrsa_fsname = fsname;
+	ddrsa.ddrsa_oldsnapname = oldsnapname;
+	ddrsa.ddrsa_newsnapname = newsnapname;
+	ddrsa.ddrsa_recursive = B_FALSE;
+
+	err = zcp_sync_task(state, dsl_dataset_rename_snapshot_check,
+	    dsl_dataset_rename_snapshot_sync, &ddrsa, sync, NULL);
+
+	return (err);
+}
+
 static int zcp_synctask_inherit_prop(lua_State *, boolean_t,
     nvlist_t *err_details);
-static zcp_synctask_info_t zcp_synctask_inherit_prop_info = {
+static const zcp_synctask_info_t zcp_synctask_inherit_prop_info = {
 	.name = "inherit",
 	.func = zcp_synctask_inherit_prop,
 	.space_check = ZFS_SPACE_CHECK_RESERVED,
@@ -325,7 +361,7 @@ zcp_synctask_inherit_prop_check(void *arg, dmu_tx_t *tx)
 	zcp_inherit_prop_arg_t *args = arg;
 	zfs_prop_t prop = zfs_name_to_prop(args->zipa_prop);
 
-	if (prop == ZPROP_INVAL) {
+	if (prop == ZPROP_USERPROP) {
 		if (zfs_prop_user(args->zipa_prop))
 			return (0);
 
@@ -354,6 +390,7 @@ static int
 zcp_synctask_inherit_prop(lua_State *state, boolean_t sync,
     nvlist_t *err_details)
 {
+	(void) err_details;
 	int err;
 	zcp_inherit_prop_arg_t zipa = { 0 };
 	dsl_props_set_arg_t *dpsa = &zipa.zipa_dpsa;
@@ -381,12 +418,12 @@ zcp_synctask_inherit_prop(lua_State *state, boolean_t sync,
 }
 
 static int zcp_synctask_bookmark(lua_State *, boolean_t, nvlist_t *);
-static zcp_synctask_info_t zcp_synctask_bookmark_info = {
+static const zcp_synctask_info_t zcp_synctask_bookmark_info = {
 	.name = "bookmark",
 	.func = zcp_synctask_bookmark,
 	.pargs = {
-	    {.za_name = "snapshot | bookmark", .za_lua_type = LUA_TSTRING},
-	    {.za_name = "bookmark", .za_lua_type = LUA_TSTRING},
+	    {.za_name = "snapshot | bookmark", .za_lua_type = LUA_TSTRING },
+	    {.za_name = "bookmark", .za_lua_type = LUA_TSTRING },
 	    {NULL, 0}
 	},
 	.kwargs = {
@@ -396,10 +433,10 @@ static zcp_synctask_info_t zcp_synctask_bookmark_info = {
 	.blocks_modified = 1,
 };
 
-/* ARGSUSED */
 static int
 zcp_synctask_bookmark(lua_State *state, boolean_t sync, nvlist_t *err_details)
 {
+	(void) err_details;
 	int err;
 	const char *source = lua_tostring(state, 1);
 	const char *new = lua_tostring(state, 2);
@@ -424,15 +461,15 @@ zcp_synctask_bookmark(lua_State *state, boolean_t sync, nvlist_t *err_details)
 }
 
 static int zcp_synctask_set_prop(lua_State *, boolean_t, nvlist_t *err_details);
-static zcp_synctask_info_t zcp_synctask_set_prop_info = {
+static const zcp_synctask_info_t zcp_synctask_set_prop_info = {
 	.name = "set_prop",
 	.func = zcp_synctask_set_prop,
 	.space_check = ZFS_SPACE_CHECK_RESERVED,
 	.blocks_modified = 2,
 	.pargs = {
-		{ .za_name = "dataset", .za_lua_type = LUA_TSTRING},
-		{ .za_name = "property", .za_lua_type =  LUA_TSTRING},
-		{ .za_name = "value", .za_lua_type =  LUA_TSTRING},
+		{ .za_name = "dataset", .za_lua_type = LUA_TSTRING },
+		{ .za_name = "property", .za_lua_type =  LUA_TSTRING },
+		{ .za_name = "value", .za_lua_type =  LUA_TSTRING },
 		{ NULL, 0 }
 	},
 	.kwargs = {
@@ -443,6 +480,7 @@ static zcp_synctask_info_t zcp_synctask_set_prop_info = {
 static int
 zcp_synctask_set_prop(lua_State *state, boolean_t sync, nvlist_t *err_details)
 {
+	(void) err_details;
 	int err;
 	zcp_set_prop_arg_t args = { 0 };
 
@@ -522,12 +560,12 @@ zcp_synctask_wrapper(lua_State *state)
 int
 zcp_load_synctask_lib(lua_State *state, boolean_t sync)
 {
-	int i;
-	zcp_synctask_info_t *zcp_synctask_funcs[] = {
+	const zcp_synctask_info_t *zcp_synctask_funcs[] = {
 		&zcp_synctask_destroy_info,
 		&zcp_synctask_promote_info,
 		&zcp_synctask_rollback_info,
 		&zcp_synctask_snapshot_info,
+		&zcp_synctask_rename_snapshot_info,
 		&zcp_synctask_inherit_prop_info,
 		&zcp_synctask_bookmark_info,
 		&zcp_synctask_set_prop_info,
@@ -536,13 +574,12 @@ zcp_load_synctask_lib(lua_State *state, boolean_t sync)
 
 	lua_newtable(state);
 
-	for (i = 0; zcp_synctask_funcs[i] != NULL; i++) {
-		zcp_synctask_info_t *info = zcp_synctask_funcs[i];
-		lua_pushlightuserdata(state, info);
+	for (int i = 0; zcp_synctask_funcs[i] != NULL; i++) {
+		const zcp_synctask_info_t *info = zcp_synctask_funcs[i];
+		lua_pushlightuserdata(state, (void *)(uintptr_t)info);
 		lua_pushboolean(state, sync);
 		lua_pushcclosure(state, &zcp_synctask_wrapper, 2);
 		lua_setfield(state, -2, info->name);
-		info++;
 	}
 
 	return (1);
diff --git a/sys/contrib/openzfs/module/zfs/zfeature.c b/sys/contrib/openzfs/module/zfs/zfeature.c
index 9d16fff81d0a..1d25bc406866 100644
--- a/sys/contrib/openzfs/module/zfs/zfeature.c
+++ b/sys/contrib/openzfs/module/zfs/zfeature.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -389,6 +389,13 @@ feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
 	    !spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION) &&
 	    feature->fi_feature == SPA_FEATURE_BOOKMARK_V2)
 		spa->spa_errata = 0;
+
+	/*
+	 * Convert the old on-disk error log to the new format when activating
+	 * the head_errlog feature.
+	 */
+	if (feature->fi_feature == SPA_FEATURE_HEAD_ERRLOG)
+		spa_upgrade_errlog(spa, tx);
 }
 
 static void
diff --git a/sys/contrib/openzfs/module/zfs/zfs_byteswap.c b/sys/contrib/openzfs/module/zfs/zfs_byteswap.c
index cd35849c3f37..8666883f09a2 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_byteswap.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_byteswap.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -36,9 +36,7 @@ static
 void
 zfs_oldace_byteswap(ace_t *ace, int ace_cnt)
 {
-	int i;
-
-	for (i = 0; i != ace_cnt; i++, ace++) {
+	for (int i = 0; i != ace_cnt; i++, ace++) {
 		ace->a_who = BSWAP_32(ace->a_who);
 		ace->a_access_mask = BSWAP_32(ace->a_access_mask);
 		ace->a_flags = BSWAP_16(ace->a_flags);
@@ -138,23 +136,16 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
 	}
 }
 
-/* ARGSUSED */
 void
 zfs_oldacl_byteswap(void *buf, size_t size)
 {
-	int cnt;
-
 	/*
 	 * Arggh, since we don't know how many ACEs are in
 	 * the array, we have to swap the entire block
 	 */
-
-	cnt = size / sizeof (ace_t);
-
-	zfs_oldace_byteswap((ace_t *)buf, cnt);
+	zfs_oldace_byteswap((ace_t *)buf, size / sizeof (ace_t));
 }
 
-/* ARGSUSED */
 void
 zfs_acl_byteswap(void *buf, size_t size)
 {
diff --git a/sys/contrib/openzfs/module/zfs/zfs_chksum.c b/sys/contrib/openzfs/module/zfs/zfs_chksum.c
new file mode 100644
index 000000000000..acedeab7a163
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_chksum.c
@@ -0,0 +1,379 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_chksum.h>
+#include <sys/zfs_impl.h>
+
+#include <sys/blake3.h>
+#include <sys/sha2.h>
+
+/* limit benchmarking to max 256KiB, when EdonR is slower then this: */
+#define	LIMIT_PERF_MBS	300
+
+typedef struct {
+	const char *name;
+	const char *impl;
+	uint64_t bs1k;
+	uint64_t bs4k;
+	uint64_t bs16k;
+	uint64_t bs64k;
+	uint64_t bs256k;
+	uint64_t bs1m;
+	uint64_t bs4m;
+	uint64_t bs16m;
+	zio_cksum_salt_t salt;
+	zio_checksum_t *(func);
+	zio_checksum_tmpl_init_t *(init);
+	zio_checksum_tmpl_free_t *(free);
+} chksum_stat_t;
+
+static chksum_stat_t *chksum_stat_data = 0;
+static int chksum_stat_cnt = 0;
+static kstat_t *chksum_kstat = NULL;
+
+/*
+ * Sample output on i3-1005G1 System:
+ *
+ * implementation   1k      4k     16k     64k    256k      1m      4m     16m
+ * edonr-generic  1278    1625    1769    1776    1783    1778    1771    1767
+ * skein-generic   548     594     613     623     621     623     621     486
+ * sha256-generic  255     270     281     278     279     281     283     283
+ * sha256-x64      288     310     316     317     318     317     317     316
+ * sha256-ssse3    304     342     351     355     356     357     356     356
+ * sha256-avx      311     348     359     362     362     363     363     362
+ * sha256-avx2     330     378     389     395     395     395     395     395
+ * sha256-shani    908    1127    1212    1230    1233    1234    1223    1230
+ * sha512-generic  359     409     431     427     429     430     428     423
+ * sha512-x64      420     473     490     496     497     497     496     495
+ * sha512-avx      406     522     546     560     560     560     556     560
+ * sha512-avx2     464     568     601     606     609     610     607     608
+ * blake3-generic  330     327     324     323     324     320     323     322
+ * blake3-sse2     424    1366    1449    1468    1458    1453    1395    1408
+ * blake3-sse41    453    1554    1658    1703    1689    1669    1622    1630
+ * blake3-avx2     452    2013    3225    3351    3356    3261    3076    3101
+ * blake3-avx512   498    2869    5269    5926    5872    5643    5014    5005
+ */
+static int
+chksum_kstat_headers(char *buf, size_t size)
+{
+	ssize_t off = 0;
+
+	off += kmem_scnprintf(buf + off, size, "%-23s", "implementation");
+	off += kmem_scnprintf(buf + off, size - off, "%8s", "1k");
+	off += kmem_scnprintf(buf + off, size - off, "%8s", "4k");
+	off += kmem_scnprintf(buf + off, size - off, "%8s", "16k");
+	off += kmem_scnprintf(buf + off, size - off, "%8s", "64k");
+	off += kmem_scnprintf(buf + off, size - off, "%8s", "256k");
+	off += kmem_scnprintf(buf + off, size - off, "%8s", "1m");
+	off += kmem_scnprintf(buf + off, size - off, "%8s", "4m");
+	(void) kmem_scnprintf(buf + off, size - off, "%8s\n", "16m");
+
+	return (0);
+}
+
+static int
+chksum_kstat_data(char *buf, size_t size, void *data)
+{
+	chksum_stat_t *cs;
+	ssize_t off = 0;
+	char b[24];
+
+	cs = (chksum_stat_t *)data;
+	kmem_scnprintf(b, 23, "%s-%s", cs->name, cs->impl);
+	off += kmem_scnprintf(buf + off, size - off, "%-23s", b);
+	off += kmem_scnprintf(buf + off, size - off, "%8llu",
+	    (u_longlong_t)cs->bs1k);
+	off += kmem_scnprintf(buf + off, size - off, "%8llu",
+	    (u_longlong_t)cs->bs4k);
+	off += kmem_scnprintf(buf + off, size - off, "%8llu",
+	    (u_longlong_t)cs->bs16k);
+	off += kmem_scnprintf(buf + off, size - off, "%8llu",
+	    (u_longlong_t)cs->bs64k);
+	off += kmem_scnprintf(buf + off, size - off, "%8llu",
+	    (u_longlong_t)cs->bs256k);
+	off += kmem_scnprintf(buf + off, size - off, "%8llu",
+	    (u_longlong_t)cs->bs1m);
+	off += kmem_scnprintf(buf + off, size - off, "%8llu",
+	    (u_longlong_t)cs->bs4m);
+	(void) kmem_scnprintf(buf + off, size - off, "%8llu\n",
+	    (u_longlong_t)cs->bs16m);
+
+	return (0);
+}
+
+static void *
+chksum_kstat_addr(kstat_t *ksp, loff_t n)
+{
+	if (n < chksum_stat_cnt)
+		ksp->ks_private = (void *)(chksum_stat_data + n);
+	else
+		ksp->ks_private = NULL;
+
+	return (ksp->ks_private);
+}
+
+static void
+chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round,
+    uint64_t *result)
+{
+	hrtime_t start;
+	uint64_t run_bw, run_time_ns, run_count = 0, size = 0;
+	uint32_t l, loops = 0;
+	zio_cksum_t zcp;
+
+	switch (round) {
+	case 1: /* 1k */
+		size = 1<<10; loops = 128; break;
+	case 2: /* 2k */
+		size = 1<<12; loops = 64; break;
+	case 3: /* 4k */
+		size = 1<<14; loops = 32; break;
+	case 4: /* 16k */
+		size = 1<<16; loops = 16; break;
+	case 5: /* 256k */
+		size = 1<<18; loops = 8; break;
+	case 6: /* 1m */
+		size = 1<<20; loops = 4; break;
+	case 7: /* 4m */
+		size = 1<<22; loops = 1; break;
+	case 8: /* 16m */
+		size = 1<<24; loops = 1; break;
+	}
+
+	kpreempt_disable();
+	start = gethrtime();
+	do {
+		for (l = 0; l < loops; l++, run_count++)
+			cs->func(abd, size, ctx, &zcp);
+
+		run_time_ns = gethrtime() - start;
+	} while (run_time_ns < MSEC2NSEC(1));
+	kpreempt_enable();
+
+	run_bw = size * run_count * NANOSEC;
+	run_bw /= run_time_ns;	/* B/s */
+	*result = run_bw/1024/1024; /* MiB/s */
+}
+
+#define	LIMIT_INIT	0
+#define	LIMIT_NEEDED	1
+#define	LIMIT_NOLIMIT	2
+
+static void
+chksum_benchit(chksum_stat_t *cs)
+{
+	abd_t *abd;
+	void *ctx = 0;
+	void *salt = &cs->salt.zcs_bytes;
+	static int chksum_stat_limit = LIMIT_INIT;
+
+	memset(salt, 0, sizeof (cs->salt.zcs_bytes));
+	if (cs->init)
+		ctx = cs->init(&cs->salt);
+
+	/* allocate test memory via abd linear interface */
+	abd = abd_alloc_linear(1<<20, B_FALSE);
+	chksum_run(cs, abd, ctx, 1, &cs->bs1k);
+	chksum_run(cs, abd, ctx, 2, &cs->bs4k);
+	chksum_run(cs, abd, ctx, 3, &cs->bs16k);
+	chksum_run(cs, abd, ctx, 4, &cs->bs64k);
+	chksum_run(cs, abd, ctx, 5, &cs->bs256k);
+
+	/* check if we ran on a slow cpu */
+	if (chksum_stat_limit == LIMIT_INIT) {
+		if (cs->bs1k < LIMIT_PERF_MBS) {
+			chksum_stat_limit = LIMIT_NEEDED;
+		} else {
+			chksum_stat_limit = LIMIT_NOLIMIT;
+		}
+	}
+
+	/* skip benchmarks >= 1MiB when the CPU is to slow */
+	if (chksum_stat_limit == LIMIT_NEEDED)
+		goto abort;
+
+	chksum_run(cs, abd, ctx, 6, &cs->bs1m);
+	abd_free(abd);
+
+	/* allocate test memory via abd non linear interface */
+	abd = abd_alloc(1<<24, B_FALSE);
+	chksum_run(cs, abd, ctx, 7, &cs->bs4m);
+	chksum_run(cs, abd, ctx, 8, &cs->bs16m);
+
+abort:
+	abd_free(abd);
+
+	/* free up temp memory */
+	if (cs->free)
+		cs->free(ctx);
+}
+
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+chksum_benchmark(void)
+{
+#ifndef _KERNEL
+	/* we need the benchmark only for the kernel module */
+	return;
+#endif
+
+	chksum_stat_t *cs;
+	uint64_t max;
+	uint32_t id, cbid = 0, id_save;
+	const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3");
+	const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256");
+	const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512");
+
+	/* count implementations */
+	chksum_stat_cnt = 2;
+	chksum_stat_cnt += sha256->getcnt();
+	chksum_stat_cnt += sha512->getcnt();
+	chksum_stat_cnt += blake3->getcnt();
+	chksum_stat_data = kmem_zalloc(
+	    sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP);
+
+	/* edonr - needs to be the first one here (slow CPU check) */
+	cs = &chksum_stat_data[cbid++];
+
+	/* edonr */
+	cs->init = abd_checksum_edonr_tmpl_init;
+	cs->func = abd_checksum_edonr_native;
+	cs->free = abd_checksum_edonr_tmpl_free;
+	cs->name = "edonr";
+	cs->impl = "generic";
+	chksum_benchit(cs);
+
+	/* skein */
+	cs = &chksum_stat_data[cbid++];
+	cs->init = abd_checksum_skein_tmpl_init;
+	cs->func = abd_checksum_skein_native;
+	cs->free = abd_checksum_skein_tmpl_free;
+	cs->name = "skein";
+	cs->impl = "generic";
+	chksum_benchit(cs);
+
+	/* sha256 */
+	id_save = sha256->getid();
+	for (max = 0, id = 0; id < sha256->getcnt(); id++) {
+		sha256->setid(id);
+		cs = &chksum_stat_data[cbid++];
+		cs->init = 0;
+		cs->func = abd_checksum_sha256;
+		cs->free = 0;
+		cs->name = sha256->name;
+		cs->impl = sha256->getname();
+		chksum_benchit(cs);
+		if (cs->bs256k > max) {
+			max = cs->bs256k;
+			sha256->set_fastest(id);
+		}
+	}
+	sha256->setid(id_save);
+
+	/* sha512 */
+	id_save = sha512->getid();
+	for (max = 0, id = 0; id < sha512->getcnt(); id++) {
+		sha512->setid(id);
+		cs = &chksum_stat_data[cbid++];
+		cs->init = 0;
+		cs->func = abd_checksum_sha512_native;
+		cs->free = 0;
+		cs->name = sha512->name;
+		cs->impl = sha512->getname();
+		chksum_benchit(cs);
+		if (cs->bs256k > max) {
+			max = cs->bs256k;
+			sha512->set_fastest(id);
+		}
+	}
+	sha512->setid(id_save);
+
+	/* blake3 */
+	id_save = blake3->getid();
+	for (max = 0, id = 0; id < blake3->getcnt(); id++) {
+		blake3->setid(id);
+		cs = &chksum_stat_data[cbid++];
+		cs->init = abd_checksum_blake3_tmpl_init;
+		cs->func = abd_checksum_blake3_native;
+		cs->free = abd_checksum_blake3_tmpl_free;
+		cs->name = blake3->name;
+		cs->impl = blake3->getname();
+		chksum_benchit(cs);
+		if (cs->bs256k > max) {
+			max = cs->bs256k;
+			blake3->set_fastest(id);
+		}
+	}
+	blake3->setid(id_save);
+}
+
+void
+chksum_init(void)
+{
+#ifdef _KERNEL
+	blake3_per_cpu_ctx_init();
+#endif
+
+	/* Benchmark supported implementations */
+	chksum_benchmark();
+
+	/* Install kstats for all implementations */
+	chksum_kstat = kstat_create("zfs", 0, "chksum_bench", "misc",
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+	if (chksum_kstat != NULL) {
+		chksum_kstat->ks_data = NULL;
+		chksum_kstat->ks_ndata = UINT32_MAX;
+		kstat_set_raw_ops(chksum_kstat,
+		    chksum_kstat_headers,
+		    chksum_kstat_data,
+		    chksum_kstat_addr);
+		kstat_install(chksum_kstat);
+	}
+}
+
+void
+chksum_fini(void)
+{
+	if (chksum_kstat != NULL) {
+		kstat_delete(chksum_kstat);
+		chksum_kstat = NULL;
+	}
+
+	if (chksum_stat_cnt) {
+		kmem_free(chksum_stat_data,
+		    sizeof (chksum_stat_t) * chksum_stat_cnt);
+		chksum_stat_cnt = 0;
+		chksum_stat_data = 0;
+	}
+
+#ifdef _KERNEL
+	blake3_per_cpu_ctx_fini();
+#endif
+}
diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c
index 007f31b4e7b3..2f43c4aa41b8 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_fm.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -59,7 +59,7 @@
  * read I/Os, there  are basically three 'types' of I/O, which form a roughly
  * layered diagram:
  *
- *      +---------------+
+ * 	+---------------+
  * 	| Aggregate I/O |	No associated logical data or device
  * 	+---------------+
  *              |
@@ -124,14 +124,14 @@ static taskqid_t recent_events_cleaner_tqid;
  * This setting can be changed dynamically and setting it to zero
  * disables duplicate detection.
  */
-unsigned int zfs_zevent_retain_max = 2000;
+static unsigned int zfs_zevent_retain_max = 2000;
 
 /*
  * The lifespan for a recent ereport entry. The default of 15 minutes is
  * intended to outlive the zfs diagnosis engine's threshold of 10 errors
  * over a period of 10 minutes.
  */
-unsigned int zfs_zevent_retain_expire_secs = 900;
+static unsigned int zfs_zevent_retain_expire_secs = 900;
 
 typedef enum zfs_subclass {
 	ZSC_IO,
@@ -200,12 +200,53 @@ recent_events_compare(const void *a, const void *b)
 	return (0);
 }
 
+/*
+ * workaround: vdev properties don't have inheritance
+ */
+static uint64_t
+vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop)
+{
+	uint64_t propdef, propval;
+
+	propdef = vdev_prop_default_numeric(prop);
+	switch (prop) {
+		case VDEV_PROP_CHECKSUM_N:
+			propval = vd->vdev_checksum_n;
+			break;
+		case VDEV_PROP_CHECKSUM_T:
+			propval = vd->vdev_checksum_t;
+			break;
+		case VDEV_PROP_IO_N:
+			propval = vd->vdev_io_n;
+			break;
+		case VDEV_PROP_IO_T:
+			propval = vd->vdev_io_t;
+			break;
+		case VDEV_PROP_SLOW_IO_N:
+			propval = vd->vdev_slow_io_n;
+			break;
+		case VDEV_PROP_SLOW_IO_T:
+			propval = vd->vdev_slow_io_t;
+			break;
+		default:
+			propval = propdef;
+			break;
+	}
+
+	if (propval != propdef)
+		return (propval);
+
+	if (vd->vdev_parent == NULL)
+		return (propdef);
+
+	return (vdev_prop_get_inherited(vd->vdev_parent, prop));
+}
+
 static void zfs_ereport_schedule_cleaner(void);
 
 /*
  * background task to clean stale recent event nodes.
  */
-/*ARGSUSED*/
 static void
 zfs_ereport_cleaner(void *arg)
 {
@@ -254,7 +295,6 @@ void
 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
 {
 	uint64_t vdev_guid, pool_guid;
-	int cnt = 0;
 
 	ASSERT(vd != NULL || spa != NULL);
 	if (vd == NULL) {
@@ -278,7 +318,6 @@ zfs_ereport_clear(spa_t *spa, vdev_t *vd)
 			avl_remove(&recent_events_tree, entry);
 			list_remove(&recent_events_list, entry);
 			kmem_free(entry, sizeof (*entry));
-			cnt++;
 		}
 	}
 
@@ -665,6 +704,69 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 		    DATA_TYPE_UINT64, zb->zb_blkid, NULL);
 	}
 
+	/*
+	 * Payload for tuning the zed
+	 */
+	if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
+		uint64_t cksum_n, cksum_t;
+
+		cksum_n = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_N);
+		if (cksum_n != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N))
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N,
+			    DATA_TYPE_UINT64,
+			    cksum_n,
+			    NULL);
+
+		cksum_t = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_T);
+		if (cksum_t != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T))
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T,
+			    DATA_TYPE_UINT64,
+			    cksum_t,
+			    NULL);
+	}
+
+	if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) {
+		uint64_t io_n, io_t;
+
+		io_n = vdev_prop_get_inherited(vd, VDEV_PROP_IO_N);
+		if (io_n != vdev_prop_default_numeric(VDEV_PROP_IO_N))
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N,
+			    DATA_TYPE_UINT64,
+			    io_n,
+			    NULL);
+
+		io_t = vdev_prop_get_inherited(vd, VDEV_PROP_IO_T);
+		if (io_t != vdev_prop_default_numeric(VDEV_PROP_IO_T))
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T,
+			    DATA_TYPE_UINT64,
+			    io_t,
+			    NULL);
+	}
+
+	if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
+		uint64_t slow_io_n, slow_io_t;
+
+		slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N);
+		if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N))
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
+			    DATA_TYPE_UINT64,
+			    slow_io_n,
+			    NULL);
+
+		slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T);
+		if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T))
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
+			    DATA_TYPE_UINT64,
+			    slow_io_t,
+			    NULL);
+	}
+
 	mutex_exit(&spa->spa_errlist_lock);
 
 	*ereport_out = ereport;
@@ -678,10 +780,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 #define	MAX_RANGES		16
 
 typedef struct zfs_ecksum_info {
-	/* histograms of set and cleared bits by bit number in a 64-bit word */
-	uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY];
-	uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
-
 	/* inline arrays of bits set and cleared. */
 	uint64_t zei_bits_set[ZFM_MAX_INLINE];
 	uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
@@ -705,7 +803,7 @@ typedef struct zfs_ecksum_info {
 } zfs_ecksum_info_t;
 
 static void
-update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count)
+update_bad_bits(uint64_t value_arg, uint32_t *count)
 {
 	size_t i;
 	size_t bits = 0;
@@ -713,10 +811,8 @@ update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count)
 
 	/* We store the bits in big-endian (largest-first) order */
 	for (i = 0; i < 64; i++) {
-		if (value & (1ull << i)) {
-			hist[63 - i]++;
+		if (value & (1ull << i))
 			++bits;
-		}
 	}
 	/* update the count of bits changed */
 	*count += bits;
@@ -826,9 +922,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
 	const uint64_t *good;
 	const uint64_t *bad;
 
-	uint64_t allset = 0;
-	uint64_t allcleared = 0;
-
 	size_t nui64s = size / sizeof (uint64_t);
 
 	size_t inline_size;
@@ -847,14 +940,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
 
 	if (info != NULL && info->zbc_has_cksum) {
 		fm_payload_set(ereport,
-		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
-		    DATA_TYPE_UINT64_ARRAY,
-		    sizeof (info->zbc_expected) / sizeof (uint64_t),
-		    (uint64_t *)&info->zbc_expected,
-		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
-		    DATA_TYPE_UINT64_ARRAY,
-		    sizeof (info->zbc_actual) / sizeof (uint64_t),
-		    (uint64_t *)&info->zbc_actual,
 		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
 		    DATA_TYPE_STRING,
 		    info->zbc_checksum_name,
@@ -930,9 +1015,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
 			// bits set in good, but not in bad
 			cleared = (good[idx] & (~bad[idx]));
 
-			allset |= set;
-			allcleared |= cleared;
-
 			if (!no_inline) {
 				ASSERT3U(offset, <, inline_size);
 				eip->zei_bits_set[offset] = set;
@@ -940,10 +1022,8 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
 				offset++;
 			}
 
-			update_histogram(set, eip->zei_histogram_set,
-			    &eip->zei_range_sets[range]);
-			update_histogram(cleared, eip->zei_histogram_cleared,
-			    &eip->zei_range_clears[range]);
+			update_bad_bits(set, &eip->zei_range_sets[range]);
+			update_bad_bits(cleared, &eip->zei_range_clears[range]);
 		}
 
 		/* convert to byte offsets */
@@ -979,23 +1059,14 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
 		    DATA_TYPE_UINT8_ARRAY,
 		    inline_size, (uint8_t *)eip->zei_bits_cleared,
 		    NULL);
-	} else {
-		fm_payload_set(ereport,
-		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
-		    DATA_TYPE_UINT32_ARRAY,
-		    NBBY * sizeof (uint64_t), eip->zei_histogram_set,
-		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
-		    DATA_TYPE_UINT32_ARRAY,
-		    NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
-		    NULL);
 	}
 	return (eip);
 }
 #else
-/*ARGSUSED*/
 void
 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
 {
+	(void) spa, (void) vd;
 }
 #endif
 
@@ -1025,10 +1096,7 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
 		return (B_FALSE);
 
 	if (zio != NULL) {
-		/*
-		 * If this is not a read or write zio, ignore the error.  This
-		 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
-		 */
+		/* If this is not a read or write zio, ignore the error */
 		if (zio->io_type != ZIO_TYPE_READ &&
 		    zio->io_type != ZIO_TYPE_WRITE)
 			return (B_FALSE);
@@ -1072,6 +1140,8 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
 	    (zio != NULL) && (!zio->io_timestamp)) {
 		return (B_FALSE);
 	}
+#else
+	(void) subclass, (void) spa, (void) vd, (void) zio;
 #endif
 	return (B_TRUE);
 }
@@ -1112,6 +1182,9 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
 
 	/* Cleanup is handled by the callback function */
 	rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
+#else
+	(void) subclass, (void) spa, (void) vd, (void) zb, (void) zio,
+	    (void) state;
 #endif
 	return (rc);
 }
@@ -1141,6 +1214,8 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
 
 	if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
 		return (SET_ERROR(EBUSY));
+#else
+	(void) zb, (void) offset;
 #endif
 
 	report = kmem_zalloc(sizeof (*report), KM_SLEEP);
@@ -1150,7 +1225,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
 	/* copy the checksum failure information if it was provided */
 	if (info != NULL) {
 		report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
-		bcopy(info, report->zcr_ckinfo, sizeof (*info));
+		memcpy(report->zcr_ckinfo, info, sizeof (*info));
 	}
 
 	report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
@@ -1193,6 +1268,9 @@ zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data,
 	report->zcr_ereport = report->zcr_detector = NULL;
 	if (info != NULL)
 		kmem_free(info, sizeof (*info));
+#else
+	(void) report, (void) good_data, (void) bad_data,
+	    (void) drop_if_identical;
 #endif
 }
 
@@ -1257,6 +1335,9 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
 		rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
 		kmem_free(info, sizeof (*info));
 	}
+#else
+	(void) spa, (void) vd, (void) zb, (void) zio, (void) offset,
+	    (void) length, (void) good_data, (void) bad_data, (void) zbc;
 #endif
 	return (rc);
 }
@@ -1321,7 +1402,8 @@ zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name,
 		while ((elem = nvlist_next_nvpair(aux, elem)) != NULL)
 			(void) nvlist_add_nvpair(resource, elem);
 	}
-
+#else
+	(void) spa, (void) vd, (void) type, (void) name, (void) aux;
 #endif
 	return (resource);
 }
@@ -1336,6 +1418,8 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name,
 	resource = zfs_event_create(spa, vd, type, name, aux);
 	if (resource)
 		zfs_zevent_post(resource, NULL, zfs_zevent_post_cb);
+#else
+	(void) spa, (void) vd, (void) type, (void) name, (void) aux;
 #endif
 }
 
@@ -1380,17 +1464,17 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
 	aux = fm_nvlist_create(NULL);
 	if (vd && aux) {
 		if (vd->vdev_physpath) {
-			(void) nvlist_add_string(aux,
+			fnvlist_add_string(aux,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH,
 			    vd->vdev_physpath);
 		}
 		if (vd->vdev_enc_sysfs_path) {
-			(void) nvlist_add_string(aux,
+			fnvlist_add_string(aux,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
 			    vd->vdev_enc_sysfs_path);
 		}
 
-		(void) nvlist_add_uint64(aux,
+		fnvlist_add_uint64(aux,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate);
 	}
 
@@ -1399,6 +1483,8 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
 
 	if (aux)
 		fm_nvlist_destroy(aux, FM_NVA_FREE);
+#else
+	(void) spa, (void) vd, (void) laststate;
 #endif
 }
 
@@ -1434,9 +1520,8 @@ zfs_ereport_fini(void)
 {
 	recent_events_node_t *entry;
 
-	while ((entry = list_head(&recent_events_list)) != NULL) {
+	while ((entry = list_remove_head(&recent_events_list)) != NULL) {
 		avl_remove(&recent_events_tree, entry);
-		list_remove(&recent_events_list, entry);
 		kmem_free(entry, sizeof (*entry));
 	}
 	avl_destroy(&recent_events_tree);
@@ -1450,7 +1535,7 @@ zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name)
 	nvlist_t *aux;
 
 	aux = fm_nvlist_create(NULL);
-	nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name);
+	fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name);
 
 	zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
 	fm_nvlist_destroy(aux, FM_NVA_FREE);
@@ -1485,12 +1570,12 @@ zfs_ereport_zvol_post(const char *subclass, const char *name,
 		return;
 
 	aux = fm_nvlist_create(NULL);
-	nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name);
-	nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME,
+	fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name);
+	fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME,
 	    raw_name);
 	r = strchr(name, '/');
 	if (r && r[1])
-		nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]);
+		fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]);
 
 	zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
 	fm_nvlist_destroy(aux, FM_NVA_FREE);
diff --git a/sys/contrib/openzfs/module/zfs/zfs_fuid.c b/sys/contrib/openzfs/module/zfs/zfs_fuid.c
index a90bf5feeea1..add4241dcc99 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_fuid.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_fuid.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -61,7 +61,7 @@ typedef struct fuid_domain {
 	uint64_t	f_idx;
 } fuid_domain_t;
 
-static char *nulldomain = "";
+static const char *const nulldomain = "";
 
 /*
  * Compare two indexes.
@@ -133,7 +133,7 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
 
 		for (i = 0; i != count; i++) {
 			fuid_domain_t *domnode;
-			char *domain;
+			const char *domain;
 			uint64_t idx;
 
 			VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN,
@@ -171,7 +171,7 @@ zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
 	avl_destroy(idx_tree);
 }
 
-char *
+const char *
 zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
 {
 	fuid_domain_t searchnode, *findnode;
@@ -258,8 +258,8 @@ zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 		VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN,
 		    domnode->f_ksid->kd_name) == 0);
 	}
-	VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
-	    fuids, numnodes) == 0);
+	fnvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
+	    (const nvlist_t * const *)fuids, numnodes);
 	for (i = 0; i != numnodes; i++)
 		nvlist_free(fuids[i]);
 	kmem_free(fuids, numnodes * sizeof (void *));
@@ -290,9 +290,9 @@ zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
  * necessary for the caller or another thread to detect the dirty table
  * and sync out the changes.
  */
-int
+static int
 zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain,
-    char **retdomain, boolean_t addok)
+    const char **retdomain, boolean_t addok)
 {
 	fuid_domain_t searchnode, *findnode;
 	avl_index_t loc;
@@ -358,7 +358,7 @@ retry:
 const char *
 zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
 {
-	char *domain;
+	const char *domain;
 
 	if (idx == 0 || !zfsvfs->z_use_fuids)
 		return (NULL);
@@ -518,8 +518,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
 	uint64_t	idx;
 	ksid_t		*ksid;
 	uint32_t	rid;
-	char		*kdomain;
-	const char	*domain;
+	const char	*kdomain, *domain;
 	uid_t		id;
 
 	VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
@@ -574,8 +573,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
     zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp)
 {
 #ifdef HAVE_KSID
-	const char *domain;
-	char *kdomain;
+	const char *domain, *kdomain;
 	uint32_t fuid_idx = FUID_INDEX(id);
 	uint32_t rid = 0;
 	idmap_stat status;
@@ -624,7 +622,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
 			rid = FUID_RID(fuidp->z_fuid_group);
 			idx = FUID_INDEX(fuidp->z_fuid_group);
 			break;
-		};
+		}
 		domain = fuidp->z_domain_table[idx - 1];
 	} else {
 		if (type == ZFS_OWNER || type == ZFS_ACE_USER)
@@ -701,19 +699,15 @@ zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
 	zfs_fuid_t *zfuid;
 	zfs_fuid_domain_t *zdomain;
 
-	while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) {
-		list_remove(&fuidp->z_fuids, zfuid);
+	while ((zfuid = list_remove_head(&fuidp->z_fuids)) != NULL)
 		kmem_free(zfuid, sizeof (zfs_fuid_t));
-	}
 
 	if (fuidp->z_domain_table != NULL)
 		kmem_free(fuidp->z_domain_table,
 		    (sizeof (char *)) * fuidp->z_domain_cnt);
 
-	while ((zdomain = list_head(&fuidp->z_domains)) != NULL) {
-		list_remove(&fuidp->z_domains, zdomain);
+	while ((zdomain = list_remove_head(&fuidp->z_domains)) != NULL)
 		kmem_free(zdomain, sizeof (zfs_fuid_domain_t));
-	}
 
 	kmem_free(fuidp, sizeof (zfs_fuid_info_t));
 }
diff --git a/sys/contrib/openzfs/module/zfs/spa_boot.c b/sys/contrib/openzfs/module/zfs/zfs_impl.c
index 674394650f82..20322ff98b31 100644
--- a/sys/contrib/openzfs/module/zfs/spa_boot.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_impl.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -20,31 +20,42 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
  */
 
-#ifdef _KERNEL
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_impl.h>
 
-#include <sys/zio.h>
-#include <sys/spa_boot.h>
-#include <sys/sunddi.h>
+#include <sys/blake3.h>
+#include <sys/sha2.h>
 
-char *
-spa_get_bootprop(char *propname)
+/*
+ * impl_ops - backend for implementations of algorithms
+ */
+const zfs_impl_t *impl_ops[] = {
+	&zfs_blake3_ops,
+	&zfs_sha256_ops,
+	&zfs_sha512_ops,
+	NULL
+};
+
+/*
+ * zfs_impl_get_ops - Get the API functions for an impl backend
+ */
+const zfs_impl_t *
+zfs_impl_get_ops(const char *algo)
 {
-	char *value;
+	const zfs_impl_t **ops = impl_ops;
 
-	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
-	    DDI_PROP_DONTPASS, propname, &value) != DDI_SUCCESS)
-		return (NULL);
-	return (value);
-}
+	if (!algo || !*algo)
+		return (*ops);
 
-void
-spa_free_bootprop(char *value)
-{
-	ddi_prop_free(value);
-}
+	for (; *ops; ops++) {
+		if (strcmp(algo, (*ops)->name) == 0)
+			break;
+	}
 
-#endif /* _KERNEL */
+	ASSERT3P(ops, !=, NULL);
+	return (*ops);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
index 96a021acbc95..7b527eb75e83 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -23,11 +23,11 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 Martin Matuska
  * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
- * Portions Copyright 2012 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * Copyright (c) 2012 Pawel Jakub Dawidek
  * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
@@ -38,8 +38,9 @@
  * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved.
- * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, 2021, 2024, Klara Inc.
  * Copyright (c) 2019, Allan Jude
+ * Copyright 2024 Oxide Computer Company
  */
 
 /*
@@ -222,23 +223,22 @@
 #include <sys/zfs_ioctl_impl.h>
 
 kmutex_t zfsdev_state_lock;
-zfsdev_state_t *zfsdev_state_list;
+static zfsdev_state_t zfsdev_state_listhead;
 
 /*
  * Limit maximum nvlist size.  We don't want users passing in insane values
  * for zc->zc_nvlist_src_size, since we will need to allocate that much memory.
  * Defaults to 0=auto which is handled by platform code.
  */
-unsigned long zfs_max_nvlist_src_size = 0;
+uint64_t zfs_max_nvlist_src_size = 0;
 
 /*
  * When logging the output nvlist of an ioctl in the on-disk history, limit
  * the logged size to this many bytes.  This must be less than DMU_MAX_ACCESS.
  * This applies primarily to zfs_ioc_channel_program().
  */
-unsigned long zfs_history_output_max = 1024 * 1024;
+static uint64_t zfs_history_output_max = 1024 * 1024;
 
-uint_t zfs_fsyncer_key;
 uint_t zfs_allow_log_key;
 
 /* DATA_TYPE_ANY is used when zkey_type can vary. */
@@ -373,10 +373,10 @@ zfs_log_history(zfs_cmd_t *zc)
  * Policy for top-level read operations (list pools).  Requires no privileges,
  * and can be used in the local zone, as there is no associated dataset.
  */
-/* ARGSUSED */
 static int
 zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) zc, (void) innvl, (void) cr;
 	return (0);
 }
 
@@ -384,10 +384,10 @@ zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
  * Policy for dataset read operations (list children, get statistics).  Requires
  * no privileges, but must be visible in the local zone.
  */
-/* ARGSUSED */
 static int
 zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) innvl, (void) cr;
 	if (INGLOBALZONE(curproc) ||
 	    zone_dataset_visible(zc->zc_name, NULL))
 		return (0);
@@ -604,7 +604,7 @@ static int
 zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
     cred_t *cr)
 {
-	char *strval;
+	const char *strval;
 
 	/*
 	 * Check permissions for special properties.
@@ -656,35 +656,29 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
 	return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
 }
 
-/* ARGSUSED */
 static int
 zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-	int error;
-
-	error = zfs_dozonecheck(zc->zc_name, cr);
-	if (error != 0)
-		return (error);
-
 	/*
 	 * permission to set permissions will be evaluated later in
 	 * dsl_deleg_can_allow()
 	 */
-	return (0);
+	(void) innvl;
+	return (zfs_dozonecheck(zc->zc_name, cr));
 }
 
-/* ARGSUSED */
 static int
 zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) innvl;
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_ROLLBACK, cr));
 }
 
-/* ARGSUSED */
 static int
 zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) innvl;
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	const char *cp;
@@ -717,10 +711,10 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 	return (error);
 }
 
-/* ARGSUSED */
 static int
 zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) innvl;
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_SEND, cr));
 }
@@ -728,12 +722,14 @@ zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 static int
 zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) zc, (void) innvl, (void) cr;
 	return (SET_ERROR(ENOTSUP));
 }
 
 static int
 zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) zc, (void) innvl, (void) cr;
 	return (SET_ERROR(ENOTSUP));
 }
 
@@ -745,7 +741,7 @@ zfs_get_parent(const char *datasetname, char *parent, int parentsize)
 	/*
 	 * Remove the @bla or /bla from the end of the name to get the parent.
 	 */
-	(void) strncpy(parent, datasetname, parentsize);
+	(void) strlcpy(parent, datasetname, parentsize);
 	cp = strrchr(parent, '@');
 	if (cp != NULL) {
 		cp[0] = '\0';
@@ -771,10 +767,10 @@ zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
 	return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
 }
 
-/* ARGSUSED */
 static int
 zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) innvl;
 	return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
 }
 
@@ -782,10 +778,10 @@ zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
  * Destroying snapshots with delegated permissions requires
  * descendant mount and destroy permissions.
  */
-/* ARGSUSED */
 static int
 zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) zc;
 	nvlist_t *snaps;
 	nvpair_t *pair, *nextpair;
 	int error = 0;
@@ -844,17 +840,17 @@ zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
 	return (error);
 }
 
-/* ARGSUSED */
 static int
 zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) innvl;
 	return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr));
 }
 
-/* ARGSUSED */
 static int
 zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) innvl;
 	dsl_pool_t *dp;
 	dsl_dataset_t *clone;
 	int error;
@@ -899,10 +895,10 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 	return (error);
 }
 
-/* ARGSUSED */
 static int
 zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) innvl;
 	int error;
 
 	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
@@ -917,13 +913,6 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 	    ZFS_DELEG_PERM_CREATE, cr));
 }
 
-/* ARGSUSED */
-static int
-zfs_secpolicy_recv_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	return (zfs_secpolicy_recv(zc, innvl, cr));
-}
-
 int
 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
 {
@@ -934,10 +923,10 @@ zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
 /*
  * Check for permission to create each snapshot in the nvlist.
  */
-/* ARGSUSED */
 static int
 zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) zc;
 	nvlist_t *snaps;
 	int error = 0;
 	nvpair_t *pair;
@@ -946,7 +935,7 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(snaps, pair)) {
-		char *name = nvpair_name(pair);
+		char *name = (char *)nvpair_name(pair);
 		char *atp = strchr(name, '@');
 
 		if (atp == NULL) {
@@ -965,15 +954,15 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 /*
  * Check for permission to create each bookmark in the nvlist.
  */
-/* ARGSUSED */
 static int
 zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) zc;
 	int error = 0;
 
 	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
-		char *name = nvpair_name(pair);
+		char *name = (char *)nvpair_name(pair);
 		char *hashp = strchr(name, '#');
 
 		if (hashp == NULL) {
@@ -990,16 +979,16 @@ zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 	return (error);
 }
 
-/* ARGSUSED */
 static int
 zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) zc;
 	nvpair_t *pair, *nextpair;
 	int error = 0;
 
 	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
 	    pair = nextpair) {
-		char *name = nvpair_name(pair);
+		char *name = (char *)nvpair_name(pair);
 		char *hashp = strchr(name, '#');
 		nextpair = nvlist_next_nvpair(innvl, pair);
 
@@ -1031,10 +1020,10 @@ zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 	return (error);
 }
 
-/* ARGSUSED */
 static int
 zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) zc, (void) innvl, (void) cr;
 	/*
 	 * Even root must have a proper TSD so that we know what pool
 	 * to log to.
@@ -1047,9 +1036,9 @@ zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 static int
 zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-	char	parentname[ZFS_MAX_DATASET_NAME_LEN];
-	int	error;
-	char	*origin;
+	char		parentname[ZFS_MAX_DATASET_NAME_LEN];
+	int		error;
+	const char	*origin;
 
 	if ((error = zfs_get_parent(zc->zc_name, parentname,
 	    sizeof (parentname))) != 0)
@@ -1072,10 +1061,11 @@ zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
  * Policy for pool operations - create/destroy pools, add vdevs, etc.  Requires
  * SYS_CONFIG privilege, which is not available in a local zone.
  */
-/* ARGSUSED */
 int
 zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) zc, (void) innvl;
+
 	if (secpolicy_sys_config(cr, B_FALSE) != 0)
 		return (SET_ERROR(EPERM));
 
@@ -1085,13 +1075,13 @@ zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 /*
  * Policy for object to name lookups.
  */
-/* ARGSUSED */
 static int
 zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) innvl;
 	int error;
 
-	if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
+	if (secpolicy_sys_config(cr, B_FALSE) == 0)
 		return (0);
 
 	error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
@@ -1101,20 +1091,20 @@ zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 /*
  * Policy for fault injection.  Requires all privileges.
  */
-/* ARGSUSED */
 static int
 zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) zc, (void) innvl;
 	return (secpolicy_zinject(cr));
 }
 
-/* ARGSUSED */
 static int
 zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) innvl;
 	zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
 
-	if (prop == ZPROP_INVAL) {
+	if (prop == ZPROP_USERPROP) {
 		if (!zfs_prop_user(zc->zc_value))
 			return (SET_ERROR(EINVAL));
 		return (zfs_secpolicy_write_perms(zc->zc_name,
@@ -1174,18 +1164,18 @@ zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 	    userquota_perms[zc->zc_objset_type], cr));
 }
 
-/* ARGSUSED */
 static int
 zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) innvl;
 	return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
 	    NULL, cr));
 }
 
-/* ARGSUSED */
 static int
 zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) zc;
 	nvpair_t *pair;
 	nvlist_t *holds;
 	int error;
@@ -1206,10 +1196,10 @@ zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 	return (0);
 }
 
-/* ARGSUSED */
 static int
 zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	(void) zc;
 	nvpair_t *pair;
 	int error;
 
@@ -1240,8 +1230,8 @@ zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 	 */
 	int error;
 
-	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
-	    ZFS_DELEG_PERM_DIFF, cr)) == 0)
+	if (zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_DIFF, cr) == 0)
 		return (0);
 
 	error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
@@ -1289,8 +1279,7 @@ get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
 
 	packed = vmem_alloc(size, KM_SLEEP);
 
-	if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
-	    iflag)) != 0) {
+	if (ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag) != 0) {
 		vmem_free(packed, size);
 		return (SET_ERROR(EFAULT));
 	}
@@ -1407,7 +1396,8 @@ getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
  * which prevents all inode ops from running.
  */
 static int
-zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
+zfsvfs_hold(const char *name, const void *tag, zfsvfs_t **zfvp,
+    boolean_t writer)
 {
 	int error = 0;
 
@@ -1432,7 +1422,7 @@ zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
 }
 
 static void
-zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
+zfsvfs_rele(zfsvfs_t *zfsvfs, const void *tag)
 {
 	ZFS_TEARDOWN_EXIT(zfsvfs, tag);
 
@@ -1470,7 +1460,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
 		nvlist_t *nvl = NULL;
 		nvlist_t *hidden_args = NULL;
 		uint64_t version = SPA_VERSION;
-		char *tname;
+		const char *tname;
 
 		(void) nvlist_lookup_uint64(props,
 		    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
@@ -1592,8 +1582,9 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc)
 	nvlist_t *configs;
 	int error;
 
-	if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
-		return (SET_ERROR(EEXIST));
+	error = spa_all_configs(&zc->zc_cookie, &configs);
+	if (error)
+		return (error);
 
 	error = put_nvlist(zc, configs);
 
@@ -1695,6 +1686,47 @@ zfs_ioc_pool_scan(zfs_cmd_t *zc)
 	return (error);
 }
 
+/*
+ * inputs:
+ * poolname             name of the pool
+ * scan_type            scan func (pool_scan_func_t)
+ * scan_command         scrub pause/resume flag (pool_scrub_cmd_t)
+ */
+static const zfs_ioc_key_t zfs_keys_pool_scrub[] = {
+	{"scan_type",		DATA_TYPE_UINT64,	0},
+	{"scan_command",	DATA_TYPE_UINT64,	0},
+};
+
+static int
+zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	spa_t *spa;
+	int error;
+	uint64_t scan_type, scan_cmd;
+
+	if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0)
+		return (SET_ERROR(EINVAL));
+	if (nvlist_lookup_uint64(innvl, "scan_command", &scan_cmd) != 0)
+		return (SET_ERROR(EINVAL));
+
+	if (scan_cmd >= POOL_SCRUB_FLAGS_END)
+		return (SET_ERROR(EINVAL));
+
+	if ((error = spa_open(poolname, &spa, FTAG)) != 0)
+		return (error);
+
+	if (scan_cmd == POOL_SCRUB_PAUSE) {
+		error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE);
+	} else if (scan_type == POOL_SCAN_NONE) {
+		error = spa_scan_stop(spa);
+	} else {
+		error = spa_scan(spa, scan_type);
+	}
+
+	spa_close(spa, FTAG);
+	return (error);
+}
+
 static int
 zfs_ioc_pool_freeze(zfs_cmd_t *zc)
 {
@@ -1855,7 +1887,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
 	error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config);
 	if (error == 0) {
-		error = spa_vdev_add(spa, config);
+		error = spa_vdev_add(spa, config, zc->zc_flags);
 		nvlist_free(config);
 	}
 	spa_close(spa, FTAG);
@@ -1921,6 +1953,10 @@ zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
 		error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
 		break;
 
+	case VDEV_STATE_REMOVED:
+		error = vdev_remove_wanted(spa, zc->zc_guid);
+		break;
+
 	default:
 		error = SET_ERROR(EINVAL);
 	}
@@ -2044,7 +2080,7 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
 
 	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
 
-	if (zc->zc_nvlist_dst != 0 &&
+	if (!zc->zc_simple && zc->zc_nvlist_dst != 0 &&
 	    (error = dsl_prop_get_all(os, &nv)) == 0) {
 		dmu_objset_stats(os, nv);
 		/*
@@ -2300,7 +2336,7 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 	}
 
 	while (error == 0) {
-		if (issig(JUSTLOOKING) && issig(FORREAL)) {
+		if (issig()) {
 			error = SET_ERROR(EINTR);
 			break;
 		}
@@ -2331,6 +2367,7 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 		}
 
 		if (zc->zc_simple) {
+			dsl_dataset_fast_stat(ds, &zc->zc_objset_stats);
 			dsl_dataset_rele(ds, FTAG);
 			break;
 		}
@@ -2416,7 +2453,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
 	const char *strval = NULL;
 	int err = -1;
 
-	if (prop == ZPROP_INVAL) {
+	if (prop == ZPROP_USERPROP) {
 		if (zfs_prop_userquota(propname))
 			return (zfs_prop_set_userquota(dsname, pair));
 		return (-1);
@@ -2486,11 +2523,27 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
 	case ZFS_PROP_VOLSIZE:
 		err = zvol_set_volsize(dsname, intval);
 		break;
-	case ZFS_PROP_SNAPDEV:
-		err = zvol_set_snapdev(dsname, source, intval);
+	case ZFS_PROP_VOLTHREADING:
+		err = zvol_set_volthreading(dsname, intval);
+		/*
+		 * Set err to -1 to force the zfs_set_prop_nvlist code down the
+		 * default path to set the value in the nvlist.
+		 */
+		if (err == 0)
+			err = -1;
 		break;
+	case ZFS_PROP_SNAPDEV:
 	case ZFS_PROP_VOLMODE:
-		err = zvol_set_volmode(dsname, source, intval);
+		err = zvol_set_common(dsname, prop, source, intval);
+		break;
+	case ZFS_PROP_READONLY:
+		err = zvol_set_ro(dsname, intval);
+		/*
+		 * Set err to -1 to force the zfs_set_prop_nvlist code down the
+		 * default path to set the value in the nvlist.
+		 */
+		if (err == 0)
+			err = -1;
 		break;
 	case ZFS_PROP_VERSION:
 	{
@@ -2558,6 +2611,7 @@ zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
 	nvpair_t *pair;
 	nvpair_t *propval;
 	int rv = 0;
+	int err;
 	uint64_t intval;
 	const char *strval;
 	boolean_t should_update_mount_cache = B_FALSE;
@@ -2569,7 +2623,7 @@ retry:
 	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
 		const char *propname = nvpair_name(pair);
 		zfs_prop_t prop = zfs_name_to_prop(propname);
-		int err = 0;
+		err = 0;
 
 		/* decode the property value */
 		propval = pair;
@@ -2586,7 +2640,7 @@ retry:
 			/* inherited properties are expected to be booleans */
 			if (nvpair_type(propval) != DATA_TYPE_BOOLEAN)
 				err = SET_ERROR(EINVAL);
-		} else if (err == 0 && prop == ZPROP_INVAL) {
+		} else if (err == 0 && prop == ZPROP_USERPROP) {
 			if (zfs_prop_user(propname)) {
 				if (nvpair_type(propval) != DATA_TYPE_STRING)
 					err = SET_ERROR(EINVAL);
@@ -2668,47 +2722,52 @@ retry:
 		goto retry;
 	}
 
-	if (!nvlist_empty(genericnvl) &&
-	    dsl_props_set(dsname, source, genericnvl) != 0) {
-		/*
-		 * If this fails, we still want to set as many properties as we
-		 * can, so try setting them individually.
-		 */
-		pair = NULL;
-		while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
-			const char *propname = nvpair_name(pair);
-			int err = 0;
-
-			propval = pair;
-			if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
-				nvlist_t *attrs;
-				attrs = fnvpair_value_nvlist(pair);
-				propval = fnvlist_lookup_nvpair(attrs,
-				    ZPROP_VALUE);
-			}
+	if (nvlist_empty(genericnvl))
+		goto out;
 
-			if (nvpair_type(propval) == DATA_TYPE_STRING) {
-				strval = fnvpair_value_string(propval);
-				err = dsl_prop_set_string(dsname, propname,
-				    source, strval);
-			} else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) {
-				err = dsl_prop_inherit(dsname, propname,
-				    source);
-			} else {
-				intval = fnvpair_value_uint64(propval);
-				err = dsl_prop_set_int(dsname, propname, source,
-				    intval);
-			}
+	/*
+	 * Try to set them all in one batch.
+	 */
+	err = dsl_props_set(dsname, source, genericnvl);
+	if (err == 0)
+		goto out;
 
-			if (err != 0) {
-				if (errlist != NULL) {
-					fnvlist_add_int32(errlist, propname,
-					    err);
-				}
-				rv = err;
+	/*
+	 * If batching fails, we still want to set as many properties as we
+	 * can, so try setting them individually.
+	 */
+	pair = NULL;
+	while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
+		const char *propname = nvpair_name(pair);
+
+		propval = pair;
+		if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+			nvlist_t *attrs;
+			attrs = fnvpair_value_nvlist(pair);
+			propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE);
+		}
+
+		if (nvpair_type(propval) == DATA_TYPE_STRING) {
+			strval = fnvpair_value_string(propval);
+			err = dsl_prop_set_string(dsname, propname,
+			    source, strval);
+		} else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) {
+			err = dsl_prop_inherit(dsname, propname, source);
+		} else {
+			intval = fnvpair_value_uint64(propval);
+			err = dsl_prop_set_int(dsname, propname, source,
+			    intval);
+		}
+
+		if (err != 0) {
+			if (errlist != NULL) {
+				fnvlist_add_int32(errlist, propname, err);
 			}
+			rv = err;
 		}
 	}
+
+out:
 	if (should_update_mount_cache)
 		zfs_ioctl_update_mount_cache(dsname);
 
@@ -2856,11 +2915,11 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc)
 		 * and reservation to the received or default values even though
 		 * they are not considered inheritable.
 		 */
-		if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
+		if (prop != ZPROP_USERPROP && !zfs_prop_inheritable(prop))
 			return (SET_ERROR(EINVAL));
 	}
 
-	if (prop == ZPROP_INVAL) {
+	if (prop == ZPROP_USERPROP) {
 		if (!zfs_prop_user(propname))
 			return (SET_ERROR(EINVAL));
 
@@ -2928,7 +2987,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc)
 		mutex_enter(&spa_namespace_lock);
 		if ((spa = spa_lookup(zc->zc_name)) != NULL) {
 			spa_configfile_set(spa, props, B_FALSE);
-			spa_write_cachefile(spa, B_FALSE, B_TRUE);
+			spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
 		}
 		mutex_exit(&spa_namespace_lock);
 		if (spa != NULL) {
@@ -2982,6 +3041,96 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc)
 }
 
 /*
+ * innvl: {
+ *     "vdevprops_set_vdev" -> guid
+ *     "vdevprops_set_props" -> { prop -> value }
+ * }
+ *
+ * outnvl: propname -> error code (int32)
+ */
+static const zfs_ioc_key_t zfs_keys_vdev_set_props[] = {
+	{ZPOOL_VDEV_PROPS_SET_VDEV,	DATA_TYPE_UINT64,	0},
+	{ZPOOL_VDEV_PROPS_SET_PROPS,	DATA_TYPE_NVLIST,	0}
+};
+
+static int
+zfs_ioc_vdev_set_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	spa_t *spa;
+	int error;
+	vdev_t *vd;
+	uint64_t vdev_guid;
+
+	/* Early validation */
+	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
+	    &vdev_guid) != 0)
+		return (SET_ERROR(EINVAL));
+
+	if (outnvl == NULL)
+		return (SET_ERROR(EINVAL));
+
+	if ((error = spa_open(poolname, &spa, FTAG)) != 0)
+		return (error);
+
+	ASSERT(spa_writeable(spa));
+
+	if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(ENOENT));
+	}
+
+	error = vdev_prop_set(vd, innvl, outnvl);
+
+	spa_close(spa, FTAG);
+
+	return (error);
+}
+
+/*
+ * innvl: {
+ *     "vdevprops_get_vdev" -> guid
+ *     (optional) "vdevprops_get_props" -> { propname -> propid }
+ * }
+ *
+ * outnvl: propname -> value
+ */
+static const zfs_ioc_key_t zfs_keys_vdev_get_props[] = {
+	{ZPOOL_VDEV_PROPS_GET_VDEV,	DATA_TYPE_UINT64,	0},
+	{ZPOOL_VDEV_PROPS_GET_PROPS,	DATA_TYPE_NVLIST,	ZK_OPTIONAL}
+};
+
+static int
+zfs_ioc_vdev_get_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	spa_t *spa;
+	int error;
+	vdev_t *vd;
+	uint64_t vdev_guid;
+
+	/* Early validation */
+	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV,
+	    &vdev_guid) != 0)
+		return (SET_ERROR(EINVAL));
+
+	if (outnvl == NULL)
+		return (SET_ERROR(EINVAL));
+
+	if ((error = spa_open(poolname, &spa, FTAG)) != 0)
+		return (error);
+
+	if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(ENOENT));
+	}
+
+	error = vdev_prop_get(vd, innvl, outnvl);
+
+	spa_close(spa, FTAG);
+
+	return (error);
+}
+
+/*
  * inputs:
  * zc_name		name of filesystem
  * zc_nvlist_src{_size}	nvlist of delegated permissions
@@ -3002,7 +3151,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc)
 	/*
 	 * Verify nvlist is constructed correctly
 	 */
-	if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) {
+	if (zfs_deleg_verify_nvlist(fsaclnv) != 0) {
 		nvlist_free(fsaclnv);
 		return (SET_ERROR(EINVAL));
 	}
@@ -3052,7 +3201,6 @@ zfs_ioc_get_fsacl(zfs_cmd_t *zc)
 	return (error);
 }
 
-/* ARGSUSED */
 static void
 zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 {
@@ -3414,11 +3562,11 @@ static const zfs_ioc_key_t zfs_keys_remap[] = {
 	/* no nvl keys */
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	/* This IOCTL is no longer supported. */
+	(void) fsname, (void) innvl, (void) outnvl;
 	return (0);
 }
 
@@ -3506,10 +3654,10 @@ static const zfs_ioc_key_t zfs_keys_log_history[] = {
 	{"message",	DATA_TYPE_STRING,	0},
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
 {
+	(void) unused, (void) outnvl;
 	const char *message;
 	char *poolname;
 	spa_t *spa;
@@ -3612,10 +3760,10 @@ zfs_unmount_snap(const char *snapname)
 	(void) zfsctl_snapshot_unmount(snapname, MNT_FORCE);
 }
 
-/* ARGSUSED */
 static int
 zfs_unmount_snap_cb(const char *snapname, void *arg)
 {
+	(void) arg;
 	zfs_unmount_snap(snapname);
 	return (0);
 }
@@ -3659,7 +3807,6 @@ static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = {
 	{"defer",	DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
@@ -3712,10 +3859,10 @@ static const zfs_ioc_key_t zfs_keys_bookmark[] = {
 	{"<bookmark>...",	DATA_TYPE_STRING,	ZK_WILDCARDLIST},
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
+	(void) poolname;
 	return (dsl_bookmark_create(innvl, outnvl));
 }
 
@@ -3752,11 +3899,11 @@ static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = {
 	/* no nvl keys */
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl,
     nvlist_t *outnvl)
 {
+	(void) innvl;
 	char fsname[ZFS_MAX_DATASET_NAME_LEN];
 	char *bmname;
 
@@ -3827,7 +3974,7 @@ static int
 zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
     nvlist_t *outnvl)
 {
-	char *program;
+	const char *program;
 	uint64_t instrlimit, memlimit;
 	boolean_t sync_flag;
 	nvpair_t *nvarg = NULL;
@@ -3861,10 +4008,10 @@ static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = {
 	/* no nvl keys */
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
+	(void) innvl, (void) outnvl;
 	return (spa_checkpoint(poolname));
 }
 
@@ -3876,11 +4023,11 @@ static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = {
 	/* no nvl keys */
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl,
     nvlist_t *outnvl)
 {
+	(void) innvl, (void) outnvl;
 	return (spa_checkpoint_discard(poolname));
 }
 
@@ -3981,7 +4128,8 @@ zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 
 	if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
 	    cmd_type == POOL_INITIALIZE_START ||
-	    cmd_type == POOL_INITIALIZE_SUSPEND)) {
+	    cmd_type == POOL_INITIALIZE_SUSPEND ||
+	    cmd_type == POOL_INITIALIZE_UNINIT)) {
 		return (SET_ERROR(EINVAL));
 	}
 
@@ -4242,13 +4390,12 @@ static const zfs_ioc_key_t zfs_keys_rollback[] = {
 	{"target",	DATA_TYPE_STRING,	ZK_OPTIONAL},
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	zfsvfs_t *zfsvfs;
 	zvol_state_handle_t *zv;
-	char *target = NULL;
+	const char *target = NULL;
 	int error;
 
 	(void) nvlist_lookup_string(innvl, "target", &target);
@@ -4314,16 +4461,17 @@ recursive_unmount(const char *fsname, void *arg)
  * outnvl is unused
  */
 
-/* ARGSUSED */
 static const zfs_ioc_key_t zfs_keys_redact[] = {
 	{"bookname",		DATA_TYPE_STRING,	0},
 	{"snapnv",		DATA_TYPE_NVLIST,	0},
 };
+
 static int
 zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 {
+	(void) outnvl;
 	nvlist_t *redactnvl = NULL;
-	char *redactbook = NULL;
+	const char *redactbook = NULL;
 
 	if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0)
 		return (SET_ERROR(EINVAL));
@@ -4403,7 +4551,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
 	uint64_t intval, compval;
 	int err;
 
-	if (prop == ZPROP_INVAL) {
+	if (prop == ZPROP_USERPROP) {
 		if (zfs_prop_user(propname)) {
 			if ((err = zfs_secpolicy_write_perms(dsname,
 			    ZFS_DELEG_PERM_USERPROP, cr)))
@@ -4719,10 +4867,10 @@ propval_equals(nvpair_t *p1, nvpair_t *p2)
 		return (B_FALSE);
 
 	if (nvpair_type(p1) == DATA_TYPE_STRING) {
-		char *valstr1, *valstr2;
+		const char *valstr1, *valstr2;
 
-		VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
-		VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
+		VERIFY(nvpair_value_string(p1, &valstr1) == 0);
+		VERIFY(nvpair_value_string(p2, &valstr2) == 0);
 		return (strcmp(valstr1, valstr2) == 0);
 	} else {
 		uint64_t intval1, intval2;
@@ -4787,6 +4935,11 @@ extract_delay_props(nvlist_t *props)
 	static const zfs_prop_t delayable[] = {
 		ZFS_PROP_REFQUOTA,
 		ZFS_PROP_KEYLOCATION,
+		/*
+		 * Setting ZFS_PROP_SHARESMB requires the objset type to be
+		 * known, which is not possible prior to receipt of raw sends.
+		 */
+		ZFS_PROP_SHARESMB,
 		0
 	};
 	int i;
@@ -4838,9 +4991,9 @@ static boolean_t zfs_ioc_recv_inject_err;
  * encountered errors, if any. It's the callers responsibility to free.
  */
 static int
-zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
-    nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force,
-    boolean_t resumable, int input_fd,
+zfs_ioc_recv_impl(char *tofs, char *tosnap, const char *origin,
+    nvlist_t *recvprops, nvlist_t *localprops, nvlist_t *hidden_args,
+    boolean_t force, boolean_t heal, boolean_t resumable, int input_fd,
     dmu_replay_record_t *begin_record, uint64_t *read_bytes,
     uint64_t *errflags, nvlist_t **errors)
 {
@@ -4850,6 +5003,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
 	offset_t off, noff;
 	nvlist_t *local_delayprops = NULL;
 	nvlist_t *recv_delayprops = NULL;
+	nvlist_t *inherited_delayprops = NULL;
 	nvlist_t *origprops = NULL; /* existing properties */
 	nvlist_t *origrecvd = NULL; /* existing received properties */
 	boolean_t first_recvd_props = B_FALSE;
@@ -4865,7 +5019,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
 		return (SET_ERROR(EBADF));
 
 	noff = off = zfs_file_off(input_fp);
-	error = dmu_recv_begin(tofs, tosnap, begin_record, force,
+	error = dmu_recv_begin(tofs, tosnap, begin_record, force, heal,
 	    resumable, localprops, hidden_args, origin, &drc, input_fp,
 	    &off);
 	if (error != 0)
@@ -4949,7 +5103,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
 				/* -x property */
 				const char *name = nvpair_name(nvp);
 				zfs_prop_t prop = zfs_name_to_prop(name);
-				if (prop != ZPROP_INVAL) {
+				if (prop != ZPROP_USERPROP) {
 					if (!zfs_prop_inheritable(prop))
 						continue;
 				} else if (!zfs_prop_user(name))
@@ -4964,6 +5118,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
 		local_delayprops = extract_delay_props(oprops);
 		(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL,
 		    oprops, *errors);
+		inherited_delayprops = extract_delay_props(xprops);
 		(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED,
 		    xprops, *errors);
 
@@ -5021,6 +5176,10 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
 			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL,
 			    local_delayprops, *errors);
 		}
+		if (inherited_delayprops != NULL && error == 0) {
+			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED,
+			    inherited_delayprops, *errors);
+		}
 	}
 
 	/*
@@ -5040,6 +5199,10 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
 		ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0);
 		nvlist_free(local_delayprops);
 	}
+	if (inherited_delayprops != NULL) {
+		ASSERT(nvlist_merge(localprops, inherited_delayprops, 0) == 0);
+		nvlist_free(inherited_delayprops);
+	}
 	*read_bytes = off - noff;
 
 #ifdef	ZFS_DEBUG
@@ -5176,15 +5339,16 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 	nvlist_t *errors = NULL;
 	nvlist_t *recvdprops = NULL;
 	nvlist_t *localprops = NULL;
-	char *origin = NULL;
+	const char *origin = NULL;
 	char *tosnap;
 	char tofs[ZFS_MAX_DATASET_NAME_LEN];
 	int error = 0;
 
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
 	    strchr(zc->zc_value, '@') == NULL ||
-	    strchr(zc->zc_value, '%'))
+	    strchr(zc->zc_value, '%') != NULL) {
 		return (SET_ERROR(EINVAL));
+	}
 
 	(void) strlcpy(tofs, zc->zc_value, sizeof (tofs));
 	tosnap = strchr(tofs, '@');
@@ -5192,13 +5356,15 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 
 	if (zc->zc_nvlist_src != 0 &&
 	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    zc->zc_iflags, &recvdprops)) != 0)
-		return (error);
+	    zc->zc_iflags, &recvdprops)) != 0) {
+		goto out;
+	}
 
 	if (zc->zc_nvlist_conf != 0 &&
 	    (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
-	    zc->zc_iflags, &localprops)) != 0)
-		return (error);
+	    zc->zc_iflags, &localprops)) != 0) {
+		goto out;
+	}
 
 	if (zc->zc_string[0])
 		origin = zc->zc_string;
@@ -5208,10 +5374,8 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 	begin_record.drr_u.drr_begin = zc->zc_begin_record;
 
 	error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops,
-	    NULL, zc->zc_guid, B_FALSE, zc->zc_cookie, &begin_record,
+	    NULL, zc->zc_guid, B_FALSE, B_FALSE, zc->zc_cookie, &begin_record,
 	    &zc->zc_cookie, &zc->zc_obj, &errors);
-	nvlist_free(recvdprops);
-	nvlist_free(localprops);
 
 	/*
 	 * Now that all props, initial and delayed, are set, report the prop
@@ -5227,7 +5391,10 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 		error = SET_ERROR(EINVAL);
 	}
 
+out:
 	nvlist_free(errors);
+	nvlist_free(recvdprops);
+	nvlist_free(localprops);
 
 	return (error);
 }
@@ -5241,6 +5408,7 @@ zfs_ioc_recv(zfs_cmd_t *zc)
  *     "begin_record" -> non-byteswapped dmu_replay_record_t
  *     "input_fd" -> file descriptor to read stream from (int32)
  *     (optional) "force" -> force flag (value ignored)
+ *     (optional) "heal" -> use send stream to heal data corruption
  *     (optional) "resumable" -> resumable flag (value ignored)
  *     (optional) "cleanup_fd" -> unused
  *     (optional) "action_handle" -> unused
@@ -5261,6 +5429,7 @@ static const zfs_ioc_key_t zfs_keys_recv_new[] = {
 	{"begin_record",	DATA_TYPE_BYTE_ARRAY,	0},
 	{"input_fd",		DATA_TYPE_INT32,	0},
 	{"force",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
+	{"heal",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 	{"resumable",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 	{"cleanup_fd",		DATA_TYPE_INT32,	ZK_OPTIONAL},
 	{"action_handle",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
@@ -5276,11 +5445,12 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 	nvlist_t *recvprops = NULL;
 	nvlist_t *localprops = NULL;
 	nvlist_t *hidden_args = NULL;
-	char *snapname;
-	char *origin = NULL;
+	const char *snapname;
+	const char *origin = NULL;
 	char *tosnap;
 	char tofs[ZFS_MAX_DATASET_NAME_LEN];
 	boolean_t force;
+	boolean_t heal;
 	boolean_t resumable;
 	uint64_t read_bytes = 0;
 	uint64_t errflags = 0;
@@ -5291,8 +5461,9 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 
 	if (dataset_namecheck(snapname, NULL, NULL) != 0 ||
 	    strchr(snapname, '@') == NULL ||
-	    strchr(snapname, '%'))
+	    strchr(snapname, '%') != NULL) {
 		return (SET_ERROR(EINVAL));
+	}
 
 	(void) strlcpy(tofs, snapname, sizeof (tofs));
 	tosnap = strchr(tofs, '@');
@@ -5310,36 +5481,47 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 	input_fd = fnvlist_lookup_int32(innvl, "input_fd");
 
 	force = nvlist_exists(innvl, "force");
+	heal = nvlist_exists(innvl, "heal");
 	resumable = nvlist_exists(innvl, "resumable");
 
 	/* we still use "props" here for backwards compatibility */
 	error = nvlist_lookup_nvlist(innvl, "props", &recvprops);
 	if (error && error != ENOENT)
-		return (error);
+		goto out;
 
 	error = nvlist_lookup_nvlist(innvl, "localprops", &localprops);
 	if (error && error != ENOENT)
-		return (error);
+		goto out;
 
 	error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
 	if (error && error != ENOENT)
-		return (error);
+		goto out;
 
 	error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops,
-	    hidden_args, force, resumable, input_fd, begin_record,
+	    hidden_args, force, heal, resumable, input_fd, begin_record,
 	    &read_bytes, &errflags, &errors);
 
 	fnvlist_add_uint64(outnvl, "read_bytes", read_bytes);
 	fnvlist_add_uint64(outnvl, "error_flags", errflags);
 	fnvlist_add_nvlist(outnvl, "errors", errors);
 
+out:
 	nvlist_free(errors);
 	nvlist_free(recvprops);
 	nvlist_free(localprops);
+	nvlist_free(hidden_args);
 
 	return (error);
 }
 
+/*
+ * When stack space is limited, we write replication stream data to the target
+ * on a separate taskq thread, to make sure there's enough stack space.
+ */
+#ifndef HAVE_LARGE_STACKS
+#define	USE_SEND_TASKQ	1
+#endif
+
 typedef struct dump_bytes_io {
 	zfs_file_t	*dbi_fp;
 	caddr_t		dbi_buf;
@@ -5360,31 +5542,65 @@ dump_bytes_cb(void *arg)
 	dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL);
 }
 
+typedef struct dump_bytes_arg {
+	zfs_file_t	*dba_fp;
+#ifdef USE_SEND_TASKQ
+	taskq_t		*dba_tq;
+	taskq_ent_t	dba_tqent;
+#endif
+} dump_bytes_arg_t;
+
 static int
 dump_bytes(objset_t *os, void *buf, int len, void *arg)
 {
+	dump_bytes_arg_t *dba = (dump_bytes_arg_t *)arg;
 	dump_bytes_io_t dbi;
 
-	dbi.dbi_fp = arg;
+	dbi.dbi_fp = dba->dba_fp;
 	dbi.dbi_buf = buf;
 	dbi.dbi_len = len;
 
-#if defined(HAVE_LARGE_STACKS)
-	dump_bytes_cb(&dbi);
+#ifdef USE_SEND_TASKQ
+	taskq_dispatch_ent(dba->dba_tq, dump_bytes_cb, &dbi, TQ_SLEEP,
+	    &dba->dba_tqent);
+	taskq_wait(dba->dba_tq);
 #else
-	/*
-	 * The vn_rdwr() call is performed in a taskq to ensure that there is
-	 * always enough stack space to write safely to the target filesystem.
-	 * The ZIO_TYPE_FREE threads are used because there can be a lot of
-	 * them and they are used in vdev_file.c for a similar purpose.
-	 */
-	spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE,
-	    ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP);
-#endif /* HAVE_LARGE_STACKS */
+	dump_bytes_cb(&dbi);
+#endif
 
 	return (dbi.dbi_err);
 }
 
+static int
+dump_bytes_init(dump_bytes_arg_t *dba, int fd, dmu_send_outparams_t *out)
+{
+	zfs_file_t *fp = zfs_file_get(fd);
+	if (fp == NULL)
+		return (SET_ERROR(EBADF));
+
+	dba->dba_fp = fp;
+#ifdef USE_SEND_TASKQ
+	dba->dba_tq = taskq_create("z_send", 1, defclsyspri, 0, 0, 0);
+	taskq_init_ent(&dba->dba_tqent);
+#endif
+
+	memset(out, 0, sizeof (dmu_send_outparams_t));
+	out->dso_outfunc = dump_bytes;
+	out->dso_arg = dba;
+	out->dso_dryrun = B_FALSE;
+
+	return (0);
+}
+
+static void
+dump_bytes_fini(dump_bytes_arg_t *dba)
+{
+	zfs_file_put(dba->dba_fp);
+#ifdef USE_SEND_TASKQ
+	taskq_destroy(dba->dba_tq);
+#endif
+}
+
 /*
  * inputs:
  * zc_name	name of snapshot to send
@@ -5469,21 +5685,18 @@ zfs_ioc_send(zfs_cmd_t *zc)
 		dsl_dataset_rele(tosnap, FTAG);
 		dsl_pool_rele(dp, FTAG);
 	} else {
-		zfs_file_t *fp;
-		dmu_send_outparams_t out = {0};
-
-		if ((fp = zfs_file_get(zc->zc_cookie)) == NULL)
-			return (SET_ERROR(EBADF));
+		dump_bytes_arg_t dba;
+		dmu_send_outparams_t out;
+		error = dump_bytes_init(&dba, zc->zc_cookie, &out);
+		if (error)
+			return (error);
 
-		off = zfs_file_off(fp);
-		out.dso_outfunc = dump_bytes;
-		out.dso_arg = fp;
-		out.dso_dryrun = B_FALSE;
+		off = zfs_file_off(dba.dba_fp);
 		error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
 		    zc->zc_fromobj, embedok, large_block_ok, compressok,
 		    rawok, savedok, zc->zc_cookie, &off, &out);
 
-		zfs_file_put(fp);
+		dump_bytes_fini(&dba);
 	}
 	return (error);
 }
@@ -5585,17 +5798,12 @@ zfs_ioc_error_log(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
-	size_t count = (size_t)zc->zc_nvlist_dst_size;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
-	    &count);
-	if (error == 0)
-		zc->zc_nvlist_dst_size = count;
-	else
-		zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
+	    &zc->zc_nvlist_dst_size);
 
 	spa_close(spa, FTAG);
 
@@ -5654,10 +5862,13 @@ zfs_ioc_clear(zfs_cmd_t *zc)
 
 	/*
 	 * If multihost is enabled, resuming I/O is unsafe as another
-	 * host may have imported the pool.
+	 * host may have imported the pool. Check for remote activity.
 	 */
-	if (spa_multihost(spa) && spa_suspended(spa))
-		return (SET_ERROR(EINVAL));
+	if (spa_multihost(spa) && spa_suspended(spa) &&
+	    spa_mmp_remote_host_activity(spa)) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(EREMOTEIO));
+	}
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
@@ -5703,10 +5914,10 @@ static const zfs_ioc_key_t zfs_keys_pool_reopen[] = {
 	{"scrub_restart",	DATA_TYPE_BOOLEAN_VALUE,	ZK_OPTIONAL},
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl)
 {
+	(void) outnvl;
 	spa_t *spa;
 	int error;
 	boolean_t rc, scrub_restart = B_TRUE;
@@ -6005,10 +6216,6 @@ zfs_ioc_share(zfs_cmd_t *zc)
 	return (SET_ERROR(ENOSYS));
 }
 
-ace_t full_access[] = {
-	{(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
-};
-
 /*
  * inputs:
  * zc_name		name of containing filesystem
@@ -6117,10 +6324,10 @@ static const zfs_ioc_key_t zfs_keys_hold[] = {
 	{"cleanup_fd",		DATA_TYPE_INT32,	ZK_OPTIONAL},
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
 {
+	(void) pool;
 	nvpair_t *pair;
 	nvlist_t *holds;
 	int cleanup_fd = -1;
@@ -6133,7 +6340,7 @@ zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
 	/* make sure the user didn't pass us any invalid (empty) tags */
 	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(holds, pair)) {
-		char *htag;
+		const char *htag;
 
 		error = nvpair_value_string(pair, &htag);
 		if (error != 0)
@@ -6169,10 +6376,10 @@ static const zfs_ioc_key_t zfs_keys_get_holds[] = {
 	/* no nvl keys */
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl)
 {
+	(void) args;
 	return (dsl_dataset_get_holds(snapname, outnvl));
 }
 
@@ -6191,10 +6398,10 @@ static const zfs_ioc_key_t zfs_keys_release[] = {
 	{"<snapname>...",	DATA_TYPE_NVLIST,	ZK_WILDCARDLIST},
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist)
 {
+	(void) pool;
 	return (dsl_dataset_user_release(holds, errlist));
 }
 
@@ -6252,7 +6459,7 @@ zfs_ioc_events_next(zfs_cmd_t *zc)
 static int
 zfs_ioc_events_clear(zfs_cmd_t *zc)
 {
-	int count;
+	uint_t count;
 
 	zfs_zevent_drain_all(&count);
 	zc->zc_cookie = count;
@@ -6353,7 +6560,7 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
 	int error;
 	dsl_pool_t *dp;
 	dsl_dataset_t *new, *old;
-	char *firstsnap;
+	const char *firstsnap;
 	uint64_t used, comp, uncomp;
 
 	firstsnap = fnvlist_lookup_string(innvl, "firstsnap");
@@ -6428,15 +6635,14 @@ static const zfs_ioc_key_t zfs_keys_send_new[] = {
 	{"redactbook",		DATA_TYPE_STRING,	ZK_OPTIONAL},
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 {
+	(void) outnvl;
 	int error;
 	offset_t off;
-	char *fromname = NULL;
+	const char *fromname = NULL;
 	int fd;
-	zfs_file_t *fp;
 	boolean_t largeblockok;
 	boolean_t embedok;
 	boolean_t compressok;
@@ -6444,7 +6650,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 	boolean_t savedok;
 	uint64_t resumeobj = 0;
 	uint64_t resumeoff = 0;
-	char *redactbook = NULL;
+	const char *redactbook = NULL;
 
 	fd = fnvlist_lookup_int32(innvl, "fd");
 
@@ -6461,28 +6667,28 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 
 	(void) nvlist_lookup_string(innvl, "redactbook", &redactbook);
 
-	if ((fp = zfs_file_get(fd)) == NULL)
-		return (SET_ERROR(EBADF));
-
-	off = zfs_file_off(fp);
+	dump_bytes_arg_t dba;
+	dmu_send_outparams_t out;
+	error = dump_bytes_init(&dba, fd, &out);
+	if (error)
+		return (error);
 
-	dmu_send_outparams_t out = {0};
-	out.dso_outfunc = dump_bytes;
-	out.dso_arg = fp;
-	out.dso_dryrun = B_FALSE;
+	off = zfs_file_off(dba.dba_fp);
 	error = dmu_send(snapname, fromname, embedok, largeblockok,
 	    compressok, rawok, savedok, resumeobj, resumeoff,
 	    redactbook, fd, &off, &out);
 
-	zfs_file_put(fp);
+	dump_bytes_fini(&dba);
+
 	return (error);
 }
 
-/* ARGSUSED */
 static int
 send_space_sum(objset_t *os, void *buf, int len, void *arg)
 {
+	(void) os, (void) buf;
 	uint64_t *size = arg;
+
 	*size += len;
 	return (0);
 }
@@ -6533,8 +6739,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 	dsl_dataset_t *tosnap;
 	dsl_dataset_t *fromsnap = NULL;
 	int error;
-	char *fromname = NULL;
-	char *redactlist_book = NULL;
+	const char *fromname = NULL;
+	const char *redactlist_book = NULL;
 	boolean_t largeblockok;
 	boolean_t embedok;
 	boolean_t compressok;
@@ -6674,10 +6880,10 @@ static const zfs_ioc_key_t zfs_keys_pool_sync[] = {
 	{"force",	DATA_TYPE_BOOLEAN_VALUE,	0},
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl)
 {
+	(void) onvl;
 	int err;
 	boolean_t rc, force = B_FALSE;
 	spa_t *spa;
@@ -6717,10 +6923,10 @@ static const zfs_ioc_key_t zfs_keys_load_key[] = {
 	{"noop",	DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
+	(void) outnvl;
 	int ret;
 	dsl_crypto_params_t *dcp = NULL;
 	nvlist_t *hidden_args;
@@ -6759,10 +6965,10 @@ static const zfs_ioc_key_t zfs_keys_unload_key[] = {
 	/* no nvl keys */
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
+	(void) innvl, (void) outnvl;
 	int ret = 0;
 
 	if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
@@ -6780,7 +6986,7 @@ out:
 
 /*
  * Changes a user's wrapping key used to decrypt a dataset. The keyformat,
- * keylocation, pbkdf2salt, and  pbkdf2iters properties can also be specified
+ * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified
  * here to change how the key is derived in userspace.
  *
  * innvl: {
@@ -6797,10 +7003,10 @@ static const zfs_ioc_key_t zfs_keys_change_key[] = {
 	{"props",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
 };
 
-/* ARGSUSED */
 static int
 zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
+	(void) outnvl;
 	int ret;
 	uint64_t cmd = DCP_CMD_NONE;
 	dsl_crypto_params_t *dcp = NULL;
@@ -7029,7 +7235,7 @@ zfs_ioctl_init(void)
 	    ARRAY_SIZE(zfs_keys_destroy_bookmarks));
 
 	zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW,
-	    zfs_ioc_recv_new, zfs_secpolicy_recv_new, DATASET_NAME,
+	    zfs_ioc_recv_new, zfs_secpolicy_recv, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_recv_new, ARRAY_SIZE(zfs_keys_recv_new));
 	zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY,
@@ -7107,6 +7313,21 @@ zfs_ioctl_init(void)
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE,
 	    zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv));
 
+	zfs_ioctl_register("zpool_vdev_get_props", ZFS_IOC_VDEV_GET_PROPS,
+	    zfs_ioc_vdev_get_props, zfs_secpolicy_read, POOL_NAME,
+	    POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_vdev_get_props,
+	    ARRAY_SIZE(zfs_keys_vdev_get_props));
+
+	zfs_ioctl_register("zpool_vdev_set_props", ZFS_IOC_VDEV_SET_PROPS,
+	    zfs_ioc_vdev_set_props, zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
+	    zfs_keys_vdev_set_props, ARRAY_SIZE(zfs_keys_vdev_set_props));
+
+	zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB,
+	    zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_NONE, B_TRUE, B_TRUE,
+	    zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub));
+
 	/* IOCTLS that use the legacy function signature */
 
 	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
@@ -7268,7 +7489,7 @@ zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec)
 	 */
 	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
-		char *name = nvpair_name(pair);
+		const char *name = nvpair_name(pair);
 		data_type_t type = nvpair_type(pair);
 		boolean_t identified = B_FALSE;
 
@@ -7359,7 +7580,7 @@ zfsdev_getminor(zfs_file_t *fp, minor_t *minorp)
 
 	mutex_enter(&zfsdev_state_lock);
 
-	for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+	for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) {
 
 		if (zs->zs_minor == -1)
 			continue;
@@ -7381,9 +7602,9 @@ zfsdev_get_state(minor_t minor, enum zfsdev_state_type which)
 {
 	zfsdev_state_t *zs;
 
-	for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+	for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) {
 		if (zs->zs_minor == minor) {
-			smp_rmb();
+			membar_consumer();
 			switch (which) {
 			case ZST_ONEXIT:
 				return (zs->zs_onexit);
@@ -7435,7 +7656,7 @@ zfsdev_state_init(void *priv)
 	if (minor == 0)
 		return (SET_ERROR(ENXIO));
 
-	for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+	for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) {
 		if (zs->zs_minor == -1)
 			break;
 		zsprev = zs;
@@ -7719,13 +7940,11 @@ zfs_kmod_init(void)
 	zfs_ioctl_init();
 
 	mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
-	zfsdev_state_list = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP);
-	zfsdev_state_list->zs_minor = -1;
+	zfsdev_state_listhead.zs_minor = -1;
 
 	if ((error = zfsdev_attach()) != 0)
 		goto out;
 
-	tsd_create(&zfs_fsyncer_key, NULL);
 	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
 	tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
 
@@ -7747,13 +7966,14 @@ zfs_kmod_fini(void)
 
 	mutex_destroy(&zfsdev_state_lock);
 
-	for (zs = zfsdev_state_list; zs != NULL; zs = zsnext) {
+	for (zs = &zfsdev_state_listhead; zs != NULL; zs = zsnext) {
 		zsnext = zs->zs_next;
 		if (zs->zs_onexit)
 			zfs_onexit_destroy(zs->zs_onexit);
 		if (zs->zs_zevent)
 			zfs_zevent_destroy(zs->zs_zevent);
-		kmem_free(zs, sizeof (zfsdev_state_t));
+		if (zs != &zfsdev_state_listhead)
+			kmem_free(zs, sizeof (zfsdev_state_t));
 	}
 
 	zfs_ereport_taskq_fini();	/* run before zfs_fini() on Linux */
@@ -7761,15 +7981,12 @@ zfs_kmod_fini(void)
 	spa_fini();
 	zvol_fini();
 
-	tsd_destroy(&zfs_fsyncer_key);
 	tsd_destroy(&rrw_tsd_key);
 	tsd_destroy(&zfs_allow_log_key);
 }
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW,
-    "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls");
+ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, U64, ZMOD_RW,
+	"Maximum size in bytes allowed for src nvlist passed with ZFS ioctls");
 
-ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, ULONG, ZMOD_RW,
-    "Maximum size in bytes of ZFS ioctl output that will be logged");
-/* END CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, U64, ZMOD_RW,
+	"Maximum size in bytes of ZFS ioctl output that will be logged");
diff --git a/sys/contrib/openzfs/module/zfs/zfs_log.c b/sys/contrib/openzfs/module/zfs/zfs_log.c
index e248dc3cc4e8..fa4e7093ca46 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_log.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_log.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2015, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2022 by Pawel Jakub Dawidek
  */
 
 
@@ -107,86 +108,81 @@ zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap)
 static void
 zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
 {
-	uint32_t	*bitmap;
-	uint64_t	*attrs;
-	uint64_t	*crtime;
-	xoptattr_t	*xoap;
-	void		*scanstamp;
-	int		i;
+	xoptattr_t *xoap;
 
 	xoap = xva_getxoptattr(xvap);
 	ASSERT(xoap);
 
 	lrattr->lr_attr_masksize = xvap->xva_mapsize;
-	bitmap = &lrattr->lr_attr_bitmap;
-	for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
+	uint32_t *bitmap = &lrattr->lr_attr_bitmap;
+	for (int i = 0; i != xvap->xva_mapsize; i++, bitmap++)
 		*bitmap = xvap->xva_reqattrmap[i];
-	}
 
-	/* Now pack the attributes up in a single uint64_t */
-	attrs = (uint64_t *)bitmap;
-	*attrs = 0;
-	crtime = attrs + 1;
-	bzero(crtime, 2 * sizeof (uint64_t));
-	scanstamp = (caddr_t)(crtime + 2);
-	bzero(scanstamp, AV_SCANSTAMP_SZ);
+	lr_attr_end_t *end = (lr_attr_end_t *)bitmap;
+	end->lr_attr_attrs = 0;
+	end->lr_attr_crtime[0] = 0;
+	end->lr_attr_crtime[1] = 0;
+	memset(end->lr_attr_scanstamp, 0, AV_SCANSTAMP_SZ);
+
 	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
-		*attrs |= (xoap->xoa_readonly == 0) ? 0 :
+		end->lr_attr_attrs |= (xoap->xoa_readonly == 0) ? 0 :
 		    XAT0_READONLY;
 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
-		*attrs |= (xoap->xoa_hidden == 0) ? 0 :
+		end->lr_attr_attrs |= (xoap->xoa_hidden == 0) ? 0 :
 		    XAT0_HIDDEN;
 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
-		*attrs |= (xoap->xoa_system == 0) ? 0 :
+		end->lr_attr_attrs |= (xoap->xoa_system == 0) ? 0 :
 		    XAT0_SYSTEM;
 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
-		*attrs |= (xoap->xoa_archive == 0) ? 0 :
+		end->lr_attr_attrs |= (xoap->xoa_archive == 0) ? 0 :
 		    XAT0_ARCHIVE;
 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
-		*attrs |= (xoap->xoa_immutable == 0) ? 0 :
+		end->lr_attr_attrs |= (xoap->xoa_immutable == 0) ? 0 :
 		    XAT0_IMMUTABLE;
 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
-		*attrs |= (xoap->xoa_nounlink == 0) ? 0 :
+		end->lr_attr_attrs |= (xoap->xoa_nounlink == 0) ? 0 :
 		    XAT0_NOUNLINK;
 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
-		*attrs |= (xoap->xoa_appendonly == 0) ? 0 :
+		end->lr_attr_attrs |= (xoap->xoa_appendonly == 0) ? 0 :
 		    XAT0_APPENDONLY;
 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
-		*attrs |= (xoap->xoa_opaque == 0) ? 0 :
+		end->lr_attr_attrs |= (xoap->xoa_opaque == 0) ? 0 :
 		    XAT0_APPENDONLY;
 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
-		*attrs |= (xoap->xoa_nodump == 0) ? 0 :
+		end->lr_attr_attrs |= (xoap->xoa_nodump == 0) ? 0 :
 		    XAT0_NODUMP;
 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
-		*attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
+		end->lr_attr_attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
 		    XAT0_AV_QUARANTINED;
 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
-		*attrs |= (xoap->xoa_av_modified == 0) ? 0 :
+		end->lr_attr_attrs |= (xoap->xoa_av_modified == 0) ? 0 :
 		    XAT0_AV_MODIFIED;
 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
-		ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
+		ZFS_TIME_ENCODE(&xoap->xoa_createtime, end->lr_attr_crtime);
 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
 		ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID));
 
-		bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
+		memcpy(end->lr_attr_scanstamp, xoap->xoa_av_scanstamp,
+		    AV_SCANSTAMP_SZ);
 	} else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
 		/*
 		 * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid
 		 * at the same time, so we can share the same space.
 		 */
-		bcopy(&xoap->xoa_projid, scanstamp, sizeof (uint64_t));
+		memcpy(end->lr_attr_scanstamp, &xoap->xoa_projid,
+		    sizeof (uint64_t));
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
-		*attrs |= (xoap->xoa_reparse == 0) ? 0 :
+		end->lr_attr_attrs |= (xoap->xoa_reparse == 0) ? 0 :
 		    XAT0_REPARSE;
 	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
-		*attrs |= (xoap->xoa_offline == 0) ? 0 :
+		end->lr_attr_attrs |= (xoap->xoa_offline == 0) ? 0 :
 		    XAT0_OFFLINE;
 	if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
-		*attrs |= (xoap->xoa_sparse == 0) ? 0 :
+		end->lr_attr_attrs |= (xoap->xoa_sparse == 0) ? 0 :
 		    XAT0_SPARSE;
 	if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT))
-		*attrs |= (xoap->xoa_projinherit == 0) ? 0 :
+		end->lr_attr_attrs |= (xoap->xoa_projinherit == 0) ? 0 :
 		    XAT0_PROJINHERIT;
 }
 
@@ -214,7 +210,7 @@ zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
 	if (fuidp->z_domain_str_sz != 0) {
 		for (zdomain = list_head(&fuidp->z_domains); zdomain;
 		    zdomain = list_next(&fuidp->z_domains, zdomain)) {
-			bcopy((void *)zdomain->z_domain, start,
+			memcpy(start, zdomain->z_domain,
 			    strlen(zdomain->z_domain) + 1);
 			start = (caddr_t)start +
 			    strlen(zdomain->z_domain) + 1;
@@ -392,7 +388,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 		else
 			lracl->lr_acl_flags = 0;
 
-		bcopy(vsecp->vsa_aclentp, end, aclsize);
+		memcpy(end, vsecp->vsa_aclentp, aclsize);
 		end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
 	}
 
@@ -404,7 +400,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	/*
 	 * Now place file name in log record
 	 */
-	bcopy(name, end, namesize);
+	memcpy(end, name, namesize);
 
 	zil_itx_assign(zilog, itx, tx);
 }
@@ -426,7 +422,7 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
 	lr = (lr_remove_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
-	bcopy(name, (char *)(lr + 1), namesize);
+	memcpy(lr + 1, name, namesize);
 
 	itx->itx_oid = foid;
 
@@ -462,7 +458,7 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	lr = (lr_link_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
 	lr->lr_link_obj = zp->z_id;
-	bcopy(name, (char *)(lr + 1), namesize);
+	memcpy(lr + 1, name, namesize);
 
 	zil_itx_assign(zilog, itx, tx);
 }
@@ -493,8 +489,31 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	    sizeof (uint64_t));
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
 	    lr->lr_crtime, sizeof (uint64_t) * 2);
-	bcopy(name, (char *)(lr + 1), namesize);
-	bcopy(link, (char *)(lr + 1) + namesize, linksize);
+	memcpy((char *)(lr + 1), name, namesize);
+	memcpy((char *)(lr + 1) + namesize, link, linksize);
+
+	zil_itx_assign(zilog, itx, tx);
+}
+
+static void
+do_zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp,
+    const char *sname, znode_t *tdzp, const char *dname, znode_t *szp)
+{
+	itx_t *itx;
+	lr_rename_t *lr;
+	size_t snamesize = strlen(sname) + 1;
+	size_t dnamesize = strlen(dname) + 1;
+
+	if (zil_replaying(zilog, tx))
+		return;
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
+	lr = (lr_rename_t *)&itx->itx_lr;
+	lr->lr_sdoid = sdzp->z_id;
+	lr->lr_tdoid = tdzp->z_id;
+	memcpy((char *)(lr + 1), sname, snamesize);
+	memcpy((char *)(lr + 1) + snamesize, dname, dnamesize);
+	itx->itx_oid = szp->z_id;
 
 	zil_itx_assign(zilog, itx, tx);
 }
@@ -506,20 +525,73 @@ void
 zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp,
     const char *sname, znode_t *tdzp, const char *dname, znode_t *szp)
 {
+	txtype |= TX_RENAME;
+	do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp);
+}
+
+/*
+ * Handles TX_RENAME_EXCHANGE transactions.
+ */
+void
+zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+    znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname,
+    znode_t *szp)
+{
+	txtype |= TX_RENAME_EXCHANGE;
+	do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp);
+}
+
+/*
+ * Handles TX_RENAME_WHITEOUT transactions.
+ *
+ * Unfortunately we cannot reuse do_zfs_log_rename because we we need to call
+ * zfs_mknode() on replay which requires stashing bits as with TX_CREATE.
+ */
+void
+zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+    znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname,
+    znode_t *szp, znode_t *wzp)
+{
 	itx_t *itx;
-	lr_rename_t *lr;
+	lr_rename_whiteout_t *lr;
 	size_t snamesize = strlen(sname) + 1;
 	size_t dnamesize = strlen(dname) + 1;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
+	txtype |= TX_RENAME_WHITEOUT;
 	itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
-	lr = (lr_rename_t *)&itx->itx_lr;
-	lr->lr_sdoid = sdzp->z_id;
-	lr->lr_tdoid = tdzp->z_id;
-	bcopy(sname, (char *)(lr + 1), snamesize);
-	bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+	lr = (lr_rename_whiteout_t *)&itx->itx_lr;
+	lr->lr_rename.lr_sdoid = sdzp->z_id;
+	lr->lr_rename.lr_tdoid = tdzp->z_id;
+
+	/*
+	 * RENAME_WHITEOUT will create an entry at the source znode, so we need
+	 * to store the same data that the equivalent call to zfs_log_create()
+	 * would.
+	 */
+	lr->lr_wfoid = wzp->z_id;
+	LR_FOID_SET_SLOTS(lr->lr_wfoid, wzp->z_dnodesize >> DNODE_SHIFT);
+	(void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(wzp)), &lr->lr_wgen,
+	    sizeof (uint64_t));
+	(void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(wzp)),
+	    lr->lr_wcrtime, sizeof (uint64_t) * 2);
+	lr->lr_wmode = wzp->z_mode;
+	lr->lr_wuid = (uint64_t)KUID_TO_SUID(ZTOUID(wzp));
+	lr->lr_wgid = (uint64_t)KGID_TO_SGID(ZTOGID(wzp));
+
+	/*
+	 * This rdev will always be makdevice(0, 0) but because the ZIL log and
+	 * replay code needs to be platform independent (and there is no
+	 * platform independent makdev()) we need to copy the one created
+	 * during the rename operation.
+	 */
+	(void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_RDEV(ZTOZSB(wzp)), &lr->lr_wrdev,
+	    sizeof (lr->lr_wrdev));
+
+	memcpy((char *)(lr + 1), sname, snamesize);
+	memcpy((char *)(lr + 1) + snamesize, dname, dnamesize);
 	itx->itx_oid = szp->z_id;
 
 	zil_itx_assign(zilog, itx, tx);
@@ -530,17 +602,16 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp,
  * called as soon as the write is on stable storage (be it via a DMU sync or a
  * ZIL commit).
  */
-long zfs_immediate_write_sz = 32768;
+static int64_t zfs_immediate_write_sz = 32768;
 
 void
 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *zp, offset_t off, ssize_t resid, int ioflag,
+    znode_t *zp, offset_t off, ssize_t resid, boolean_t commit,
     zil_callback_t callback, void *callback_data)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
 	uint32_t blocksize = zp->z_blksz;
 	itx_wr_state_t write_state;
-	uintptr_t fsync_cnt;
 	uint64_t gen = 0;
 	ssize_t size = resid;
 
@@ -556,15 +627,11 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	else if (!spa_has_slogs(zilog->zl_spa) &&
 	    resid >= zfs_immediate_write_sz)
 		write_state = WR_INDIRECT;
-	else if (ioflag & (O_SYNC | O_DSYNC))
+	else if (commit)
 		write_state = WR_COPIED;
 	else
 		write_state = WR_NEED_COPY;
 
-	if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
-		(void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
-	}
-
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &gen,
 	    sizeof (gen));
 
@@ -615,12 +682,9 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 		BP_ZERO(&lr->lr_blkptr);
 
 		itx->itx_private = ZTOZSB(zp);
+		itx->itx_sync = (zp->z_sync_cnt != 0);
 		itx->itx_gen = gen;
 
-		if (!(ioflag & (O_SYNC | O_DSYNC)) && (zp->z_sync_cnt == 0) &&
-		    (fsync_cnt == 0))
-			itx->itx_sync = B_FALSE;
-
 		itx->itx_callback = callback;
 		itx->itx_callback_data = callback_data;
 		zil_itx_assign(zilog, itx, tx);
@@ -721,6 +785,40 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 }
 
 /*
+ * Handles TX_SETSAXATTR transactions.
+ */
+void
+zfs_log_setsaxattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *zp, const char *name, const void *value, size_t size)
+{
+	itx_t		*itx;
+	lr_setsaxattr_t	*lr;
+	size_t		recsize = sizeof (lr_setsaxattr_t);
+	void		*xattrstart;
+	int		namelen;
+
+	if (zil_replaying(zilog, tx) || zp->z_unlinked)
+		return;
+
+	namelen = strlen(name) + 1;
+	recsize += (namelen + size);
+	itx = zil_itx_create(txtype, recsize);
+	lr = (lr_setsaxattr_t *)&itx->itx_lr;
+	lr->lr_foid = zp->z_id;
+	xattrstart = (char *)(lr + 1);
+	memcpy(xattrstart, name, namelen);
+	if (value != NULL) {
+		memcpy((char *)xattrstart + namelen, value, size);
+		lr->lr_size = size;
+	} else {
+		lr->lr_size = 0;
+	}
+
+	itx->itx_sync = (zp->z_sync_cnt != 0);
+	zil_itx_assign(zilog, itx, tx);
+}
+
+/*
  * Handles TX_ACL transactions.
  */
 void
@@ -768,11 +866,11 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
 
 	if (txtype == TX_ACL_V0) {
 		lrv0 = (lr_acl_v0_t *)lr;
-		bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
+		memcpy(lrv0 + 1, vsecp->vsa_aclentp, aclbytes);
 	} else {
 		void *start = (ace_t *)(lr + 1);
 
-		bcopy(vsecp->vsa_aclentp, start, aclbytes);
+		memcpy(start, vsecp->vsa_aclentp, aclbytes);
 
 		start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
 
@@ -786,7 +884,52 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
 	zil_itx_assign(zilog, itx, tx);
 }
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, LONG, ZMOD_RW,
+/*
+ * Handles TX_CLONE_RANGE transactions.
+ */
+void
+zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp,
+    uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps,
+    size_t nbps)
+{
+	itx_t *itx;
+	lr_clone_range_t *lr;
+	uint64_t partlen, max_log_data;
+	size_t partnbps;
+
+	if (zil_replaying(zilog, tx) || zp->z_unlinked)
+		return;
+
+	max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t));
+
+	while (nbps > 0) {
+		partnbps = MIN(nbps, max_log_data / sizeof (bps[0]));
+		partlen = partnbps * blksz;
+		ASSERT3U(partlen, <, len + blksz);
+		partlen = MIN(partlen, len);
+
+		itx = zil_itx_create(txtype,
+		    sizeof (*lr) + sizeof (bps[0]) * partnbps);
+		lr = (lr_clone_range_t *)&itx->itx_lr;
+		lr->lr_foid = zp->z_id;
+		lr->lr_offset = off;
+		lr->lr_length = partlen;
+		lr->lr_blksz = blksz;
+		lr->lr_nbps = partnbps;
+		memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps);
+
+		itx->itx_sync = (zp->z_sync_cnt != 0);
+
+		zil_itx_assign(zilog, itx, tx);
+
+		bps += partnbps;
+		ASSERT3U(nbps, >=, partnbps);
+		nbps -= partnbps;
+		off += partlen;
+		ASSERT3U(len, >=, partlen);
+		len -= partlen;
+	}
+}
+
+ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, S64, ZMOD_RW,
 	"Largest data block to write to zil");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zfs_onexit.c b/sys/contrib/openzfs/module/zfs/zfs_onexit.c
index 7c56dd9c97f5..7bf804b67790 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_onexit.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_onexit.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -87,8 +87,7 @@ zfs_onexit_destroy(zfs_onexit_t *zo)
 	zfs_onexit_action_node_t *ap;
 
 	mutex_enter(&zo->zo_lock);
-	while ((ap = list_head(&zo->zo_actions)) != NULL) {
-		list_remove(&zo->zo_actions, ap);
+	while ((ap = list_remove_head(&zo->zo_actions)) != NULL) {
 		mutex_exit(&zo->zo_lock);
 		ap->za_func(ap->za_data);
 		kmem_free(ap, sizeof (zfs_onexit_action_node_t));
@@ -151,7 +150,7 @@ zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo)
  */
 int
 zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
-    uint64_t *action_handle)
+    uintptr_t *action_handle)
 {
 	zfs_onexit_t *zo;
 	zfs_onexit_action_node_t *ap;
@@ -170,7 +169,7 @@ zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
 	list_insert_tail(&zo->zo_actions, ap);
 	mutex_exit(&zo->zo_lock);
 	if (action_handle)
-		*action_handle = (uint64_t)(uintptr_t)ap;
+		*action_handle = (uintptr_t)ap;
 
 	return (0);
 }
diff --git a/sys/contrib/openzfs/module/zfs/zfs_quota.c b/sys/contrib/openzfs/module/zfs/zfs_quota.c
index e61db5c7ab83..9b351eefc04e 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_quota.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_quota.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -20,8 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
- * All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek
  * Copyright (c) 2012, 2015, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c b/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c
index b18b480ce527..091562ca6852 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/zfs_replay.c b/sys/contrib/openzfs/module/zfs/zfs_replay.c
index e6ed3e738e40..2e0af60f6db4 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_replay.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_replay.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 Cyril Plisko. All rights reserved.
  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/types.h>
@@ -47,6 +48,8 @@
 #include <sys/atomic.h>
 #include <sys/cred.h>
 #include <sys/zpl.h>
+#include <sys/dmu_objset.h>
+#include <sys/zfeature.h>
 
 /*
  * NB: FreeBSD expects to be able to do vnode locking in lookup and
@@ -68,7 +71,7 @@ static void
 zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
     uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
 {
-	bzero(vap, sizeof (*vap));
+	memset(vap, 0, sizeof (*vap));
 	vap->va_mask = (uint_t)mask;
 	vap->va_mode = mode;
 #if defined(__FreeBSD__) || defined(__APPLE__)
@@ -80,10 +83,10 @@ zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
 	vap->va_nodeid = nodeid;
 }
 
-/* ARGSUSED */
 static int
 zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap)
 {
+	(void) arg1, (void) arg2, (void) byteswap;
 	return (SET_ERROR(ENOTSUP));
 }
 
@@ -141,13 +144,13 @@ zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
 		ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID));
 
-		bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
+		memcpy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
 	} else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
 		/*
 		 * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid
 		 * at the same time, so we can share the same space.
 		 */
-		bcopy(scanstamp, &xoap->xoa_projid, sizeof (uint64_t));
+		memcpy(&xoap->xoa_projid, scanstamp, sizeof (uint64_t));
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
 		xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
@@ -306,6 +309,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
 	uint64_t dnodesize;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lracl));
+
 	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
 	if (byteswap) {
 		byteswap_uint64_array(lracl, sizeof (*lracl));
@@ -362,7 +367,7 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
 		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
 		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
 		    lr->lr_uid, lr->lr_gid);
-		fallthrough;
+		zfs_fallthrough;
 	case TX_CREATE_ACL_ATTR:
 		if (name == NULL) {
 			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
@@ -384,8 +389,13 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
 			    lr->lr_uid, lr->lr_gid);
 		}
 
+#if defined(__linux__)
+		error = zfs_create(dzp, name, &xva.xva_vattr,
+		    0, 0, &zp, kcred, vflg, &vsec, zfs_init_idmap);
+#else
 		error = zfs_create(dzp, name, &xva.xva_vattr,
-		    0, 0, &zp, kcred, vflg, &vsec);
+		    0, 0, &zp, kcred, vflg, &vsec, NULL);
+#endif
 		break;
 	case TX_MKDIR_ACL:
 		aclstart = (caddr_t)(lracl + 1);
@@ -394,7 +404,7 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
 		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
 		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
 		    lr->lr_uid, lr->lr_gid);
-		fallthrough;
+		zfs_fallthrough;
 	case TX_MKDIR_ACL_ATTR:
 		if (name == NULL) {
 			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
@@ -414,8 +424,13 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
 			    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
 			    lr->lr_uid, lr->lr_gid);
 		}
+#if defined(__linux__)
 		error = zfs_mkdir(dzp, name, &xva.xva_vattr,
-		    &zp, kcred, vflg, &vsec);
+		    &zp, kcred, vflg, &vsec, zfs_init_idmap);
+#else
+		error = zfs_mkdir(dzp, name, &xva.xva_vattr,
+		    &zp, kcred, vflg, &vsec, NULL);
+#endif
 		break;
 	default:
 		error = SET_ERROR(ENOTSUP);
@@ -457,6 +472,8 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
 	uint64_t dnodesize;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
 	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
@@ -500,9 +517,9 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
 	 *
 	 * The _ATTR versions will grab the fuid info in their subcases.
 	 */
-	if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK &&
-	    (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR &&
-	    (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) {
+	if (txtype != TX_SYMLINK &&
+	    txtype != TX_MKDIR_ATTR &&
+	    txtype != TX_CREATE_ATTR) {
 		start = (lr + 1);
 		zfsvfs->z_fuid_replay =
 		    zfs_replay_fuid_domain(start, &start,
@@ -519,14 +536,19 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
 		    zfs_replay_fuid_domain(start, &start,
 		    lr->lr_uid, lr->lr_gid);
 		name = (char *)start;
-		fallthrough;
+		zfs_fallthrough;
 
 	case TX_CREATE:
 		if (name == NULL)
 			name = (char *)start;
 
+#if defined(__linux__)
+		error = zfs_create(dzp, name, &xva.xva_vattr,
+		    0, 0, &zp, kcred, vflg, NULL, zfs_init_idmap);
+#else
 		error = zfs_create(dzp, name, &xva.xva_vattr,
-		    0, 0, &zp, kcred, vflg, NULL);
+		    0, 0, &zp, kcred, vflg, NULL, NULL);
+#endif
 		break;
 	case TX_MKDIR_ATTR:
 		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
@@ -537,14 +559,20 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
 		    zfs_replay_fuid_domain(start, &start,
 		    lr->lr_uid, lr->lr_gid);
 		name = (char *)start;
-		fallthrough;
+		zfs_fallthrough;
 
 	case TX_MKDIR:
 		if (name == NULL)
 			name = (char *)(lr + 1);
 
+#if defined(__linux__)
 		error = zfs_mkdir(dzp, name, &xva.xva_vattr,
-		    &zp, kcred, vflg, NULL);
+		    &zp, kcred, vflg, NULL, zfs_init_idmap);
+#else
+		error = zfs_mkdir(dzp, name, &xva.xva_vattr,
+		    &zp, kcred, vflg, NULL, NULL);
+#endif
+
 		break;
 	case TX_MKXATTR:
 		error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &zp, kcred);
@@ -552,8 +580,13 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
 	case TX_SYMLINK:
 		name = (char *)(lr + 1);
 		link = name + strlen(name) + 1;
+#if defined(__linux__)
+		error = zfs_symlink(dzp, name, &xva.xva_vattr,
+		    link, &zp, kcred, vflg, zfs_init_idmap);
+#else
 		error = zfs_symlink(dzp, name, &xva.xva_vattr,
-		    link, &zp, kcred, vflg);
+		    link, &zp, kcred, vflg, NULL);
+#endif
 		break;
 	default:
 		error = SET_ERROR(ENOTSUP);
@@ -584,6 +617,8 @@ zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
 	int error;
 	int vflg = 0;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
@@ -619,6 +654,8 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
 	int error;
 	int vflg = 0;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
@@ -641,18 +678,21 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
 }
 
 static int
-zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
+do_zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, char *sname,
+    char *tname, uint64_t rflags, vattr_t *wo_vap)
 {
-	zfsvfs_t *zfsvfs = arg1;
-	lr_rename_t *lr = arg2;
-	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
-	char *tname = sname + strlen(sname) + 1;
 	znode_t *sdzp, *tdzp;
-	int error;
-	int vflg = 0;
+	int error, vflg = 0;
 
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
+	/* Only Linux currently supports RENAME_* flags. */
+#ifdef __linux__
+	VERIFY0(rflags & ~(RENAME_EXCHANGE | RENAME_WHITEOUT));
+
+	/* wo_vap must be non-NULL iff. we're doing RENAME_WHITEOUT */
+	VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
+#else
+	VERIFY0(rflags);
+#endif
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
 		return (error);
@@ -665,7 +705,13 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
 	if (lr->lr_common.lrc_txtype & TX_CI)
 		vflg |= FIGNORECASE;
 
-	error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg);
+#if defined(__linux__)
+	error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags,
+	    wo_vap, zfs_init_idmap);
+#else
+	error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags,
+	    wo_vap, NULL);
+#endif
 
 	zrele(tdzp);
 	zrele(sdzp);
@@ -673,6 +719,92 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
 }
 
 static int
+zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zfsvfs_t *zfsvfs = arg1;
+	lr_rename_t *lr = arg2;
+
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
+	char *tname = sname + strlen(sname) + 1;
+	return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, 0, NULL));
+}
+
+static int
+zfs_replay_rename_exchange(void *arg1, void *arg2, boolean_t byteswap)
+{
+#ifdef __linux__
+	zfsvfs_t *zfsvfs = arg1;
+	lr_rename_t *lr = arg2;
+
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
+	char *tname = sname + strlen(sname) + 1;
+	return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, RENAME_EXCHANGE,
+	    NULL));
+#else
+	return (SET_ERROR(ENOTSUP));
+#endif
+}
+
+static int
+zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap)
+{
+#ifdef __linux__
+	zfsvfs_t *zfsvfs = arg1;
+	lr_rename_whiteout_t *lr = arg2;
+	int error;
+	/* For the whiteout file. */
+	xvattr_t xva;
+	uint64_t objid;
+	uint64_t dnodesize;
+
+	ASSERT3U(lr->lr_rename.lr_common.lrc_reclen, >, sizeof (*lr));
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	objid = LR_FOID_GET_OBJ(lr->lr_wfoid);
+	dnodesize = LR_FOID_GET_SLOTS(lr->lr_wfoid) << DNODE_SHIFT;
+
+	xva_init(&xva);
+	zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
+	    lr->lr_wmode, lr->lr_wuid, lr->lr_wgid, lr->lr_wrdev, objid);
+
+	/*
+	 * As with TX_CREATE, RENAME_WHITEOUT ends up in zfs_mknode(), which
+	 * assigns the object's creation time, generation number, and dnode
+	 * slot count. The generic zfs_rename() has no concept of these
+	 * attributes, so we smuggle the values inside the vattr's otherwise
+	 * unused va_ctime, va_nblocks, and va_fsid fields.
+	 */
+	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_wcrtime);
+	xva.xva_vattr.va_nblocks = lr->lr_wgen;
+	xva.xva_vattr.va_fsid = dnodesize;
+
+	error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
+	if (error)
+		return (error);
+
+	/* sname and tname follow lr_rename_whiteout_t */
+	char *sname = (char *)(lr + 1);
+	char *tname = sname + strlen(sname) + 1;
+	return (do_zfs_replay_rename(zfsvfs, &lr->lr_rename, sname, tname,
+	    RENAME_WHITEOUT, &xva.xva_vattr));
+#else
+	return (SET_ERROR(ENOTSUP));
+#endif
+}
+
+static int
 zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
@@ -682,6 +814,8 @@ zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 	int error;
 	uint64_t eod, offset, length;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
@@ -745,6 +879,8 @@ zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap)
 	int error;
 	uint64_t end;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
@@ -789,16 +925,17 @@ zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 	zfsvfs_t *zfsvfs = arg1;
 	lr_truncate_t *lr = arg2;
 	znode_t *zp;
-	flock64_t fl;
+	flock64_t fl = {0};
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
-	bzero(&fl, sizeof (fl));
 	fl.l_type = F_WRLCK;
 	fl.l_whence = SEEK_SET;
 	fl.l_start = lr->lr_offset;
@@ -823,6 +960,8 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
 	int error;
 	void *start;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	xva_init(&xva);
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
@@ -859,7 +998,11 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
 	zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
 	    lr->lr_uid, lr->lr_gid);
 
-	error = zfs_setattr(zp, vap, 0, kcred);
+#if defined(__linux__)
+	error = zfs_setattr(zp, vap, 0, kcred, zfs_init_idmap);
+#else
+	error = zfs_setattr(zp, vap, 0, kcred, NULL);
+#endif
 
 	zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 	zfsvfs->z_fuid_replay = NULL;
@@ -869,15 +1012,102 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
 }
 
 static int
+zfs_replay_setsaxattr(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zfsvfs_t *zfsvfs = arg1;
+	lr_setsaxattr_t *lr = arg2;
+	znode_t *zp;
+	nvlist_t *nvl;
+	size_t sa_size;
+	char *name;
+	char *value;
+	size_t size;
+	int error = 0;
+
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr) + lr->lr_size);
+
+	ASSERT(spa_feature_is_active(zfsvfs->z_os->os_spa,
+	    SPA_FEATURE_ZILSAXATTR));
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+		return (error);
+
+	rw_enter(&zp->z_xattr_lock, RW_WRITER);
+	mutex_enter(&zp->z_lock);
+	if (zp->z_xattr_cached == NULL)
+		error = zfs_sa_get_xattr(zp);
+	mutex_exit(&zp->z_lock);
+
+	if (error)
+		goto out;
+
+	ASSERT(zp->z_xattr_cached);
+	nvl = zp->z_xattr_cached;
+
+	/* Get xattr name, value and size from log record */
+	size = lr->lr_size;
+	name = (char *)(lr + 1);
+	if (size == 0) {
+		value = NULL;
+		error = nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY);
+	} else {
+		value = name + strlen(name) + 1;
+		/* Limited to 32k to keep nvpair memory allocations small */
+		if (size > DXATTR_MAX_ENTRY_SIZE) {
+			error = SET_ERROR(EFBIG);
+			goto out;
+		}
+
+		/* Prevent the DXATTR SA from consuming the entire SA region */
+		error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
+		if (error)
+			goto out;
+
+		if (sa_size > DXATTR_MAX_SA_SIZE) {
+			error = SET_ERROR(EFBIG);
+			goto out;
+		}
+
+		error = nvlist_add_byte_array(nvl, name, (uchar_t *)value,
+		    size);
+	}
+
+	/*
+	 * Update the SA for additions, modifications, and removals. On
+	 * error drop the inconsistent cached version of the nvlist, it
+	 * will be reconstructed from the ARC when next accessed.
+	 */
+	if (error == 0)
+		error = zfs_sa_set_xattr(zp, name, value, size);
+
+	if (error) {
+		nvlist_free(nvl);
+		zp->z_xattr_cached = NULL;
+	}
+
+out:
+	rw_exit(&zp->z_xattr_lock);
+	zrele(zp);
+	return (error);
+}
+
+static int
 zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_acl_v0_t *lr = arg2;
 	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
-	vsecattr_t vsa;
+	vsecattr_t vsa = {0};
 	znode_t *zp;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) +
+	    sizeof (ace_t) * lr->lr_aclcnt);
+
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
 		zfs_oldace_byteswap(ace, lr->lr_aclcnt);
@@ -886,7 +1116,6 @@ zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap)
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
-	bzero(&vsa, sizeof (vsa));
 	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
 	vsa.vsa_aclcnt = lr->lr_aclcnt;
 	vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt;
@@ -920,10 +1149,13 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
 	zfsvfs_t *zfsvfs = arg1;
 	lr_acl_t *lr = arg2;
 	ace_t *ace = (ace_t *)(lr + 1);
-	vsecattr_t vsa;
+	vsecattr_t vsa = {0};
 	znode_t *zp;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + lr->lr_acl_bytes);
+
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
 		zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
@@ -937,7 +1169,6 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
-	bzero(&vsa, sizeof (vsa));
 	vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
 	vsa.vsa_aclcnt = lr->lr_aclcnt;
 	vsa.vsa_aclentp = ace;
@@ -964,10 +1195,42 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
 	return (error);
 }
 
+static int
+zfs_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zfsvfs_t *zfsvfs = arg1;
+	lr_clone_range_t *lr = arg2;
+	znode_t *zp;
+	int error;
+
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t,
+	    lr_bps[lr->lr_nbps]));
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+		/*
+		 * Clones can be logged out of order, so don't be surprised if
+		 * the file is gone - just return success.
+		 */
+		if (error == ENOENT)
+			error = 0;
+		return (error);
+	}
+
+	error = zfs_clone_range_replay(zp, lr->lr_offset, lr->lr_length,
+	    lr->lr_blksz, lr->lr_bps, lr->lr_nbps);
+
+	zrele(zp);
+	return (error);
+}
+
 /*
  * Callback vectors for replaying records
  */
-zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
+zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = {
 	zfs_replay_error,	/* no such type */
 	zfs_replay_create,	/* TX_CREATE */
 	zfs_replay_create,	/* TX_MKDIR */
@@ -989,4 +1252,8 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
 	zfs_replay_create,	/* TX_MKDIR_ATTR */
 	zfs_replay_create_acl,	/* TX_MKDIR_ACL_ATTR */
 	zfs_replay_write2,	/* TX_WRITE2 */
+	zfs_replay_setsaxattr,	/* TX_SETSAXATTR */
+	zfs_replay_rename_exchange,	/* TX_RENAME_EXCHANGE */
+	zfs_replay_rename_whiteout,	/* TX_RENAME_WHITEOUT */
+	zfs_replay_clone_range,	/* TX_CLONE_RANGE */
 };
diff --git a/sys/contrib/openzfs/module/zfs/zfs_rlock.c b/sys/contrib/openzfs/module/zfs/zfs_rlock.c
index 06a5e031a7df..f42661df82e4 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_rlock.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_rlock.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/zfs_sa.c b/sys/contrib/openzfs/module/zfs/zfs_sa.c
index 67be131da63b..fb2443b756f8 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_sa.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_sa.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -29,6 +29,7 @@
 #include <sys/zfs_sa.h>
 #include <sys/dmu_objset.h>
 #include <sys/sa_impl.h>
+#include <sys/zfeature.h>
 
 /*
  * ZPL attribute registration table.
@@ -43,7 +44,7 @@
  * this version of ZFS won't change or delete them.
  */
 
-sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
+const sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
 	{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
 	{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
 	{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
@@ -69,7 +70,10 @@ sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
 	{NULL, 0, 0, 0}
 };
 
+
 #ifdef _KERNEL
+static int zfs_zil_saxattr = 1;
+
 int
 zfs_sa_readlink(znode_t *zp, zfs_uio_t *uio)
 {
@@ -103,8 +107,8 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
 	if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) {
 		VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx));
 		if (len) {
-			bcopy(link, (caddr_t)db->db_data +
-			    ZFS_OLD_ZNODE_PHYS_SIZE, len);
+			memcpy((caddr_t)db->db_data +
+			    ZFS_OLD_ZNODE_PHYS_SIZE, link, len);
 		}
 	} else {
 		dmu_buf_t *dbp;
@@ -116,7 +120,7 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
 		dmu_buf_will_dirty(dbp, tx);
 
 		ASSERT3U(len, <=, dbp->db_size);
-		bcopy(link, dbp->db_data, len);
+		memcpy(dbp->db_data, link, len);
 		dmu_buf_rele(dbp, FTAG);
 	}
 }
@@ -219,13 +223,14 @@ zfs_sa_get_xattr(znode_t *zp)
 }
 
 int
-zfs_sa_set_xattr(znode_t *zp)
+zfs_sa_set_xattr(znode_t *zp, const char *name, const void *value, size_t vsize)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	zilog_t *zilog;
 	dmu_tx_t *tx;
 	char *obj;
 	size_t size;
-	int error;
+	int error, logsaxattr = 0;
 
 	ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
 	ASSERT(zp->z_xattr_cached);
@@ -244,6 +249,17 @@ zfs_sa_set_xattr(znode_t *zp)
 	if (error)
 		goto out_free;
 
+	zilog = zfsvfs->z_log;
+
+	/*
+	 * Users enable ZIL logging of xattr=sa operations by enabling the
+	 * SPA_FEATURE_ZILSAXATTR feature on the pool. Feature is activated
+	 * during zil_process_commit_list/zil_create, if enabled.
+	 */
+	if (spa_feature_is_enabled(zfsvfs->z_os->os_spa,
+	    SPA_FEATURE_ZILSAXATTR) && zfs_zil_saxattr)
+		logsaxattr = 1;
+
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa_create(tx, size);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
@@ -256,6 +272,10 @@ zfs_sa_set_xattr(znode_t *zp)
 		sa_bulk_attr_t bulk[2];
 		uint64_t ctime[2];
 
+		if (logsaxattr)
+			zfs_log_setsaxattr(zilog, tx, TX_SETSAXATTR, zp, name,
+			    value, vsize);
+
 		zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DXATTR(zfsvfs),
 		    NULL, obj, size);
@@ -264,6 +284,8 @@ zfs_sa_set_xattr(znode_t *zp)
 		VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
 
 		dmu_tx_commit(tx);
+		if (logsaxattr && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+			zil_commit(zilog, 0);
 	}
 out_free:
 	vmem_free(obj, size);
@@ -396,8 +418,9 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
 	/* if scanstamp then add scanstamp */
 
 	if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
-		bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
-		    scanstamp, AV_SCANSTAMP_SZ);
+		memcpy(scanstamp,
+		    (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+		    AV_SCANSTAMP_SZ);
 		SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs),
 		    NULL, scanstamp, AV_SCANSTAMP_SZ);
 		zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
@@ -433,6 +456,9 @@ zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp)
 	}
 }
 
+ZFS_MODULE_PARAM(zfs, zfs_, zil_saxattr, INT, ZMOD_RW,
+	"Disable xattr=sa extended attribute logging in ZIL by settng 0.");
+
 EXPORT_SYMBOL(zfs_attr_table);
 EXPORT_SYMBOL(zfs_sa_readlink);
 EXPORT_SYMBOL(zfs_sa_symlink);
diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
index 7cbb70f499af..f3db953eab46 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -24,6 +24,7 @@
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
@@ -46,34 +47,53 @@
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
+#include <sys/dsl_crypt.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/policy.h>
+#include <sys/zfeature.h>
 #include <sys/zfs_vnops.h>
 #include <sys/zfs_quota.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 
+/*
+ * Enable the experimental block cloning feature.  If this setting is 0, then
+ * even if feature@block_cloning is enabled, attempts to clone blocks will act
+ * as though the feature is disabled.
+ */
+int zfs_bclone_enabled = 1;
 
-static ulong_t zfs_fsync_sync_cnt = 4;
+/*
+ * When set zfs_clone_range() waits for dirty data to be written to disk.
+ * This allows the clone operation to reliably succeed when a file is modified
+ * and then immediately cloned. For small files this may be slower than making
+ * a copy of the file and is therefore not the default.  However, in certain
+ * scenarios this behavior may be desirable so a tunable is provided.
+ */
+static int zfs_bclone_wait_dirty = 0;
+
+/*
+ * Maximum bytes to read per chunk in zfs_read().
+ */
+static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
 
 int
 zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
 {
+	int error = 0;
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 
-	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
-
 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
-		ZFS_ENTER(zfsvfs);
-		ZFS_VERIFY_ZP(zp);
+		if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+			return (error);
+		atomic_inc_32(&zp->z_sync_writes_cnt);
 		zil_commit(zfsvfs->z_log, zp->z_id);
-		ZFS_EXIT(zfsvfs);
+		atomic_dec_32(&zp->z_sync_writes_cnt);
+		zfs_exit(zfsvfs, FTAG);
 	}
-	tsd_set(zfs_fsyncer_key, NULL);
-
-	return (0);
+	return (error);
 }
 
 
@@ -102,10 +122,10 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
 		hole = B_FALSE;
 
 	/* Flush any mmap()'d data to disk */
-	if (zn_has_cached_data(zp))
-		zn_flush_cached_data(zp, B_FALSE);
+	if (zn_has_cached_data(zp, 0, file_sz - 1))
+		zn_flush_cached_data(zp, B_TRUE);
 
-	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, file_sz, RL_READER);
+	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
 	error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
 	zfs_rangelock_exit(lr);
 
@@ -144,37 +164,44 @@ zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off)
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int error;
 
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
+	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+		return (error);
 
 	error = zfs_holey_common(zp, cmd, off);
 
-	ZFS_EXIT(zfsvfs);
+	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 #endif /* SEEK_HOLE && SEEK_DATA */
 
-/*ARGSUSED*/
 int
 zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int error;
 
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
+	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+		return (error);
 
 	if (flag & V_ACE_MASK)
-		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
+#if defined(__linux__)
+		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
+		    zfs_init_idmap);
+#else
+		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
+		    NULL);
+#endif
 	else
-		error = zfs_zaccess_rwx(zp, mode, flag, cr);
+#if defined(__linux__)
+		error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap);
+#else
+		error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL);
+#endif
 
-	ZFS_EXIT(zfsvfs);
+	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
-static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
-
 /*
  * Read bytes from specified file into supplied buffer.
  *
@@ -192,25 +219,25 @@ static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
  * Side Effects:
  *	inode - atime updated if byte count > 0
  */
-/* ARGSUSED */
 int
 zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 {
+	(void) cr;
 	int error = 0;
 	boolean_t frsync = B_FALSE;
 
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
+	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+		return (error);
 
 	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
-		ZFS_EXIT(zfsvfs);
+		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EACCES));
 	}
 
 	/* We don't copy out anything useful for directories. */
 	if (Z_ISDIR(ZTOTYPE(zp))) {
-		ZFS_EXIT(zfsvfs);
+		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EISDIR));
 	}
 
@@ -218,7 +245,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 	 * Validate file offset
 	 */
 	if (zfs_uio_offset(uio) < (offset_t)0) {
-		ZFS_EXIT(zfsvfs);
+		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
@@ -226,7 +253,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 	 * Fasttrack empty reads
 	 */
 	if (zfs_uio_resid(uio) == 0) {
-		ZFS_EXIT(zfsvfs);
+		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
 
@@ -275,7 +302,8 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			error = mappedread_sf(zp, nbytes, uio);
 		else
 #endif
-		if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) {
+		if (zn_has_cached_data(zp, zfs_uio_offset(uio),
+		    zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) {
 			error = mappedread(zp, nbytes, uio);
 		} else {
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
@@ -311,10 +339,65 @@ out:
 	zfs_rangelock_exit(lr);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
-	ZFS_EXIT(zfsvfs);
+	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
+static void
+zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr,
+    uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx)
+{
+	zilog_t *zilog = zfsvfs->z_log;
+	const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
+
+	ASSERT(clear_setid_bits_txgp != NULL);
+	ASSERT(tx != NULL);
+
+	/*
+	 * Clear Set-UID/Set-GID bits on successful write if not
+	 * privileged and at least one of the execute bits is set.
+	 *
+	 * It would be nice to do this after all writes have
+	 * been done, but that would still expose the ISUID/ISGID
+	 * to another app after the partial write is committed.
+	 *
+	 * Note: we don't call zfs_fuid_map_id() here because
+	 * user 0 is not an ephemeral uid.
+	 */
+	mutex_enter(&zp->z_acl_lock);
+	if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 &&
+	    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
+	    secpolicy_vnode_setid_retain(zp, cr,
+	    ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
+		uint64_t newmode;
+
+		zp->z_mode &= ~(S_ISUID | S_ISGID);
+		newmode = zp->z_mode;
+		(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+		    (void *)&newmode, sizeof (uint64_t), tx);
+
+		mutex_exit(&zp->z_acl_lock);
+
+		/*
+		 * Make sure SUID/SGID bits will be removed when we replay the
+		 * log. If the setid bits are keep coming back, don't log more
+		 * than one TX_SETATTR per transaction group.
+		 */
+		if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) {
+			vattr_t va = {0};
+
+			va.va_mask = ATTR_MODE;
+			va.va_nodeid = zp->z_id;
+			va.va_mode = newmode;
+			zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va,
+			    ATTR_MODE, NULL);
+			*clear_setid_bits_txgp = dmu_tx_get_txg(tx);
+		}
+	} else {
+		mutex_exit(&zp->z_acl_lock);
+	}
+}
+
 /*
  * Write the bytes to a file.
  *
@@ -333,13 +416,12 @@ out:
  * Timestamps:
  *	ip - ctime|mtime updated if byte count > 0
  */
-
-/* ARGSUSED */
 int
 zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 {
-	int error = 0;
+	int error = 0, error1;
 	ssize_t start_resid = zfs_uio_resid(uio);
+	uint64_t clear_setid_bits_txg = 0;
 
 	/*
 	 * Fasttrack empty write
@@ -349,8 +431,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		return (0);
 
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
+	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+		return (error);
 
 	sa_bulk_attr_t bulk[4];
 	int count = 0;
@@ -367,7 +449,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 	 * so check it explicitly here.
 	 */
 	if (zfs_is_readonly(zfsvfs)) {
-		ZFS_EXIT(zfsvfs);
+		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EROFS));
 	}
 
@@ -379,7 +461,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 	if ((zp->z_pflags & ZFS_IMMUTABLE) ||
 	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) &&
 	    (zfs_uio_offset(uio) < zp->z_size))) {
-		ZFS_EXIT(zfsvfs);
+		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
@@ -388,19 +470,17 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 	 */
 	offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio);
 	if (woff < 0) {
-		ZFS_EXIT(zfsvfs);
+		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
-	const uint64_t max_blksz = zfsvfs->z_max_blksz;
-
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
-	 * Skip this if uio contains loaned arc_buf.
 	 */
-	if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
-		ZFS_EXIT(zfsvfs);
+	ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1);
+	if (zfs_uio_prefaultpages(pfbytes, uio)) {
+		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EFAULT));
 	}
 
@@ -433,9 +513,9 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
 	}
 
-	if (zn_rlimit_fsize(zp, uio)) {
+	if (zn_rlimit_fsize_uio(zp, uio)) {
 		zfs_rangelock_exit(lr);
-		ZFS_EXIT(zfsvfs);
+		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EFBIG));
 	}
 
@@ -443,7 +523,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 
 	if (woff >= limit) {
 		zfs_rangelock_exit(lr);
-		ZFS_EXIT(zfsvfs);
+		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EFBIG));
 	}
 
@@ -452,6 +532,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 
 	uint64_t end_size = MAX(zp->z_size, woff + n);
 	zilog_t *zilog = zfsvfs->z_log;
+	boolean_t commit = (ioflag & (O_SYNC | O_DSYNC)) ||
+	    (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS);
 
 	const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
 	const uint64_t gid = KGID_TO_SGID(ZTOGID(zp));
@@ -474,10 +556,31 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			break;
 		}
 
+		uint64_t blksz;
+		if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) {
+			if (zp->z_blksz > zfsvfs->z_max_blksz &&
+			    !ISP2(zp->z_blksz)) {
+				/*
+				 * File's blocksize is already larger than the
+				 * "recordsize" property.  Only let it grow to
+				 * the next power of 2.
+				 */
+				blksz = 1 << highbit64(zp->z_blksz);
+			} else {
+				blksz = zfsvfs->z_max_blksz;
+			}
+			blksz = MIN(blksz, P2ROUNDUP(end_size,
+			    SPA_MINBLOCKSIZE));
+			blksz = MAX(blksz, zp->z_blksz);
+		} else {
+			blksz = zp->z_blksz;
+		}
+
 		arc_buf_t *abuf = NULL;
-		if (n >= max_blksz && woff >= zp->z_size &&
-		    P2PHASE(woff, max_blksz) == 0 &&
-		    zp->z_blksz == max_blksz) {
+		ssize_t nbytes = n;
+		if (n >= blksz && woff >= zp->z_size &&
+		    P2PHASE(woff, blksz) == 0 &&
+		    (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
 			/*
 			 * This write covers a full block.  "Borrow" a buffer
 			 * from the dmu so that we can fill it before we enter
@@ -485,18 +588,26 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			 * holding up the transaction if the data copy hangs
 			 * up on a pagefault (e.g., from an NFS server mapping).
 			 */
-			size_t cbytes;
-
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
-			    max_blksz);
+			    blksz);
 			ASSERT(abuf != NULL);
-			ASSERT(arc_buf_size(abuf) == max_blksz);
-			if ((error = zfs_uiocopy(abuf->b_data, max_blksz,
-			    UIO_WRITE, uio, &cbytes))) {
+			ASSERT(arc_buf_size(abuf) == blksz);
+			if ((error = zfs_uiocopy(abuf->b_data, blksz,
+			    UIO_WRITE, uio, &nbytes))) {
 				dmu_return_arcbuf(abuf);
 				break;
 			}
-			ASSERT3S(cbytes, ==, max_blksz);
+			ASSERT3S(nbytes, ==, blksz);
+		} else {
+			nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) -
+			    P2PHASE(woff, blksz));
+			if (pfbytes < nbytes) {
+				if (zfs_uio_prefaultpages(nbytes, uio)) {
+					error = SET_ERROR(EFAULT);
+					break;
+				}
+				pfbytes = nbytes;
+			}
 		}
 
 		/*
@@ -506,8 +617,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
 		DB_DNODE_ENTER(db);
-		dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
-		    MIN(n, max_blksz));
+		dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes);
 		DB_DNODE_EXIT(db);
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
@@ -519,37 +629,21 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		}
 
 		/*
+		 * NB: We must call zfs_clear_setid_bits_if_necessary before
+		 * committing the transaction!
+		 */
+
+		/*
 		 * If rangelock_enter() over-locked we grow the blocksize
 		 * and then reduce the lock range.  This will only happen
 		 * on the first iteration since rangelock_reduce() will
 		 * shrink down lr_length to the appropriate size.
 		 */
 		if (lr->lr_length == UINT64_MAX) {
-			uint64_t new_blksz;
-
-			if (zp->z_blksz > max_blksz) {
-				/*
-				 * File's blocksize is already larger than the
-				 * "recordsize" property.  Only let it grow to
-				 * the next power of 2.
-				 */
-				ASSERT(!ISP2(zp->z_blksz));
-				new_blksz = MIN(end_size,
-				    1 << highbit64(zp->z_blksz));
-			} else {
-				new_blksz = MIN(end_size, max_blksz);
-			}
-			zfs_grow_blocksize(zp, new_blksz, tx);
+			zfs_grow_blocksize(zp, blksz, tx);
 			zfs_rangelock_reduce(lr, woff, n);
 		}
 
-		/*
-		 * XXX - should we really limit each write to z_max_blksz?
-		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
-		 */
-		const ssize_t nbytes =
-		    MIN(n, max_blksz - P2PHASE(woff, max_blksz));
-
 		ssize_t tx_bytes;
 		if (abuf == NULL) {
 			tx_bytes = zfs_uio_resid(uio);
@@ -559,6 +653,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			zfs_uio_fault_disable(uio, B_FALSE);
 #ifdef __linux__
 			if (error == EFAULT) {
+				zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
+				    cr, &clear_setid_bits_txg, tx);
 				dmu_tx_commit(tx);
 				/*
 				 * Account for partial writes before
@@ -567,30 +663,23 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 				 * zfs_uio_prefaultpages, or prefaultpages may
 				 * error, and we may break the loop early.
 				 */
-				if (tx_bytes != zfs_uio_resid(uio))
-					n -= tx_bytes - zfs_uio_resid(uio);
-				if (zfs_uio_prefaultpages(MIN(n, max_blksz),
-				    uio)) {
-					break;
-				}
+				n -= tx_bytes - zfs_uio_resid(uio);
+				pfbytes -= tx_bytes - zfs_uio_resid(uio);
 				continue;
 			}
 #endif
-			if (error != 0) {
+			/*
+			 * On FreeBSD, EFAULT should be propagated back to the
+			 * VFS, which will handle faulting and will retry.
+			 */
+			if (error != 0 && error != EFAULT) {
+				zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
+				    cr, &clear_setid_bits_txg, tx);
 				dmu_tx_commit(tx);
 				break;
 			}
 			tx_bytes -= zfs_uio_resid(uio);
 		} else {
-			/* Implied by abuf != NULL: */
-			ASSERT3S(n, >=, max_blksz);
-			ASSERT0(P2PHASE(woff, max_blksz));
-			/*
-			 * We can simplify nbytes to MIN(n, max_blksz) since
-			 * P2PHASE(woff, max_blksz) is 0, and knowing
-			 * n >= max_blksz lets us simplify further:
-			 */
-			ASSERT3S(nbytes, ==, max_blksz);
 			/*
 			 * Thus, we're writing a full block at a block-aligned
 			 * offset and extending the file past EOF.
@@ -601,6 +690,13 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			error = dmu_assign_arcbuf_by_dbuf(
 			    sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
 			if (error != 0) {
+				/*
+				 * XXX This might not be necessary if
+				 * dmu_assign_arcbuf_by_dbuf is guaranteed
+				 * to be atomic.
+				 */
+				zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
+				    cr, &clear_setid_bits_txg, tx);
 				dmu_return_arcbuf(abuf);
 				dmu_tx_commit(tx);
 				break;
@@ -609,7 +705,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			zfs_uioskip(uio, nbytes);
 			tx_bytes = nbytes;
 		}
-		if (tx_bytes && zn_has_cached_data(zp) &&
+		if (tx_bytes &&
+		    zn_has_cached_data(zp, woff, woff + tx_bytes - 1) &&
 		    !(ioflag & O_DIRECT)) {
 			update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
 		}
@@ -626,30 +723,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			break;
 		}
 
-		/*
-		 * Clear Set-UID/Set-GID bits on successful write if not
-		 * privileged and at least one of the execute bits is set.
-		 *
-		 * It would be nice to do this after all writes have
-		 * been done, but that would still expose the ISUID/ISGID
-		 * to another app after the partial write is committed.
-		 *
-		 * Note: we don't call zfs_fuid_map_id() here because
-		 * user 0 is not an ephemeral uid.
-		 */
-		mutex_enter(&zp->z_acl_lock);
-		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
-		    (S_IXUSR >> 6))) != 0 &&
-		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
-		    secpolicy_vnode_setid_retain(zp, cr,
-		    ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
-			uint64_t newmode;
-			zp->z_mode &= ~(S_ISUID | S_ISGID);
-			newmode = zp->z_mode;
-			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
-			    (void *)&newmode, sizeof (uint64_t), tx);
-		}
-		mutex_exit(&zp->z_acl_lock);
+		zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr,
+		    &clear_setid_bits_txg, tx);
 
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 
@@ -660,7 +735,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		while ((end_size = zp->z_size) < zfs_uio_offset(uio)) {
 			(void) atomic_cas_64(&zp->z_size, end_size,
 			    zfs_uio_offset(uio));
-			ASSERT(error == 0);
+			ASSERT(error == 0 || error == EFAULT);
 		}
 		/*
 		 * If we are replaying and eof is non zero then force
@@ -670,23 +745,26 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 			zp->z_size = zfsvfs->z_replay_eof;
 
-		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		if (error1 != 0)
+			/* Avoid clobbering EFAULT. */
+			error = error1;
 
-		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
+		/*
+		 * NB: During replay, the TX_SETATTR record logged by
+		 * zfs_clear_setid_bits_if_necessary must precede any of
+		 * the TX_WRITE records logged here.
+		 */
+		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit,
 		    NULL, NULL);
+
 		dmu_tx_commit(tx);
 
 		if (error != 0)
 			break;
 		ASSERT3S(tx_bytes, ==, nbytes);
 		n -= nbytes;
-
-		if (n > 0) {
-			if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
-				error = SET_ERROR(EFAULT);
-				break;
-			}
-		}
+		pfbytes -= nbytes;
 	}
 
 	zfs_znode_update_vfs(zp);
@@ -699,23 +777,21 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 	 */
 	if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid ||
 	    error == EFAULT) {
-		ZFS_EXIT(zfsvfs);
+		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
-	if (ioflag & (O_SYNC | O_DSYNC) ||
-	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+	if (commit)
 		zil_commit(zilog, zp->z_id);
 
 	const int64_t nwritten = start_resid - zfs_uio_resid(uio);
 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
 	task_io_account_write(nwritten);
 
-	ZFS_EXIT(zfsvfs);
+	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
-/*ARGSUSED*/
 int
 zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
 {
@@ -723,32 +799,31 @@ zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
+	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+		return (error);
 	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
-	ZFS_EXIT(zfsvfs);
+	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
-/*ARGSUSED*/
 int
 zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
-	zilog_t	*zilog = zfsvfs->z_log;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
+	zilog_t	*zilog;
 
+	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+		return (error);
+	zilog = zfsvfs->z_log;
 	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
-	ZFS_EXIT(zfsvfs);
+	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
@@ -777,7 +852,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 	uint64_t zp_gen;
 
 	ASSERT3P(lwb, !=, NULL);
-	ASSERT3P(zio, !=, NULL);
 	ASSERT3U(size, !=, 0);
 
 	/*
@@ -804,7 +878,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 		return (SET_ERROR(ENOENT));
 	}
 
-	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_lwb = lwb;
 	zgd->zgd_private = zp;
 
@@ -827,6 +901,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 		}
 		ASSERT(error == 0 || error == ENOENT);
 	} else { /* indirect write */
+		ASSERT3P(zio, !=, NULL);
 		/*
 		 * Have to lock the whole block to ensure when it's
 		 * written out and its checksum is being calculated
@@ -855,8 +930,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 		}
 #endif
 		if (error == 0)
-			error = dmu_buf_hold(os, object, offset, zgd, &db,
-			    DMU_READ_NO_PREFETCH);
+			error = dmu_buf_hold_noread(os, object, offset, zgd,
+			    &db);
 
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
@@ -901,10 +976,10 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 }
 
 
-/* ARGSUSED */
 static void
 zfs_get_done(zgd_t *zgd, int error)
 {
+	(void) error;
 	znode_t *zp = zgd->zgd_private;
 
 	if (zgd->zgd_db)
@@ -921,6 +996,551 @@ zfs_get_done(zgd_t *zgd, int error)
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
+static int
+zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
+{
+	int error;
+
+	/* Swap. Not sure if the order of zfs_enter()s is important. */
+	if (zfsvfs1 > zfsvfs2) {
+		zfsvfs_t *tmpzfsvfs;
+
+		tmpzfsvfs = zfsvfs2;
+		zfsvfs2 = zfsvfs1;
+		zfsvfs1 = tmpzfsvfs;
+	}
+
+	error = zfs_enter(zfsvfs1, tag);
+	if (error != 0)
+		return (error);
+	if (zfsvfs1 != zfsvfs2) {
+		error = zfs_enter(zfsvfs2, tag);
+		if (error != 0) {
+			zfs_exit(zfsvfs1, tag);
+			return (error);
+		}
+	}
+
+	return (0);
+}
+
+static void
+zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
+{
+
+	zfs_exit(zfsvfs1, tag);
+	if (zfsvfs1 != zfsvfs2)
+		zfs_exit(zfsvfs2, tag);
+}
+
+/*
+ * We split each clone request in chunks that can fit into a single ZIL
+ * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning
+ * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives
+ * us room for storing 1022 block pointers.
+ *
+ * On success, the function return the number of bytes copied in *lenp.
+ * Note, it doesn't return how much bytes are left to be copied.
+ * On errors which are caused by any file system limitations or
+ * brt limitations `EINVAL` is returned. In the most cases a user
+ * requested bad parameters, it could be possible to clone the file but
+ * some parameters don't match the requirements.
+ */
+int
+zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
+    uint64_t *outoffp, uint64_t *lenp, cred_t *cr)
+{
+	zfsvfs_t	*inzfsvfs, *outzfsvfs;
+	objset_t	*inos, *outos;
+	zfs_locked_range_t *inlr, *outlr;
+	dmu_buf_impl_t	*db;
+	dmu_tx_t	*tx;
+	zilog_t		*zilog;
+	uint64_t	inoff, outoff, len, done;
+	uint64_t	outsize, size;
+	int		error;
+	int		count = 0;
+	sa_bulk_attr_t	bulk[3];
+	uint64_t	mtime[2], ctime[2];
+	uint64_t	uid, gid, projid;
+	blkptr_t	*bps;
+	size_t		maxblocks, nbps;
+	uint_t		inblksz;
+	uint64_t	clear_setid_bits_txg = 0;
+	uint64_t	last_synced_txg = 0;
+
+	inoff = *inoffp;
+	outoff = *outoffp;
+	len = *lenp;
+	done = 0;
+
+	inzfsvfs = ZTOZSB(inzp);
+	outzfsvfs = ZTOZSB(outzp);
+
+	/*
+	 * We need to call zfs_enter() potentially on two different datasets,
+	 * so we need a dedicated function for that.
+	 */
+	error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG);
+	if (error != 0)
+		return (error);
+
+	inos = inzfsvfs->z_os;
+	outos = outzfsvfs->z_os;
+
+	/*
+	 * Both source and destination have to belong to the same storage pool.
+	 */
+	if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (SET_ERROR(EXDEV));
+	}
+
+	/*
+	 * outos and inos belongs to the same storage pool.
+	 * see a few lines above, only one check.
+	 */
+	if (!spa_feature_is_enabled(dmu_objset_spa(outos),
+	    SPA_FEATURE_BLOCK_CLONING)) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
+	ASSERT(!outzfsvfs->z_replay);
+
+	/*
+	 * Block cloning from an unencrypted dataset into an encrypted
+	 * dataset and vice versa is not supported.
+	 */
+	if (inos->os_encrypted != outos->os_encrypted) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (SET_ERROR(EXDEV));
+	}
+
+	/*
+	 * Cloning across encrypted datasets is possible only if they
+	 * share the same master key.
+	 */
+	if (inos != outos && inos->os_encrypted &&
+	    !dmu_objset_crypto_key_equal(inos, outos)) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (SET_ERROR(EXDEV));
+	}
+
+	error = zfs_verify_zp(inzp);
+	if (error == 0)
+		error = zfs_verify_zp(outzp);
+	if (error != 0) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (error);
+	}
+
+	/*
+	 * We don't copy source file's flags that's why we don't allow to clone
+	 * files that are in quarantine.
+	 */
+	if (inzp->z_pflags & ZFS_AV_QUARANTINED) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (SET_ERROR(EACCES));
+	}
+
+	if (inoff >= inzp->z_size) {
+		*lenp = 0;
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (0);
+	}
+	if (len > inzp->z_size - inoff) {
+		len = inzp->z_size - inoff;
+	}
+	if (len == 0) {
+		*lenp = 0;
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (0);
+	}
+
+	/*
+	 * Callers might not be able to detect properly that we are read-only,
+	 * so check it explicitly here.
+	 */
+	if (zfs_is_readonly(outzfsvfs)) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
+	 * If immutable or not appending then return EPERM.
+	 * Intentionally allow ZFS_READONLY through here.
+	 * See zfs_zaccess_common()
+	 */
+	if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (SET_ERROR(EPERM));
+	}
+
+	/*
+	 * No overlapping if we are cloning within the same file.
+	 */
+	if (inzp == outzp) {
+		if (inoff < outoff + len && outoff < inoff + len) {
+			zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+			return (SET_ERROR(EINVAL));
+		}
+	}
+
+	/* Flush any mmap()'d data to disk */
+	if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
+		zn_flush_cached_data(inzp, B_TRUE);
+
+	/*
+	 * Maintain predictable lock order.
+	 */
+	if (inzp < outzp || (inzp == outzp && inoff < outoff)) {
+		inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
+		    RL_READER);
+		outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
+		    RL_WRITER);
+	} else {
+		outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
+		    RL_WRITER);
+		inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
+		    RL_READER);
+	}
+
+	inblksz = inzp->z_blksz;
+
+	/*
+	 * We cannot clone into a file with different block size if we can't
+	 * grow it (block size is already bigger, has more than one block, or
+	 * not locked for growth).  There are other possible reasons for the
+	 * grow to fail, but we cover what we can before opening transaction
+	 * and the rest detect after we try to do it.
+	 */
+	if (inblksz < outzp->z_blksz) {
+		error = SET_ERROR(EINVAL);
+		goto unlock;
+	}
+	if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz ||
+	    outlr->lr_length != UINT64_MAX)) {
+		error = SET_ERROR(EINVAL);
+		goto unlock;
+	}
+
+	/*
+	 * Block size must be power-of-2 if destination offset != 0.
+	 * There can be no multiple blocks of non-power-of-2 size.
+	 */
+	if (outoff != 0 && !ISP2(inblksz)) {
+		error = SET_ERROR(EINVAL);
+		goto unlock;
+	}
+
+	/*
+	 * Offsets and len must be at block boundries.
+	 */
+	if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) {
+		error = SET_ERROR(EINVAL);
+		goto unlock;
+	}
+	/*
+	 * Length must be multipe of blksz, except for the end of the file.
+	 */
+	if ((len % inblksz) != 0 &&
+	    (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) {
+		error = SET_ERROR(EINVAL);
+		goto unlock;
+	}
+
+	/*
+	 * If we are copying only one block and it is smaller than recordsize
+	 * property, do not allow destination to grow beyond one block if it
+	 * is not there yet.  Otherwise the destination will get stuck with
+	 * that block size forever, that can be as small as 512 bytes, no
+	 * matter how big the destination grow later.
+	 */
+	if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz &&
+	    outzp->z_size <= inblksz && outoff + len > inblksz) {
+		error = SET_ERROR(EINVAL);
+		goto unlock;
+	}
+
+	error = zn_rlimit_fsize(outoff + len);
+	if (error != 0) {
+		goto unlock;
+	}
+
+	if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) {
+		error = SET_ERROR(EFBIG);
+		goto unlock;
+	}
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL,
+	    &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL,
+	    &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL,
+	    &outzp->z_size, 8);
+
+	zilog = outzfsvfs->z_log;
+	maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) /
+	    sizeof (bps[0]);
+
+	uid = KUID_TO_SUID(ZTOUID(outzp));
+	gid = KGID_TO_SGID(ZTOGID(outzp));
+	projid = outzp->z_projid;
+
+	bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
+
+	/*
+	 * Clone the file in reasonable size chunks.  Each chunk is cloned
+	 * in a separate transaction; this keeps the intent log records small
+	 * and allows us to do more fine-grained space accounting.
+	 */
+	while (len > 0) {
+		size = MIN(inblksz * maxblocks, len);
+
+		if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT,
+		    uid) ||
+		    zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT,
+		    gid) ||
+		    (projid != ZFS_DEFAULT_PROJID &&
+		    zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT,
+		    projid))) {
+			error = SET_ERROR(EDQUOT);
+			break;
+		}
+
+		nbps = maxblocks;
+		last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
+		error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
+		    &nbps);
+		if (error != 0) {
+			/*
+			 * If we are trying to clone a block that was created
+			 * in the current transaction group, the error will be
+			 * EAGAIN here.  Based on zfs_bclone_wait_dirty either
+			 * return a shortened range to the caller so it can
+			 * fallback, or wait for the next TXG and check again.
+			 */
+			if (error == EAGAIN && zfs_bclone_wait_dirty) {
+				txg_wait_synced(dmu_objset_pool(inos),
+				    last_synced_txg + 1);
+				continue;
+			}
+
+			break;
+		}
+
+		/*
+		 * Start a transaction.
+		 */
+		tx = dmu_tx_create(outos);
+		dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE);
+		db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl);
+		DB_DNODE_ENTER(db);
+		dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size);
+		DB_DNODE_EXIT(db);
+		zfs_sa_upgrade_txholds(tx, outzp);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error != 0) {
+			dmu_tx_abort(tx);
+			break;
+		}
+
+		/*
+		 * Copy source znode's block size. This is done only if the
+		 * whole znode is locked (see zfs_rangelock_cb()) and only
+		 * on the first iteration since zfs_rangelock_reduce() will
+		 * shrink down lr_length to the appropriate size.
+		 */
+		if (outlr->lr_length == UINT64_MAX) {
+			zfs_grow_blocksize(outzp, inblksz, tx);
+
+			/*
+			 * Block growth may fail for many reasons we can not
+			 * predict here.  If it happen the cloning is doomed.
+			 */
+			if (inblksz != outzp->z_blksz) {
+				error = SET_ERROR(EINVAL);
+				dmu_tx_abort(tx);
+				break;
+			}
+
+			/*
+			 * Round range lock up to the block boundary, so we
+			 * prevent appends until we are done.
+			 */
+			zfs_rangelock_reduce(outlr, outoff,
+			    ((len - 1) / inblksz + 1) * inblksz);
+		}
+
+		error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx,
+		    bps, nbps);
+		if (error != 0) {
+			dmu_tx_commit(tx);
+			break;
+		}
+
+		if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) {
+			update_pages(outzp, outoff, size, outos);
+		}
+
+		zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr,
+		    &clear_setid_bits_txg, tx);
+
+		zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime);
+
+		/*
+		 * Update the file size (zp_size) if it has changed;
+		 * account for possible concurrent updates.
+		 */
+		while ((outsize = outzp->z_size) < outoff + size) {
+			(void) atomic_cas_64(&outzp->z_size, outsize,
+			    outoff + size);
+		}
+
+		error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx);
+
+		zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff,
+		    size, inblksz, bps, nbps);
+
+		dmu_tx_commit(tx);
+
+		if (error != 0)
+			break;
+
+		inoff += size;
+		outoff += size;
+		len -= size;
+		done += size;
+
+		if (issig()) {
+			error = SET_ERROR(EINTR);
+			break;
+		}
+	}
+
+	vmem_free(bps, sizeof (bps[0]) * maxblocks);
+	zfs_znode_update_vfs(outzp);
+
+unlock:
+	zfs_rangelock_exit(outlr);
+	zfs_rangelock_exit(inlr);
+
+	if (done > 0) {
+		/*
+		 * If we have made at least partial progress, reset the error.
+		 */
+		error = 0;
+
+		ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp);
+
+		if (outos->os_sync == ZFS_SYNC_ALWAYS) {
+			zil_commit(zilog, outzp->z_id);
+		}
+
+		*inoffp += done;
+		*outoffp += done;
+		*lenp = done;
+	} else {
+		/*
+		 * If we made no progress, there must be a good reason.
+		 * EOF is handled explicitly above, before the loop.
+		 */
+		ASSERT3S(error, !=, 0);
+	}
+
+	zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+
+	return (error);
+}
+
+/*
+ * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(),
+ * but we cannot do that, because when replaying we don't have source znode
+ * available. This is why we need a dedicated replay function.
+ */
+int
+zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz,
+    const blkptr_t *bps, size_t nbps)
+{
+	zfsvfs_t	*zfsvfs;
+	dmu_buf_impl_t	*db;
+	dmu_tx_t	*tx;
+	int		error;
+	int		count = 0;
+	sa_bulk_attr_t	bulk[3];
+	uint64_t	mtime[2], ctime[2];
+
+	ASSERT3U(off, <, MAXOFFSET_T);
+	ASSERT3U(len, >, 0);
+	ASSERT3U(nbps, >, 0);
+
+	zfsvfs = ZTOZSB(zp);
+
+	ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
+	    SPA_FEATURE_BLOCK_CLONING));
+
+	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+		return (error);
+
+	ASSERT(zfsvfs->z_replay);
+	ASSERT(!zfs_is_readonly(zfsvfs));
+
+	if ((off % blksz) != 0) {
+		zfs_exit(zfsvfs, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, 8);
+
+	/*
+	 * Start a transaction.
+	 */
+	tx = dmu_tx_create(zfsvfs->z_os);
+
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+	DB_DNODE_ENTER(db);
+	dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len);
+	DB_DNODE_EXIT(db);
+	zfs_sa_upgrade_txholds(tx, zp);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error != 0) {
+		dmu_tx_abort(tx);
+		zfs_exit(zfsvfs, FTAG);
+		return (error);
+	}
+
+	if (zp->z_blksz < blksz)
+		zfs_grow_blocksize(zp, blksz, tx);
+
+	dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps);
+
+	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+
+	if (zp->z_size < off + len)
+		zp->z_size = off + len;
+
+	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
+	/*
+	 * zil_replaying() not only check if we are replaying ZIL, but also
+	 * updates the ZIL header to record replay progress.
+	 */
+	VERIFY(zil_replaying(zfsvfs->z_log, tx));
+
+	dmu_tx_commit(tx);
+
+	zfs_znode_update_vfs(zp);
+
+	zfs_exit(zfsvfs, FTAG);
+
+	return (error);
+}
+
 EXPORT_SYMBOL(zfs_access);
 EXPORT_SYMBOL(zfs_fsync);
 EXPORT_SYMBOL(zfs_holey);
@@ -928,6 +1548,14 @@ EXPORT_SYMBOL(zfs_read);
 EXPORT_SYMBOL(zfs_write);
 EXPORT_SYMBOL(zfs_getsecattr);
 EXPORT_SYMBOL(zfs_setsecattr);
+EXPORT_SYMBOL(zfs_clone_range);
+EXPORT_SYMBOL(zfs_clone_range_replay);
 
-ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
 	"Bytes to read per chunk");
+
+ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
+	"Enable block cloning");
+
+ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
+	"Wait for dirty blocks when cloning");
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c
index 640e805d093a..34be54b337fd 100644
--- a/sys/contrib/openzfs/module/zfs/zil.c
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -43,6 +43,8 @@
 #include <sys/metaslab.h>
 #include <sys/trace_zfs.h>
 #include <sys/abd.h>
+#include <sys/brt.h>
+#include <sys/wmsum.h>
 
 /*
  * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
@@ -89,12 +91,12 @@
  * committed to stable storage. Please refer to the zil_commit_waiter()
  * function (and the comments within it) for more details.
  */
-int zfs_commit_timeout_pct = 5;
+static uint_t zfs_commit_timeout_pct = 10;
 
 /*
  * See zil.h for more information about these fields.
  */
-zil_stats_t zil_stats = {
+static zil_kstat_values_t zil_stats = {
 	{ "zil_commit_count",			KSTAT_DATA_UINT64 },
 	{ "zil_commit_writer_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_count",			KSTAT_DATA_UINT64 },
@@ -106,11 +108,16 @@ zil_stats_t zil_stats = {
 	{ "zil_itx_needcopy_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_bytes",	KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_normal_write",	KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_normal_alloc",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_bytes",	KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_slog_write",	KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_slog_alloc",	KSTAT_DATA_UINT64 },
 };
 
-static kstat_t *zil_ksp;
+static zil_sums_t zil_sums_global;
+static kstat_t *zil_kstats_global;
 
 /*
  * Disable intent logging replay.  This global ZIL switch affects all pools.
@@ -118,25 +125,25 @@ static kstat_t *zil_ksp;
 int zil_replay_disable = 0;
 
 /*
- * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
- * the disk(s) by the ZIL after an LWB write has completed. Setting this
- * will cause ZIL corruption on power loss if a volatile out-of-order
- * write cache is enabled.
+ * Disable the flush commands that are normally sent to the disk(s) by the ZIL
+ * after an LWB write has completed. Setting this will cause ZIL corruption on
+ * power loss if a volatile out-of-order write cache is enabled.
  */
-int zil_nocacheflush = 0;
+static int zil_nocacheflush = 0;
 
 /*
  * Limit SLOG write size per commit executed with synchronous priority.
  * Any writes above that will be executed with lower (asynchronous) priority
  * to limit potential SLOG device abuse by single active ZIL writer.
  */
-unsigned long zil_slog_bulk = 768 * 1024;
+static uint64_t zil_slog_bulk = 64 * 1024 * 1024;
 
 static kmem_cache_t *zil_lwb_cache;
 static kmem_cache_t *zil_zcw_cache;
 
-#define	LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
-    sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
+static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
+static itx_t *zil_itx_clone(itx_t *oitx);
+static uint64_t zil_max_waste_space(zilog_t *zilog);
 
 static int
 zil_bp_compare(const void *x1, const void *x2)
@@ -213,16 +220,30 @@ zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
 	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
 }
 
+static int
+zil_kstats_global_update(kstat_t *ksp, int rw)
+{
+	zil_kstat_values_t *zs = ksp->ks_data;
+	ASSERT3P(&zil_stats, ==, zs);
+
+	if (rw == KSTAT_WRITE) {
+		return (SET_ERROR(EACCES));
+	}
+
+	zil_kstat_values_update(zs, &zil_sums_global);
+
+	return (0);
+}
+
 /*
  * Read a log block and make sure it's valid.
  */
 static int
 zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
-    blkptr_t *nbp, void *dst, char **end)
+    blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf)
 {
-	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+	zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
-	arc_buf_t *abuf = NULL;
 	zbookmark_phys_t zb;
 	int error;
 
@@ -239,7 +260,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
-	    &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+	    abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		zio_cksum_t cksum = bp->blk_cksum;
@@ -254,39 +275,35 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
 		 */
 		cksum.zc_word[ZIL_ZC_SEQ]++;
 
+		uint64_t size = BP_GET_LSIZE(bp);
 		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
-			zil_chain_t *zilc = abuf->b_data;
+			zil_chain_t *zilc = (*abuf)->b_data;
 			char *lr = (char *)(zilc + 1);
-			uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
 
-			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
-			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
+			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+			    sizeof (cksum)) ||
+			    zilc->zc_nused < sizeof (*zilc) ||
+			    zilc->zc_nused > size) {
 				error = SET_ERROR(ECKSUM);
 			} else {
-				ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
-				bcopy(lr, dst, len);
-				*end = (char *)dst + len;
+				*begin = lr;
+				*end = lr + zilc->zc_nused - sizeof (*zilc);
 				*nbp = zilc->zc_next_blk;
 			}
 		} else {
-			char *lr = abuf->b_data;
-			uint64_t size = BP_GET_LSIZE(bp);
+			char *lr = (*abuf)->b_data;
 			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
 
-			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
-			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
+			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+			    sizeof (cksum)) ||
 			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
 				error = SET_ERROR(ECKSUM);
 			} else {
-				ASSERT3U(zilc->zc_nused, <=,
-				    SPA_OLD_MAXBLOCKSIZE);
-				bcopy(lr, dst, zilc->zc_nused);
-				*end = (char *)dst + zilc->zc_nused;
+				*begin = lr;
+				*end = lr + zilc->zc_nused;
 				*nbp = zilc->zc_next_blk;
 			}
 		}
-
-		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
@@ -298,7 +315,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
 static int
 zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 {
-	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+	zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 	const blkptr_t *bp = &lr->lr_blkptr;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
@@ -307,7 +324,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 
 	if (BP_IS_HOLE(bp)) {
 		if (wbuf != NULL)
-			bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
+			memset(wbuf, 0, MAX(BP_GET_LSIZE(bp), lr->lr_length));
 		return (0);
 	}
 
@@ -322,6 +339,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 	if (wbuf == NULL)
 		zio_flags |= ZIO_FLAG_RAW;
 
+	ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
 	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
@@ -330,13 +348,96 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 
 	if (error == 0) {
 		if (wbuf != NULL)
-			bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
+			memcpy(wbuf, abuf->b_data, arc_buf_size(abuf));
 		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
 }
 
+void
+zil_sums_init(zil_sums_t *zs)
+{
+	wmsum_init(&zs->zil_commit_count, 0);
+	wmsum_init(&zs->zil_commit_writer_count, 0);
+	wmsum_init(&zs->zil_itx_count, 0);
+	wmsum_init(&zs->zil_itx_indirect_count, 0);
+	wmsum_init(&zs->zil_itx_indirect_bytes, 0);
+	wmsum_init(&zs->zil_itx_copied_count, 0);
+	wmsum_init(&zs->zil_itx_copied_bytes, 0);
+	wmsum_init(&zs->zil_itx_needcopy_count, 0);
+	wmsum_init(&zs->zil_itx_needcopy_bytes, 0);
+	wmsum_init(&zs->zil_itx_metaslab_normal_count, 0);
+	wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0);
+	wmsum_init(&zs->zil_itx_metaslab_normal_write, 0);
+	wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0);
+	wmsum_init(&zs->zil_itx_metaslab_slog_count, 0);
+	wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0);
+	wmsum_init(&zs->zil_itx_metaslab_slog_write, 0);
+	wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0);
+}
+
+void
+zil_sums_fini(zil_sums_t *zs)
+{
+	wmsum_fini(&zs->zil_commit_count);
+	wmsum_fini(&zs->zil_commit_writer_count);
+	wmsum_fini(&zs->zil_itx_count);
+	wmsum_fini(&zs->zil_itx_indirect_count);
+	wmsum_fini(&zs->zil_itx_indirect_bytes);
+	wmsum_fini(&zs->zil_itx_copied_count);
+	wmsum_fini(&zs->zil_itx_copied_bytes);
+	wmsum_fini(&zs->zil_itx_needcopy_count);
+	wmsum_fini(&zs->zil_itx_needcopy_bytes);
+	wmsum_fini(&zs->zil_itx_metaslab_normal_count);
+	wmsum_fini(&zs->zil_itx_metaslab_normal_bytes);
+	wmsum_fini(&zs->zil_itx_metaslab_normal_write);
+	wmsum_fini(&zs->zil_itx_metaslab_normal_alloc);
+	wmsum_fini(&zs->zil_itx_metaslab_slog_count);
+	wmsum_fini(&zs->zil_itx_metaslab_slog_bytes);
+	wmsum_fini(&zs->zil_itx_metaslab_slog_write);
+	wmsum_fini(&zs->zil_itx_metaslab_slog_alloc);
+}
+
+void
+zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums)
+{
+	zs->zil_commit_count.value.ui64 =
+	    wmsum_value(&zil_sums->zil_commit_count);
+	zs->zil_commit_writer_count.value.ui64 =
+	    wmsum_value(&zil_sums->zil_commit_writer_count);
+	zs->zil_itx_count.value.ui64 =
+	    wmsum_value(&zil_sums->zil_itx_count);
+	zs->zil_itx_indirect_count.value.ui64 =
+	    wmsum_value(&zil_sums->zil_itx_indirect_count);
+	zs->zil_itx_indirect_bytes.value.ui64 =
+	    wmsum_value(&zil_sums->zil_itx_indirect_bytes);
+	zs->zil_itx_copied_count.value.ui64 =
+	    wmsum_value(&zil_sums->zil_itx_copied_count);
+	zs->zil_itx_copied_bytes.value.ui64 =
+	    wmsum_value(&zil_sums->zil_itx_copied_bytes);
+	zs->zil_itx_needcopy_count.value.ui64 =
+	    wmsum_value(&zil_sums->zil_itx_needcopy_count);
+	zs->zil_itx_needcopy_bytes.value.ui64 =
+	    wmsum_value(&zil_sums->zil_itx_needcopy_bytes);
+	zs->zil_itx_metaslab_normal_count.value.ui64 =
+	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_count);
+	zs->zil_itx_metaslab_normal_bytes.value.ui64 =
+	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes);
+	zs->zil_itx_metaslab_normal_write.value.ui64 =
+	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_write);
+	zs->zil_itx_metaslab_normal_alloc.value.ui64 =
+	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc);
+	zs->zil_itx_metaslab_slog_count.value.ui64 =
+	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_count);
+	zs->zil_itx_metaslab_slog_bytes.value.ui64 =
+	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes);
+	zs->zil_itx_metaslab_slog_write.value.ui64 =
+	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_write);
+	zs->zil_itx_metaslab_slog_alloc.value.ui64 =
+	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc);
+}
+
 /*
  * Parse the intent log, and call parse_func for each valid record within.
  */
@@ -353,12 +454,9 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
 	uint64_t max_lr_seq = 0;
 	uint64_t blk_count = 0;
 	uint64_t lr_count = 0;
-	blkptr_t blk, next_blk;
-	char *lrbuf, *lrp;
+	blkptr_t blk, next_blk = {{{{0}}}};
 	int error = 0;
 
-	bzero(&next_blk, sizeof (blkptr_t));
-
 	/*
 	 * Old logs didn't record the maximum zh_claim_lr_seq.
 	 */
@@ -374,13 +472,13 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
 	 * If the log has been claimed, stop if we encounter a sequence
 	 * number greater than the highest claimed sequence number.
 	 */
-	lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
 	zil_bp_tree_init(zilog);
 
 	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
 		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
 		int reclen;
-		char *end = NULL;
+		char *lrp, *end;
+		arc_buf_t *abuf = NULL;
 
 		if (blk_seq > claim_blk_seq)
 			break;
@@ -396,24 +494,42 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
 			break;
 
 		error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
-		    lrbuf, &end);
-		if (error != 0)
+		    &lrp, &end, &abuf);
+		if (error != 0) {
+			if (abuf)
+				arc_buf_destroy(abuf, &abuf);
+			if (claimed) {
+				char name[ZFS_MAX_DATASET_NAME_LEN];
+
+				dmu_objset_name(zilog->zl_os, name);
+
+				cmn_err(CE_WARN, "ZFS read log block error %d, "
+				    "dataset %s, seq 0x%llx\n", error, name,
+				    (u_longlong_t)blk_seq);
+			}
 			break;
+		}
 
-		for (lrp = lrbuf; lrp < end; lrp += reclen) {
+		for (; lrp < end; lrp += reclen) {
 			lr_t *lr = (lr_t *)lrp;
 			reclen = lr->lrc_reclen;
 			ASSERT3U(reclen, >=, sizeof (lr_t));
-			if (lr->lrc_seq > claim_lr_seq)
+			ASSERT3U(reclen, <=, end - lrp);
+			if (lr->lrc_seq > claim_lr_seq) {
+				arc_buf_destroy(abuf, &abuf);
 				goto done;
+			}
 
 			error = parse_lr_func(zilog, lr, arg, txg);
-			if (error != 0)
+			if (error != 0) {
+				arc_buf_destroy(abuf, &abuf);
 				goto done;
+			}
 			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
 			max_lr_seq = lr->lrc_seq;
 			lr_count++;
 		}
+		arc_buf_destroy(abuf, &abuf);
 	}
 done:
 	zilog->zl_parse_error = error;
@@ -422,21 +538,16 @@ done:
 	zilog->zl_parse_blk_count = blk_count;
 	zilog->zl_parse_lr_count = lr_count;
 
-	ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
-	    (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq) ||
-	    (decrypt && error == EIO));
-
 	zil_bp_tree_fini(zilog);
-	zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
 
 	return (error);
 }
 
-/* ARGSUSED */
 static int
 zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t first_txg)
 {
+	(void) tx;
 	ASSERT(!BP_IS_HOLE(bp));
 
 	/*
@@ -445,7 +556,7 @@ zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
 	 * that we rewind to is invalid. Thus, we return -1 so
 	 * zil_parse() doesn't attempt to read it.
 	 */
-	if (bp->blk_birth >= first_txg)
+	if (BP_GET_LOGICAL_BIRTH(bp) >= first_txg)
 		return (-1);
 
 	if (zil_bp_tree_add(zilog, bp) != 0)
@@ -455,11 +566,11 @@ zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
 	return (0);
 }
 
-/* ARGSUSED */
 static int
 zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
+	(void) zilog, (void) lrc, (void) tx, (void) first_txg;
 	return (0);
 }
 
@@ -471,7 +582,7 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
 	 * Claim log block if not already committed and not already claimed.
 	 * If tx == NULL, just verify that the block is claimable.
 	 */
-	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
+	if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) < first_txg ||
 	    zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
@@ -481,14 +592,12 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
 }
 
 static int
-zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
-    uint64_t first_txg)
+zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	int error;
 
-	if (lrc->lrc_txtype != TX_WRITE)
-		return (0);
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 
 	/*
 	 * If the block is not readable, don't claim it.  This can happen
@@ -498,7 +607,7 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
 	 * waited for all writes to be stable first), so it is semantically
 	 * correct to declare this the end of the log.
 	 */
-	if (lr->lr_blkptr.blk_birth >= first_txg) {
+	if (BP_GET_LOGICAL_BIRTH(&lr->lr_blkptr) >= first_txg) {
 		error = zil_read_log_data(zilog, lr, NULL);
 		if (error != 0)
 			return (error);
@@ -507,35 +616,156 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
 	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
 }
 
-/* ARGSUSED */
+static int
+zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
+    uint64_t first_txg)
+{
+	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
+	const blkptr_t *bp;
+	spa_t *spa = zilog->zl_spa;
+	uint_t ii;
+
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
+	    lr_bps[lr->lr_nbps]));
+
+	if (tx == NULL) {
+		return (0);
+	}
+
+	/*
+	 * XXX: Do we need to byteswap lr?
+	 */
+
+	for (ii = 0; ii < lr->lr_nbps; ii++) {
+		bp = &lr->lr_bps[ii];
+
+		/*
+		 * When data is embedded into the BP there is no need to create
+		 * BRT entry as there is no data block.  Just copy the BP as it
+		 * contains the data.
+		 */
+		if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+			continue;
+
+		/*
+		 * We can not handle block pointers from the future, since they
+		 * are not yet allocated.  It should not normally happen, but
+		 * just in case lets be safe and just stop here now instead of
+		 * corrupting the pool.
+		 */
+		if (BP_GET_BIRTH(bp) >= first_txg)
+			return (SET_ERROR(ENOENT));
+
+		/*
+		 * Assert the block is really allocated before we reference it.
+		 */
+		metaslab_check_free(spa, bp);
+	}
+
+	for (ii = 0; ii < lr->lr_nbps; ii++) {
+		bp = &lr->lr_bps[ii];
+		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp))
+			brt_pending_add(spa, bp, tx);
+	}
+
+	return (0);
+}
+
+static int
+zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+    uint64_t first_txg)
+{
+
+	switch (lrc->lrc_txtype) {
+	case TX_WRITE:
+		return (zil_claim_write(zilog, lrc, tx, first_txg));
+	case TX_CLONE_RANGE:
+		return (zil_claim_clone_range(zilog, lrc, tx, first_txg));
+	default:
+		return (0);
+	}
+}
+
 static int
 zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t claim_txg)
 {
+	(void) claim_txg;
+
 	zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
 }
 
 static int
-zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
-    uint64_t claim_txg)
+zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	blkptr_t *bp = &lr->lr_blkptr;
 
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
+
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
-	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
-	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
-	    !BP_IS_HOLE(bp))
+	if (BP_GET_LOGICAL_BIRTH(bp) >= claim_txg &&
+	    zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) {
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+	}
 
 	return (0);
 }
 
 static int
+zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
+{
+	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
+	const blkptr_t *bp;
+	spa_t *spa;
+	uint_t ii;
+
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
+	    lr_bps[lr->lr_nbps]));
+
+	if (tx == NULL) {
+		return (0);
+	}
+
+	spa = zilog->zl_spa;
+
+	for (ii = 0; ii < lr->lr_nbps; ii++) {
+		bp = &lr->lr_bps[ii];
+
+		if (!BP_IS_HOLE(bp)) {
+			zio_free(spa, dmu_tx_get_txg(tx), bp);
+		}
+	}
+
+	return (0);
+}
+
+static int
+zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+    uint64_t claim_txg)
+{
+
+	if (claim_txg == 0) {
+		return (0);
+	}
+
+	switch (lrc->lrc_txtype) {
+	case TX_WRITE:
+		return (zil_free_write(zilog, lrc, tx, claim_txg));
+	case TX_CLONE_RANGE:
+		return (zil_free_clone_range(zilog, lrc, tx));
+	default:
+		return (0);
+	}
+}
+
+static int
 zil_lwb_vdev_compare(const void *x1, const void *x2)
 {
 	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
@@ -544,41 +774,54 @@ zil_lwb_vdev_compare(const void *x1, const void *x2)
 	return (TREE_CMP(v1, v2));
 }
 
+/*
+ * Allocate a new lwb.  We may already have a block pointer for it, in which
+ * case we get size and version from there.  Or we may not yet, in which case
+ * we choose them here and later make the block allocation match.
+ */
 static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg,
-    boolean_t fastwrite)
+zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
+    uint64_t txg, lwb_state_t state)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
-	lwb->lwb_blk = *bp;
-	lwb->lwb_fastwrite = fastwrite;
+	if (bp) {
+		lwb->lwb_blk = *bp;
+		lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2);
+		sz = BP_GET_LSIZE(bp);
+	} else {
+		BP_ZERO(&lwb->lwb_blk);
+		lwb->lwb_slim = (spa_version(zilog->zl_spa) >=
+		    SPA_VERSION_SLIM_ZIL);
+	}
 	lwb->lwb_slog = slog;
-	lwb->lwb_state = LWB_STATE_CLOSED;
-	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
-	lwb->lwb_max_txg = txg;
+	lwb->lwb_error = 0;
+	if (lwb->lwb_slim) {
+		lwb->lwb_nmax = sz;
+		lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
+	} else {
+		lwb->lwb_nmax = sz - sizeof (zil_chain_t);
+		lwb->lwb_nused = lwb->lwb_nfilled = 0;
+	}
+	lwb->lwb_sz = sz;
+	lwb->lwb_state = state;
+	lwb->lwb_buf = zio_buf_alloc(sz);
+	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 	lwb->lwb_root_zio = NULL;
-	lwb->lwb_tx = NULL;
 	lwb->lwb_issued_timestamp = 0;
-	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
-		lwb->lwb_nused = sizeof (zil_chain_t);
-		lwb->lwb_sz = BP_GET_LSIZE(bp);
-	} else {
-		lwb->lwb_nused = 0;
-		lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
-	}
+	lwb->lwb_issued_txg = 0;
+	lwb->lwb_alloc_txg = txg;
+	lwb->lwb_max_txg = 0;
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_lwb_list, lwb);
+	if (state != LWB_STATE_NEW)
+		zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 
-	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
-	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
-	VERIFY(list_is_empty(&lwb->lwb_waiters));
-	VERIFY(list_is_empty(&lwb->lwb_itxs));
-
 	return (lwb);
 }
 
@@ -586,15 +829,17 @@ static void
 zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
-	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
-	VERIFY(list_is_empty(&lwb->lwb_waiters));
-	VERIFY(list_is_empty(&lwb->lwb_itxs));
-	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+	ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
+	    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
+	ASSERT3P(lwb->lwb_child_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
+	ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
-	ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
-	    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
+	VERIFY(list_is_empty(&lwb->lwb_itxs));
+	VERIFY(list_is_empty(&lwb->lwb_waiters));
+	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
 
 	/*
 	 * Clear the zilog's field to indicate this lwb is no longer
@@ -663,6 +908,36 @@ zilog_is_dirty(zilog_t *zilog)
 }
 
 /*
+ * Its called in zil_commit context (zil_process_commit_list()/zil_create()).
+ * It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled.
+ * Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every
+ * zil_commit.
+ */
+static void
+zil_commit_activate_saxattr_feature(zilog_t *zilog)
+{
+	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+	uint64_t txg = 0;
+	dmu_tx_t *tx = NULL;
+
+	if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
+	    dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL &&
+	    !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) {
+		tx = dmu_tx_create(zilog->zl_os);
+		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+		dsl_dataset_dirty(ds, tx);
+		txg = dmu_tx_get_txg(tx);
+
+		mutex_enter(&ds->ds_lock);
+		ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
+		    (void *)B_TRUE;
+		mutex_exit(&ds->ds_lock);
+		dmu_tx_commit(tx);
+		txg_wait_synced(zilog->zl_dmu_pool, txg);
+	}
+}
+
+/*
  * Create an on-disk intent log.
  */
 static lwb_t *
@@ -674,8 +949,9 @@ zil_create(zilog_t *zilog)
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
-	boolean_t fastwrite = FALSE;
 	boolean_t slog = FALSE;
+	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+
 
 	/*
 	 * Wait for any previous destroy to complete.
@@ -705,8 +981,6 @@ zil_create(zilog_t *zilog)
 
 		error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
 		    ZIL_MIN_BLKSZ, &slog);
-		fastwrite = TRUE;
-
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
 	}
@@ -715,7 +989,7 @@ zil_create(zilog_t *zilog)
 	 * Allocate a log write block (lwb) for the first log block.
 	 */
 	if (error == 0)
-		lwb = zil_alloc_lwb(zilog, &blk, slog, txg, fastwrite);
+		lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
@@ -723,11 +997,35 @@ zil_create(zilog_t *zilog)
 	 * (zh is part of the MOS, so we cannot modify it in open context.)
 	 */
 	if (tx != NULL) {
+		/*
+		 * If "zilsaxattr" feature is enabled on zpool, then activate
+		 * it now when we're creating the ZIL chain. We can't wait with
+		 * this until we write the first xattr log record because we
+		 * need to wait for the feature activation to sync out.
+		 */
+		if (spa_feature_is_enabled(zilog->zl_spa,
+		    SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) !=
+		    DMU_OST_ZVOL) {
+			mutex_enter(&ds->ds_lock);
+			ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
+			    (void *)B_TRUE;
+			mutex_exit(&ds->ds_lock);
+		}
+
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
+	} else {
+		/*
+		 * This branch covers the case where we enable the feature on a
+		 * zpool that has existing ZIL headers.
+		 */
+		zil_commit_activate_saxattr_feature(zilog);
 	}
+	IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
+	    dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL,
+	    dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR));
 
-	ASSERT(error != 0 || bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
+	ASSERT(error != 0 || memcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
 	IMPLY(error == 0, lwb != NULL);
 
 	return (lwb);
@@ -741,8 +1039,9 @@ zil_create(zilog_t *zilog)
  * txg_wait_synced() here either when keep_first is set, because both
  * zil_create() and zil_destroy() will wait for any in-progress destroys
  * to complete.
+ * Return B_TRUE if there were any entries to replay.
  */
-void
+boolean_t
 zil_destroy(zilog_t *zilog, boolean_t keep_first)
 {
 	const zil_header_t *zh = zilog->zl_header;
@@ -758,7 +1057,7 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
 	zilog->zl_old_header = *zh;		/* debugging aid */
 
 	if (BP_IS_HOLE(&zh->zh_log))
-		return;
+		return (B_FALSE);
 
 	tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
@@ -774,15 +1073,11 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
 		VERIFY(!keep_first);
-		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
-			if (lwb->lwb_fastwrite)
-				metaslab_fastwrite_unmark(zilog->zl_spa,
-				    &lwb->lwb_blk);
-
-			list_remove(&zilog->zl_lwb_list, lwb);
+		while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) {
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-			zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
+			if (!BP_IS_HOLE(&lwb->lwb_blk))
+				zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
 			zil_free_lwb(zilog, lwb);
 		}
 	} else if (!keep_first) {
@@ -791,6 +1086,8 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
 	mutex_exit(&zilog->zl_lock);
 
 	dmu_tx_commit(tx);
+
+	return (B_TRUE);
 }
 
 void
@@ -911,10 +1208,10 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
-/* ARGSUSED */
 int
 zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 {
+	(void) dp;
 	zilog_t *zilog;
 	objset_t *os;
 	blkptr_t *bp;
@@ -1008,21 +1305,21 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
 {
 	/*
 	 * The lwb_waiters field of the lwb is protected by the zilog's
-	 * zl_lock, thus it must be held when calling this function.
+	 * zl_issuer_lock while the lwb is open and zl_lock otherwise.
+	 * zl_issuer_lock also protects leaving the open state.
+	 * zcw_lwb setting is protected by zl_issuer_lock and state !=
+	 * flush_done, which transition is protected by zl_lock.
 	 */
-	ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
+	ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock));
+	IMPLY(lwb->lwb_state != LWB_STATE_OPENED,
+	    MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
+	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
+	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
-	mutex_enter(&zcw->zcw_lock);
 	ASSERT(!list_link_active(&zcw->zcw_node));
-	ASSERT3P(zcw->zcw_lwb, ==, NULL);
-	ASSERT3P(lwb, !=, NULL);
-	ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
-	    lwb->lwb_state == LWB_STATE_ISSUED ||
-	    lwb->lwb_state == LWB_STATE_WRITE_DONE);
-
 	list_insert_tail(&lwb->lwb_waiters, zcw);
+	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	zcw->zcw_lwb = lwb;
-	mutex_exit(&zcw->zcw_lock);
 }
 
 /*
@@ -1033,11 +1330,9 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
 static void
 zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
 {
-	mutex_enter(&zcw->zcw_lock);
 	ASSERT(!list_link_active(&zcw->zcw_node));
-	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	list_insert_tail(nolwb, zcw);
-	mutex_exit(&zcw->zcw_lock);
+	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 }
 
 void
@@ -1049,6 +1344,9 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
 	int ndvas = BP_GET_NDVAS(bp);
 	int i;
 
+	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+
 	if (zil_nocacheflush)
 		return;
 
@@ -1107,48 +1405,33 @@ zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
 }
 
 /*
- * This function is a called after all vdevs associated with a given lwb
- * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
- * as the lwb write completes, if "zil_nocacheflush" is set. Further,
- * all "previous" lwb's will have completed before this function is
- * called; i.e. this function is called for all previous lwbs before
- * it's called for "this" lwb (enforced via zio the dependencies
- * configured in zil_lwb_set_zio_dependency()).
+ * This function is a called after all vdevs associated with a given lwb write
+ * have completed their flush command; or as soon as the lwb write completes,
+ * if "zil_nocacheflush" is set. Further, all "previous" lwb's will have
+ * completed before this function is called; i.e. this function is called for
+ * all previous lwbs before it's called for "this" lwb (enforced via zio the
+ * dependencies configured in zil_lwb_set_zio_dependency()).
  *
- * The intention is for this function to be called as soon as the
- * contents of an lwb are considered "stable" on disk, and will survive
- * any sudden loss of power. At this point, any threads waiting for the
- * lwb to reach this state are signalled, and the "waiter" structures
- * are marked "done".
+ * The intention is for this function to be called as soon as the contents of
+ * an lwb are considered "stable" on disk, and will survive any sudden loss of
+ * power. At this point, any threads waiting for the lwb to reach this state
+ * are signalled, and the "waiter" structures are marked "done".
  */
 static void
 zil_lwb_flush_vdevs_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	zilog_t *zilog = lwb->lwb_zilog;
-	dmu_tx_t *tx = lwb->lwb_tx;
 	zil_commit_waiter_t *zcw;
 	itx_t *itx;
 
 	spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
 
-	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+	hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp;
 
 	mutex_enter(&zilog->zl_lock);
 
-	/*
-	 * Ensure the lwb buffer pointer is cleared before releasing the
-	 * txg. If we have had an allocation failure and the txg is
-	 * waiting to sync then we want zil_sync() to remove the lwb so
-	 * that it's not picked up as the next new one in
-	 * zil_process_commit_list(). zil_sync() will only remove the
-	 * lwb if lwb_buf is null.
-	 */
-	lwb->lwb_buf = NULL;
-	lwb->lwb_tx = NULL;
-
-	ASSERT3U(lwb->lwb_issued_timestamp, >, 0);
-	zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;
+	zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8;
 
 	lwb->lwb_root_zio = NULL;
 
@@ -1165,17 +1448,12 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
 	}
 
-	while ((itx = list_head(&lwb->lwb_itxs)) != NULL) {
-		list_remove(&lwb->lwb_itxs, itx);
+	while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
 		zil_itx_destroy(itx);
-	}
 
-	while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) {
+	while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
 		mutex_enter(&zcw->zcw_lock);
 
-		ASSERT(list_link_active(&zcw->zcw_node));
-		list_remove(&lwb->lwb_waiters, zcw);
-
 		ASSERT3P(zcw->zcw_lwb, ==, lwb);
 		zcw->zcw_lwb = NULL;
 		/*
@@ -1202,28 +1480,65 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 		mutex_exit(&zcw->zcw_lock);
 	}
 
+	uint64_t txg = lwb->lwb_issued_txg;
+
+	/* Once we drop the lock, lwb may be freed by zil_sync(). */
 	mutex_exit(&zilog->zl_lock);
 
-	/*
-	 * Now that we've written this log block, we have a stable pointer
-	 * to the next block in the chain, so it's OK to let the txg in
-	 * which we allocated the next block sync.
-	 */
-	dmu_tx_commit(tx);
+	mutex_enter(&zilog->zl_lwb_io_lock);
+	ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);
+	zilog->zl_lwb_inflight[txg & TXG_MASK]--;
+	if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0)
+		cv_broadcast(&zilog->zl_lwb_io_cv);
+	mutex_exit(&zilog->zl_lwb_io_lock);
 }
 
 /*
- * This is called when an lwb's write zio completes. The callback's
- * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
- * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
- * in writing out this specific lwb's data, and in the case that cache
- * flushes have been deferred, vdevs involved in writing the data for
- * previous lwbs. The writes corresponding to all the vdevs in the
- * lwb_vdev_tree will have completed by the time this is called, due to
- * the zio dependencies configured in zil_lwb_set_zio_dependency(),
- * which takes deferred flushes into account. The lwb will be "done"
- * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
- * completion callback for the lwb's root zio.
+ * Wait for the completion of all issued write/flush of that txg provided.
+ * It guarantees zil_lwb_flush_vdevs_done() is called and returned.
+ */
+static void
+zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg)
+{
+	ASSERT3U(txg, ==, spa_syncing_txg(zilog->zl_spa));
+
+	mutex_enter(&zilog->zl_lwb_io_lock);
+	while (zilog->zl_lwb_inflight[txg & TXG_MASK] > 0)
+		cv_wait(&zilog->zl_lwb_io_cv, &zilog->zl_lwb_io_lock);
+	mutex_exit(&zilog->zl_lwb_io_lock);
+
+#ifdef ZFS_DEBUG
+	mutex_enter(&zilog->zl_lock);
+	mutex_enter(&zilog->zl_lwb_io_lock);
+	lwb_t *lwb = list_head(&zilog->zl_lwb_list);
+	while (lwb != NULL) {
+		if (lwb->lwb_issued_txg <= txg) {
+			ASSERT(lwb->lwb_state != LWB_STATE_ISSUED);
+			ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE);
+			IMPLY(lwb->lwb_issued_txg > 0,
+			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
+		}
+		IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+		    lwb->lwb_state == LWB_STATE_FLUSH_DONE,
+		    lwb->lwb_buf == NULL);
+		lwb = list_next(&zilog->zl_lwb_list, lwb);
+	}
+	mutex_exit(&zilog->zl_lwb_io_lock);
+	mutex_exit(&zilog->zl_lock);
+#endif
+}
+
+/*
+ * This is called when an lwb's write zio completes. The callback's purpose is
+ * to issue the flush commands for the vdevs in the lwb's lwb_vdev_tree. The
+ * tree will contain the vdevs involved in writing out this specific lwb's
+ * data, and in the case that cache flushes have been deferred, vdevs involved
+ * in writing the data for previous lwbs. The writes corresponding to all the
+ * vdevs in the lwb_vdev_tree will have completed by the time this is called,
+ * due to the zio dependencies configured in zil_lwb_set_zio_dependency(),
+ * which takes deferred flushes into account. The lwb will be "done" once
+ * zil_lwb_flush_vdevs_done() is called, which occurs in the zio completion
+ * callback for the lwb's root zio.
  */
 static void
 zil_lwb_write_done(zio_t *zio)
@@ -1238,22 +1553,25 @@ zil_lwb_write_done(zio_t *zio)
 
 	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
 
-	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
-	ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
-	ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
-	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
-	ASSERT(!BP_IS_GANG(zio->io_bp));
-	ASSERT(!BP_IS_HOLE(zio->io_bp));
-	ASSERT(BP_GET_FILL(zio->io_bp) == 0);
-
 	abd_free(zio->io_abd);
+	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+	lwb->lwb_buf = NULL;
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
 	lwb->lwb_state = LWB_STATE_WRITE_DONE;
+	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
-	lwb->lwb_fastwrite = FALSE;
+
+	/*
+	 * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not
+	 * called for it yet, and when it will be, it won't be able to make
+	 * its write ZIO a parent this ZIO.  In such case we can not defer
+	 * our flushes or below may be a race between the done callbacks.
+	 */
 	nlwb = list_next(&zilog->zl_lwb_list, lwb);
+	if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
+		nlwb = NULL;
 	mutex_exit(&zilog->zl_lock);
 
 	if (avl_numnodes(t) == 0)
@@ -1279,21 +1597,20 @@ zil_lwb_write_done(zio_t *zio)
 	}
 
 	/*
-	 * If this lwb does not have any threads waiting for it to
-	 * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
-	 * command to the vdevs written to by "this" lwb, and instead
-	 * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
-	 * command for those vdevs. Thus, we merge the vdev tree of
-	 * "this" lwb with the vdev tree of the "next" lwb in the list,
-	 * and assume the "next" lwb will handle flushing the vdevs (or
-	 * deferring the flush(s) again).
+	 * If this lwb does not have any threads waiting for it to complete, we
+	 * want to defer issuing the flush command to the vdevs written to by
+	 * "this" lwb, and instead rely on the "next" lwb to handle the flush
+	 * command for those vdevs. Thus, we merge the vdev tree of "this" lwb
+	 * with the vdev tree of the "next" lwb in the list, and assume the
+	 * "next" lwb will handle flushing the vdevs (or deferring the flush(s)
+	 * again).
 	 *
-	 * This is a useful performance optimization, especially for
-	 * workloads with lots of async write activity and few sync
-	 * write and/or fsync activity, as it has the potential to
-	 * coalesce multiple flush commands to a vdev into one.
+	 * This is a useful performance optimization, especially for workloads
+	 * with lots of async write activity and few sync write and/or fsync
+	 * activity, as it has the potential to coalesce multiple flush
+	 * commands to a vdev into one.
 	 */
-	if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) {
+	if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) {
 		zil_lwb_flush_defer(lwb, nlwb);
 		ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 		return;
@@ -1316,338 +1633,487 @@ zil_lwb_write_done(zio_t *zio)
 	}
 }
 
+/*
+ * Build the zio dependency chain, which is used to preserve the ordering of
+ * lwb completions that is required by the semantics of the ZIL. Each new lwb
+ * zio becomes a parent of the previous lwb zio, such that the new lwb's zio
+ * cannot complete until the previous lwb's zio completes.
+ *
+ * This is required by the semantics of zil_commit(): the commit waiters
+ * attached to the lwbs will be woken in the lwb zio's completion callback,
+ * so this zio dependency graph ensures the waiters are woken in the correct
+ * order (the same order the lwbs were created).
+ */
 static void
 zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
 {
-	lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
-
-	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 
+	lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb);
+	if (prev_lwb == NULL ||
+	    prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE)
+		return;
+
 	/*
-	 * The zilog's "zl_last_lwb_opened" field is used to build the
-	 * lwb/zio dependency chain, which is used to preserve the
-	 * ordering of lwb completions that is required by the semantics
-	 * of the ZIL. Each new lwb zio becomes a parent of the
-	 * "previous" lwb zio, such that the new lwb's zio cannot
-	 * complete until the "previous" lwb's zio completes.
+	 * If the previous lwb's write hasn't already completed, we also want
+	 * to order the completion of the lwb write zios (above, we only order
+	 * the completion of the lwb root zios). This is required because of
+	 * how we can defer the flush commands for each lwb.
 	 *
-	 * This is required by the semantics of zil_commit(); the commit
-	 * waiters attached to the lwbs will be woken in the lwb zio's
-	 * completion callback, so this zio dependency graph ensures the
-	 * waiters are woken in the correct order (the same order the
-	 * lwbs were created).
-	 */
-	if (last_lwb_opened != NULL &&
-	    last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) {
-		ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
-		    last_lwb_opened->lwb_state == LWB_STATE_ISSUED ||
-		    last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE);
-
-		ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
-		zio_add_child(lwb->lwb_root_zio,
-		    last_lwb_opened->lwb_root_zio);
-
-		/*
-		 * If the previous lwb's write hasn't already completed,
-		 * we also want to order the completion of the lwb write
-		 * zios (above, we only order the completion of the lwb
-		 * root zios). This is required because of how we can
-		 * defer the DKIOCFLUSHWRITECACHE commands for each lwb.
-		 *
-		 * When the DKIOCFLUSHWRITECACHE commands are deferred,
-		 * the previous lwb will rely on this lwb to flush the
-		 * vdevs written to by that previous lwb. Thus, we need
-		 * to ensure this lwb doesn't issue the flush until
-		 * after the previous lwb's write completes. We ensure
-		 * this ordering by setting the zio parent/child
-		 * relationship here.
-		 *
-		 * Without this relationship on the lwb's write zio,
-		 * it's possible for this lwb's write to complete prior
-		 * to the previous lwb's write completing; and thus, the
-		 * vdevs for the previous lwb would be flushed prior to
-		 * that lwb's data being written to those vdevs (the
-		 * vdevs are flushed in the lwb write zio's completion
-		 * handler, zil_lwb_write_done()).
-		 */
-		if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) {
-			ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
-			    last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
-
-			ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL);
-			zio_add_child(lwb->lwb_write_zio,
-			    last_lwb_opened->lwb_write_zio);
-		}
+	 * When the flush commands are deferred, the previous lwb will rely on
+	 * this lwb to flush the vdevs written to by that previous lwb. Thus,
+	 * we need to ensure this lwb doesn't issue the flush until after the
+	 * previous lwb's write completes. We ensure this ordering by setting
+	 * the zio parent/child relationship here.
+	 *
+	 * Without this relationship on the lwb's write zio, it's possible for
+	 * this lwb's write to complete prior to the previous lwb's write
+	 * completing; and thus, the vdevs for the previous lwb would be
+	 * flushed prior to that lwb's data being written to those vdevs (the
+	 * vdevs are flushed in the lwb write zio's completion handler,
+	 * zil_lwb_write_done()).
+	 */
+	if (prev_lwb->lwb_state == LWB_STATE_ISSUED) {
+		ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL);
+		zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio);
+	} else {
+		ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	}
+
+	ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL);
+	zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio);
 }
 
 
 /*
  * This function's purpose is to "open" an lwb such that it is ready to
- * accept new itxs being committed to it. To do this, the lwb's zio
- * structures are created, and linked to the lwb. This function is
- * idempotent; if the passed in lwb has already been opened, this
- * function is essentially a no-op.
+ * accept new itxs being committed to it. This function is idempotent; if
+ * the passed in lwb has already been opened, it is essentially a no-op.
  */
 static void
 zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 {
-	zbookmark_phys_t zb;
-	zio_priority_t prio;
-
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
-	ASSERT3P(lwb, !=, NULL);
-	EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
-	EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);
 
-	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
-	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
-	    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
+	if (lwb->lwb_state != LWB_STATE_NEW) {
+		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+		return;
+	}
 
-	/* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */
 	mutex_enter(&zilog->zl_lock);
-	if (lwb->lwb_root_zio == NULL) {
-		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
-		    BP_GET_LSIZE(&lwb->lwb_blk));
+	lwb->lwb_state = LWB_STATE_OPENED;
+	zilog->zl_last_lwb_opened = lwb;
+	mutex_exit(&zilog->zl_lock);
+}
 
-		if (!lwb->lwb_fastwrite) {
-			metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
-			lwb->lwb_fastwrite = 1;
-		}
+/*
+ * Maximum block size used by the ZIL.  This is picked up when the ZIL is
+ * initialized.  Otherwise this should not be used directly; see
+ * zl_max_block_size instead.
+ */
+static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
 
-		if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
-			prio = ZIO_PRIORITY_SYNC_WRITE;
-		else
-			prio = ZIO_PRIORITY_ASYNC_WRITE;
+/*
+ * Plan splitting of the provided burst size between several blocks.
+ */
+static uint_t
+zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
+{
+	uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t);
 
-		lwb->lwb_root_zio = zio_root(zilog->zl_spa,
-		    zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
-		ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+	if (size <= md) {
+		/*
+		 * Small bursts are written as-is in one block.
+		 */
+		*minsize = size;
+		return (size);
+	} else if (size > 8 * md) {
+		/*
+		 * Big bursts use maximum blocks.  The first block size
+		 * is hard to predict, but it does not really matter.
+		 */
+		*minsize = 0;
+		return (md);
+	}
 
-		lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio,
-		    zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd,
-		    BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb,
-		    prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb);
-		ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+	/*
+	 * Medium bursts try to divide evenly to better utilize several SLOG
+	 * VDEVs.  The first block size we predict assuming the worst case of
+	 * maxing out others.  Fall back to using maximum blocks if due to
+	 * large records or wasted space we can not predict anything better.
+	 */
+	uint_t s = size;
+	uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t));
+	uint_t chunk = DIV_ROUND_UP(s, n);
+	uint_t waste = zil_max_waste_space(zilog);
+	waste = MAX(waste, zilog->zl_cur_max);
+	if (chunk <= md - waste) {
+		*minsize = MAX(s - (md - waste) * (n - 1), waste);
+		return (chunk);
+	} else {
+		*minsize = 0;
+		return (md);
+	}
+}
 
-		lwb->lwb_state = LWB_STATE_OPENED;
+/*
+ * Try to predict next block size based on previous history.  Make prediction
+ * sufficient for 7 of 8 previous bursts.  Don't try to save if the saving is
+ * less then 50%, extra writes may cost more, but we don't want single spike
+ * to badly affect our predictions.
+ */
+static uint_t
+zil_lwb_predict(zilog_t *zilog)
+{
+	uint_t m, o;
 
-		zil_lwb_set_zio_dependency(zilog, lwb);
-		zilog->zl_last_lwb_opened = lwb;
+	/* If we are in the middle of a burst, take it into account also. */
+	if (zilog->zl_cur_size > 0) {
+		o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m);
+	} else {
+		o = UINT_MAX;
+		m = 0;
 	}
-	mutex_exit(&zilog->zl_lock);
 
-	ASSERT3P(lwb->lwb_root_zio, !=, NULL);
-	ASSERT3P(lwb->lwb_write_zio, !=, NULL);
-	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+	/* Find minimum optimal size.  We don't need to go below that. */
+	for (int i = 0; i < ZIL_BURSTS; i++)
+		o = MIN(o, zilog->zl_prev_opt[i]);
+
+	/* Find two biggest minimal first block sizes above the optimal. */
+	uint_t m1 = MAX(m, o), m2 = o;
+	for (int i = 0; i < ZIL_BURSTS; i++) {
+		m = zilog->zl_prev_min[i];
+		if (m >= m1) {
+			m2 = m1;
+			m1 = m;
+		} else if (m > m2) {
+			m2 = m;
+		}
+	}
+
+	/*
+	 * If second minimum size gives 50% saving -- use it.  It may cost us
+	 * one additional write later, but the space saving is just too big.
+	 */
+	return ((m1 < m2 * 2) ? m1 : m2);
 }
 
 /*
- * Define a limited set of intent log block sizes.
- *
- * These must be a multiple of 4KB. Note only the amount used (again
- * aligned to 4KB) actually gets written. However, we can't always just
- * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
+ * Close the log block for being issued and allocate the next one.
+ * Has to be called under zl_issuer_lock to chain more lwbs.
  */
-struct {
-	uint64_t	limit;
-	uint64_t	blksz;
-} zil_block_buckets[] = {
-	{ 4096,		4096 },			/* non TX_WRITE */
-	{ 8192 + 4096,	8192 + 4096 },		/* database */
-	{ 32768 + 4096,	32768 + 4096 },		/* NFS writes */
-	{ 65536 + 4096,	65536 + 4096 },		/* 64KB writes */
-	{ 131072,	131072 },		/* < 128KB writes */
-	{ 131072 +4096,	65536 + 4096 },		/* 128KB writes */
-	{ UINT64_MAX,	SPA_OLD_MAXBLOCKSIZE},	/* > 128KB writes */
-};
+static lwb_t *
+zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
+{
+	uint64_t blksz, plan, plan2;
 
-/*
- * Maximum block size used by the ZIL.  This is picked up when the ZIL is
- * initialized.  Otherwise this should not be used directly; see
- * zl_max_block_size instead.
- */
-int zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
+	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+	lwb->lwb_state = LWB_STATE_CLOSED;
+
+	/*
+	 * If there was an allocation failure then returned NULL will trigger
+	 * zil_commit_writer_stall() at the caller.  This is inherently racy,
+	 * since allocation may not have happened yet.
+	 */
+	if (lwb->lwb_error != 0)
+		return (NULL);
+
+	/*
+	 * Log blocks are pre-allocated.  Here we select the size of the next
+	 * block, based on what's left of this burst and the previous history.
+	 * While we try to only write used part of the block, we can't just
+	 * always allocate the maximum block size because we can exhaust all
+	 * available pool log space, so we try to be reasonable.
+	 */
+	if (zilog->zl_cur_left > 0) {
+		/*
+		 * We are in the middle of a burst and know how much is left.
+		 * But if workload is multi-threaded there may be more soon.
+		 * Try to predict what can it be and plan for the worst case.
+		 */
+		uint_t m;
+		plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
+		if (zilog->zl_parallel) {
+			plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left +
+			    zil_lwb_predict(zilog), &m);
+			if (plan < plan2)
+				plan = plan2;
+		}
+	} else {
+		/*
+		 * The previous burst is done and we can only predict what
+		 * will come next.
+		 */
+		plan = zil_lwb_predict(zilog);
+	}
+	blksz = plan + sizeof (zil_chain_t);
+	blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t);
+	blksz = MIN(blksz, zilog->zl_max_block_size);
+	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz,
+	    uint64_t, plan);
+
+	return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state));
+}
 
 /*
- * Start a log block write and advance to the next log block.
- * Calls are serialized.
+ * Finalize previously closed block and issue the write zio.
  */
-static lwb_t *
+static void
 zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 {
-	lwb_t *nlwb = NULL;
-	zil_chain_t *zilc;
 	spa_t *spa = zilog->zl_spa;
-	blkptr_t *bp;
-	dmu_tx_t *tx;
-	uint64_t txg;
-	uint64_t zil_blksz, wsz;
-	int i, error;
+	zil_chain_t *zilc;
 	boolean_t slog;
+	zbookmark_phys_t zb;
+	zio_priority_t prio;
+	int error;
 
-	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
-	ASSERT3P(lwb->lwb_root_zio, !=, NULL);
-	ASSERT3P(lwb->lwb_write_zio, !=, NULL);
-	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
-	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
-		zilc = (zil_chain_t *)lwb->lwb_buf;
-		bp = &zilc->zc_next_blk;
-	} else {
-		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
-		bp = &zilc->zc_next_blk;
-	}
+	/* Actually fill the lwb with the data. */
+	for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
+	    itx = list_next(&lwb->lwb_itxs, itx))
+		zil_lwb_commit(zilog, lwb, itx);
+	lwb->lwb_nused = lwb->lwb_nfilled;
+	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
 
-	ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
+	lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb,
+	    ZIO_FLAG_CANFAIL);
 
 	/*
-	 * Allocate the next block and save its address in this block
-	 * before writing it in order to establish the log chain.
-	 * Note that if the allocation of nlwb synced before we wrote
-	 * the block that points at it (lwb), we'd leak it if we crashed.
-	 * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
-	 * We dirty the dataset to ensure that zil_sync() will be called
-	 * to clean up in the event of allocation failure or I/O failure.
+	 * The lwb is now ready to be issued, but it can be only if it already
+	 * got its block pointer allocated or the allocation has failed.
+	 * Otherwise leave it as-is, relying on some other thread to issue it
+	 * after allocating its block pointer via calling zil_lwb_write_issue()
+	 * for the previous lwb(s) in the chain.
 	 */
+	mutex_enter(&zilog->zl_lock);
+	lwb->lwb_state = LWB_STATE_READY;
+	if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) {
+		mutex_exit(&zilog->zl_lock);
+		return;
+	}
+	mutex_exit(&zilog->zl_lock);
 
-	tx = dmu_tx_create(zilog->zl_os);
+next_lwb:
+	if (lwb->lwb_slim)
+		zilc = (zil_chain_t *)lwb->lwb_buf;
+	else
+		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
+	int wsz = lwb->lwb_sz;
+	if (lwb->lwb_error == 0) {
+		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
+		if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk)
+			prio = ZIO_PRIORITY_SYNC_WRITE;
+		else
+			prio = ZIO_PRIORITY_ASYNC_WRITE;
+		SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+		    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+		    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
+		lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
+		    &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
+		    lwb, prio, ZIO_FLAG_CANFAIL, &zb);
+		zil_lwb_add_block(lwb, &lwb->lwb_blk);
+
+		if (lwb->lwb_slim) {
+			/* For Slim ZIL only write what is used. */
+			wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
+			    int);
+			ASSERT3S(wsz, <=, lwb->lwb_sz);
+			zio_shrink(lwb->lwb_write_zio, wsz);
+			wsz = lwb->lwb_write_zio->io_size;
+		}
+		memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
+		zilc->zc_pad = 0;
+		zilc->zc_nused = lwb->lwb_nused;
+		zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
+	} else {
+		/*
+		 * We can't write the lwb if there was an allocation failure,
+		 * so create a null zio instead just to maintain dependencies.
+		 */
+		lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL,
+		    zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL);
+		lwb->lwb_write_zio->io_error = lwb->lwb_error;
+	}
+	if (lwb->lwb_child_zio)
+		zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio);
 
 	/*
-	 * Since we are not going to create any new dirty data, and we
-	 * can even help with clearing the existing dirty data, we
-	 * should not be subject to the dirty data based delays. We
-	 * use TXG_NOTHROTTLE to bypass the delay mechanism.
+	 * Open transaction to allocate the next block pointer.
 	 */
+	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
-
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-	txg = dmu_tx_get_txg(tx);
-
-	lwb->lwb_tx = tx;
+	uint64_t txg = dmu_tx_get_txg(tx);
 
 	/*
-	 * Log blocks are pre-allocated. Here we select the size of the next
-	 * block, based on size used in the last block.
-	 * - first find the smallest bucket that will fit the block from a
-	 *   limited set of block sizes. This is because it's faster to write
-	 *   blocks allocated from the same metaslab as they are adjacent or
-	 *   close.
-	 * - next find the maximum from the new suggested size and an array of
-	 *   previous sizes. This lessens a picket fence effect of wrongly
-	 *   guessing the size if we have a stream of say 2k, 64k, 2k, 64k
-	 *   requests.
-	 *
-	 * Note we only write what is used, but we can't just allocate
-	 * the maximum block size because we can exhaust the available
-	 * pool log space.
-	 */
-	zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
-	for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
-		continue;
-	zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
-	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
-	for (i = 0; i < ZIL_PREV_BLKS; i++)
-		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
-	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
-
+	 * Allocate next the block pointer unless we are already in error.
+	 */
+	lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb);
+	blkptr_t *bp = &zilc->zc_next_blk;
 	BP_ZERO(bp);
-	error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog);
-	if (slog) {
-		ZIL_STAT_BUMP(zil_itx_metaslab_slog_count);
-		ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused);
-	} else {
-		ZIL_STAT_BUMP(zil_itx_metaslab_normal_count);
-		ZIL_STAT_INCR(zil_itx_metaslab_normal_bytes, lwb->lwb_nused);
+	error = lwb->lwb_error;
+	if (error == 0) {
+		error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
+		    &slog);
 	}
 	if (error == 0) {
-		ASSERT3U(bp->blk_birth, ==, txg);
+		ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), ==, txg);
+		BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
+		    ZIO_CHECKSUM_ZILOG);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
 		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
-
-		/*
-		 * Allocate a new log write block (lwb).
-		 */
-		nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE);
 	}
 
-	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
-		/* For Slim ZIL only write what is used. */
-		wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
-		ASSERT3U(wsz, <=, lwb->lwb_sz);
-		zio_shrink(lwb->lwb_write_zio, wsz);
-
-	} else {
-		wsz = lwb->lwb_sz;
-	}
+	/*
+	 * Reduce TXG open time by incrementing inflight counter and committing
+	 * the transaciton.  zil_sync() will wait for it to return to zero.
+	 */
+	mutex_enter(&zilog->zl_lwb_io_lock);
+	lwb->lwb_issued_txg = txg;
+	zilog->zl_lwb_inflight[txg & TXG_MASK]++;
+	zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg);
+	mutex_exit(&zilog->zl_lwb_io_lock);
+	dmu_tx_commit(tx);
 
-	zilc->zc_pad = 0;
-	zilc->zc_nused = lwb->lwb_nused;
-	zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
+	spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
 
 	/*
-	 * clear unused data for security
+	 * We've completed all potentially blocking operations.  Update the
+	 * nlwb and allow it proceed without possible lock order reversals.
 	 */
-	bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
+	mutex_enter(&zilog->zl_lock);
+	zil_lwb_set_zio_dependency(zilog, lwb);
+	lwb->lwb_state = LWB_STATE_ISSUED;
 
-	spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);
+	if (nlwb) {
+		nlwb->lwb_blk = *bp;
+		nlwb->lwb_error = error;
+		nlwb->lwb_slog = slog;
+		nlwb->lwb_alloc_txg = txg;
+		if (nlwb->lwb_state != LWB_STATE_READY)
+			nlwb = NULL;
+	}
+	mutex_exit(&zilog->zl_lock);
 
-	zil_lwb_add_block(lwb, &lwb->lwb_blk);
+	if (lwb->lwb_slog) {
+		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
+		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
+		    lwb->lwb_nused);
+		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write,
+		    wsz);
+		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc,
+		    BP_GET_LSIZE(&lwb->lwb_blk));
+	} else {
+		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count);
+		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes,
+		    lwb->lwb_nused);
+		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write,
+		    wsz);
+		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
+		    BP_GET_LSIZE(&lwb->lwb_blk));
+	}
 	lwb->lwb_issued_timestamp = gethrtime();
-	lwb->lwb_state = LWB_STATE_ISSUED;
-
-	zio_nowait(lwb->lwb_root_zio);
+	if (lwb->lwb_child_zio)
+		zio_nowait(lwb->lwb_child_zio);
 	zio_nowait(lwb->lwb_write_zio);
+	zio_nowait(lwb->lwb_root_zio);
 
 	/*
-	 * If there was an allocation failure then nlwb will be null which
-	 * forces a txg_wait_synced().
+	 * If nlwb was ready when we gave it the block pointer,
+	 * it is on us to issue it and possibly following ones.
 	 */
-	return (nlwb);
+	lwb = nlwb;
+	if (lwb)
+		goto next_lwb;
 }
 
 /*
- * Maximum amount of write data that can be put into single log block.
+ * Maximum amount of data that can be put into single log block.
  */
 uint64_t
-zil_max_log_data(zilog_t *zilog)
+zil_max_log_data(zilog_t *zilog, size_t hdrsize)
 {
-	return (zilog->zl_max_block_size -
-	    sizeof (zil_chain_t) - sizeof (lr_write_t));
+	return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize);
 }
 
 /*
  * Maximum amount of log space we agree to waste to reduce number of
- * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
+ * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%).
  */
 static inline uint64_t
 zil_max_waste_space(zilog_t *zilog)
 {
-	return (zil_max_log_data(zilog) / 8);
+	return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16);
 }
 
 /*
  * Maximum amount of write data for WR_COPIED.  For correctness, consumers
  * must fall back to WR_NEED_COPY if we can't fit the entire record into one
  * maximum sized log block, because each WR_COPIED record must fit in a
- * single log block.  For space efficiency, we want to fit two records into a
- * max-sized log block.
+ * single log block.  Below that it is a tradeoff of additional memory copy
+ * and possibly worse log space efficiency vs additional range lock/unlock.
  */
+static uint_t zil_maxcopied = 7680;
+
 uint64_t
 zil_max_copied_data(zilog_t *zilog)
 {
-	return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 -
-	    sizeof (lr_write_t));
+	uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t));
+	return (MIN(max_data, zil_maxcopied));
 }
 
+static uint64_t
+zil_itx_record_size(itx_t *itx)
+{
+	lr_t *lr = &itx->itx_lr;
+
+	if (lr->lrc_txtype == TX_COMMIT)
+		return (0);
+	ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
+	return (lr->lrc_reclen);
+}
+
+static uint64_t
+zil_itx_data_size(itx_t *itx)
+{
+	lr_t *lr = &itx->itx_lr;
+	lr_write_t *lrw = (lr_write_t *)lr;
+
+	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
+		ASSERT3U(lr->lrc_reclen, ==, sizeof (lr_write_t));
+		return (P2ROUNDUP_TYPED(lrw->lr_length, sizeof (uint64_t),
+		    uint64_t));
+	}
+	return (0);
+}
+
+static uint64_t
+zil_itx_full_size(itx_t *itx)
+{
+	lr_t *lr = &itx->itx_lr;
+
+	if (lr->lrc_txtype == TX_COMMIT)
+		return (0);
+	ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
+	return (lr->lrc_reclen + zil_itx_data_size(itx));
+}
+
+/*
+ * Estimate space needed in the lwb for the itx.  Allocate more lwbs or
+ * split the itx as needed, but don't touch the actual transaction data.
+ * Has to be called under zl_issuer_lock to call zil_lwb_write_close()
+ * to chain more lwbs.
+ */
 static lwb_t *
-zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
+zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
 {
-	lr_t *lrcb, *lrc;
-	lr_write_t *lrwb, *lrw;
-	char *lr_buf;
-	uint64_t dlen, dnow, dpad, lwb_sp, reclen, txg, max_log_data;
+	itx_t *citx;
+	lr_t *lr, *clr;
+	lr_write_t *lrw;
+	uint64_t dlen, dnow, lwb_sp, reclen, max_log_data;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3P(lwb, !=, NULL);
@@ -1655,8 +2121,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 
 	zil_lwb_write_open(zilog, lwb);
 
-	lrc = &itx->itx_lr;
-	lrw = (lr_write_t *)lrc;
+	lr = &itx->itx_lr;
+	lrw = (lr_write_t *)lr;
 
 	/*
 	 * A commit itx doesn't represent any on-disk state; instead
@@ -1670,150 +2136,207 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 	 *
 	 * For more details, see the comment above zil_commit().
 	 */
-	if (lrc->lrc_txtype == TX_COMMIT) {
-		mutex_enter(&zilog->zl_lock);
+	if (lr->lrc_txtype == TX_COMMIT) {
 		zil_commit_waiter_link_lwb(itx->itx_private, lwb);
-		itx->itx_private = NULL;
-		mutex_exit(&zilog->zl_lock);
+		list_insert_tail(&lwb->lwb_itxs, itx);
 		return (lwb);
 	}
 
-	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
-		dlen = P2ROUNDUP_TYPED(
-		    lrw->lr_length, sizeof (uint64_t), uint64_t);
-		dpad = dlen - lrw->lr_length;
-	} else {
-		dlen = dpad = 0;
-	}
-	reclen = lrc->lrc_reclen;
-	zilog->zl_cur_used += (reclen + dlen);
-	txg = lrc->lrc_txg;
-
-	ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen));
+	reclen = lr->lrc_reclen;
+	ASSERT3U(reclen, >=, sizeof (lr_t));
+	ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0));
+	dlen = zil_itx_data_size(itx);
 
 cont:
 	/*
 	 * If this record won't fit in the current log block, start a new one.
 	 * For WR_NEED_COPY optimize layout for minimal number of chunks.
 	 */
-	lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
-	max_log_data = zil_max_log_data(zilog);
+	lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
+	max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t));
 	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
 	    lwb_sp < zil_max_waste_space(zilog) &&
 	    (dlen % max_log_data == 0 ||
 	    lwb_sp < reclen + dlen % max_log_data))) {
-		lwb = zil_lwb_write_issue(zilog, lwb);
+		list_insert_tail(ilwbs, lwb);
+		lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
 		if (lwb == NULL)
 			return (NULL);
-		zil_lwb_write_open(zilog, lwb);
-		ASSERT(LWB_EMPTY(lwb));
-		lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
-
-		/*
-		 * There must be enough space in the new, empty log block to
-		 * hold reclen.  For WR_COPIED, we need to fit the whole
-		 * record in one block, and reclen is the header size + the
-		 * data size. For WR_NEED_COPY, we can create multiple
-		 * records, splitting the data into multiple blocks, so we
-		 * only need to fit one word of data per block; in this case
-		 * reclen is just the header size (no data).
-		 */
-		ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
+		lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
 	}
 
+	/*
+	 * There must be enough space in the log block to hold reclen.
+	 * For WR_COPIED, we need to fit the whole record in one block,
+	 * and reclen is the write record header size + the data size.
+	 * For WR_NEED_COPY, we can create multiple records, splitting
+	 * the data into multiple blocks, so we only need to fit one
+	 * word of data per block; in this case reclen is just the header
+	 * size (no data).
+	 */
+	ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
+
 	dnow = MIN(dlen, lwb_sp - reclen);
-	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
-	bcopy(lrc, lr_buf, reclen);
-	lrcb = (lr_t *)lr_buf;		/* Like lrc, but inside lwb. */
-	lrwb = (lr_write_t *)lrcb;	/* Like lrw, but inside lwb. */
+	if (dlen > dnow) {
+		ASSERT3U(lr->lrc_txtype, ==, TX_WRITE);
+		ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY);
+		citx = zil_itx_clone(itx);
+		clr = &citx->itx_lr;
+		lr_write_t *clrw = (lr_write_t *)clr;
+		clrw->lr_length = dnow;
+		lrw->lr_offset += dnow;
+		lrw->lr_length -= dnow;
+		zilog->zl_cur_left -= dnow;
+	} else {
+		citx = itx;
+		clr = lr;
+	}
+
+	/*
+	 * We're actually making an entry, so update lrc_seq to be the
+	 * log record sequence number.  Note that this is generally not
+	 * equal to the itx sequence number because not all transactions
+	 * are synchronous, and sometimes spa_sync() gets there first.
+	 */
+	clr->lrc_seq = ++zilog->zl_lr_seq;
+
+	lwb->lwb_nused += reclen + dnow;
+	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
+	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
+
+	zil_lwb_add_txg(lwb, lr->lrc_txg);
+	list_insert_tail(&lwb->lwb_itxs, citx);
+
+	dlen -= dnow;
+	if (dlen > 0)
+		goto cont;
+
+	if (lr->lrc_txtype == TX_WRITE &&
+	    lr->lrc_txg > spa_freeze_txg(zilog->zl_spa))
+		txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
+
+	return (lwb);
+}
+
+/*
+ * Fill the actual transaction data into the lwb, following zil_lwb_assign().
+ * Does not require locking.
+ */
+static void
+zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
+{
+	lr_t *lr, *lrb;
+	lr_write_t *lrw, *lrwb;
+	char *lr_buf;
+	uint64_t dlen, reclen;
+
+	lr = &itx->itx_lr;
+	lrw = (lr_write_t *)lr;
 
-	ZIL_STAT_BUMP(zil_itx_count);
+	if (lr->lrc_txtype == TX_COMMIT)
+		return;
+
+	reclen = lr->lrc_reclen;
+	dlen = zil_itx_data_size(itx);
+	ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);
+
+	lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
+	memcpy(lr_buf, lr, reclen);
+	lrb = (lr_t *)lr_buf;		/* Like lr, but inside lwb. */
+	lrwb = (lr_write_t *)lrb;	/* Like lrw, but inside lwb. */
+
+	ZIL_STAT_BUMP(zilog, zil_itx_count);
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
 	 */
-	if (lrc->lrc_txtype == TX_WRITE) {
-		if (txg > spa_freeze_txg(zilog->zl_spa))
-			txg_wait_synced(zilog->zl_dmu_pool, txg);
+	if (lr->lrc_txtype == TX_WRITE) {
 		if (itx->itx_wr_state == WR_COPIED) {
-			ZIL_STAT_BUMP(zil_itx_copied_count);
-			ZIL_STAT_INCR(zil_itx_copied_bytes, lrw->lr_length);
+			ZIL_STAT_BUMP(zilog, zil_itx_copied_count);
+			ZIL_STAT_INCR(zilog, zil_itx_copied_bytes,
+			    lrw->lr_length);
 		} else {
 			char *dbuf;
 			int error;
 
 			if (itx->itx_wr_state == WR_NEED_COPY) {
 				dbuf = lr_buf + reclen;
-				lrcb->lrc_reclen += dnow;
-				if (lrwb->lr_length > dnow)
-					lrwb->lr_length = dnow;
-				lrw->lr_offset += dnow;
-				lrw->lr_length -= dnow;
-				ZIL_STAT_BUMP(zil_itx_needcopy_count);
-				ZIL_STAT_INCR(zil_itx_needcopy_bytes, dnow);
+				lrb->lrc_reclen += dlen;
+				ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count);
+				ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes,
+				    dlen);
 			} else {
 				ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT);
 				dbuf = NULL;
-				ZIL_STAT_BUMP(zil_itx_indirect_count);
-				ZIL_STAT_INCR(zil_itx_indirect_bytes,
+				ZIL_STAT_BUMP(zilog, zil_itx_indirect_count);
+				ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes,
 				    lrw->lr_length);
+				if (lwb->lwb_child_zio == NULL) {
+					lwb->lwb_child_zio = zio_null(NULL,
+					    zilog->zl_spa, NULL, NULL, NULL,
+					    ZIO_FLAG_CANFAIL);
+				}
 			}
 
 			/*
-			 * We pass in the "lwb_write_zio" rather than
-			 * "lwb_root_zio" so that the "lwb_write_zio"
-			 * becomes the parent of any zio's created by
-			 * the "zl_get_data" callback. The vdevs are
-			 * flushed after the "lwb_write_zio" completes,
-			 * so we want to make sure that completion
-			 * callback waits for these additional zio's,
-			 * such that the vdevs used by those zio's will
-			 * be included in the lwb's vdev tree, and those
-			 * vdevs will be properly flushed. If we passed
-			 * in "lwb_root_zio" here, then these additional
-			 * vdevs may not be flushed; e.g. if these zio's
-			 * completed after "lwb_write_zio" completed.
+			 * The "lwb_child_zio" we pass in will become a child of
+			 * "lwb_write_zio", when one is created, so one will be
+			 * a parent of any zio's created by the "zl_get_data".
+			 * This way "lwb_write_zio" will first wait for children
+			 * block pointers before own writing, and then for their
+			 * writing completion before the vdev cache flushing.
 			 */
 			error = zilog->zl_get_data(itx->itx_private,
 			    itx->itx_gen, lrwb, dbuf, lwb,
-			    lwb->lwb_write_zio);
-			if (dbuf != NULL && error == 0 && dnow == dlen)
+			    lwb->lwb_child_zio);
+			if (dbuf != NULL && error == 0) {
 				/* Zero any padding bytes in the last block. */
-				bzero((char *)dbuf + lrwb->lr_length, dpad);
-
-			if (error == EIO) {
-				txg_wait_synced(zilog->zl_dmu_pool, txg);
-				return (lwb);
+				memset((char *)dbuf + lrwb->lr_length, 0,
+				    dlen - lrwb->lr_length);
 			}
-			if (error != 0) {
-				ASSERT(error == ENOENT || error == EEXIST ||
-				    error == EALREADY);
-				return (lwb);
+
+			/*
+			 * Typically, the only return values we should see from
+			 * ->zl_get_data() are 0, EIO, ENOENT, EEXIST or
+			 *  EALREADY. However, it is also possible to see other
+			 *  error values such as ENOSPC or EINVAL from
+			 *  dmu_read() -> dnode_hold() -> dnode_hold_impl() or
+			 *  ENXIO as well as a multitude of others from the
+			 *  block layer through dmu_buf_hold() -> dbuf_read()
+			 *  -> zio_wait(), as well as through dmu_read() ->
+			 *  dnode_hold() -> dnode_hold_impl() -> dbuf_read() ->
+			 *  zio_wait(). When these errors happen, we can assume
+			 *  that neither an immediate write nor an indirect
+			 *  write occurred, so we need to fall back to
+			 *  txg_wait_synced(). This is unusual, so we print to
+			 *  dmesg whenever one of these errors occurs.
+			 */
+			switch (error) {
+			case 0:
+				break;
+			default:
+				cmn_err(CE_WARN, "zil_lwb_commit() received "
+				    "unexpected error %d from ->zl_get_data()"
+				    ". Falling back to txg_wait_synced().",
+				    error);
+				zfs_fallthrough;
+			case EIO:
+				txg_wait_synced(zilog->zl_dmu_pool,
+				    lr->lrc_txg);
+				zfs_fallthrough;
+			case ENOENT:
+				zfs_fallthrough;
+			case EEXIST:
+				zfs_fallthrough;
+			case EALREADY:
+				return;
 			}
 		}
 	}
 
-	/*
-	 * We're actually making an entry, so update lrc_seq to be the
-	 * log record sequence number.  Note that this is generally not
-	 * equal to the itx sequence number because not all transactions
-	 * are synchronous, and sometimes spa_sync() gets there first.
-	 */
-	lrcb->lrc_seq = ++zilog->zl_lr_seq;
-	lwb->lwb_nused += reclen + dnow;
-
-	zil_lwb_add_txg(lwb, txg);
-
-	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
-	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
-
-	dlen -= dnow;
-	if (dlen > 0) {
-		zilog->zl_cur_used += reclen;
-		goto cont;
-	}
-
-	return (lwb);
+	lwb->lwb_nfilled += reclen + dlen;
+	ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused);
+	ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t)));
 }
 
 itx_t *
@@ -1822,14 +2345,16 @@ zil_itx_create(uint64_t txtype, size_t olrsize)
 	size_t itxsize, lrsize;
 	itx_t *itx;
 
+	ASSERT3U(olrsize, >=, sizeof (lr_t));
 	lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t);
+	ASSERT3U(lrsize, >=, olrsize);
 	itxsize = offsetof(itx_t, itx_lr) + lrsize;
 
 	itx = zio_data_buf_alloc(itxsize);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
-	bzero((char *)&itx->itx_lr + olrsize, lrsize - olrsize);
+	memset((char *)&itx->itx_lr + olrsize, 0, lrsize - olrsize);
 	itx->itx_sync = B_TRUE;		/* default is synchronous */
 	itx->itx_callback = NULL;
 	itx->itx_callback_data = NULL;
@@ -1838,9 +2363,26 @@ zil_itx_create(uint64_t txtype, size_t olrsize)
 	return (itx);
 }
 
+static itx_t *
+zil_itx_clone(itx_t *oitx)
+{
+	ASSERT3U(oitx->itx_size, >=, sizeof (itx_t));
+	ASSERT3U(oitx->itx_size, ==,
+	    offsetof(itx_t, itx_lr) + oitx->itx_lr.lrc_reclen);
+
+	itx_t *itx = zio_data_buf_alloc(oitx->itx_size);
+	memcpy(itx, oitx, oitx->itx_size);
+	itx->itx_callback = NULL;
+	itx->itx_callback_data = NULL;
+	return (itx);
+}
+
 void
 zil_itx_destroy(itx_t *itx)
 {
+	ASSERT3U(itx->itx_size, >=, sizeof (itx_t));
+	ASSERT3U(itx->itx_lr.lrc_reclen, ==,
+	    itx->itx_size - offsetof(itx_t, itx_lr));
 	IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL);
 	IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
 
@@ -1865,11 +2407,11 @@ zil_itxg_clean(void *arg)
 	itx_async_node_t *ian;
 
 	list = &itxs->i_sync_list;
-	while ((itx = list_head(list)) != NULL) {
+	while ((itx = list_remove_head(list)) != NULL) {
 		/*
 		 * In the general case, commit itxs will not be found
 		 * here, as they'll be committed to an lwb via
-		 * zil_lwb_commit(), and free'd in that function. Having
+		 * zil_lwb_assign(), and free'd in that function. Having
 		 * said that, it is still possible for commit itxs to be
 		 * found here, due to the following race:
 		 *
@@ -1888,7 +2430,6 @@ zil_itxg_clean(void *arg)
 		if (itx->itx_lr.lrc_txtype == TX_COMMIT)
 			zil_commit_waiter_skip(itx->itx_private);
 
-		list_remove(list, itx);
 		zil_itx_destroy(itx);
 	}
 
@@ -1896,8 +2437,7 @@ zil_itxg_clean(void *arg)
 	t = &itxs->i_async_tree;
 	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 		list = &ian->ia_list;
-		while ((itx = list_head(list)) != NULL) {
-			list_remove(list, itx);
+		while ((itx = list_remove_head(list)) != NULL) {
 			/* commit itxs should never be on the async lists. */
 			ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 			zil_itx_destroy(itx);
@@ -1926,7 +2466,7 @@ void
 zil_remove_async(zilog_t *zilog, uint64_t oid)
 {
 	uint64_t otxg, txg;
-	itx_async_node_t *ian;
+	itx_async_node_t *ian, ian_search;
 	avl_tree_t *t;
 	avl_index_t where;
 	list_t clean_list;
@@ -1953,13 +2493,13 @@ zil_remove_async(zilog_t *zilog, uint64_t oid)
 		 * Locate the object node and append its list.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
-		ian = avl_find(t, &oid, &where);
+		ian_search.ia_foid = oid;
+		ian = avl_find(t, &ian_search, &where);
 		if (ian != NULL)
 			list_move_tail(&clean_list, &ian->ia_list);
 		mutex_exit(&itxg->itxg_lock);
 	}
-	while ((itx = list_head(&clean_list)) != NULL) {
-		list_remove(&clean_list, itx);
+	while ((itx = list_remove_head(&clean_list)) != NULL) {
 		/* commit itxs should never be on the async lists. */
 		ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 		zil_itx_destroy(itx);
@@ -2090,10 +2630,10 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
  * This function will traverse the queue of itxs that need to be
  * committed, and move them onto the ZIL's zl_itx_commit_list.
  */
-static void
+static uint64_t
 zil_get_commit_list(zilog_t *zilog)
 {
-	uint64_t otxg, txg;
+	uint64_t otxg, txg, wtxg = 0;
 	list_t *commit_list = &zilog->zl_itx_commit_list;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
@@ -2127,10 +2667,33 @@ zil_get_commit_list(zilog_t *zilog)
 		 */
 		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
 		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
-		list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
+		list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
+		itx_t *itx = NULL;
+		if (unlikely(zilog->zl_suspend > 0)) {
+			/*
+			 * ZIL was just suspended, but we lost the race.
+			 * Allow all earlier itxs to be committed, but ask
+			 * caller to do txg_wait_synced(txg) for any new.
+			 */
+			if (!list_is_empty(sync_list))
+				wtxg = MAX(wtxg, txg);
+		} else {
+			itx = list_head(sync_list);
+			list_move_tail(commit_list, sync_list);
+		}
 
 		mutex_exit(&itxg->itxg_lock);
+
+		while (itx != NULL) {
+			uint64_t s = zil_itx_full_size(itx);
+			zilog->zl_cur_size += s;
+			zilog->zl_cur_left += s;
+			s = zil_itx_record_size(itx);
+			zilog->zl_cur_max = MAX(zilog->zl_cur_max, s);
+			itx = list_next(commit_list, itx);
+		}
 	}
+	return (wtxg);
 }
 
 /*
@@ -2140,7 +2703,7 @@ void
 zil_async_to_sync(zilog_t *zilog, uint64_t foid)
 {
 	uint64_t otxg, txg;
-	itx_async_node_t *ian;
+	itx_async_node_t *ian, ian_search;
 	avl_tree_t *t;
 	avl_index_t where;
 
@@ -2170,7 +2733,8 @@ zil_async_to_sync(zilog_t *zilog, uint64_t foid)
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		if (foid != 0) {
-			ian = avl_find(t, &foid, &where);
+			ian_search.ia_foid = foid;
+			ian = avl_find(t, &ian_search, &where);
 			if (ian != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
@@ -2224,7 +2788,6 @@ zil_prune_commit_list(zilog_t *zilog)
 			zil_commit_waiter_skip(itx->itx_private);
 		} else {
 			zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
-			itx->itx_private = NULL;
 		}
 
 		mutex_exit(&zilog->zl_lock);
@@ -2261,7 +2824,27 @@ zil_commit_writer_stall(zilog_t *zilog)
 	 */
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
-	ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
+	ASSERT(list_is_empty(&zilog->zl_lwb_list));
+}
+
+static void
+zil_burst_done(zilog_t *zilog)
+{
+	if (!list_is_empty(&zilog->zl_itx_commit_list) ||
+	    zilog->zl_cur_size == 0)
+		return;
+
+	if (zilog->zl_parallel)
+		zilog->zl_parallel--;
+
+	uint_t r = (zilog->zl_prev_rotor + 1) & (ZIL_BURSTS - 1);
+	zilog->zl_prev_rotor = r;
+	zilog->zl_prev_opt[r] = zil_lwb_plan(zilog, zilog->zl_cur_size,
+	    &zilog->zl_prev_min[r]);
+
+	zilog->zl_cur_size = 0;
+	zilog->zl_cur_max = 0;
+	zilog->zl_cur_left = 0;
 }
 
 /*
@@ -2271,12 +2854,12 @@ zil_commit_writer_stall(zilog_t *zilog)
  * lwb will be issued to the zio layer to be written to disk.
  */
 static void
-zil_process_commit_list(zilog_t *zilog)
+zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 {
 	spa_t *spa = zilog->zl_spa;
 	list_t nolwb_itxs;
 	list_t nolwb_waiters;
-	lwb_t *lwb;
+	lwb_t *lwb, *plwb;
 	itx_t *itx;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
@@ -2285,7 +2868,7 @@ zil_process_commit_list(zilog_t *zilog)
 	 * Return if there's nothing to commit before we dirty the fs by
 	 * calling zil_create().
 	 */
-	if (list_head(&zilog->zl_itx_commit_list) == NULL)
+	if (list_is_empty(&zilog->zl_itx_commit_list))
 		return;
 
 	list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
@@ -2296,12 +2879,32 @@ zil_process_commit_list(zilog_t *zilog)
 	if (lwb == NULL) {
 		lwb = zil_create(zilog);
 	} else {
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+		/*
+		 * Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will
+		 * have already been created (zl_lwb_list not empty).
+		 */
+		zil_commit_activate_saxattr_feature(zilog);
+		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
+		    lwb->lwb_state == LWB_STATE_OPENED);
+
+		/*
+		 * If the lwb is still opened, it means the workload is really
+		 * multi-threaded and we won the chance of write aggregation.
+		 * If it is not opened yet, but previous lwb is still not
+		 * flushed, it still means the workload is multi-threaded, but
+		 * there was too much time between the commits to aggregate, so
+		 * we try aggregation next times, but without too much hopes.
+		 */
+		if (lwb->lwb_state == LWB_STATE_OPENED) {
+			zilog->zl_parallel = ZIL_BURSTS;
+		} else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
+		    != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
+			zilog->zl_parallel = MAX(zilog->zl_parallel,
+			    ZIL_BURSTS / 2);
+		}
 	}
 
-	while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
+	while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		uint64_t txg = lrc->lrc_txg;
 
@@ -2315,8 +2918,6 @@ zil_process_commit_list(zilog_t *zilog)
 			    zilog_t *, zilog, itx_t *, itx);
 		}
 
-		list_remove(&zilog->zl_itx_commit_list, itx);
-
 		boolean_t synced = txg <= spa_last_synced_txg(spa);
 		boolean_t frozen = txg > spa_freeze_txg(spa);
 
@@ -2366,22 +2967,31 @@ zil_process_commit_list(zilog_t *zilog)
 		 */
 		if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
 			if (lwb != NULL) {
-				lwb = zil_lwb_commit(zilog, itx, lwb);
-
-				if (lwb == NULL)
+				lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs);
+				if (lwb == NULL) {
 					list_insert_tail(&nolwb_itxs, itx);
-				else
-					list_insert_tail(&lwb->lwb_itxs, itx);
+				} else if ((zcw->zcw_lwb != NULL &&
+				    zcw->zcw_lwb != lwb) || zcw->zcw_done) {
+					/*
+					 * Our lwb is done, leave the rest of
+					 * itx list to somebody else who care.
+					 */
+					zilog->zl_parallel = ZIL_BURSTS;
+					zilog->zl_cur_left -=
+					    zil_itx_full_size(itx);
+					break;
+				}
 			} else {
 				if (lrc->lrc_txtype == TX_COMMIT) {
 					zil_commit_waiter_link_nolwb(
 					    itx->itx_private, &nolwb_waiters);
 				}
-
 				list_insert_tail(&nolwb_itxs, itx);
 			}
+			zilog->zl_cur_left -= zil_itx_full_size(itx);
 		} else {
 			ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
+			zilog->zl_cur_left -= zil_itx_full_size(itx);
 			zil_itx_destroy(itx);
 		}
 	}
@@ -2393,6 +3003,8 @@ zil_process_commit_list(zilog_t *zilog)
 		 * the ZIL write pipeline; see the comment within
 		 * zil_commit_writer_stall() for more details.
 		 */
+		while ((lwb = list_remove_head(ilwbs)) != NULL)
+			zil_lwb_write_issue(zilog, lwb);
 		zil_commit_writer_stall(zilog);
 
 		/*
@@ -2402,54 +3014,45 @@ zil_process_commit_list(zilog_t *zilog)
 		 * normal.
 		 */
 		zil_commit_waiter_t *zcw;
-		while ((zcw = list_head(&nolwb_waiters)) != NULL) {
+		while ((zcw = list_remove_head(&nolwb_waiters)) != NULL)
 			zil_commit_waiter_skip(zcw);
-			list_remove(&nolwb_waiters, zcw);
-		}
 
 		/*
 		 * And finally, we have to destroy the itx's that
 		 * couldn't be committed to an lwb; this will also call
 		 * the itx's callback if one exists for the itx.
 		 */
-		while ((itx = list_head(&nolwb_itxs)) != NULL) {
-			list_remove(&nolwb_itxs, itx);
+		while ((itx = list_remove_head(&nolwb_itxs)) != NULL)
 			zil_itx_destroy(itx);
-		}
 	} else {
 		ASSERT(list_is_empty(&nolwb_waiters));
 		ASSERT3P(lwb, !=, NULL);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
+		    lwb->lwb_state == LWB_STATE_OPENED);
 
 		/*
 		 * At this point, the ZIL block pointed at by the "lwb"
-		 * variable is in one of the following states: "closed"
-		 * or "open".
+		 * variable is in "new" or "opened" state.
 		 *
-		 * If it's "closed", then no itxs have been committed to
-		 * it, so there's no point in issuing its zio (i.e. it's
-		 * "empty").
+		 * If it's "new", then no itxs have been committed to it, so
+		 * there's no point in issuing its zio (i.e. it's "empty").
 		 *
-		 * If it's "open", then it contains one or more itxs that
+		 * If it's "opened", then it contains one or more itxs that
 		 * eventually need to be committed to stable storage. In
 		 * this case we intentionally do not issue the lwb's zio
 		 * to disk yet, and instead rely on one of the following
 		 * two mechanisms for issuing the zio:
 		 *
-		 * 1. Ideally, there will be more ZIL activity occurring
-		 * on the system, such that this function will be
-		 * immediately called again (not necessarily by the same
-		 * thread) and this lwb's zio will be issued via
-		 * zil_lwb_commit(). This way, the lwb is guaranteed to
-		 * be "full" when it is issued to disk, and we'll make
-		 * use of the lwb's size the best we can.
+		 * 1. Ideally, there will be more ZIL activity occurring on
+		 * the system, such that this function will be immediately
+		 * called again by different thread and this lwb will be
+		 * closed by zil_lwb_assign().  This way, the lwb will be
+		 * "full" when it is issued to disk, and we'll make use of
+		 * the lwb's size the best we can.
 		 *
 		 * 2. If there isn't sufficient ZIL activity occurring on
-		 * the system, such that this lwb's zio isn't issued via
-		 * zil_lwb_commit(), zil_commit_waiter() will issue the
-		 * lwb's zio. If this occurs, the lwb is not guaranteed
+		 * the system, zil_commit_waiter() will close it and issue
+		 * the zio.  If this occurs, the lwb is not guaranteed
 		 * to be "full" by the time its zio is issued, and means
 		 * the size of the lwb was "too large" given the amount
 		 * of ZIL activity occurring on the system at that time.
@@ -2470,6 +3073,16 @@ zil_process_commit_list(zilog_t *zilog)
 		 * possible, without significantly impacting the latency
 		 * of each individual itx.
 		 */
+		if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
+			zil_burst_done(zilog);
+			list_insert_tail(ilwbs, lwb);
+			lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
+			if (lwb == NULL) {
+				while ((lwb = list_remove_head(ilwbs)) != NULL)
+					zil_lwb_write_issue(zilog, lwb);
+				zil_commit_writer_stall(zilog);
+			}
+		}
 	}
 }
 
@@ -2487,12 +3100,17 @@ zil_process_commit_list(zilog_t *zilog)
  * not issued, we rely on future calls to zil_commit_writer() to issue
  * the lwb, or the timeout mechanism found in zil_commit_waiter().
  */
-static void
+static uint64_t
 zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
+	list_t ilwbs;
+	lwb_t *lwb;
+	uint64_t wtxg = 0;
+
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
 
+	list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node));
 	mutex_enter(&zilog->zl_issuer_lock);
 
 	if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
@@ -2515,14 +3133,18 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
 		goto out;
 	}
 
-	ZIL_STAT_BUMP(zil_commit_writer_count);
+	ZIL_STAT_BUMP(zilog, zil_commit_writer_count);
 
-	zil_get_commit_list(zilog);
+	wtxg = zil_get_commit_list(zilog);
 	zil_prune_commit_list(zilog);
-	zil_process_commit_list(zilog);
+	zil_process_commit_list(zilog, zcw, &ilwbs);
 
 out:
 	mutex_exit(&zilog->zl_issuer_lock);
+	while ((lwb = list_remove_head(&ilwbs)) != NULL)
+		zil_lwb_write_issue(zilog, lwb);
+	list_destroy(&ilwbs);
+	return (wtxg);
 }
 
 static void
@@ -2534,7 +3156,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 
 	lwb_t *lwb = zcw->zcw_lwb;
 	ASSERT3P(lwb, !=, NULL);
-	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED);
+	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
 
 	/*
 	 * If the lwb has already been issued by another thread, we can
@@ -2543,13 +3165,11 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 * do this prior to acquiring the zl_issuer_lock, to avoid
 	 * acquiring it when it's not necessary to do so.
 	 */
-	if (lwb->lwb_state == LWB_STATE_ISSUED ||
-	    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
-	    lwb->lwb_state == LWB_STATE_FLUSH_DONE)
+	if (lwb->lwb_state != LWB_STATE_OPENED)
 		return;
 
 	/*
-	 * In order to call zil_lwb_write_issue() we must hold the
+	 * In order to call zil_lwb_write_close() we must hold the
 	 * zilog's "zl_issuer_lock". We can't simply acquire that lock,
 	 * since we're already holding the commit waiter's "zcw_lock",
 	 * and those two locks are acquired in the opposite order
@@ -2567,8 +3187,10 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 * the waiter is marked "done"), so without this check we could
 	 * wind up with a use-after-free error below.
 	 */
-	if (zcw->zcw_done)
-		goto out;
+	if (zcw->zcw_done) {
+		mutex_exit(&zilog->zl_issuer_lock);
+		return;
+	}
 
 	ASSERT3P(lwb, ==, zcw->zcw_lwb);
 
@@ -2578,26 +3200,33 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 * second time while holding the lock.
 	 *
 	 * We don't need to hold the zl_lock since the lwb cannot transition
-	 * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb
-	 * _can_ transition from ISSUED to DONE, but it's OK to race with
+	 * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb
+	 * _can_ transition from CLOSED to DONE, but it's OK to race with
 	 * that transition since we treat the lwb the same, whether it's in
-	 * the ISSUED or DONE states.
+	 * the CLOSED, ISSUED or DONE states.
 	 *
 	 * The important thing, is we treat the lwb differently depending on
-	 * if it's ISSUED or OPENED, and block any other threads that might
-	 * attempt to issue this lwb. For that reason we hold the
+	 * if it's OPENED or CLOSED, and block any other threads that might
+	 * attempt to close/issue this lwb. For that reason we hold the
 	 * zl_issuer_lock when checking the lwb_state; we must not call
-	 * zil_lwb_write_issue() if the lwb had already been issued.
+	 * zil_lwb_write_close() if the lwb had already been closed/issued.
 	 *
 	 * See the comment above the lwb_state_t structure definition for
 	 * more details on the lwb states, and locking requirements.
 	 */
-	if (lwb->lwb_state == LWB_STATE_ISSUED ||
-	    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
-	    lwb->lwb_state == LWB_STATE_FLUSH_DONE)
-		goto out;
+	if (lwb->lwb_state != LWB_STATE_OPENED) {
+		mutex_exit(&zilog->zl_issuer_lock);
+		return;
+	}
 
-	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+	/*
+	 * We do not need zcw_lock once we hold zl_issuer_lock and know lwb
+	 * is still open.  But we have to drop it to avoid a deadlock in case
+	 * callback of zio issued by zil_lwb_write_issue() try to get it,
+	 * while zil_lwb_write_issue() is blocked on attempt to issue next
+	 * lwb it found in LWB_STATE_READY state.
+	 */
+	mutex_exit(&zcw->zcw_lock);
 
 	/*
 	 * As described in the comments above zil_commit_waiter() and
@@ -2605,55 +3234,27 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 * since we've reached the commit waiter's timeout and it still
 	 * hasn't been issued.
 	 */
-	lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
+	zil_burst_done(zilog);
+	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
 
-	IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED);
-
-	/*
-	 * Since the lwb's zio hadn't been issued by the time this thread
-	 * reached its timeout, we reset the zilog's "zl_cur_used" field
-	 * to influence the zil block size selection algorithm.
-	 *
-	 * By having to issue the lwb's zio here, it means the size of the
-	 * lwb was too large, given the incoming throughput of itxs.  By
-	 * setting "zl_cur_used" to zero, we communicate this fact to the
-	 * block size selection algorithm, so it can take this information
-	 * into account, and potentially select a smaller size for the
-	 * next lwb block that is allocated.
-	 */
-	zilog->zl_cur_used = 0;
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
 	if (nlwb == NULL) {
 		/*
-		 * When zil_lwb_write_issue() returns NULL, this
+		 * When zil_lwb_write_close() returns NULL, this
 		 * indicates zio_alloc_zil() failed to allocate the
 		 * "next" lwb on-disk. When this occurs, the ZIL write
 		 * pipeline must be stalled; see the comment within the
 		 * zil_commit_writer_stall() function for more details.
-		 *
-		 * We must drop the commit waiter's lock prior to
-		 * calling zil_commit_writer_stall() or else we can wind
-		 * up with the following deadlock:
-		 *
-		 * - This thread is waiting for the txg to sync while
-		 *   holding the waiter's lock; txg_wait_synced() is
-		 *   used within txg_commit_writer_stall().
-		 *
-		 * - The txg can't sync because it is waiting for this
-		 *   lwb's zio callback to call dmu_tx_commit().
-		 *
-		 * - The lwb's zio callback can't call dmu_tx_commit()
-		 *   because it's blocked trying to acquire the waiter's
-		 *   lock, which occurs prior to calling dmu_tx_commit()
 		 */
-		mutex_exit(&zcw->zcw_lock);
+		zil_lwb_write_issue(zilog, lwb);
 		zil_commit_writer_stall(zilog);
-		mutex_enter(&zcw->zcw_lock);
+		mutex_exit(&zilog->zl_issuer_lock);
+	} else {
+		mutex_exit(&zilog->zl_issuer_lock);
+		zil_lwb_write_issue(zilog, lwb);
 	}
-
-out:
-	mutex_exit(&zilog->zl_issuer_lock);
-	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+	mutex_enter(&zcw->zcw_lock);
 }
 
 /*
@@ -2667,7 +3268,7 @@ out:
  *    waited "long enough" and the lwb is still in the "open" state.
  *
  * Given a sufficient amount of itxs being generated and written using
- * the ZIL, the lwb's zio will be issued via the zil_lwb_commit()
+ * the ZIL, the lwb's zio will be issued via the zil_lwb_assign()
  * function. If this does not occur, this secondary responsibility will
  * ensure the lwb is issued even if there is not other synchronous
  * activity on the system.
@@ -2718,7 +3319,7 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
 		 * where it's "zcw_lwb" field is NULL, and it hasn't yet
 		 * been skipped, so it's "zcw_done" field is still B_FALSE.
 		 */
-		IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED);
+		IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW);
 
 		if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
 			ASSERT3B(timedout, ==, B_FALSE);
@@ -2766,6 +3367,8 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
 			 */
 
 			IMPLY(lwb != NULL,
+			    lwb->lwb_state == LWB_STATE_CLOSED ||
+			    lwb->lwb_state == LWB_STATE_READY ||
 			    lwb->lwb_state == LWB_STATE_ISSUED ||
 			    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
@@ -2812,7 +3415,14 @@ static void
 zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
-	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+	/*
+	 * Since we are not going to create any new dirty data, and we
+	 * can even help with clearing the existing dirty data, we
+	 * should not be subject to the dirty data based delays. We
+	 * use TXG_NOTHROTTLE to bypass the delay mechanism.
+	 */
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
 
 	itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
 	itx->itx_sync = B_TRUE;
@@ -2869,7 +3479,7 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
  * queue prior to zil_commit() having been called, and which itxs were
  * added after zil_commit() was called.
  *
- * The commit it is special; it doesn't have any on-disk representation.
+ * The commit itx is special; it doesn't have any on-disk representation.
  * When a commit itx is "committed" to an lwb, the waiter associated
  * with it is linked onto the lwb's list of waiters. Then, when that lwb
  * completes, each waiter on the lwb's list is marked done and signaled
@@ -2884,8 +3494,8 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
  *      callback of the lwb's zio[*].
  *
  *      * Actually, the waiters are signaled in the zio completion
- *        callback of the root zio for the DKIOCFLUSHWRITECACHE commands
- *        that are sent to the vdevs upon completion of the lwb zio.
+ *        callback of the root zio for the flush commands that are sent to
+ *        the vdevs upon completion of the lwb zio.
  *
  *   2. When the itxs are inserted into the ZIL's queue of uncommitted
  *      itxs, the order in which they are inserted is preserved[*]; as
@@ -2992,7 +3602,7 @@ zil_commit(zilog_t *zilog, uint64_t foid)
 void
 zil_commit_impl(zilog_t *zilog, uint64_t foid)
 {
-	ZIL_STAT_BUMP(zil_commit_count);
+	ZIL_STAT_BUMP(zilog, zil_commit_count);
 
 	/*
 	 * Move the "async" itxs for the specified foid to the "sync"
@@ -3023,7 +3633,7 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
 	zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
 	zil_commit_itx_assign(zilog, zcw);
 
-	zil_commit_writer(zilog, zcw);
+	uint64_t wtxg = zil_commit_writer(zilog, zcw);
 	zil_commit_waiter(zilog, zcw);
 
 	if (zcw->zcw_zio_error != 0) {
@@ -3038,6 +3648,8 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
 		DTRACE_PROBE2(zil__commit__io__error,
 		    zilog_t *, zilog, zil_commit_waiter_t *, zcw);
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
+	} else if (wtxg != 0) {
+		txg_wait_synced(zilog->zl_dmu_pool, wtxg);
 	}
 
 	zil_free_commit_waiter(zcw);
@@ -3062,6 +3674,8 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 	if (spa_sync_pass(spa) != 1)
 		return;
 
+	zil_lwb_flush_wait_all(zilog, txg);
+
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT(zilog->zl_stop_sync == 0);
@@ -3074,11 +3688,13 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 
 	if (zilog->zl_destroy_txg == txg) {
 		blkptr_t blk = zh->zh_log;
+		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
-		ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
+		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
-		bzero(zh, sizeof (zil_header_t));
-		bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
+		memset(zh, 0, sizeof (zil_header_t));
+		memset(zilog->zl_replayed_seq, 0,
+		    sizeof (zilog->zl_replayed_seq));
 
 		if (zilog->zl_keep_first) {
 			/*
@@ -3091,15 +3707,27 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 			 */
 			zil_init_log_chain(zilog, &blk);
 			zh->zh_log = blk;
+		} else {
+			/*
+			 * A destroyed ZIL chain can't contain any TX_SETSAXATTR
+			 * records. So, deactivate the feature for this dataset.
+			 * We activate it again when we start a new ZIL chain.
+			 */
+			if (dsl_dataset_feature_is_active(ds,
+			    SPA_FEATURE_ZILSAXATTR))
+				dsl_dataset_deactivate_feature(ds,
+				    SPA_FEATURE_ZILSAXATTR, tx);
 		}
 	}
 
 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 		zh->zh_log = lwb->lwb_blk;
-		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
+		if (lwb->lwb_state != LWB_STATE_FLUSH_DONE ||
+		    lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg)
 			break;
 		list_remove(&zilog->zl_lwb_list, lwb);
-		zio_free(spa, txg, &lwb->lwb_blk);
+		if (!BP_IS_HOLE(&lwb->lwb_blk))
+			zio_free(spa, txg, &lwb->lwb_blk);
 		zil_free_lwb(zilog, lwb);
 
 		/*
@@ -3108,29 +3736,17 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 		 * out the zil_header blkptr so that we don't end
 		 * up freeing the same block twice.
 		 */
-		if (list_head(&zilog->zl_lwb_list) == NULL)
+		if (list_is_empty(&zilog->zl_lwb_list))
 			BP_ZERO(&zh->zh_log);
 	}
 
-	/*
-	 * Remove fastwrite on any blocks that have been pre-allocated for
-	 * the next commit. This prevents fastwrite counter pollution by
-	 * unused, long-lived LWBs.
-	 */
-	for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) {
-		if (lwb->lwb_fastwrite && !lwb->lwb_write_zio) {
-			metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
-			lwb->lwb_fastwrite = 0;
-		}
-	}
-
 	mutex_exit(&zilog->zl_lock);
 }
 
-/* ARGSUSED */
 static int
 zil_lwb_cons(void *vbuf, void *unused, int kmflag)
 {
+	(void) unused, (void) kmflag;
 	lwb_t *lwb = vbuf;
 	list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
 	list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
@@ -3141,10 +3757,10 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag)
 	return (0);
 }
 
-/* ARGSUSED */
 static void
 zil_lwb_dest(void *vbuf, void *unused)
 {
+	(void) unused;
 	lwb_t *lwb = vbuf;
 	mutex_destroy(&lwb->lwb_vdev_lock);
 	avl_destroy(&lwb->lwb_vdev_tree);
@@ -3161,13 +3777,16 @@ zil_init(void)
 	zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
 	    sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
-	zil_ksp = kstat_create("zfs", 0, "zil", "misc",
+	zil_sums_init(&zil_sums_global);
+	zil_kstats_global = kstat_create("zfs", 0, "zil", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 
-	if (zil_ksp != NULL) {
-		zil_ksp->ks_data = &zil_stats;
-		kstat_install(zil_ksp);
+	if (zil_kstats_global != NULL) {
+		zil_kstats_global->ks_data = &zil_stats;
+		zil_kstats_global->ks_update = zil_kstats_global_update;
+		zil_kstats_global->ks_private = NULL;
+		kstat_install(zil_kstats_global);
 	}
 }
 
@@ -3177,10 +3796,12 @@ zil_fini(void)
 	kmem_cache_destroy(zil_zcw_cache);
 	kmem_cache_destroy(zil_lwb_cache);
 
-	if (zil_ksp != NULL) {
-		kstat_delete(zil_ksp);
-		zil_ksp = NULL;
+	if (zil_kstats_global != NULL) {
+		kstat_delete(zil_kstats_global);
+		zil_kstats_global = NULL;
 	}
+
+	zil_sums_fini(&zil_sums_global);
 }
 
 void
@@ -3212,10 +3833,13 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 	zilog->zl_dirty_max_txg = 0;
 	zilog->zl_last_lwb_opened = NULL;
 	zilog->zl_last_lwb_latency = 0;
-	zilog->zl_max_block_size = zil_maxblocksize;
+	zilog->zl_max_block_size = MIN(MAX(P2ALIGN_TYPED(zil_maxblocksize,
+	    ZIL_MIN_BLKSZ, uint64_t), ZIL_MIN_BLKSZ),
+	    spa_maxblocksize(dmu_objset_spa(os)));
 
 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&zilog->zl_lwb_io_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
@@ -3229,6 +3853,12 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 	    offsetof(itx_t, itx_node));
 
 	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
+	cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);
+
+	for (int i = 0; i < ZIL_BURSTS; i++) {
+		zilog->zl_prev_opt[i] = zilog->zl_max_block_size -
+		    sizeof (zil_chain_t);
+	}
 
 	return (zilog);
 }
@@ -3264,8 +3894,10 @@ zil_free(zilog_t *zilog)
 
 	mutex_destroy(&zilog->zl_issuer_lock);
 	mutex_destroy(&zilog->zl_lock);
+	mutex_destroy(&zilog->zl_lwb_io_lock);
 
 	cv_destroy(&zilog->zl_cv_suspend);
+	cv_destroy(&zilog->zl_lwb_io_cv);
 
 	kmem_free(zilog, sizeof (zilog_t));
 }
@@ -3274,7 +3906,7 @@ zil_free(zilog_t *zilog)
  * Open an intent log.
  */
 zilog_t *
-zil_open(objset_t *os, zil_get_data_t *get_data)
+zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums)
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 
@@ -3283,6 +3915,7 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 	zilog->zl_get_data = get_data;
+	zilog->zl_sums = zil_sums;
 
 	return (zilog);
 }
@@ -3299,23 +3932,33 @@ zil_close(zilog_t *zilog)
 	if (!dmu_objset_is_snapshot(zilog->zl_os)) {
 		zil_commit(zilog, 0);
 	} else {
-		ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
+		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT0(zilog->zl_dirty_max_txg);
 		ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
 	}
 
 	mutex_enter(&zilog->zl_lock);
+	txg = zilog->zl_dirty_max_txg;
 	lwb = list_tail(&zilog->zl_lwb_list);
-	if (lwb == NULL)
-		txg = zilog->zl_dirty_max_txg;
-	else
-		txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg);
+	if (lwb != NULL) {
+		txg = MAX(txg, lwb->lwb_alloc_txg);
+		txg = MAX(txg, lwb->lwb_max_txg);
+	}
 	mutex_exit(&zilog->zl_lock);
 
 	/*
-	 * We need to use txg_wait_synced() to wait long enough for the
-	 * ZIL to be clean, and to wait for all pending lwbs to be
-	 * written out.
+	 * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends
+	 * on the time when the dmu_tx transaction is assigned in
+	 * zil_lwb_write_issue().
+	 */
+	mutex_enter(&zilog->zl_lwb_io_lock);
+	txg = MAX(zilog->zl_lwb_max_issued_txg, txg);
+	mutex_exit(&zilog->zl_lwb_io_lock);
+
+	/*
+	 * We need to use txg_wait_synced() to wait until that txg is synced.
+	 * zil_sync() will guarantee all lwbs up to that txg have been
+	 * written out, flushed, and cleaned.
 	 */
 	if (txg != 0)
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
@@ -3332,22 +3975,17 @@ zil_close(zilog_t *zilog)
 	 * We should have only one lwb left on the list; remove it now.
 	 */
 	mutex_enter(&zilog->zl_lock);
-	lwb = list_head(&zilog->zl_lwb_list);
+	lwb = list_remove_head(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
-		ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list));
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-
-		if (lwb->lwb_fastwrite)
-			metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
-
-		list_remove(&zilog->zl_lwb_list, lwb);
+		ASSERT(list_is_empty(&zilog->zl_lwb_list));
+		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
 		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 		zil_free_lwb(zilog, lwb);
 	}
 	mutex_exit(&zilog->zl_lock);
 }
 
-static char *suspend_tag = "zil suspending";
+static const char *suspend_tag = "zil suspending";
 
 /*
  * Suspend an intent log.  While in suspended mode, we still honor
@@ -3461,7 +4099,7 @@ zil_suspend(const char *osname, void **cookiep)
 
 	/*
 	 * We need to use zil_commit_impl to ensure we wait for all
-	 * LWB_STATE_OPENED and LWB_STATE_ISSUED lwbs to be committed
+	 * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed
 	 * to disk before proceeding. If we used zil_commit instead, it
 	 * would just call txg_wait_synced(), because zl_suspend is set.
 	 * txg_wait_synced() doesn't wait for these lwb's to be
@@ -3508,7 +4146,7 @@ zil_resume(void *cookie)
 }
 
 typedef struct zil_replay_arg {
-	zil_replay_func_t **zr_replay;
+	zil_replay_func_t *const *zr_replay;
 	void		*zr_arg;
 	boolean_t	zr_byteswap;
 	char		*zr_lr;
@@ -3570,7 +4208,7 @@ zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra,
 	/*
 	 * Make a copy of the data so we can revise and extend it.
 	 */
-	bcopy(lr, zr->zr_lr, reclen);
+	memcpy(zr->zr_lr, lr, reclen);
 
 	/*
 	 * If this is a TX_WRITE with a blkptr, suck in the data.
@@ -3615,10 +4253,11 @@ zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra,
 	return (0);
 }
 
-/* ARGSUSED */
 static int
 zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
+	(void) bp, (void) arg, (void) claim_txg;
+
 	zilog->zl_replay_blks++;
 
 	return (0);
@@ -3626,17 +4265,18 @@ zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg)
 
 /*
  * If this dataset has a non-empty intent log, replay it and destroy it.
+ * Return B_TRUE if there were any entries to replay.
  */
-void
-zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
+boolean_t
+zil_replay(objset_t *os, void *arg,
+    zil_replay_func_t *const replay_func[TX_MAX_TYPE])
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
 	zil_replay_arg_t zr;
 
 	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
-		zil_destroy(zilog, B_TRUE);
-		return;
+		return (zil_destroy(zilog, B_TRUE));
 	}
 
 	zr.zr_replay = replay_func;
@@ -3659,6 +4299,8 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 	zilog->zl_replay = B_FALSE;
+
+	return (B_TRUE);
 }
 
 boolean_t
@@ -3677,13 +4319,12 @@ zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
 	return (B_FALSE);
 }
 
-/* ARGSUSED */
 int
 zil_reset(const char *osname, void *arg)
 {
-	int error;
+	(void) arg;
 
-	error = zil_suspend(osname, NULL);
+	int error = zil_suspend(osname, NULL);
 	/* EACCES means crypto key not loaded */
 	if ((error == EACCES) || (error == EBUSY))
 		return (SET_ERROR(error));
@@ -3714,9 +4355,11 @@ EXPORT_SYMBOL(zil_lwb_add_block);
 EXPORT_SYMBOL(zil_bp_tree_add);
 EXPORT_SYMBOL(zil_set_sync);
 EXPORT_SYMBOL(zil_set_logbias);
+EXPORT_SYMBOL(zil_sums_init);
+EXPORT_SYMBOL(zil_sums_fini);
+EXPORT_SYMBOL(zil_kstat_values_update);
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
 	"ZIL block open timeout percentage");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
@@ -3725,9 +4368,11 @@ ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW,
 	"Disable ZIL cache flushes");
 
-ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW,
 	"Limit in bytes slog sync writes per commit");
 
-ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW,
 	"Limit in bytes of ZIL log block size");
-/* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW,
+	"Limit in bytes WR_COPIED size");
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index c016fa323b41..d68d5ababe79 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -20,10 +20,10 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2022 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
- * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, 2023, 2024, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2021, Datto, Inc.
  */
@@ -41,6 +41,7 @@
 #include <sys/zio_checksum.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
+#include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
@@ -57,33 +58,33 @@
  * I/O type descriptions
  * ==========================================================================
  */
-const char *zio_type_name[ZIO_TYPES] = {
+const char *const zio_type_name[ZIO_TYPES] = {
 	/*
 	 * Note: Linux kernel thread name length is limited
 	 * so these names will differ from upstream open zfs.
 	 */
-	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
+	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_flush", "z_trim"
 };
 
 int zio_dva_throttle_enabled = B_TRUE;
-int zio_deadman_log_all = B_FALSE;
+static int zio_deadman_log_all = B_FALSE;
 
 /*
  * ==========================================================================
  * I/O kmem caches
  * ==========================================================================
  */
-kmem_cache_t *zio_cache;
-kmem_cache_t *zio_link_cache;
+static kmem_cache_t *zio_cache;
+static kmem_cache_t *zio_link_cache;
 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
-uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
-uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+static uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+static uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 #endif
 
 /* Mark IOs as "slow" if they take longer than 30 seconds */
-int zio_slow_io_ms = (30 * MILLISEC);
+static uint_t zio_slow_io_ms = (30 * MILLISEC);
 
 #define	BP_SPANB(indblkshift, level) \
 	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
@@ -114,9 +115,15 @@ int zio_slow_io_ms = (30 * MILLISEC);
  * fragmented systems, which may have very few free segments of this size,
  * and may need to load new metaslabs to satisfy 128K allocations.
  */
-int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
-int zfs_sync_pass_dont_compress = 8; /* don't compress starting in this pass */
-int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
+
+/* defer frees starting in this pass */
+uint_t zfs_sync_pass_deferred_free = 2;
+
+/* don't compress starting in this pass */
+static uint_t zfs_sync_pass_dont_compress = 8;
+
+/* rewrite new bps starting in this pass */
+static uint_t zfs_sync_pass_rewrite = 2;
 
 /*
  * An allocating zio is one that either currently has the DVA allocate
@@ -129,12 +136,12 @@ int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
  * allocations as well.
  */
 int zio_exclude_metadata = 0;
-int zio_requeue_io_start_cut_in_line = 1;
+static int zio_requeue_io_start_cut_in_line = 1;
 
 #ifdef ZFS_DEBUG
-int zio_buf_debug_limit = 16384;
+static const int zio_buf_debug_limit = 16384;
 #else
-int zio_buf_debug_limit = 0;
+static const int zio_buf_debug_limit = 0;
 #endif
 
 static inline void __zio_execute(zio_t *zio);
@@ -151,32 +158,22 @@ zio_init(void)
 	zio_link_cache = kmem_cache_create("zio_link_cache",
 	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
-	/*
-	 * For small buffers, we want a cache for each multiple of
-	 * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
-	 * for each quarter-power of 2.
-	 */
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
-		size_t p2 = size;
-		size_t align = 0;
-		size_t data_cflags, cflags;
-
-		data_cflags = KMC_NODEBUG;
-		cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
-		    KMC_NODEBUG : 0;
+		size_t align, cflags, data_cflags;
+		char name[32];
 
-#if defined(_ILP32) && defined(_KERNEL)
 		/*
-		 * Cache size limited to 1M on 32-bit platforms until ARC
-		 * buffers no longer require virtual address space.
+		 * Create cache for each half-power of 2 size, starting from
+		 * SPA_MINBLOCKSIZE.  It should give us memory space efficiency
+		 * of ~7/8, sufficient for transient allocations mostly using
+		 * these caches.
 		 */
-		if (size > zfs_max_recordsize)
-			break;
-#endif
-
+		size_t p2 = size;
 		while (!ISP2(p2))
 			p2 &= p2 - 1;
+		if (!IS_P2ALIGNED(size, p2 / 2))
+			continue;
 
 #ifndef _KERNEL
 		/*
@@ -187,47 +184,37 @@ zio_init(void)
 		 */
 		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 			continue;
-		/*
-		 * Here's the problem - on 4K native devices in userland on
-		 * Linux using O_DIRECT, buffers must be 4K aligned or I/O
-		 * will fail with EINVAL, causing zdb (and others) to coredump.
-		 * Since userland probably doesn't need optimized buffer caches,
-		 * we just force 4K alignment on everything.
-		 */
-		align = 8 * SPA_MINBLOCKSIZE;
-#else
-		if (size < PAGESIZE) {
-			align = SPA_MINBLOCKSIZE;
-		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
-			align = PAGESIZE;
-		}
 #endif
 
-		if (align != 0) {
-			char name[36];
-			if (cflags == data_cflags) {
-				/*
-				 * Resulting kmem caches would be identical.
-				 * Save memory by creating only one.
-				 */
-				(void) snprintf(name, sizeof (name),
-				    "zio_buf_comb_%lu", (ulong_t)size);
-				zio_buf_cache[c] = kmem_cache_create(name,
-				    size, align, NULL, NULL, NULL, NULL, NULL,
-				    cflags);
-				zio_data_buf_cache[c] = zio_buf_cache[c];
-				continue;
-			}
-			(void) snprintf(name, sizeof (name), "zio_buf_%lu",
-			    (ulong_t)size);
-			zio_buf_cache[c] = kmem_cache_create(name, size,
-			    align, NULL, NULL, NULL, NULL, NULL, cflags);
-
-			(void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
-			    (ulong_t)size);
-			zio_data_buf_cache[c] = kmem_cache_create(name, size,
-			    align, NULL, NULL, NULL, NULL, NULL, data_cflags);
+		if (IS_P2ALIGNED(size, PAGESIZE))
+			align = PAGESIZE;
+		else
+			align = 1 << (highbit64(size ^ (size - 1)) - 1);
+
+		cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
+		    KMC_NODEBUG : 0;
+		data_cflags = KMC_NODEBUG;
+		if (cflags == data_cflags) {
+			/*
+			 * Resulting kmem caches would be identical.
+			 * Save memory by creating only one.
+			 */
+			(void) snprintf(name, sizeof (name),
+			    "zio_buf_comb_%lu", (ulong_t)size);
+			zio_buf_cache[c] = kmem_cache_create(name, size, align,
+			    NULL, NULL, NULL, NULL, NULL, cflags);
+			zio_data_buf_cache[c] = zio_buf_cache[c];
+			continue;
 		}
+		(void) snprintf(name, sizeof (name), "zio_buf_%lu",
+		    (ulong_t)size);
+		zio_buf_cache[c] = kmem_cache_create(name, size, align,
+		    NULL, NULL, NULL, NULL, NULL, cflags);
+
+		(void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
+		    (ulong_t)size);
+		zio_data_buf_cache[c] = kmem_cache_create(name, size, align,
+		    NULL, NULL, NULL, NULL, NULL, data_cflags);
 	}
 
 	while (--c != 0) {
@@ -308,6 +295,53 @@ zio_fini(void)
  * ==========================================================================
  */
 
+#ifdef ZFS_DEBUG
+static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
+#endif
+
+/*
+ * Use empty space after the buffer to detect overflows.
+ *
+ * Since zio_init() creates kmem caches only for certain set of buffer sizes,
+ * allocations of different sizes may have some unused space after the data.
+ * Filling part of that space with a known pattern on allocation and checking
+ * it on free should allow us to detect some buffer overflows.
+ */
+static void
+zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
+{
+#ifdef ZFS_DEBUG
+	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
+	ulong_t *canary = p + off / sizeof (ulong_t);
+	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
+	if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
+	    cache[c] == cache[c + 1])
+		asize = (c + 2) << SPA_MINBLOCKSHIFT;
+	for (; off < asize; canary++, off += sizeof (ulong_t))
+		*canary = zio_buf_canary;
+#endif
+}
+
+static void
+zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
+{
+#ifdef ZFS_DEBUG
+	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
+	ulong_t *canary = p + off / sizeof (ulong_t);
+	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
+	if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
+	    cache[c] == cache[c + 1])
+		asize = (c + 2) << SPA_MINBLOCKSHIFT;
+	for (; off < asize; canary++, off += sizeof (ulong_t)) {
+		if (unlikely(*canary != zio_buf_canary)) {
+			PANIC("ZIO buffer overflow %p (%zu) + %zu %#lx != %#lx",
+			    p, size, (canary - p) * sizeof (ulong_t),
+			    *canary, zio_buf_canary);
+		}
+	}
+#endif
+}
+
 /*
  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
@@ -324,7 +358,9 @@ zio_buf_alloc(size_t size)
 	atomic_add_64(&zio_buf_cache_allocs[c], 1);
 #endif
 
-	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
+	void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE);
+	zio_buf_put_canary(p, size, zio_buf_cache, c);
+	return (p);
 }
 
 /*
@@ -340,7 +376,9 @@ zio_data_buf_alloc(size_t size)
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
-	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
+	void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE);
+	zio_buf_put_canary(p, size, zio_data_buf_cache, c);
+	return (p);
 }
 
 void
@@ -353,6 +391,7 @@ zio_buf_free(void *buf, size_t size)
 	atomic_add_64(&zio_buf_cache_frees[c], 1);
 #endif
 
+	zio_buf_check_canary(buf, size, zio_buf_cache, c);
 	kmem_cache_free(zio_buf_cache[c], buf);
 }
 
@@ -363,12 +402,14 @@ zio_data_buf_free(void *buf, size_t size)
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
+	zio_buf_check_canary(buf, size, zio_data_buf_cache, c);
 	kmem_cache_free(zio_data_buf_cache[c], buf);
 }
 
 static void
 zio_abd_free(void *abd, size_t size)
 {
+	(void) size;
 	abd_free((abd_t *)abd);
 }
 
@@ -514,8 +555,9 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
 
 	/*
 	 * If this is an authenticated block, just check the MAC. It would be
-	 * nice to separate this out into its own flag, but for the moment
-	 * enum zio_flag is out of bits.
+	 * nice to separate this out into its own flag, but when this was done,
+	 * we had run out of bits in what is now zio_flag_t. Future cleanup
+	 * could make this a flag bit.
 	 */
 	if (BP_IS_AUTHENTICATED(bp)) {
 		if (ot == DMU_OT_OBJSET) {
@@ -570,7 +612,8 @@ error:
 	if (ret == ECKSUM) {
 		zio->io_error = SET_ERROR(EIO);
 		if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
-			spa_log_error(spa, &zio->io_bookmark);
+			spa_log_error(spa, &zio->io_bookmark,
+			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 			    spa, NULL, &zio->io_bookmark, zio, 0);
 		}
@@ -625,8 +668,6 @@ zio_unique_parent(zio_t *cio)
 void
 zio_add_child(zio_t *pio, zio_t *cio)
 {
-	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
-
 	/*
 	 * Logical I/Os can have logical, gang, or vdev children.
 	 * Gang I/Os can have gang or vdev children.
@@ -635,6 +676,12 @@ zio_add_child(zio_t *pio, zio_t *cio)
 	 */
 	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
 
+	/* Parent should not have READY stage if child doesn't have it. */
+	IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
+	    (cio->io_child_type != ZIO_CHILD_VDEV),
+	    (pio->io_pipeline & ZIO_STAGE_READY) == 0);
+
+	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 	zl->zl_parent = pio;
 	zl->zl_child = cio;
 
@@ -643,19 +690,53 @@ zio_add_child(zio_t *pio, zio_t *cio)
 
 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
+	uint64_t *countp = pio->io_children[cio->io_child_type];
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
-		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
+		countp[w] += !cio->io_state[w];
 
 	list_insert_head(&pio->io_child_list, zl);
 	list_insert_head(&cio->io_parent_list, zl);
 
-	pio->io_child_count++;
-	cio->io_parent_count++;
-
 	mutex_exit(&cio->io_lock);
 	mutex_exit(&pio->io_lock);
 }
 
+void
+zio_add_child_first(zio_t *pio, zio_t *cio)
+{
+	/*
+	 * Logical I/Os can have logical, gang, or vdev children.
+	 * Gang I/Os can have gang or vdev children.
+	 * Vdev I/Os can only have vdev children.
+	 * The following ASSERT captures all of these constraints.
+	 */
+	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
+
+	/* Parent should not have READY stage if child doesn't have it. */
+	IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
+	    (cio->io_child_type != ZIO_CHILD_VDEV),
+	    (pio->io_pipeline & ZIO_STAGE_READY) == 0);
+
+	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
+	zl->zl_parent = pio;
+	zl->zl_child = cio;
+
+	ASSERT(list_is_empty(&cio->io_parent_list));
+	list_insert_head(&cio->io_parent_list, zl);
+
+	mutex_enter(&pio->io_lock);
+
+	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
+
+	uint64_t *countp = pio->io_children[cio->io_child_type];
+	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+		countp[w] += !cio->io_state[w];
+
+	list_insert_head(&pio->io_child_list, zl);
+
+	mutex_exit(&pio->io_lock);
+}
+
 static void
 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 {
@@ -668,9 +749,6 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 	list_remove(&pio->io_child_list, zl);
 	list_remove(&cio->io_parent_list, zl);
 
-	pio->io_child_count--;
-	cio->io_parent_count--;
-
 	mutex_exit(&cio->io_lock);
 	mutex_exit(&pio->io_lock);
 	kmem_cache_free(zio_link_cache, zl);
@@ -725,7 +803,10 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
 
 		/*
 		 * If we can tell the caller to execute this parent next, do
-		 * so.  Otherwise dispatch the parent zio as its own task.
+		 * so. We do this if the parent's zio type matches the child's
+		 * type, or if it's a zio_null() with no done callback, and so
+		 * has no actual work to do. Otherwise dispatch the parent zio
+		 * in its own taskq.
 		 *
 		 * Having the caller execute the parent when possible reduces
 		 * locking on the zio taskq's, reduces context switch
@@ -744,7 +825,9 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
 		 * parent-child relationships, as we do with the "mega zio"
 		 * of writes for spa_sync(), and the chain of ZIL blocks.
 		 */
-		if (next_to_executep != NULL && *next_to_executep == NULL) {
+		if (next_to_executep != NULL && *next_to_executep == NULL &&
+		    (pio->io_type == zio->io_type ||
+		    (pio->io_type == ZIO_TYPE_NULL && !pio->io_done))) {
 			*next_to_executep = pio;
 		} else {
 			zio_taskq_dispatch(pio, type, B_FALSE);
@@ -804,7 +887,7 @@ static zio_t *
 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
     void *private, zio_type_t type, zio_priority_t priority,
-    enum zio_flag flags, vdev_t *vd, uint64_t offset,
+    zio_flag_t flags, vdev_t *vd, uint64_t offset,
     const zbookmark_phys_t *zb, enum zio_stage stage,
     enum zio_stage pipeline)
 {
@@ -821,7 +904,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
 
 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
-	bzero(zio, sizeof (zio_t));
+	memset(zio, 0, sizeof (zio_t));
 
 	mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
@@ -842,12 +925,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 		zio->io_child_type = ZIO_CHILD_LOGICAL;
 
 	if (bp != NULL) {
-		zio->io_bp = (blkptr_t *)bp;
-		zio->io_bp_copy = *bp;
-		zio->io_bp_orig = *bp;
 		if (type != ZIO_TYPE_WRITE ||
-		    zio->io_child_type == ZIO_CHILD_DDT)
+		    zio->io_child_type == ZIO_CHILD_DDT) {
+			zio->io_bp_copy = *bp;
 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
+		} else {
+			zio->io_bp = (blkptr_t *)bp;
+		}
+		zio->io_bp_orig = *bp;
 		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 			zio->io_logical = zio;
 		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
@@ -869,8 +954,10 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 	zio->io_pipeline_trace = ZIO_STAGE_OPEN;
+	zio->io_allocator = ZIO_ALLOCATOR_NONE;
 
-	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
+	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) ||
+	    (pipeline & ZIO_STAGE_READY) == 0;
 	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 
 	if (zb != NULL)
@@ -882,7 +969,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
-		zio_add_child(pio, zio);
+		zio_add_child_first(pio, zio);
 	}
 
 	taskq_init_ent(&zio->io_tqent);
@@ -890,7 +977,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	return (zio);
 }
 
-static void
+void
 zio_destroy(zio_t *zio)
 {
 	metaslab_trace_fini(&zio->io_alloc_list);
@@ -901,9 +988,13 @@ zio_destroy(zio_t *zio)
 	kmem_cache_free(zio_cache, zio);
 }
 
+/*
+ * ZIO intended to be between others.  Provides synchronization at READY
+ * and DONE pipeline stages and calls the respective callbacks.
+ */
 zio_t *
 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
-    void *private, enum zio_flag flags)
+    void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
@@ -914,10 +1005,22 @@ zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
 	return (zio);
 }
 
+/*
+ * ZIO intended to be a root of a tree.  Unlike null ZIO does not have a
+ * READY pipeline stage (is ready on creation), so it should not be used
+ * as child of any ZIO that may need waiting for grandchildren READY stage
+ * (any other ZIO type).
+ */
 zio_t *
-zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags)
 {
-	return (zio_null(NULL, spa, NULL, done, private, flags));
+	zio_t *zio;
+
+	zio = zio_create(NULL, spa, 0, NULL, NULL, 0, 0, done, private,
+	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL,
+	    ZIO_STAGE_OPEN, ZIO_ROOT_PIPELINE);
+
+	return (zio);
 }
 
 static int
@@ -931,9 +1034,35 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
+	zfs_dbgmsg("bad blkptr at %px: "
+	    "DVA[0]=%#llx/%#llx "
+	    "DVA[1]=%#llx/%#llx "
+	    "DVA[2]=%#llx/%#llx "
+	    "prop=%#llx "
+	    "pad=%#llx,%#llx "
+	    "phys_birth=%#llx "
+	    "birth=%#llx "
+	    "fill=%#llx "
+	    "cksum=%#llx/%#llx/%#llx/%#llx",
+	    bp,
+	    (long long)bp->blk_dva[0].dva_word[0],
+	    (long long)bp->blk_dva[0].dva_word[1],
+	    (long long)bp->blk_dva[1].dva_word[0],
+	    (long long)bp->blk_dva[1].dva_word[1],
+	    (long long)bp->blk_dva[2].dva_word[0],
+	    (long long)bp->blk_dva[2].dva_word[1],
+	    (long long)bp->blk_prop,
+	    (long long)bp->blk_pad[0],
+	    (long long)bp->blk_pad[1],
+	    (long long)BP_GET_PHYSICAL_BIRTH(bp),
+	    (long long)BP_GET_LOGICAL_BIRTH(bp),
+	    (long long)bp->blk_fill,
+	    (long long)bp->blk_cksum.zc_word[0],
+	    (long long)bp->blk_cksum.zc_word[1],
+	    (long long)bp->blk_cksum.zc_word[2],
+	    (long long)bp->blk_cksum.zc_word[3]);
 	switch (blk_verify) {
 	case BLK_VERIFY_HALT:
-		dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp);
 		zfs_panic_recover("%s: %s", spa_name(spa), buf);
 		break;
 	case BLK_VERIFY_LOG:
@@ -954,49 +1083,54 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
  * If everything checks out B_TRUE is returned.  The zfs_blkptr_verify
  * argument controls the behavior when an invalid field is detected.
  *
- * Modes for zfs_blkptr_verify:
- *   1) BLK_VERIFY_ONLY (evaluate the block)
- *   2) BLK_VERIFY_LOG (evaluate the block and log problems)
- *   3) BLK_VERIFY_HALT (call zfs_panic_recover on error)
+ * Values for blk_verify_flag:
+ *   BLK_VERIFY_ONLY: evaluate the block
+ *   BLK_VERIFY_LOG: evaluate the block and log problems
+ *   BLK_VERIFY_HALT: call zfs_panic_recover on error
+ *
+ * Values for blk_config_flag:
+ *   BLK_CONFIG_HELD: caller holds SCL_VDEV for writer
+ *   BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be
+ *   obtained for reader
+ *   BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better
+ *   performance
  */
 boolean_t
-zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
-    enum blk_verify_flag blk_verify)
+zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
+    enum blk_config_flag blk_config, enum blk_verify_flag blk_verify)
 {
 	int errors = 0;
 
 	if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-		    "blkptr at %p has invalid TYPE %llu",
+		    "blkptr at %px has invalid TYPE %llu",
 		    bp, (longlong_t)BP_GET_TYPE(bp));
 	}
-	if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
-	    BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
+	if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-		    "blkptr at %p has invalid CHECKSUM %llu",
+		    "blkptr at %px has invalid CHECKSUM %llu",
 		    bp, (longlong_t)BP_GET_CHECKSUM(bp));
 	}
-	if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
-	    BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
+	if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-		    "blkptr at %p has invalid COMPRESS %llu",
+		    "blkptr at %px has invalid COMPRESS %llu",
 		    bp, (longlong_t)BP_GET_COMPRESS(bp));
 	}
 	if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-		    "blkptr at %p has invalid LSIZE %llu",
+		    "blkptr at %px has invalid LSIZE %llu",
 		    bp, (longlong_t)BP_GET_LSIZE(bp));
 	}
 	if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-		    "blkptr at %p has invalid PSIZE %llu",
+		    "blkptr at %px has invalid PSIZE %llu",
 		    bp, (longlong_t)BP_GET_PSIZE(bp));
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-			    "blkptr at %p has invalid ETYPE %llu",
+			    "blkptr at %px has invalid ETYPE %llu",
 			    bp, (longlong_t)BPE_GET_ETYPE(bp));
 		}
 	}
@@ -1008,17 +1142,27 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
 	if (!spa->spa_trust_config)
 		return (errors == 0);
 
-	if (!config_held)
-		spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
-	else
+	switch (blk_config) {
+	case BLK_CONFIG_HELD:
 		ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER));
+		break;
+	case BLK_CONFIG_NEEDED:
+		spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
+		break;
+	case BLK_CONFIG_SKIP:
+		return (errors == 0);
+	default:
+		panic("invalid blk_config %u", blk_config);
+	}
+
 	/*
 	 * Pool-specific checks.
 	 *
-	 * Note: it would be nice to verify that the blk_birth and
-	 * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
-	 * allows the birth time of log blocks (and dmu_sync()-ed blocks
-	 * that are in the log) to be arbitrarily large.
+	 * Note: it would be nice to verify that the logical birth
+	 * and physical birth are not too large.  However,
+	 * spa_freeze() allows the birth time of log blocks (and
+	 * dmu_sync()-ed blocks that are in the log) to be arbitrarily
+	 * large.
 	 */
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		const dva_t *dva = &bp->blk_dva[i];
@@ -1026,20 +1170,20 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
 
 		if (vdevid >= spa->spa_root_vdev->vdev_children) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-			    "blkptr at %p DVA %u has invalid VDEV %llu",
+			    "blkptr at %px DVA %u has invalid VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 		if (vd == NULL) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-			    "blkptr at %p DVA %u has invalid VDEV %llu",
+			    "blkptr at %px DVA %u has invalid VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (vd->vdev_ops == &vdev_hole_ops) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-			    "blkptr at %p DVA %u has hole VDEV %llu",
+			    "blkptr at %px DVA %u has hole VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
@@ -1057,13 +1201,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
 			asize = vdev_gang_header_asize(vd);
 		if (offset + asize > vd->vdev_asize) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-			    "blkptr at %p DVA %u has invalid OFFSET %llu",
+			    "blkptr at %px DVA %u has invalid OFFSET %llu",
 			    bp, i, (longlong_t)offset);
 		}
 	}
-	if (errors > 0)
-		dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp);
-	if (!config_held)
+	if (blk_config == BLK_CONFIG_NEEDED)
 		spa_config_exit(spa, SCL_VDEV, bp);
 
 	return (errors == 0);
@@ -1072,6 +1214,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
 boolean_t
 zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
 {
+	(void) bp;
 	uint64_t vdevid = DVA_GET_VDEV(dva);
 
 	if (vdevid >= spa->spa_root_vdev->vdev_children)
@@ -1102,11 +1245,11 @@ zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
-    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
+    zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
-	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
+	zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp,
 	    data, size, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
@@ -1119,9 +1262,8 @@ zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
-    zio_done_func_t *physdone, zio_done_func_t *done,
-    void *private, zio_priority_t priority, enum zio_flag flags,
-    const zbookmark_phys_t *zb)
+    zio_done_func_t *done, void *private, zio_priority_t priority,
+    zio_flag_t flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
@@ -1141,7 +1283,6 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 
 	zio->io_ready = ready;
 	zio->io_children_ready = children_ready;
-	zio->io_physdone = physdone;
 	zio->io_prop = *zp;
 
 	/*
@@ -1163,7 +1304,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
     uint64_t size, zio_done_func_t *done, void *private,
-    zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
+    zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
@@ -1175,12 +1316,14 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
 }
 
 void
-zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
+    boolean_t brtwrite)
 {
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+	ASSERT(!brtwrite || !nopwrite);
 
 	/*
 	 * We must reset the io_prop to match the values that existed
@@ -1189,6 +1332,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 	 */
 	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 	zio->io_prop.zp_nopwrite = nopwrite;
+	zio->io_prop.zp_brtwrite = brtwrite;
 	zio->io_prop.zp_copies = copies;
 	zio->io_bp_override = bp;
 }
@@ -1197,7 +1341,7 @@ void
 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 {
 
-	(void) zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_HALT);
+	(void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
 
 	/*
 	 * The check for EMBEDDED is a performance optimization.  We
@@ -1206,7 +1350,6 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 	 */
 	if (BP_IS_EMBEDDED(bp))
 		return;
-	metaslab_check_free(spa, bp);
 
 	/*
 	 * Frees that are for the currently-syncing txg, are not going to be
@@ -1222,7 +1365,9 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 	    BP_GET_DEDUP(bp) ||
 	    txg != spa->spa_syncing_txg ||
 	    (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free &&
-	    !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) {
+	    !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) ||
+	    brt_maybe_exists(spa, bp)) {
+		metaslab_check_free(spa, bp);
 		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 	} else {
 		VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL);
@@ -1236,7 +1381,7 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
  */
 zio_t *
 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
-    enum zio_flag flags)
+    zio_flag_t flags)
 {
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(spa_syncing_txg(spa) == txg);
@@ -1248,11 +1393,13 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	arc_freed(spa, bp);
 	dsl_scan_freed(spa, bp);
 
-	if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) {
+	if (BP_IS_GANG(bp) ||
+	    BP_GET_DEDUP(bp) ||
+	    brt_maybe_exists(spa, bp)) {
 		/*
-		 * GANG and DEDUP blocks can induce a read (for the gang block
-		 * header, or the DDT), so issue them asynchronously so that
-		 * this thread is not tied up.
+		 * GANG, DEDUP and BRT blocks can induce a read (for the gang
+		 * block header, the DDT or the BRT), so issue them
+		 * asynchronously so that this thread is not tied up.
 		 */
 		enum zio_stage stage =
 		    ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC;
@@ -1269,12 +1416,12 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 
 zio_t *
 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
-    zio_done_func_t *done, void *private, enum zio_flag flags)
+    zio_done_func_t *done, void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
-	(void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER,
-	    BLK_VERIFY_HALT);
+	(void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ?
+	    BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
@@ -1291,7 +1438,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	 * starts allocating blocks -- so that nothing is allocated twice.
 	 * If txg == 0 we just verify that the block is claimable.
 	 */
-	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp), <,
 	    spa_min_claim_txg(spa));
 	ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
 	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(8) */
@@ -1305,33 +1452,9 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 }
 
 zio_t *
-zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, enum zio_flag flags)
-{
-	zio_t *zio;
-	int c;
-
-	if (vd->vdev_children == 0) {
-		zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
-		    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
-		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
-
-		zio->io_cmd = cmd;
-	} else {
-		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
-
-		for (c = 0; c < vd->vdev_children; c++)
-			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
-			    done, private, flags));
-	}
-
-	return (zio);
-}
-
-zio_t *
 zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     zio_done_func_t *done, void *private, zio_priority_t priority,
-    enum zio_flag flags, enum trim_flag trim_flags)
+    zio_flag_t flags, enum trim_flag trim_flags)
 {
 	zio_t *zio;
 
@@ -1351,7 +1474,7 @@ zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
-    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
+    zio_priority_t priority, zio_flag_t flags, boolean_t labels)
 {
 	zio_t *zio;
 
@@ -1372,7 +1495,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
-    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
+    zio_priority_t priority, zio_flag_t flags, boolean_t labels)
 {
 	zio_t *zio;
 
@@ -1409,7 +1532,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
     abd_t *data, uint64_t size, int type, zio_priority_t priority,
-    enum zio_flag flags, zio_done_func_t *done, void *private)
+    zio_flag_t flags, zio_done_func_t *done, void *private)
 {
 	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 	zio_t *zio;
@@ -1468,22 +1591,17 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 		flags &= ~ZIO_FLAG_IO_ALLOCATING;
 	}
 
-
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
 	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 
-	zio->io_physdone = pio->io_physdone;
-	if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
-		zio->io_logical->io_phys_children++;
-
 	return (zio);
 }
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
-    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
+    zio_type_t type, zio_priority_t priority, zio_flag_t flags,
     zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
@@ -1499,12 +1617,29 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
 	return (zio);
 }
 
+
+/*
+ * Send a flush command to the given vdev. Unlike most zio creation functions,
+ * the flush zios are issued immediately. You can wait on pio to pause until
+ * the flushes complete.
+ */
 void
-zio_flush(zio_t *zio, vdev_t *vd)
+zio_flush(zio_t *pio, vdev_t *vd)
 {
-	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
-	    NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+	const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+	    ZIO_FLAG_DONT_RETRY;
+
+	if (vd->vdev_nowritecache)
+		return;
+
+	if (vd->vdev_children == 0) {
+		zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0,
+		    NULL, NULL, ZIO_TYPE_FLUSH, ZIO_PRIORITY_NOW, flags, vd, 0,
+		    NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE));
+	} else {
+		for (uint64_t c = 0; c < vd->vdev_children; c++)
+			zio_flush(pio, vd->vdev_child[c]);
+	}
 }
 
 void
@@ -1528,6 +1663,19 @@ zio_shrink(zio_t *zio, uint64_t size)
 }
 
 /*
+ * Round provided allocation size up to a value that can be allocated
+ * by at least some vdev(s) in the pool with minimum or no additional
+ * padding and without extra space usage on others
+ */
+static uint64_t
+zio_roundup_alloc_size(spa_t *spa, uint64_t size)
+{
+	if (size > spa->spa_min_alloc)
+		return (roundup(size, spa->spa_gcd_alloc));
+	return (spa->spa_min_alloc);
+}
+
+/*
  * ==========================================================================
  * Prepare to read and write logical blocks
  * ==========================================================================
@@ -1565,15 +1713,8 @@ zio_read_bp_init(zio_t *zio)
 		abd_return_buf_copy(zio->io_abd, data, psize);
 	} else {
 		ASSERT(!BP_IS_EMBEDDED(bp));
-		ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
 	}
 
-	if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
-		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
-
-	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
-		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
-
 	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 
@@ -1592,12 +1733,16 @@ zio_write_bp_init(zio_t *zio)
 		blkptr_t *bp = zio->io_bp;
 		zio_prop_t *zp = &zio->io_prop;
 
-		ASSERT(bp->blk_birth != zio->io_txg);
-		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
+		ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg);
 
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
+		if (zp->zp_brtwrite)
+			return (zio);
+
+		ASSERT(!BP_GET_DEDUP(zio->io_bp_override));
+
 		if (BP_IS_EMBEDDED(bp))
 			return (zio);
 
@@ -1649,7 +1794,7 @@ zio_write_compress(zio_t *zio)
 	blkptr_t *bp = zio->io_bp;
 	uint64_t lsize = zio->io_lsize;
 	uint64_t psize = zio->io_size;
-	int pass = 1;
+	uint32_t pass = 1;
 
 	/*
 	 * If our children haven't all reached the ready stage,
@@ -1676,7 +1821,7 @@ zio_write_compress(zio_t *zio)
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 	ASSERT(zio->io_bp_override == NULL);
 
-	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
+	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) {
 		/*
 		 * We're rewriting an existing block, which means we're
 		 * working on behalf of spa_sync().  For spa_sync() to
@@ -1696,19 +1841,23 @@ zio_write_compress(zio_t *zio)
 			compress = ZIO_COMPRESS_OFF;
 
 		/* Make sure someone doesn't change their mind on overwrites */
-		ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
-		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
+		ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) ||
+		    MIN(zp->zp_copies, spa_max_replication(spa))
+		    == BP_GET_NDVAS(bp));
 	}
 
 	/* If it's a compressed write that is not raw, compress the buffer. */
 	if (compress != ZIO_COMPRESS_OFF &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
-		void *cbuf = zio_buf_alloc(lsize);
-		psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize,
+		void *cbuf = NULL;
+		psize = zio_compress_data(compress, zio->io_abd, &cbuf, lsize,
 		    zp->zp_complevel);
-		if (psize == 0 || psize >= lsize) {
+		if (psize == 0) {
 			compress = ZIO_COMPRESS_OFF;
-			zio_buf_free(cbuf, lsize);
+		} else if (psize >= lsize) {
+			compress = ZIO_COMPRESS_OFF;
+			if (cbuf != NULL)
+				zio_buf_free(cbuf, lsize);
 		} else if (!zp->zp_dedup && !zp->zp_encrypt &&
 		    psize <= BPE_PAYLOAD_SIZE &&
 		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
@@ -1719,7 +1868,7 @@ zio_write_compress(zio_t *zio)
 			BP_SET_TYPE(bp, zio->io_prop.zp_type);
 			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
 			zio_buf_free(cbuf, lsize);
-			bp->blk_birth = zio->io_txg;
+			BP_SET_LOGICAL_BIRTH(bp, zio->io_txg);
 			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 			ASSERT(spa_feature_is_active(spa,
 			    SPA_FEATURE_EMBEDDED_DATA));
@@ -1733,9 +1882,8 @@ zio_write_compress(zio_t *zio)
 			 * in that we charge for the padding used to fill out
 			 * the last sector.
 			 */
-			ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT);
-			size_t rounded = (size_t)roundup(psize,
-			    spa->spa_min_alloc);
+			size_t rounded = (size_t)zio_roundup_alloc_size(spa,
+			    psize);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
 				zio_buf_free(cbuf, lsize);
@@ -1771,9 +1919,15 @@ zio_write_compress(zio_t *zio)
 		    zio->io_abd, NULL, lsize, zp->zp_complevel);
 		if (psize == 0 || psize >= lsize)
 			compress = ZIO_COMPRESS_OFF;
-	} else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
-		size_t rounded = MIN((size_t)roundup(psize,
-		    spa->spa_min_alloc), lsize);
+	} else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS &&
+	    !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) {
+		/*
+		 * If we are raw receiving an encrypted dataset we should not
+		 * take this codepath because it will change the on-disk block
+		 * and decryption will fail.
+		 */
+		size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize),
+		    lsize);
 
 		if (rounded != psize) {
 			abd_t *cdata = abd_alloc_linear(rounded, B_TRUE);
@@ -1795,7 +1949,7 @@ zio_write_compress(zio_t *zio)
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
-	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
+	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg &&
 	    BP_GET_PSIZE(bp) == psize &&
 	    pass >= zfs_sync_pass_rewrite) {
 		VERIFY3U(psize, !=, 0);
@@ -1809,7 +1963,7 @@ zio_write_compress(zio_t *zio)
 	}
 
 	if (psize == 0) {
-		if (zio->io_bp_orig.blk_birth != 0 &&
+		if (BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig) != 0 &&
 		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 			BP_SET_LSIZE(bp, lsize);
 			BP_SET_TYPE(bp, zp->zp_type);
@@ -1869,7 +2023,6 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 {
 	spa_t *spa = zio->io_spa;
 	zio_type_t t = zio->io_type;
-	int flags = (cutinline ? TQ_FRONT : 0);
 
 	/*
 	 * If we're a config writer or a probe, the normal issue and
@@ -1887,23 +2040,18 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 
 	/*
 	 * If this is a high priority I/O, then use the high priority taskq if
-	 * available.
+	 * available or cut the line otherwise.
 	 */
-	if ((zio->io_priority == ZIO_PRIORITY_NOW ||
-	    zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) &&
-	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
-		q++;
+	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) {
+		if (spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
+			q++;
+		else
+			cutinline = B_TRUE;
+	}
 
 	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
 
-	/*
-	 * NB: We are assuming that the zio can only be dispatched
-	 * to a single taskq at a time.  It would be a grievous error
-	 * to dispatch the zio to another taskq at the same time.
-	 */
-	ASSERT(taskq_empty_ent(&zio->io_tqent));
-	spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags,
-	    &zio->io_tqent);
+	spa_taskq_dispatch(spa, t, q, zio_execute, zio, cutinline);
 }
 
 static boolean_t
@@ -1928,8 +2076,8 @@ zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
 static zio_t *
 zio_issue_async(zio_t *zio)
 {
+	ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio));
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
-
 	return (NULL);
 }
 
@@ -2027,7 +2175,7 @@ zio_deadman_impl(zio_t *pio, int ziodepth)
 		    "delta=%llu queued=%llu io=%llu "
 		    "path=%s "
 		    "last=%llu type=%d "
-		    "priority=%d flags=0x%x stage=0x%x "
+		    "priority=%d flags=0x%llx stage=0x%x "
 		    "pipeline=0x%x pipeline-trace=0x%x "
 		    "objset=%llu object=%llu "
 		    "level=%llu blkid=%llu "
@@ -2037,8 +2185,8 @@ zio_deadman_impl(zio_t *pio, int ziodepth)
 		    (u_longlong_t)delta, pio->io_delta, pio->io_delay,
 		    vd ? vd->vdev_path : "NULL",
 		    vq ? vq->vq_io_complete_ts : 0, pio->io_type,
-		    pio->io_priority, pio->io_flags, pio->io_stage,
-		    pio->io_pipeline, pio->io_pipeline_trace,
+		    pio->io_priority, (u_longlong_t)pio->io_flags,
+		    pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace,
 		    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
 		    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid,
 		    (u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size,
@@ -2065,7 +2213,7 @@ zio_deadman_impl(zio_t *pio, int ziodepth)
  * using the zfs_dbgmsg() interface then post deadman event for the ZED.
  */
 void
-zio_deadman(zio_t *pio, char *tag)
+zio_deadman(zio_t *pio, const char *tag)
 {
 	spa_t *spa = pio->io_spa;
 	char *name = spa_name(spa);
@@ -2143,6 +2291,8 @@ zio_execute_stack_check(zio_t *zio)
 	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
 	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
 		return (B_TRUE);
+#else
+	(void) zio;
 #endif /* HAVE_LARGE_STACKS */
 
 	return (B_FALSE);
@@ -2241,6 +2391,9 @@ zio_wait(zio_t *zio)
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		spa_select_allocator(zio);
+	}
 	__zio_execute(zio);
 
 	mutex_enter(&zio->io_lock);
@@ -2277,7 +2430,7 @@ zio_nowait(zio_t *zio)
 	ASSERT3P(zio->io_executor, ==, NULL);
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
-	    zio_unique_parent(zio) == NULL) {
+	    list_is_empty(&zio->io_parent_list)) {
 		zio_t *pio;
 
 		/*
@@ -2293,6 +2446,9 @@ zio_nowait(zio_t *zio)
 
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		spa_select_allocator(zio);
+	}
 	__zio_execute(zio);
 }
 
@@ -2306,13 +2462,14 @@ static void
 zio_reexecute(void *arg)
 {
 	zio_t *pio = arg;
-	zio_t *cio, *cio_next;
+	zio_t *cio, *cio_next, *gio;
 
 	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
 	ASSERT(pio->io_gang_leader == NULL);
 	ASSERT(pio->io_gang_tree == NULL);
 
+	mutex_enter(&pio->io_lock);
 	pio->io_flags = pio->io_orig_flags;
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
@@ -2320,8 +2477,16 @@ zio_reexecute(void *arg)
 	pio->io_flags |= ZIO_FLAG_REEXECUTED;
 	pio->io_pipeline_trace = 0;
 	pio->io_error = 0;
-	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
-		pio->io_state[w] = 0;
+	pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) ||
+	    (pio->io_pipeline & ZIO_STAGE_READY) == 0;
+	pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE);
+	zio_link_t *zl = NULL;
+	while ((gio = zio_walk_parents(pio, &zl)) != NULL) {
+		for (int w = 0; w < ZIO_WAIT_TYPES; w++) {
+			gio->io_children[pio->io_child_type][w] +=
+			    !pio->io_state[w];
+		}
+	}
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		pio->io_child_error[c] = 0;
 
@@ -2335,12 +2500,9 @@ zio_reexecute(void *arg)
 	 * the remainder of pio's io_child_list, from 'cio_next' onward,
 	 * cannot be affected by any side effects of reexecuting 'cio'.
 	 */
-	zio_link_t *zl = NULL;
-	mutex_enter(&pio->io_lock);
+	zl = NULL;
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
-		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
-			pio->io_children[cio->io_child_type][w]++;
 		mutex_exit(&pio->io_lock);
 		zio_reexecute(cio);
 		mutex_enter(&pio->io_lock);
@@ -2366,8 +2528,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
 		    "failure and the failure mode property for this pool "
 		    "is set to panic.", spa_name(spa));
 
-	cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
-	    "failure and has been suspended.\n", spa_name(spa));
+	if (reason != ZIO_SUSPEND_MMP) {
+		cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable "
+		    "I/O failure and has been suspended.\n", spa_name(spa));
+	}
 
 	(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
 	    NULL, NULL, 0);
@@ -2555,11 +2719,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
 	return (zio);
 }
 
-/* ARGSUSED */
 static zio_t *
 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
+	(void) gn, (void) data, (void) offset;
+
 	zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
 	    ZIO_GANG_CHILD_FLAGS(pio));
 	if (zio == NULL) {
@@ -2569,11 +2734,11 @@ zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
 	return (zio);
 }
 
-/* ARGSUSED */
 static zio_t *
 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
+	(void) gn, (void) data, (void) offset;
 	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
 	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
 }
@@ -2652,7 +2817,7 @@ zio_gang_tree_assemble_done(zio_t *zio)
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(gio == zio_unique_parent(zio));
-	ASSERT(zio->io_child_count == 0);
+	ASSERT(list_is_empty(&zio->io_child_list));
 
 	if (zio->io_error)
 		return;
@@ -2751,6 +2916,12 @@ zio_gang_issue(zio_t *zio)
 }
 
 static void
+zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
+{
+	cio->io_allocator = pio->io_allocator;
+}
+
+static void
 zio_write_gang_member_ready(zio_t *zio)
 {
 	zio_t *pio = zio_unique_parent(zio);
@@ -2768,7 +2939,7 @@ zio_write_gang_member_ready(zio_t *zio)
 	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
 	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
 	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
-	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
+	VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
 
 	mutex_enter(&pio->io_lock);
 	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
@@ -2806,19 +2977,22 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 	uint64_t resid = pio->io_size;
 	uint64_t lsize;
 	int copies = gio->io_prop.zp_copies;
-	int gbh_copies;
 	zio_prop_t zp;
 	int error;
 	boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
 
 	/*
-	 * encrypted blocks need DVA[2] free so encrypted gang headers can't
-	 * have a third copy.
+	 * If one copy was requested, store 2 copies of the GBH, so that we
+	 * can still traverse all the data (e.g. to free or scrub) even if a
+	 * block is damaged.  Note that we can't store 3 copies of the GBH in
+	 * all cases, e.g. with encryption, which uses DVA[2] for the IV+salt.
 	 */
-	gbh_copies = MIN(copies + 1, spa_max_replication(spa));
-	if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP)
-		gbh_copies = SPA_DVAS_PER_BP - 1;
+	int gbh_copies = copies;
+	if (gbh_copies == 1) {
+		gbh_copies = MIN(2, spa_max_replication(spa));
+	}
 
+	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
 	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
@@ -2872,7 +3046,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 
 	gn = zio_gang_node_alloc(gnpp);
 	gbh = gn->gn_gbh;
-	bzero(gbh, SPA_GANGBLOCKSIZE);
+	memset(gbh, 0, SPA_GANGBLOCKSIZE);
 	gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
 
 	/*
@@ -2882,6 +3056,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 	    zio_write_gang_done, NULL, pio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
+	zio_gang_inherit_allocator(pio, zio);
+
 	/*
 	 * Create and nowait the gang children.
 	 */
@@ -2901,17 +3077,19 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 		zp.zp_nopwrite = B_FALSE;
 		zp.zp_encrypt = gio->io_prop.zp_encrypt;
 		zp.zp_byteorder = gio->io_prop.zp_byteorder;
-		bzero(zp.zp_salt, ZIO_DATA_SALT_LEN);
-		bzero(zp.zp_iv, ZIO_DATA_IV_LEN);
-		bzero(zp.zp_mac, ZIO_DATA_MAC_LEN);
+		memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN);
+		memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN);
+		memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
 
 		zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    has_data ? abd_get_offset(pio->io_abd, pio->io_size -
 		    resid) : NULL, lsize, lsize, &zp,
-		    zio_write_gang_member_ready, NULL, NULL,
+		    zio_write_gang_member_ready, NULL,
 		    zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
+		zio_gang_inherit_allocator(zio, cio);
+
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(has_data);
@@ -2932,11 +3110,6 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 	 */
 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
-	/*
-	 * We didn't allocate this bp, so make sure it doesn't get unmarked.
-	 */
-	pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
-
 	zio_nowait(zio);
 
 	return (pio);
@@ -2967,6 +3140,7 @@ zio_nop_write(zio_t *zio)
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	zio_prop_t *zp = &zio->io_prop;
 
+	ASSERT(BP_IS_HOLE(bp));
 	ASSERT(BP_GET_LEVEL(bp) == 0);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 	ASSERT(zp->zp_nopwrite);
@@ -3000,8 +3174,7 @@ zio_nop_write(zio_t *zio)
 		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
 		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
 		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
-		ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
-		    sizeof (uint64_t)) == 0);
+		ASSERT3U(bp->blk_prop, ==, bp_orig->blk_prop);
 
 		/*
 		 * If we're overwriting a block that is currently on an
@@ -3009,11 +3182,13 @@ zio_nop_write(zio_t *zio)
 		 * allow a new block to be allocated on a concrete vdev.
 		 */
 		spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER);
-		vdev_t *tvd = vdev_lookup_top(zio->io_spa,
-		    DVA_GET_VDEV(&bp->blk_dva[0]));
-		if (tvd->vdev_ops == &vdev_indirect_ops) {
-			spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
-			return (zio);
+		for (int d = 0; d < BP_GET_NDVAS(bp_orig); d++) {
+			vdev_t *tvd = vdev_lookup_top(zio->io_spa,
+			    DVA_GET_VDEV(&bp_orig->blk_dva[d]));
+			if (tvd->vdev_ops == &vdev_indirect_ops) {
+				spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
+				return (zio);
+			}
 		}
 		spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
 
@@ -3027,6 +3202,35 @@ zio_nop_write(zio_t *zio)
 
 /*
  * ==========================================================================
+ * Block Reference Table
+ * ==========================================================================
+ */
+static zio_t *
+zio_brt_free(zio_t *zio)
+{
+	blkptr_t *bp;
+
+	bp = zio->io_bp;
+
+	if (BP_GET_LEVEL(bp) > 0 ||
+	    BP_IS_METADATA(bp) ||
+	    !brt_maybe_exists(zio->io_spa, bp)) {
+		return (zio);
+	}
+
+	if (!brt_entry_decref(zio->io_spa, bp)) {
+		/*
+		 * This isn't the last reference, so we cannot free
+		 * the data yet.
+		 */
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+	}
+
+	return (zio);
+}
+
+/*
+ * ==========================================================================
  * Dedup
  * ==========================================================================
  */
@@ -3332,14 +3536,14 @@ zio_ddt_write(zio_t *zio)
 		else
 			ddt_phys_addref(ddp);
 	} else if (zio->io_bp_override) {
-		ASSERT(bp->blk_birth == txg);
+		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
 		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
 		ddt_phys_fill(ddp, bp);
 		ddt_phys_addref(ddp);
 	} else {
 		cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
 		    zio->io_orig_size, zio->io_orig_size, zp,
-		    zio_ddt_child_write_ready, NULL, NULL,
+		    zio_ddt_child_write_ready, NULL,
 		    zio_ddt_child_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
@@ -3354,7 +3558,7 @@ zio_ddt_write(zio_t *zio)
 	return (zio);
 }
 
-ddt_entry_t *freedde; /* for debugging */
+static ddt_entry_t *freedde; /* for debugging */
 
 static zio_t *
 zio_ddt_free(zio_t *zio)
@@ -3398,6 +3602,7 @@ zio_io_to_allocate(spa_t *spa, int allocator)
 		return (NULL);
 
 	ASSERT(IO_IS_ALLOCATING(zio));
+	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 	/*
 	 * Try to place a reservation for this zio. If we're unable to
@@ -3434,21 +3639,12 @@ zio_dva_throttle(zio_t *zio)
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 
-	zbookmark_phys_t *bm = &zio->io_bookmark;
-	/*
-	 * We want to try to use as many allocators as possible to help improve
-	 * performance, but we also want logically adjacent IOs to be physically
-	 * adjacent to improve sequential read performance. We chunk each object
-	 * into 2^20 block regions, and then hash based on the objset, object,
-	 * level, and region to accomplish both of these goals.
-	 */
-	int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object,
-	    bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
-	zio->io_allocator = allocator;
+	int allocator = zio->io_allocator;
 	zio->io_metaslab_class = mc;
 	mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
 	avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
@@ -3493,7 +3689,6 @@ zio_dva_allocate(zio_t *zio)
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
-	flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
 	if (zio->io_flags & ZIO_FLAG_NODATA)
 		flags |= METASLAB_DONT_THROTTLE;
 	if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
@@ -3523,6 +3718,7 @@ zio_dva_allocate(zio_t *zio)
 	 * sync write performance.  If a log allocation fails, we will fall
 	 * back to spa_sync() which is abysmal for performance.
 	 */
+	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 	    &zio->io_alloc_list, zio, zio->io_allocator);
@@ -3611,11 +3807,13 @@ zio_dva_claim(zio_t *zio)
 static void
 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
-	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+	ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp));
 	ASSERT(zio->io_bp_override == NULL);
 
-	if (!BP_IS_HOLE(bp))
-		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
+	if (!BP_IS_HOLE(bp)) {
+		metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp),
+		    B_TRUE);
+	}
 
 	if (gn != NULL) {
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
@@ -3653,7 +3851,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 	 * of, so we just hash the objset ID to pick the allocator to get
 	 * some parallelism.
 	 */
-	int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
+	int flags = METASLAB_ZIL;
 	int allocator = (uint_t)cityhash4(0, 0, 0,
 	    os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
@@ -3755,7 +3953,7 @@ zio_vdev_io_start(zio_t *zio)
 		 * Note: the code can handle other kinds of writes,
 		 * but we don't expect them.
 		 */
-		if (zio->io_vd->vdev_removing) {
+		if (zio->io_vd->vdev_noalloc) {
 			ASSERT(zio->io_flags &
 			    (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
 			    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
@@ -3857,8 +4055,15 @@ zio_vdev_io_start(zio_t *zio)
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_TRIM)) {
 
-		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
-			return (zio);
+		if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) {
+			/*
+			 * "no-op" injections return success, but do no actual
+			 * work. Just skip the remaining vdev stages.
+			 */
+			zio_vdev_io_bypass(zio);
+			zio_interrupt(zio);
+			return (NULL);
+		}
 
 		if ((zio = vdev_queue_io(zio)) == NULL)
 			return (NULL);
@@ -3887,17 +4092,17 @@ zio_vdev_io_done(zio_t *zio)
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
-	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM);
+	    zio->io_type == ZIO_TYPE_WRITE ||
+	    zio->io_type == ZIO_TYPE_FLUSH ||
+	    zio->io_type == ZIO_TYPE_TRIM);
 
 	if (zio->io_delay)
 		zio->io_delay = gethrtime() - zio->io_delay;
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_ops != &vdev_draid_spare_ops) {
-		vdev_queue_io_done(zio);
-
-		if (zio->io_type == ZIO_TYPE_WRITE)
-			vdev_cache_write(zio);
+		if (zio->io_type != ZIO_TYPE_FLUSH)
+			vdev_queue_io_done(zio);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_device_injections(vd, zio,
@@ -3906,7 +4111,8 @@ zio_vdev_io_done(zio_t *zio)
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_label_injection(zio, EIO);
 
-		if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) {
+		if (zio->io_error && zio->io_type != ZIO_TYPE_FLUSH &&
+		    zio->io_type != ZIO_TYPE_TRIM) {
 			if (!vdev_accessible(vd, zio)) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
@@ -3917,7 +4123,7 @@ zio_vdev_io_done(zio_t *zio)
 
 	ops->vdev_op_io_done(zio);
 
-	if (unexpected_error)
+	if (unexpected_error && vd->vdev_remove_wanted == B_FALSE)
 		VERIFY(vdev_probe(vd, zio) == NULL);
 
 	return (zio);
@@ -3964,7 +4170,6 @@ zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
 	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
 }
 
-/*ARGSUSED*/
 void
 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr)
 {
@@ -4009,8 +4214,7 @@ zio_vdev_io_assess(zio_t *zio)
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
 		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
 		zio->io_error = 0;
-		zio->io_flags |= ZIO_FLAG_IO_RETRY |
-		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
+		zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE;
 		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
 		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
 		    zio_requeue_io_start_cut_in_line);
@@ -4043,20 +4247,12 @@ zio_vdev_io_assess(zio_t *zio)
 	 * boolean flag so that we don't bother with it in the future.
 	 */
 	if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
-	    zio->io_type == ZIO_TYPE_IOCTL &&
-	    zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
+	    zio->io_type == ZIO_TYPE_FLUSH && vd != NULL)
 		vd->vdev_nowritecache = B_TRUE;
 
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
-	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
-	    zio->io_physdone != NULL) {
-		ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
-		ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
-		zio->io_physdone(zio->io_logical);
-	}
-
 	return (zio);
 }
 
@@ -4305,12 +4501,12 @@ zio_checksum_verify(zio_t *zio)
 		zio->io_error = error;
 		if (error == ECKSUM &&
 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-			(void) zfs_ereport_start_checksum(zio->io_spa,
-			    zio->io_vd, &zio->io_bookmark, zio,
-			    zio->io_offset, zio->io_size, &info);
 			mutex_enter(&zio->io_vd->vdev_stat_lock);
 			zio->io_vd->vdev_stat.vs_checksum_errors++;
 			mutex_exit(&zio->io_vd->vdev_stat_lock);
+			(void) zfs_ereport_start_checksum(zio->io_spa,
+			    zio->io_vd, &zio->io_bookmark, zio,
+			    zio->io_offset, zio->io_size, &info);
 		}
 	}
 
@@ -4364,22 +4560,24 @@ zio_ready(zio_t *zio)
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
-	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
-	    ZIO_WAIT_READY)) {
+	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
+	    ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) {
 		return (NULL);
 	}
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
-		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
-		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
+		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg ||
+		    BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
 		zio->io_ready(zio);
 	}
 
+#ifdef ZFS_DEBUG
 	if (bp != NULL && bp != &zio->io_bp_copy)
 		zio->io_bp_copy = *bp;
+#endif
 
 	if (zio->io_error != 0) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
@@ -4388,6 +4586,7 @@ zio_ready(zio_t *zio)
 			ASSERT(IO_IS_ALLOCATING(zio));
 			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(zio->io_metaslab_class != NULL);
+			ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 			/*
 			 * We were unable to allocate anything, unreserve and
@@ -4418,7 +4617,7 @@ zio_ready(zio_t *zio)
 	}
 
 	if (zio->io_flags & ZIO_FLAG_NODATA) {
-		if (BP_IS_GANG(bp)) {
+		if (bp != NULL && BP_IS_GANG(bp)) {
 			zio->io_flags &= ~ZIO_FLAG_NODATA;
 		} else {
 			ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
@@ -4474,6 +4673,7 @@ zio_dva_throttle_done(zio_t *zio)
 	}
 
 	ASSERT(IO_IS_ALLOCATING(pio));
+	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	ASSERT3P(zio, !=, zio->io_logical);
 	ASSERT(zio->io_logical != NULL);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
@@ -4536,6 +4736,7 @@ zio_done(zio_t *zio)
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(zio->io_bp != NULL);
+		ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 		metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
 		    zio->io_allocator);
@@ -4551,7 +4752,7 @@ zio_done(zio_t *zio)
 	if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
 		ASSERT(zio->io_bp->blk_pad[0] == 0);
 		ASSERT(zio->io_bp->blk_pad[1] == 0);
-		ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
+		ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy,
 		    sizeof (blkptr_t)) == 0 ||
 		    (zio->io_bp == zio_unique_parent(zio)->io_bp));
 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
@@ -4662,7 +4863,8 @@ zio_done(zio_t *zio)
 			 * For logical I/O requests, tell the SPA to log the
 			 * error and generate a logical data ereport.
 			 */
-			spa_log_error(zio->io_spa, &zio->io_bookmark);
+			spa_log_error(zio->io_spa, &zio->io_bookmark,
+			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
 			    zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
 		}
@@ -4797,15 +4999,14 @@ zio_done(zio_t *zio)
 			 * Reexecution is potentially a huge amount of work.
 			 * Hand it off to the otherwise-unused claim taskq.
 			 */
-			ASSERT(taskq_empty_ent(&zio->io_tqent));
-			spa_taskq_dispatch_ent(zio->io_spa,
+			spa_taskq_dispatch(zio->io_spa,
 			    ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
-			    zio_reexecute, zio, 0, &zio->io_tqent);
+			    zio_reexecute, zio, B_FALSE);
 		}
 		return (NULL);
 	}
 
-	ASSERT(zio->io_child_count == 0);
+	ASSERT(list_is_empty(&zio->io_child_list));
 	ASSERT(zio->io_reexecute == 0);
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
@@ -4820,12 +5021,6 @@ zio_done(zio_t *zio)
 		zfs_ereport_free_checksum(zcr);
 	}
 
-	if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
-	    !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
-	    !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
-		metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
-	}
-
 	/*
 	 * It is the responsibility of the done callback to ensure that this
 	 * particular zio is no longer discoverable for adoption, and as
@@ -4878,6 +5073,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
 	zio_encrypt,
 	zio_checksum_generate,
 	zio_nop_write,
+	zio_brt_free,
 	zio_ddt_read_start,
 	zio_ddt_read_done,
 	zio_ddt_write,
@@ -4998,7 +5194,7 @@ zbookmark_subtree_completed(const dnode_phys_t *dnp,
 {
 	zbookmark_phys_t mod_zb = *subtree_root;
 	mod_zb.zb_blkid++;
-	ASSERT(last_block->zb_level == 0);
+	ASSERT0(last_block->zb_level);
 
 	/* The objset_phys_t isn't before anything. */
 	if (dnp == NULL)
@@ -5024,26 +5220,41 @@ zbookmark_subtree_completed(const dnode_phys_t *dnp,
 	    last_block) <= 0);
 }
 
+/*
+ * This function is similar to zbookmark_subtree_completed(), but returns true
+ * if subtree_root is equal or ahead of last_block, i.e. still to be done.
+ */
+boolean_t
+zbookmark_subtree_tbd(const dnode_phys_t *dnp,
+    const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
+{
+	ASSERT0(last_block->zb_level);
+	if (dnp == NULL)
+		return (B_FALSE);
+	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
+	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, subtree_root,
+	    last_block) >= 0);
+}
+
 EXPORT_SYMBOL(zio_type_name);
 EXPORT_SYMBOL(zio_buf_alloc);
 EXPORT_SYMBOL(zio_data_buf_alloc);
 EXPORT_SYMBOL(zio_buf_free);
 EXPORT_SYMBOL(zio_data_buf_free);
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW,
 	"Max I/O completion time (milliseconds) before marking it as slow");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW,
 	"Prioritize requeued I/O");
 
-ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free,  INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free,  UINT, ZMOD_RW,
 	"Defer frees starting in this pass");
 
-ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, UINT, ZMOD_RW,
 	"Don't compress starting in this pass");
 
-ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, UINT, ZMOD_RW,
 	"Rewrite new bps starting in this pass");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW,
@@ -5051,4 +5262,3 @@ ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW,
 	"Log all slow ZIOs, not just those with vdevs");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zio_checksum.c b/sys/contrib/openzfs/module/zfs/zio_checksum.c
index e6b5c9588939..ce6772a40c8b 100644
--- a/sys/contrib/openzfs/module/zfs/zio_checksum.c
+++ b/sys/contrib/openzfs/module/zfs/zio_checksum.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -91,29 +91,29 @@
  * invocation and passed to the checksum function.
  */
 
-/*ARGSUSED*/
 static void
 abd_checksum_off(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
+	(void) abd, (void) size, (void) ctx_template;
 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 }
 
-/*ARGSUSED*/
 static void
 abd_fletcher_2_native(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
+	(void) ctx_template;
 	fletcher_init(zcp);
 	(void) abd_iterate_func(abd, 0, size,
 	    fletcher_2_incremental_native, zcp);
 }
 
-/*ARGSUSED*/
 static void
 abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
+	(void) ctx_template;
 	fletcher_init(zcp);
 	(void) abd_iterate_func(abd, 0, size,
 	    fletcher_2_incremental_byteswap, zcp);
@@ -127,11 +127,11 @@ abd_fletcher_4_impl(abd_t *abd, uint64_t size, zio_abd_checksum_data_t *acdp)
 	fletcher_4_abd_ops.acf_fini(acdp);
 }
 
-/*ARGSUSED*/
 void
 abd_fletcher_4_native(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
+	(void) ctx_template;
 	fletcher_4_ctx_t ctx;
 
 	zio_abd_checksum_data_t acd = {
@@ -144,11 +144,11 @@ abd_fletcher_4_native(abd_t *abd, uint64_t size,
 
 }
 
-/*ARGSUSED*/
 void
 abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
+	(void) ctx_template;
 	fletcher_4_ctx_t ctx;
 
 	zio_abd_checksum_data_t acd = {
@@ -165,10 +165,10 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 	{{NULL, NULL}, NULL, NULL, 0, "on"},
 	{{abd_checksum_off,		abd_checksum_off},
 	    NULL, NULL, 0, "off"},
-	{{abd_checksum_SHA256,		abd_checksum_SHA256},
+	{{abd_checksum_sha256,		abd_checksum_sha256},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
 	    "label"},
-	{{abd_checksum_SHA256,		abd_checksum_SHA256},
+	{{abd_checksum_sha256,		abd_checksum_sha256},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
 	    "gang_header"},
 	{{abd_fletcher_2_native,	abd_fletcher_2_byteswap},
@@ -177,14 +177,14 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 	    NULL, NULL, 0, "fletcher2"},
 	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
-	{{abd_checksum_SHA256,		abd_checksum_SHA256},
+	{{abd_checksum_sha256,		abd_checksum_sha256},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
 	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
 	{{abd_checksum_off,		abd_checksum_off},
 	    NULL, NULL, 0, "noparity"},
-	{{abd_checksum_SHA512_native,	abd_checksum_SHA512_byteswap},
+	{{abd_checksum_sha512_native,	abd_checksum_sha512_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
 	{{abd_checksum_skein_native,	abd_checksum_skein_byteswap},
@@ -195,6 +195,10 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 	    abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
 	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
+	{{abd_checksum_blake3_native,	abd_checksum_blake3_byteswap},
+	    abd_checksum_blake3_tmpl_init, abd_checksum_blake3_tmpl_free,
+	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"},
 };
 
 /*
@@ -207,6 +211,8 @@ zio_checksum_to_feature(enum zio_checksum cksum)
 	VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
 
 	switch (cksum) {
+	case ZIO_CHECKSUM_BLAKE3:
+		return (SPA_FEATURE_BLAKE3);
 	case ZIO_CHECKSUM_SHA512:
 		return (SPA_FEATURE_SHA512);
 	case ZIO_CHECKSUM_SKEIN:
@@ -266,7 +272,7 @@ static void
 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
-	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
+	uint64_t txg = BP_GET_BIRTH(bp);
 
 	ASSERT(BP_IS_GANG(bp));
 
@@ -351,17 +357,20 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
 		zio_eck_t eck;
 		size_t eck_offset;
 
-		bzero(&saved, sizeof (zio_cksum_t));
+		memset(&saved, 0, sizeof (zio_cksum_t));
 
 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t zilc;
 			abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
 
-			size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ,
-			    uint64_t);
+			uint64_t nused = P2ROUNDUP_TYPED(zilc.zc_nused,
+			    ZIL_MIN_BLKSZ, uint64_t);
+			ASSERT3U(size, >=, nused);
+			size = nused;
 			eck = zilc.zc_eck;
 			eck_offset = offsetof(zil_chain_t, zc_eck);
 		} else {
+			ASSERT3U(size, >=, sizeof (zio_eck_t));
 			eck_offset = size - sizeof (zio_eck_t);
 			abd_copy_to_buf_off(&eck, abd, eck_offset,
 			    sizeof (zio_eck_t));
@@ -417,6 +426,9 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
 
 	zio_checksum_template_init(checksum, spa);
 
+	IMPLY(bp == NULL, ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED);
+	IMPLY(bp == NULL, checksum == ZIO_CHECKSUM_LABEL);
+
 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_cksum_t verifier;
 		size_t eck_offset;
@@ -439,12 +451,13 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
 				return (SET_ERROR(ECKSUM));
 			}
 
-			if (nused > size) {
+			nused = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+			if (size < nused)
 				return (SET_ERROR(ECKSUM));
-			}
-
-			size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+			size = nused;
 		} else {
+			if (size < sizeof (zio_eck_t))
+				return (SET_ERROR(ECKSUM));
 			eck_offset = size - sizeof (zio_eck_t);
 			abd_copy_to_buf_off(&eck, abd, eck_offset,
 			    sizeof (zio_eck_t));
@@ -506,8 +519,6 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
 	}
 
 	if (info != NULL) {
-		info->zbc_expected = expected_cksum;
-		info->zbc_actual = actual_cksum;
 		info->zbc_checksum_name = ci->ci_name;
 		info->zbc_byteswapped = byteswap;
 		info->zbc_injected = 0;
diff --git a/sys/contrib/openzfs/module/zfs/zio_compress.c b/sys/contrib/openzfs/module/zfs/zio_compress.c
index 1ff1e76d7f22..c8a10db7483b 100644
--- a/sys/contrib/openzfs/module/zfs/zio_compress.c
+++ b/sys/contrib/openzfs/module/zfs/zio_compress.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -44,7 +44,7 @@
  * If nonzero, every 1/X decompression attempts will fail, simulating
  * an undetected memory error.
  */
-unsigned long zio_decompress_fail_fraction = 0;
+static unsigned long zio_decompress_fail_fraction = 0;
 
 /*
  * Compression vectors.
@@ -66,7 +66,7 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
 	{"gzip-9",	9,	gzip_compress,	gzip_decompress, NULL},
 	{"zle",		64,	zle_compress,	zle_decompress, NULL},
 	{"lz4",		0,	lz4_compress_zfs, lz4_decompress_zfs, NULL},
-	{"zstd",	ZIO_ZSTD_LEVEL_DEFAULT,	zfs_zstd_compress,
+	{"zstd",	ZIO_ZSTD_LEVEL_DEFAULT,	zfs_zstd_compress_wrap,
 	    zfs_zstd_decompress, zfs_zstd_decompress_level},
 };
 
@@ -74,6 +74,7 @@ uint8_t
 zio_complevel_select(spa_t *spa, enum zio_compress compress, uint8_t child,
     uint8_t parent)
 {
+	(void) spa;
 	uint8_t result;
 
 	if (!ZIO_COMPRESS_HASLEVEL(compress))
@@ -110,10 +111,11 @@ zio_compress_select(spa_t *spa, enum zio_compress child,
 	return (result);
 }
 
-/*ARGSUSED*/
 static int
 zio_compress_zeroed_cb(void *data, size_t len, void *private)
 {
+	(void) private;
+
 	uint64_t *end = (uint64_t *)((char *)data + len);
 	for (uint64_t *word = (uint64_t *)data; word < end; word++)
 		if (*word != 0)
@@ -123,7 +125,7 @@ zio_compress_zeroed_cb(void *data, size_t len, void *private)
 }
 
 size_t
-zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len,
+zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
     uint8_t level)
 {
 	size_t c_len, d_len;
@@ -161,9 +163,12 @@ zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len,
 		ASSERT3U(complevel, !=, ZIO_COMPLEVEL_INHERIT);
 	}
 
+	if (*dst == NULL)
+		*dst = zio_buf_alloc(s_len);
+
 	/* No compression algorithms can read from ABDs directly */
 	void *tmp = abd_borrow_buf_copy(src, s_len);
-	c_len = ci->ci_compress(tmp, dst, s_len, d_len, complevel);
+	c_len = ci->ci_compress(tmp, *dst, s_len, d_len, complevel);
 	abd_return_buf(src, tmp, s_len);
 
 	if (c_len > d_len)
diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c
index feaf41dc65e3..012a0e3c6c17 100644
--- a/sys/contrib/openzfs/module/zfs/zio_inject.c
+++ b/sys/contrib/openzfs/module/zfs/zio_inject.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2024, Klara Inc.
  */
 
 /*
@@ -59,6 +60,7 @@ uint32_t zio_injection_enabled = 0;
 typedef struct inject_handler {
 	int			zi_id;
 	spa_t			*zi_spa;
+	char			*zi_spa_name; /* ZINJECT_DELAY_IMPORT only */
 	zinject_record_t	zi_record;
 	uint64_t		*zi_lanes;
 	int			zi_next_lane;
@@ -148,7 +150,8 @@ zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva,
 	    zb->zb_level == record->zi_level &&
 	    zb->zb_blkid >= record->zi_start &&
 	    zb->zb_blkid <= record->zi_end &&
-	    (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) &&
+	    (record->zi_dvas == 0 ||
+	    (dva != ZI_NO_DVA && (record->zi_dvas & (1ULL << dva)))) &&
 	    error == record->zi_error) {
 		return (freq_triggered(record->zi_freq));
 	}
@@ -161,7 +164,7 @@ zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva,
  * specified by tag.
  */
 void
-zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
+zio_handle_panic_injection(spa_t *spa, const char *tag, uint64_t type)
 {
 	inject_handler_t *handler;
 
@@ -341,15 +344,14 @@ zio_handle_label_injection(zio_t *zio, int error)
 	return (ret);
 }
 
-/*ARGSUSED*/
 static int
 zio_inject_bitflip_cb(void *data, size_t len, void *private)
 {
-	zio_t *zio __maybe_unused = private;
+	zio_t *zio = private;
 	uint8_t *buffer = data;
 	uint_t byte = random_in_range(len);
 
-	ASSERT(zio->io_type == ZIO_TYPE_READ);
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 
 	/* flip a single random bit in an abd data buffer */
 	buffer[byte] ^= 1 << random_in_range(8);
@@ -364,10 +366,10 @@ zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2)
 	int ret = 0;
 
 	/*
-	 * We skip over faults in the labels unless it's during
-	 * device open (i.e. zio == NULL).
+	 * We skip over faults in the labels unless it's during device open
+	 * (i.e. zio == NULL) or a device flush (offset is meaningless)
 	 */
-	if (zio != NULL) {
+	if (zio != NULL && zio->io_type != ZIO_TYPE_FLUSH) {
 		uint64_t offset = zio->io_offset;
 
 		if (offset < VDEV_LABEL_START_SIZE ||
@@ -605,6 +607,12 @@ zio_handle_io_delay(zio_t *zio)
 		if (vd->vdev_guid != handler->zi_record.zi_guid)
 			continue;
 
+		/* also match on I/O type (e.g., -T read) */
+		if (handler->zi_record.zi_iotype != ZIO_TYPES &&
+		    handler->zi_record.zi_iotype != zio->io_type) {
+			continue;
+		}
+
 		/*
 		 * Defensive; should never happen as the array allocation
 		 * occurs prior to inserting this handler on the list.
@@ -699,6 +707,63 @@ zio_handle_io_delay(zio_t *zio)
 	return (min_target);
 }
 
+static void
+zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command)
+{
+	inject_handler_t *handler;
+	hrtime_t delay = 0;
+	int id = 0;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers);
+	    handler != NULL && handler->zi_record.zi_cmd == command;
+	    handler = list_next(&inject_handlers, handler)) {
+		ASSERT3P(handler->zi_spa_name, !=, NULL);
+		if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) {
+			uint64_t pause =
+			    SEC2NSEC(handler->zi_record.zi_duration);
+			if (pause > elapsed) {
+				delay = pause - elapsed;
+			}
+			id = handler->zi_id;
+			break;
+		}
+	}
+
+	rw_exit(&inject_lock);
+
+	if (delay) {
+		if (command == ZINJECT_DELAY_IMPORT) {
+			spa_import_progress_set_notes(spa, "injecting %llu "
+			    "sec delay", (u_longlong_t)NSEC2SEC(delay));
+		}
+		zfs_sleep_until(gethrtime() + delay);
+	}
+	if (id) {
+		/* all done with this one-shot handler */
+		zio_clear_fault(id);
+	}
+}
+
+/*
+ * For testing, inject a delay during an import
+ */
+void
+zio_handle_import_delay(spa_t *spa, hrtime_t elapsed)
+{
+	zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT);
+}
+
+/*
+ * For testing, inject a delay during an export
+ */
+void
+zio_handle_export_delay(spa_t *spa, hrtime_t elapsed)
+{
+	zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT);
+}
+
 static int
 zio_calculate_range(const char *pool, zinject_record_t *record)
 {
@@ -756,6 +821,28 @@ zio_calculate_range(const char *pool, zinject_record_t *record)
 	return (0);
 }
 
+static boolean_t
+zio_pool_handler_exists(const char *name, zinject_type_t command)
+{
+	boolean_t exists = B_FALSE;
+
+	rw_enter(&inject_lock, RW_READER);
+	for (inject_handler_t *handler = list_head(&inject_handlers);
+	    handler != NULL; handler = list_next(&inject_handlers, handler)) {
+		if (command != handler->zi_record.zi_cmd)
+			continue;
+
+		const char *pool = (handler->zi_spa_name != NULL) ?
+		    handler->zi_spa_name : spa_name(handler->zi_spa);
+		if (strcmp(name, pool) == 0) {
+			exists = B_TRUE;
+			break;
+		}
+	}
+	rw_exit(&inject_lock);
+
+	return (exists);
+}
 /*
  * Create a new handler for the given record.  We add it to the list, adding
  * a reference to the spa_t in the process.  We increment zio_injection_enabled,
@@ -806,16 +893,42 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
 
 	if (!(flags & ZINJECT_NULL)) {
 		/*
-		 * spa_inject_ref() will add an injection reference, which will
-		 * prevent the pool from being removed from the namespace while
-		 * still allowing it to be unloaded.
+		 * Pool delays for import or export don't take an
+		 * injection reference on the spa. Instead they
+		 * rely on matching by name.
 		 */
-		if ((spa = spa_inject_addref(name)) == NULL)
-			return (SET_ERROR(ENOENT));
+		if (record->zi_cmd == ZINJECT_DELAY_IMPORT ||
+		    record->zi_cmd == ZINJECT_DELAY_EXPORT) {
+			if (record->zi_duration <= 0)
+				return (SET_ERROR(EINVAL));
+			/*
+			 * Only one import | export delay handler per pool.
+			 */
+			if (zio_pool_handler_exists(name, record->zi_cmd))
+				return (SET_ERROR(EEXIST));
+
+			mutex_enter(&spa_namespace_lock);
+			boolean_t has_spa = spa_lookup(name) != NULL;
+			mutex_exit(&spa_namespace_lock);
+
+			if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa)
+				return (SET_ERROR(EEXIST));
+			if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa)
+				return (SET_ERROR(ENOENT));
+			spa = NULL;
+		} else {
+			/*
+			 * spa_inject_ref() will add an injection reference,
+			 * which will prevent the pool from being removed
+			 * from the namespace while still allowing it to be
+			 * unloaded.
+			 */
+			if ((spa = spa_inject_addref(name)) == NULL)
+				return (SET_ERROR(ENOENT));
+		}
 
 		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
-
-		handler->zi_spa = spa;
+		handler->zi_spa = spa;	/* note: can be NULL */
 		handler->zi_record = *record;
 
 		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
@@ -828,6 +941,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
 			handler->zi_next_lane = 0;
 		}
 
+		if (handler->zi_spa == NULL)
+			handler->zi_spa_name = spa_strdup(name);
+		else
+			handler->zi_spa_name = NULL;
+
 		rw_enter(&inject_lock, RW_WRITER);
 
 		/*
@@ -887,7 +1005,11 @@ zio_inject_list_next(int *id, char *name, size_t buflen,
 	if (handler) {
 		*record = handler->zi_record;
 		*id = handler->zi_id;
-		(void) strncpy(name, spa_name(handler->zi_spa), buflen);
+		ASSERT(handler->zi_spa || handler->zi_spa_name);
+		if (handler->zi_spa != NULL)
+			(void) strlcpy(name, spa_name(handler->zi_spa), buflen);
+		else
+			(void) strlcpy(name, handler->zi_spa_name, buflen);
 		ret = 0;
 	} else {
 		ret = SET_ERROR(ENOENT);
@@ -937,7 +1059,11 @@ zio_clear_fault(int id)
 		ASSERT3P(handler->zi_lanes, ==, NULL);
 	}
 
-	spa_inject_delref(handler->zi_spa);
+	if (handler->zi_spa_name != NULL)
+		spa_strfree(handler->zi_spa_name);
+
+	if (handler->zi_spa != NULL)
+		spa_inject_delref(handler->zi_spa);
 	kmem_free(handler, sizeof (inject_handler_t));
 	atomic_dec_32(&zio_injection_enabled);
 
diff --git a/sys/contrib/openzfs/module/zfs/zle.c b/sys/contrib/openzfs/module/zfs/zle.c
index 0decebb13ca7..1483a65af803 100644
--- a/sys/contrib/openzfs/module/zfs/zle.c
+++ b/sys/contrib/openzfs/module/zfs/zle.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
diff --git a/sys/contrib/openzfs/module/zfs/zrlock.c b/sys/contrib/openzfs/module/zfs/zrlock.c
index a4def6053622..0d50cc4712ca 100644
--- a/sys/contrib/openzfs/module/zfs/zrlock.c
+++ b/sys/contrib/openzfs/module/zfs/zrlock.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -106,16 +106,16 @@ zrl_add_impl(zrlock_t *zrl, const char *zc)
 void
 zrl_remove(zrlock_t *zrl)
 {
-	uint32_t n;
-
 #ifdef	ZFS_DEBUG
 	if (zrl->zr_owner == curthread) {
 		zrl->zr_owner = NULL;
 		zrl->zr_caller = NULL;
 	}
+	int32_t n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
+	ASSERT3S(n, >=, 0);
+#else
+	atomic_dec_32((uint32_t *)&zrl->zr_refcount);
 #endif
-	n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
-	ASSERT3S((int32_t)n, >=, 0);
 }
 
 int
diff --git a/sys/contrib/openzfs/module/zfs/zthr.c b/sys/contrib/openzfs/module/zfs/zthr.c
index 33fdda7b68d1..02b9f0805dd7 100644
--- a/sys/contrib/openzfs/module/zfs/zthr.c
+++ b/sys/contrib/openzfs/module/zfs/zthr.c
@@ -231,7 +231,7 @@ struct zthr {
 	const char	*zthr_name;
 };
 
-static void
+static __attribute__((noreturn)) void
 zthr_procedure(void *arg)
 {
 	zthr_t *t = arg;
@@ -469,6 +469,12 @@ zthr_iscancelled(zthr_t *t)
 	return (cancelled);
 }
 
+boolean_t
+zthr_iscurthread(zthr_t *t)
+{
+	return (t->zthr_thread == curthread);
+}
+
 /*
  * Wait for the zthr to finish its current function. Similar to
  * zthr_iscancelled, you can use zthr_has_waiters to have the zthr_func end
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
index d50cce7d7357..5b6a3f5cb410 100644
--- a/sys/contrib/openzfs/module/zfs/zvol.c
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
@@ -90,9 +90,8 @@ unsigned int zvol_inhibit_dev = 0;
 unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
 
 struct hlist_head *zvol_htable;
-list_t zvol_state_list;
+static list_t zvol_state_list;
 krwlock_t zvol_state_lock;
-const zvol_platform_ops_t *ops;
 
 typedef enum {
 	ZVOL_ASYNC_REMOVE_MINORS,
@@ -112,13 +111,10 @@ typedef struct {
 uint64_t
 zvol_name_hash(const char *name)
 {
-	int i;
 	uint64_t crc = -1ULL;
-	const uint8_t *p = (const uint8_t *)name;
 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-	for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) {
+	for (const uint8_t *p = (const uint8_t *)name; *p != 0; p++)
 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF];
-	}
 	return (crc);
 }
 
@@ -139,8 +135,7 @@ zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
 	hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
 		zv = hlist_entry(p, zvol_state_t, zv_hlink);
 		mutex_enter(&zv->zv_state_lock);
-		if (zv->zv_hash == hash &&
-		    strncmp(zv->zv_name, name, MAXNAMELEN) == 0) {
+		if (zv->zv_hash == hash && strcmp(zv->zv_name, name) == 0) {
 			/*
 			 * this is the right zvol, take the locks in the
 			 * right order
@@ -155,8 +150,7 @@ zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
 				 * to hold zvol_state_lock
 				 */
 				ASSERT(zv->zv_hash == hash &&
-				    strncmp(zv->zv_name, name, MAXNAMELEN)
-				    == 0);
+				    strcmp(zv->zv_name, name) == 0);
 			}
 			rw_exit(&zvol_state_lock);
 			return (zv);
@@ -365,12 +359,46 @@ out:
 		mutex_exit(&zv->zv_state_lock);
 
 	if (error == 0 && zv != NULL)
-		ops->zv_update_volsize(zv, volsize);
+		zvol_os_update_volsize(zv, volsize);
 
 	return (SET_ERROR(error));
 }
 
 /*
+ * Update volthreading.
+ */
+int
+zvol_set_volthreading(const char *name, boolean_t value)
+{
+	zvol_state_t *zv = zvol_find_by_name(name, RW_NONE);
+	if (zv == NULL)
+		return (ENOENT);
+	zv->zv_threading = value;
+	mutex_exit(&zv->zv_state_lock);
+	return (0);
+}
+
+/*
+ * Update zvol ro property.
+ */
+int
+zvol_set_ro(const char *name, boolean_t value)
+{
+	zvol_state_t *zv = zvol_find_by_name(name, RW_NONE);
+	if (zv == NULL)
+		return (-1);
+	if (value) {
+		zvol_os_set_disk_ro(zv, 1);
+		zv->zv_flags |= ZVOL_RDONLY;
+	} else {
+		zvol_os_set_disk_ro(zv, 0);
+		zv->zv_flags &= ~ZVOL_RDONLY;
+	}
+	mutex_exit(&zv->zv_state_lock);
+	return (0);
+}
+
+/*
  * Sanity check volume block size.
  */
 int
@@ -418,6 +446,8 @@ zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 	lr_truncate_t *lr = arg2;
 	uint64_t offset, length;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
@@ -430,7 +460,7 @@ zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 	if (error != 0) {
 		dmu_tx_abort(tx);
 	} else {
-		zil_replaying(zv->zv_zilog, tx);
+		(void) zil_replaying(zv->zv_zilog, tx);
 		dmu_tx_commit(tx);
 		error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset,
 		    length);
@@ -454,6 +484,8 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 	dmu_tx_t *tx;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
@@ -476,7 +508,7 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 		dmu_tx_abort(tx);
 	} else {
 		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
-		zil_replaying(zv->zv_zilog, tx);
+		(void) zil_replaying(zv->zv_zilog, tx);
 		dmu_tx_commit(tx);
 	}
 
@@ -486,6 +518,7 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 static int
 zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
 {
+	(void) arg1, (void) arg2, (void) byteswap;
 	return (SET_ERROR(ENOTSUP));
 }
 
@@ -493,7 +526,7 @@ zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
  * Callback vectors for replaying records.
  * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
  */
-zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
+zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
 	zvol_replay_err,	/* no such transaction type */
 	zvol_replay_err,	/* TX_CREATE */
 	zvol_replay_err,	/* TX_MKDIR */
@@ -513,6 +546,10 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 	zvol_replay_err,	/* TX_MKDIR_ATTR */
 	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
 	zvol_replay_err,	/* TX_WRITE2 */
+	zvol_replay_err,	/* TX_SETSAXATTR */
+	zvol_replay_err,	/* TX_RENAME_EXCHANGE */
+	zvol_replay_err,	/* TX_RENAME_WHITEOUT */
+	zvol_replay_err,	/* TX_CLONE_RANGE */
 };
 
 /*
@@ -521,11 +558,11 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
  * We store data in the log buffers if it's small enough.
  * Otherwise we will later flush the data out via dmu_sync().
  */
-ssize_t zvol_immediate_write_sz = 32768;
+static const ssize_t zvol_immediate_write_sz = 32768;
 
 void
 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
-    uint64_t size, int sync)
+    uint64_t size, boolean_t commit)
 {
 	uint32_t blocksize = zv->zv_volblocksize;
 	zilog_t *zilog = zv->zv_zilog;
@@ -540,7 +577,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
 	else if (!spa_has_slogs(zilog->zl_spa) &&
 	    size >= blocksize && blocksize > zvol_immediate_write_sz)
 		write_state = WR_INDIRECT;
-	else if (sync)
+	else if (commit)
 		write_state = WR_COPIED;
 	else
 		write_state = WR_NEED_COPY;
@@ -575,7 +612,6 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
 		BP_ZERO(&lr->lr_blkptr);
 
 		itx->itx_private = zv;
-		itx->itx_sync = sync;
 
 		(void) zil_itx_assign(zilog, itx, tx);
 
@@ -592,8 +628,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
  * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
  */
 void
-zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
-    boolean_t sync)
+zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len)
 {
 	itx_t *itx;
 	lr_truncate_t *lr;
@@ -608,15 +643,14 @@ zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
 	lr->lr_offset = off;
 	lr->lr_length = len;
 
-	itx->itx_sync = sync;
 	zil_itx_assign(zilog, itx, tx);
 }
 
 
-/* ARGSUSED */
 static void
 zvol_get_done(zgd_t *zgd, int error)
 {
+	(void) error;
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
@@ -640,10 +674,9 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 	int error;
 
 	ASSERT3P(lwb, !=, NULL);
-	ASSERT3P(zio, !=, NULL);
 	ASSERT3U(size, !=, 0);
 
-	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_lwb = lwb;
 
 	/*
@@ -659,6 +692,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 		error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
 		    DMU_READ_NO_PREFETCH);
 	} else { /* indirect write */
+		ASSERT3P(zio, !=, NULL);
 		/*
 		 * Have to lock the whole block to ensure when it's written out
 		 * and its checksum is being calculated that no one can change
@@ -669,8 +703,8 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 		offset = P2ALIGN_TYPED(offset, size, uint64_t);
 		zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
 		    size, RL_READER);
-		error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
-		    DMU_READ_NO_PREFETCH);
+		error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd,
+		    &db);
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
 
@@ -746,15 +780,15 @@ zvol_setup_zv(zvol_state_t *zv)
 	if (error)
 		return (SET_ERROR(error));
 
-	ops->zv_set_capacity(zv, volsize >> 9);
+	zvol_os_set_capacity(zv, volsize >> 9);
 	zv->zv_volsize = volsize;
 
 	if (ro || dmu_objset_is_snapshot(os) ||
 	    !spa_writeable(dmu_objset_spa(os))) {
-		ops->zv_set_disk_ro(zv, 1);
+		zvol_os_set_disk_ro(zv, 1);
 		zv->zv_flags |= ZVOL_RDONLY;
 	} else {
-		ops->zv_set_disk_ro(zv, 0);
+		zvol_os_set_disk_ro(zv, 0);
 		zv->zv_flags &= ~ZVOL_RDONLY;
 	}
 	return (0);
@@ -867,54 +901,26 @@ int
 zvol_first_open(zvol_state_t *zv, boolean_t readonly)
 {
 	objset_t *os;
-	int error, locked = 0;
-	boolean_t ro;
+	int error;
 
 	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT(mutex_owned(&spa_namespace_lock));
 
-	/*
-	 * In all other cases the spa_namespace_lock is taken before the
-	 * bdev->bd_mutex lock.	 But in this case the Linux __blkdev_get()
-	 * function calls fops->open() with the bdev->bd_mutex lock held.
-	 * This deadlock can be easily observed with zvols used as vdevs.
-	 *
-	 * To avoid a potential lock inversion deadlock we preemptively
-	 * try to take the spa_namespace_lock().  Normally it will not
-	 * be contended and this is safe because spa_open_common() handles
-	 * the case where the caller already holds the spa_namespace_lock.
-	 *
-	 * When it is contended we risk a lock inversion if we were to
-	 * block waiting for the lock.	Luckily, the __blkdev_get()
-	 * function allows us to return -ERESTARTSYS which will result in
-	 * bdev->bd_mutex being dropped, reacquired, and fops->open() being
-	 * called again.  This process can be repeated safely until both
-	 * locks are acquired.
-	 */
-	if (!mutex_owned(&spa_namespace_lock)) {
-		locked = mutex_tryenter(&spa_namespace_lock);
-		if (!locked)
-			return (SET_ERROR(EINTR));
-	}
-
-	ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
+	boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
 	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
 	if (error)
-		goto out_mutex;
+		return (SET_ERROR(error));
 
 	zv->zv_objset = os;
 
 	error = zvol_setup_zv(zv);
-
 	if (error) {
 		dmu_objset_disown(os, 1, zv);
 		zv->zv_objset = NULL;
 	}
 
-out_mutex:
-	if (locked)
-		mutex_exit(&spa_namespace_lock);
-	return (SET_ERROR(error));
+	return (error);
 }
 
 void
@@ -951,7 +957,7 @@ zvol_prefetch_minors_impl(void *arg)
 	job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
 	    FTAG, &os);
 	if (job->error == 0) {
-		dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ);
 		dmu_objset_disown(os, B_TRUE, FTAG);
 	}
 }
@@ -1053,8 +1059,7 @@ zvol_add_clones(const char *dsname, list_t *minors_list)
 out:
 	if (dd != NULL)
 		dsl_dir_rele(dd, FTAG);
-	if (dp != NULL)
-		dsl_pool_rele(dp, FTAG);
+	dsl_pool_rele(dp, FTAG);
 }
 
 /*
@@ -1102,7 +1107,7 @@ zvol_create_minors_cb(const char *dsname, void *arg)
 			 * traverse snapshots only, do not traverse children,
 			 * and skip the 'dsname'
 			 */
-			error = dmu_objset_find(dsname,
+			(void) dmu_objset_find(dsname,
 			    zvol_create_snap_minor_cb, (void *)job,
 			    DS_FIND_SNAPSHOTS);
 		}
@@ -1146,7 +1151,7 @@ zvol_create_minors_recursive(const char *name)
 	 * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need
 	 * any lock because all list operation is done on the current thread.
 	 *
-	 * We will use this list to do zvol_create_minor_impl after prefetch
+	 * We will use this list to do zvol_os_create_minor after prefetch
 	 * so we don't have to traverse using dmu_objset_find again.
 	 */
 	list_create(&minors_list, sizeof (minors_job_t),
@@ -1160,7 +1165,7 @@ zvol_create_minors_recursive(const char *name)
 		    &snapdev, NULL);
 
 		if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
-			(void) ops->zv_create_minor(name);
+			(void) zvol_os_create_minor(name);
 	} else {
 		fstrans_cookie_t cookie = spl_fstrans_mark();
 		(void) dmu_objset_find(name, zvol_create_minors_cb,
@@ -1171,13 +1176,12 @@ zvol_create_minors_recursive(const char *name)
 	taskq_wait_outstanding(system_taskq, 0);
 
 	/*
-	 * Prefetch is completed, we can do zvol_create_minor_impl
+	 * Prefetch is completed, we can do zvol_os_create_minor
 	 * sequentially.
 	 */
-	while ((job = list_head(&minors_list)) != NULL) {
-		list_remove(&minors_list, job);
+	while ((job = list_remove_head(&minors_list)) != NULL) {
 		if (!job->error)
-			(void) ops->zv_create_minor(job->name);
+			(void) zvol_os_create_minor(job->name);
 		kmem_strfree(job->name);
 		kmem_free(job, sizeof (minors_job_t));
 	}
@@ -1207,9 +1211,9 @@ zvol_create_minor(const char *name)
 		    "snapdev", &snapdev, NULL);
 
 		if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
-			(void) ops->zv_create_minor(name);
+			(void) zvol_os_create_minor(name);
 	} else {
-		(void) ops->zv_create_minor(name);
+		(void) zvol_os_create_minor(name);
 	}
 }
 
@@ -1220,7 +1224,7 @@ zvol_create_minor(const char *name)
 static void
 zvol_free_task(void *arg)
 {
-	ops->zv_free(arg);
+	zvol_os_free(arg);
 }
 
 void
@@ -1265,7 +1269,7 @@ zvol_remove_minors_impl(const char *name)
 			 * Cleared while holding zvol_state_lock as a writer
 			 * which will prevent zvol_open() from opening it.
 			 */
-			ops->zv_clear_private(zv);
+			zvol_os_clear_private(zv);
 
 			/* Drop zv_state_lock before zvol_free() */
 			mutex_exit(&zv->zv_state_lock);
@@ -1282,10 +1286,8 @@ zvol_remove_minors_impl(const char *name)
 	rw_exit(&zvol_state_lock);
 
 	/* Drop zvol_state_lock before calling zvol_free() */
-	while ((zv = list_head(&free_list)) != NULL) {
-		list_remove(&free_list, zv);
-		ops->zv_free(zv);
-	}
+	while ((zv = list_remove_head(&free_list)) != NULL)
+		zvol_os_free(zv);
 }
 
 /* Remove minor for this specific volume only */
@@ -1317,7 +1319,7 @@ zvol_remove_minor_impl(const char *name)
 			}
 			zvol_remove(zv);
 
-			ops->zv_clear_private(zv);
+			zvol_os_clear_private(zv);
 			mutex_exit(&zv->zv_state_lock);
 			break;
 		} else {
@@ -1329,7 +1331,7 @@ zvol_remove_minor_impl(const char *name)
 	rw_exit(&zvol_state_lock);
 
 	if (zv != NULL)
-		ops->zv_free(zv);
+		zvol_os_free(zv);
 }
 
 /*
@@ -1339,13 +1341,12 @@ static void
 zvol_rename_minors_impl(const char *oldname, const char *newname)
 {
 	zvol_state_t *zv, *zv_next;
-	int oldnamelen, newnamelen;
+	int oldnamelen;
 
 	if (zvol_inhibit_dev)
 		return;
 
 	oldnamelen = strlen(oldname);
-	newnamelen = strlen(newname);
 
 	rw_enter(&zvol_state_lock, RW_READER);
 
@@ -1355,14 +1356,14 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
 		mutex_enter(&zv->zv_state_lock);
 
 		if (strcmp(zv->zv_name, oldname) == 0) {
-			ops->zv_rename_minor(zv, newname);
+			zvol_os_rename_minor(zv, newname);
 		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
 		    (zv->zv_name[oldnamelen] == '/' ||
 		    zv->zv_name[oldnamelen] == '@')) {
 			char *name = kmem_asprintf("%s%c%s", newname,
 			    zv->zv_name[oldnamelen],
 			    zv->zv_name + oldnamelen + 1);
-			ops->zv_rename_minor(zv, name);
+			zvol_os_rename_minor(zv, name);
 			kmem_strfree(name);
 		}
 
@@ -1386,7 +1387,7 @@ zvol_set_snapdev_cb(const char *dsname, void *param)
 
 	switch (arg->snapdev) {
 		case ZFS_SNAPDEV_VISIBLE:
-			(void) ops->zv_create_minor(dsname);
+			(void) zvol_os_create_minor(dsname);
 			break;
 		case ZFS_SNAPDEV_HIDDEN:
 			(void) zvol_remove_minor_impl(dsname);
@@ -1443,14 +1444,14 @@ zvol_set_volmode_impl(char *name, uint64_t volmode)
 		case ZFS_VOLMODE_GEOM:
 		case ZFS_VOLMODE_DEV:
 			(void) zvol_remove_minor_impl(name);
-			(void) ops->zv_create_minor(name);
+			(void) zvol_os_create_minor(name);
 			break;
 		case ZFS_VOLMODE_DEFAULT:
 			(void) zvol_remove_minor_impl(name);
 			if (zvol_volmode == ZFS_VOLMODE_NONE)
 				break;
 			else /* if zvol_volmode is invalid defaults to "geom" */
-				(void) ops->zv_create_minor(name);
+				(void) zvol_os_create_minor(name);
 			break;
 	}
 	spl_fstrans_unmark(cookie);
@@ -1470,9 +1471,9 @@ zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
 	task->op = op;
 	task->value = value;
 
-	strlcpy(task->name1, name1, MAXNAMELEN);
+	strlcpy(task->name1, name1, sizeof (task->name1));
 	if (name2 != NULL)
-		strlcpy(task->name2, name2, MAXNAMELEN);
+		strlcpy(task->name2, name2, sizeof (task->name2));
 
 	return (task);
 }
@@ -1516,7 +1517,7 @@ typedef struct zvol_set_prop_int_arg {
 	const char *zsda_name;
 	uint64_t zsda_value;
 	zprop_source_t zsda_source;
-	dmu_tx_t *zsda_tx;
+	zfs_prop_t zsda_prop;
 } zvol_set_prop_int_arg_t;
 
 /*
@@ -1524,7 +1525,7 @@ typedef struct zvol_set_prop_int_arg {
  * conditions are imposed.
  */
 static int
-zvol_set_snapdev_check(void *arg, dmu_tx_t *tx)
+zvol_set_common_check(void *arg, dmu_tx_t *tx)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
@@ -1540,104 +1541,34 @@ zvol_set_snapdev_check(void *arg, dmu_tx_t *tx)
 	return (error);
 }
 
-/* ARGSUSED */
 static int
-zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+zvol_set_common_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
-	char dsname[MAXNAMELEN];
+	zvol_set_prop_int_arg_t *zsda = arg;
+	char dsname[ZFS_MAX_DATASET_NAME_LEN];
 	zvol_task_t *task;
-	uint64_t snapdev;
+	uint64_t prop;
 
+	const char *prop_name = zfs_prop_to_name(zsda->zsda_prop);
 	dsl_dataset_name(ds, dsname);
-	if (dsl_prop_get_int_ds(ds, "snapdev", &snapdev) != 0)
-		return (0);
-	task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev);
-	if (task == NULL)
-		return (0);
-
-	(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
-	    task, TQ_SLEEP);
-	return (0);
-}
-
-/*
- * Traverse all child datasets and apply snapdev appropriately.
- * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
- * dataset and read the effective "snapdev" on every child in the callback
- * function: this is because the value is not guaranteed to be the same in the
- * whole dataset hierarchy.
- */
-static void
-zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx)
-{
-	zvol_set_prop_int_arg_t *zsda = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dir_t *dd;
-	dsl_dataset_t *ds;
-	int error;
 
-	VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
-	zsda->zsda_tx = tx;
+	if (dsl_prop_get_int_ds(ds, prop_name, &prop) != 0)
+		return (0);
 
-	error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
-	if (error == 0) {
-		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV),
-		    zsda->zsda_source, sizeof (zsda->zsda_value), 1,
-		    &zsda->zsda_value, zsda->zsda_tx);
-		dsl_dataset_rele(ds, FTAG);
+	switch (zsda->zsda_prop) {
+		case ZFS_PROP_VOLMODE:
+			task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname,
+			    NULL, prop);
+			break;
+		case ZFS_PROP_SNAPDEV:
+			task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname,
+			    NULL, prop);
+			break;
+		default:
+			task = NULL;
+			break;
 	}
-	dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb,
-	    zsda, DS_FIND_CHILDREN);
 
-	dsl_dir_rele(dd, FTAG);
-}
-
-int
-zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev)
-{
-	zvol_set_prop_int_arg_t zsda;
-
-	zsda.zsda_name = ddname;
-	zsda.zsda_source = source;
-	zsda.zsda_value = snapdev;
-
-	return (dsl_sync_task(ddname, zvol_set_snapdev_check,
-	    zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
-}
-
-/*
- * Sanity check the dataset for safe use by the sync task.  No additional
- * conditions are imposed.
- */
-static int
-zvol_set_volmode_check(void *arg, dmu_tx_t *tx)
-{
-	zvol_set_prop_int_arg_t *zsda = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dir_t *dd;
-	int error;
-
-	error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
-	if (error != 0)
-		return (error);
-
-	dsl_dir_rele(dd, FTAG);
-
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
-{
-	char dsname[MAXNAMELEN];
-	zvol_task_t *task;
-	uint64_t volmode;
-
-	dsl_dataset_name(ds, dsname);
-	if (dsl_prop_get_int_ds(ds, "volmode", &volmode) != 0)
-		return (0);
-	task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, volmode);
 	if (task == NULL)
 		return (0);
 
@@ -1647,14 +1578,14 @@ zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 }
 
 /*
- * Traverse all child datasets and apply volmode appropriately.
+ * Traverse all child datasets and apply the property appropriately.
  * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
- * dataset and read the effective "volmode" on every child in the callback
+ * dataset and read the effective "property" on every child in the callback
  * function: this is because the value is not guaranteed to be the same in the
  * whole dataset hierarchy.
  */
 static void
-zvol_set_volmode_sync(void *arg, dmu_tx_t *tx)
+zvol_set_common_sync(void *arg, dmu_tx_t *tx)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
@@ -1663,33 +1594,34 @@ zvol_set_volmode_sync(void *arg, dmu_tx_t *tx)
 	int error;
 
 	VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
-	zsda->zsda_tx = tx;
 
 	error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
 	if (error == 0) {
-		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_VOLMODE),
+		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(zsda->zsda_prop),
 		    zsda->zsda_source, sizeof (zsda->zsda_value), 1,
-		    &zsda->zsda_value, zsda->zsda_tx);
+		    &zsda->zsda_value, tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 
-	dmu_objset_find_dp(dp, dd->dd_object, zvol_set_volmode_sync_cb,
+	dmu_objset_find_dp(dp, dd->dd_object, zvol_set_common_sync_cb,
 	    zsda, DS_FIND_CHILDREN);
 
 	dsl_dir_rele(dd, FTAG);
 }
 
 int
-zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode)
+zvol_set_common(const char *ddname, zfs_prop_t prop, zprop_source_t source,
+    uint64_t val)
 {
 	zvol_set_prop_int_arg_t zsda;
 
 	zsda.zsda_name = ddname;
 	zsda.zsda_source = source;
-	zsda.zsda_value = volmode;
+	zsda.zsda_value = val;
+	zsda.zsda_prop = prop;
 
-	return (dsl_sync_task(ddname, zvol_set_volmode_check,
-	    zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
+	return (dsl_sync_task(ddname, zvol_set_common_check,
+	    zvol_set_common_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 void
@@ -1727,13 +1659,7 @@ boolean_t
 zvol_is_zvol(const char *name)
 {
 
-	return (ops->zv_is_zvol(name));
-}
-
-void
-zvol_register_ops(const zvol_platform_ops_t *zvol_ops)
-{
-	ops = zvol_ops;
+	return (zvol_os_is_zvol(name));
 }
 
 int