7 files changed, 387 insertions, 428 deletions
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
index 3d0f88b36336..7403f10d91b7 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -2557,12 +2557,13 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	/*
 	 * Due to our use of dn_nlevels below, this can only be called
-	 * in open context, unless we are operating on the MOS.
-	 * From syncing context, dn_nlevels may be different from the
-	 * dn_nlevels used when dbuf was dirtied.
+	 * in open context, unless we are operating on the MOS or it's
+	 * a special object. From syncing context, dn_nlevels may be
+	 * different from the dn_nlevels used when dbuf was dirtied.
 	 */
 	ASSERT(db->db_objset ==
 	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
+	    DMU_OBJECT_IS_SPECIAL(db->db.db_object) ||
 	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT0(db->db_level);
diff --git a/sys/contrib/openzfs/module/zfs/multilist.c b/sys/contrib/openzfs/module/zfs/multilist.c
index 7b85d19e19ee..46fb79269310 100644
--- a/sys/contrib/openzfs/module/zfs/multilist.c
+++ b/sys/contrib/openzfs/module/zfs/multilist.c
@@ -81,7 +81,7 @@ multilist_create_impl(multilist_t *ml, size_t size, size_t offset,
 	ml->ml_num_sublists = num;
 	ml->ml_index_func = index_func;
 
-	ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
+	ml->ml_sublists = vmem_zalloc(sizeof (multilist_sublist_t) *
 	    ml->ml_num_sublists, KM_SLEEP);
 
 	ASSERT3P(ml->ml_sublists, !=, NULL);
@@ -134,7 +134,7 @@ multilist_destroy(multilist_t *ml)
 	}
 
 	ASSERT3P(ml->ml_sublists, !=, NULL);
-	kmem_free(ml->ml_sublists,
+	vmem_free(ml->ml_sublists,
 	    sizeof (multilist_sublist_t) * ml->ml_num_sublists);
 
 	ml->ml_num_sublists = 0;
diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c
index 7d4d06659146..cf28955b0c50 100644
--- a/sys/contrib/openzfs/module/zfs/spa_config.c
+++ b/sys/contrib/openzfs/module/zfs/spa_config.c
@@ -48,18 +48,17 @@
 /*
  * Pool configuration repository.
  *
- * Pool configuration is stored as a packed nvlist on the filesystem.  By
- * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot
- * (when the ZFS module is loaded).  Pools can also have the 'cachefile'
- * property set that allows them to be stored in an alternate location until
- * the control of external software.
+ * Pool configuration is stored as a packed nvlist on the filesystem.  When
+ * pools are imported they are added to the /etc/zfs/zpool.cache file and
+ * removed from it when exported.  For each cache file, we have a single nvlist
+ * which holds all the configuration information.  Pools can also have the
+ * 'cachefile' property set which allows this config to be stored in an
+ * alternate location under the control of external software.
  *
- * For each cache file, we have a single nvlist which holds all the
- * configuration information.  When the module loads, we read this information
- * from /etc/zfs/zpool.cache and populate the SPA namespace.  This namespace is
- * maintained independently in spa.c.  Whenever the namespace is modified, or
- * the configuration of a pool is changed, we call spa_write_cachefile(), which
- * walks through all the active pools and writes the configuration to disk.
+ * The kernel independantly maintains an AVL tree of imported pools.  See the
+ * "SPA locking" comment in spa.c.  Whenever a pool configuration is modified
+ * we call spa_write_cachefile() which walks through all the active pools and
+ * writes the updated configuration to to /etc/zfs/zpool.cache file.
  */
 
 static uint64_t spa_config_generation = 1;
@@ -69,94 +68,6 @@ static uint64_t spa_config_generation = 1;
  * userland pools when doing testing.
  */
 char *spa_config_path = (char *)ZPOOL_CACHE;
-#ifdef _KERNEL
-static int zfs_autoimport_disable = B_TRUE;
-#endif
-
-/*
- * Called when the module is first loaded, this routine loads the configuration
- * file into the SPA namespace.  It does not actually open or load the pools; it
- * only populates the namespace.
- */
-void
-spa_config_load(void)
-{
-	void *buf = NULL;
-	nvlist_t *nvlist, *child;
-	nvpair_t *nvpair;
-	char *pathname;
-	zfs_file_t *fp;
-	zfs_file_attr_t zfa;
-	uint64_t fsize;
-	int err;
-
-#ifdef _KERNEL
-	if (zfs_autoimport_disable)
-		return;
-#endif
-
-	/*
-	 * Open the configuration file.
-	 */
-	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-
-	(void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
-
-	err = zfs_file_open(pathname, O_RDONLY, 0, &fp);
-
-#ifdef __FreeBSD__
-	if (err)
-		err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp);
-#endif
-	kmem_free(pathname, MAXPATHLEN);
-
-	if (err)
-		return;
-
-	if (zfs_file_getattr(fp, &zfa))
-		goto out;
-
-	fsize = zfa.zfa_size;
-	buf = kmem_alloc(fsize, KM_SLEEP);
-
-	/*
-	 * Read the nvlist from the file.
-	 */
-	if (zfs_file_read(fp, buf, fsize, NULL) < 0)
-		goto out;
-
-	/*
-	 * Unpack the nvlist.
-	 */
-	if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
-		goto out;
-
-	/*
-	 * Iterate over all elements in the nvlist, creating a new spa_t for
-	 * each one with the specified configuration.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	nvpair = NULL;
-	while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
-		if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
-			continue;
-
-		child = fnvpair_value_nvlist(nvpair);
-
-		if (spa_lookup(nvpair_name(nvpair)) != NULL)
-			continue;
-		(void) spa_add(nvpair_name(nvpair), child, NULL);
-	}
-	mutex_exit(&spa_namespace_lock);
-
-	nvlist_free(nvlist);
-
-out:
-	if (buf != NULL)
-		kmem_free(buf, fsize);
-
-	zfs_file_close(fp);
-}
 
 static int
 spa_config_remove(spa_config_dirent_t *dp)
@@ -623,7 +534,6 @@ spa_config_update(spa_t *spa, int what)
 		spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
 }
 
-EXPORT_SYMBOL(spa_config_load);
 EXPORT_SYMBOL(spa_all_configs);
 EXPORT_SYMBOL(spa_config_set);
 EXPORT_SYMBOL(spa_config_generate);
@@ -634,8 +544,3 @@ EXPORT_SYMBOL(spa_config_update);
 ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD,
 	"SPA config file (/etc/zfs/zpool.cache)");
 #endif
-
-#ifdef _KERNEL
-ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW,
-	"Disable pool import at module load");
-#endif
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index cce772eae598..dceafbc27556 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -2548,13 +2548,6 @@ spa_name_compare(const void *a1, const void *a2)
 }
 
 void
-spa_boot_init(void *unused)
-{
-	(void) unused;
-	spa_config_load();
-}
-
-void
 spa_init(spa_mode_t mode)
 {
 	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -2607,7 +2600,6 @@ spa_init(spa_mode_t mode)
 	chksum_init();
 	zpool_prop_init();
 	zpool_feature_init();
-	spa_config_load();
 	vdev_prop_init();
 	l2arc_start();
 	scan_init();
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c
index 31b59c55f17b..0307df55aa21 100644
--- a/sys/contrib/openzfs/module/zfs/zil.c
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@@ -819,34 +819,37 @@ zil_lwb_vdev_compare(const void *x1, const void *x2)
  * we choose them here and later make the block allocation match.
  */
 static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
-    uint64_t txg, lwb_state_t state)
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, int min_sz, int sz,
+    boolean_t slog, uint64_t txg)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+	lwb->lwb_flags = 0;
 	lwb->lwb_zilog = zilog;
 	if (bp) {
 		lwb->lwb_blk = *bp;
-		lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2);
+		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2)
+			lwb->lwb_flags |= LWB_FLAG_SLIM;
 		sz = BP_GET_LSIZE(bp);
+		lwb->lwb_min_sz = sz;
 	} else {
 		BP_ZERO(&lwb->lwb_blk);
-		lwb->lwb_slim = (spa_version(zilog->zl_spa) >=
-		    SPA_VERSION_SLIM_ZIL);
+		if (spa_version(zilog->zl_spa) >= SPA_VERSION_SLIM_ZIL)
+			lwb->lwb_flags |= LWB_FLAG_SLIM;
+		lwb->lwb_min_sz = min_sz;
 	}
-	lwb->lwb_slog = slog;
+	if (slog)
+		lwb->lwb_flags |= LWB_FLAG_SLOG;
 	lwb->lwb_error = 0;
-	if (lwb->lwb_slim) {
-		lwb->lwb_nmax = sz;
-		lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
-	} else {
-		lwb->lwb_nmax = sz - sizeof (zil_chain_t);
-		lwb->lwb_nused = lwb->lwb_nfilled = 0;
-	}
+	/*
+	 * Buffer allocation and capacity setup will be done in
+	 * zil_lwb_write_open() when the LWB is opened for ITX assignment.
+	 */
+	lwb->lwb_nmax = lwb->lwb_nused = lwb->lwb_nfilled = 0;
 	lwb->lwb_sz = sz;
-	lwb->lwb_state = state;
-	lwb->lwb_buf = zio_buf_alloc(sz);
+	lwb->lwb_buf = NULL;
+	lwb->lwb_state = LWB_STATE_NEW;
 	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 	lwb->lwb_root_zio = NULL;
@@ -857,8 +860,6 @@ zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_lwb_list, lwb);
-	if (state != LWB_STATE_NEW)
-		zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 
 	return (lwb);
@@ -878,7 +879,7 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 	VERIFY(list_is_empty(&lwb->lwb_itxs));
 	VERIFY(list_is_empty(&lwb->lwb_waiters));
 	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
-	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
+	ASSERT(!MUTEX_HELD(&lwb->lwb_lock));
 
 	/*
 	 * Clear the zilog's field to indicate this lwb is no longer
@@ -1019,7 +1020,7 @@ zil_create(zilog_t *zilog)
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
-		    ZIL_MIN_BLKSZ, &slog);
+		    ZIL_MIN_BLKSZ, ZIL_MIN_BLKSZ, &slog, B_TRUE);
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
 	}
@@ -1028,7 +1029,7 @@ zil_create(zilog_t *zilog)
 	 * Allocate a log write block (lwb) for the first log block.
 	 */
 	if (error == 0)
-		lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
+		lwb = zil_alloc_lwb(zilog, &blk, 0, 0, slog, txg);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
@@ -1324,10 +1325,12 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
  * zil_commit() is racing with spa_sync().
  */
 static void
-zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
+zil_commit_waiter_done(zil_commit_waiter_t *zcw, int err)
 {
 	mutex_enter(&zcw->zcw_lock);
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
+	zcw->zcw_lwb = NULL;
+	zcw->zcw_error = err;
 	zcw->zcw_done = B_TRUE;
 	cv_broadcast(&zcw->zcw_cv);
 	mutex_exit(&zcw->zcw_lock);
@@ -1389,7 +1392,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
 	if (zil_nocacheflush)
 		return;
 
-	mutex_enter(&lwb->lwb_vdev_lock);
+	mutex_enter(&lwb->lwb_lock);
 	for (i = 0; i < ndvas; i++) {
 		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		if (avl_find(t, &zvsearch, &where) == NULL) {
@@ -1398,7 +1401,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
 			avl_insert(t, zv, where);
 		}
 	}
-	mutex_exit(&lwb->lwb_vdev_lock);
+	mutex_exit(&lwb->lwb_lock);
 }
 
 static void
@@ -1415,12 +1418,12 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
 
 	/*
 	 * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
-	 * not need the protection of lwb_vdev_lock (it will only be modified
+	 * not need the protection of lwb_lock (it will only be modified
 	 * while holding zilog->zl_lock) as its writes and those of its
 	 * children have all completed.  The younger 'nlwb' may be waiting on
 	 * future writes to additional vdevs.
 	 */
-	mutex_enter(&nlwb->lwb_vdev_lock);
+	mutex_enter(&nlwb->lwb_lock);
 	/*
 	 * Tear down the 'lwb' vdev tree, ensuring that entries which do not
 	 * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
@@ -1434,7 +1437,7 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
 			kmem_free(zv, sizeof (*zv));
 		}
 	}
-	mutex_exit(&nlwb->lwb_vdev_lock);
+	mutex_exit(&nlwb->lwb_lock);
 }
 
 void
@@ -1491,10 +1494,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 		zil_itx_destroy(itx, 0);
 
 	while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
-		mutex_enter(&zcw->zcw_lock);
-
-		ASSERT3P(zcw->zcw_lwb, ==, lwb);
-		zcw->zcw_lwb = NULL;
 		/*
 		 * We expect any ZIO errors from child ZIOs to have been
 		 * propagated "up" to this specific LWB's root ZIO, in
@@ -1509,14 +1508,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 		 * errors not being handled correctly here. See the
 		 * comment above the call to "zio_flush" for details.
 		 */
-
-		zcw->zcw_zio_error = zio->io_error;
-
-		ASSERT3B(zcw->zcw_done, ==, B_FALSE);
-		zcw->zcw_done = B_TRUE;
-		cv_broadcast(&zcw->zcw_cv);
-
-		mutex_exit(&zcw->zcw_lock);
+		zil_commit_waiter_done(zcw, zio->io_error);
 	}
 
 	uint64_t txg = lwb->lwb_issued_txg;
@@ -1588,7 +1580,7 @@ zil_lwb_write_done(zio_t *zio)
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
-	lwb_t *nlwb;
+	lwb_t *nlwb = NULL;
 
 	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
 
@@ -1608,9 +1600,11 @@ zil_lwb_write_done(zio_t *zio)
 	 * its write ZIO a parent this ZIO.  In such case we can not defer
 	 * our flushes or below may be a race between the done callbacks.
 	 */
-	nlwb = list_next(&zilog->zl_lwb_list, lwb);
-	if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
-		nlwb = NULL;
+	if (!(lwb->lwb_flags & LWB_FLAG_CRASHED)) {
+		nlwb = list_next(&zilog->zl_lwb_list, lwb);
+		if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
+			nlwb = NULL;
+	}
 	mutex_exit(&zilog->zl_lock);
 
 	if (avl_numnodes(t) == 0)
@@ -1624,12 +1618,17 @@ zil_lwb_write_done(zio_t *zio)
 	 * written out.
 	 *
 	 * Additionally, we don't perform any further error handling at
-	 * this point (e.g. setting "zcw_zio_error" appropriately), as
-	 * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus,
-	 * we expect any error seen here, to have been propagated to
-	 * that function).
+	 * this point (e.g. setting "zcw_error" appropriately), as we
+	 * expect that to occur in "zil_lwb_flush_vdevs_done" (thus, we
+	 * expect any error seen here, to have been propagated to that
+	 * function).
+	 *
+	 * Note that we treat a "crashed" LWB as though it was in error,
+	 * even if it did appear to succeed, because we've already
+	 * signaled error and cleaned up waiters and committers in
+	 * zil_crash(); we just want to clean up and get out of here.
 	 */
-	if (zio->io_error != 0) {
+	if (zio->io_error != 0 || (lwb->lwb_flags & LWB_FLAG_CRASHED)) {
 		while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
 			kmem_free(zv, sizeof (*zv));
 		return;
@@ -1742,10 +1741,26 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 		return;
 	}
 
+	mutex_enter(&lwb->lwb_lock);
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_OPENED;
 	zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
+	mutex_exit(&lwb->lwb_lock);
+
+	/*
+	 * Allocate buffer and set up LWB capacities.
+	 */
+	ASSERT0P(lwb->lwb_buf);
+	ASSERT3U(lwb->lwb_sz, >, 0);
+	lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
+	if (lwb->lwb_flags & LWB_FLAG_SLIM) {
+		lwb->lwb_nmax = lwb->lwb_sz;
+		lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
+	} else {
+		lwb->lwb_nmax = lwb->lwb_sz - sizeof (zil_chain_t);
+		lwb->lwb_nused = lwb->lwb_nfilled = 0;
+	}
 }
 
 /*
@@ -1762,6 +1777,8 @@ static uint_t
 zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
 {
 	uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t);
+	uint_t waste = zil_max_waste_space(zilog);
+	waste = MAX(waste, zilog->zl_cur_max);
 
 	if (size <= md) {
 		/*
@@ -1772,9 +1789,10 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
 	} else if (size > 8 * md) {
 		/*
 		 * Big bursts use maximum blocks.  The first block size
-		 * is hard to predict, but it does not really matter.
+		 * is hard to predict, but we need at least enough space
+		 * to make reasonable progress.
 		 */
-		*minsize = 0;
+		*minsize = waste;
 		return (md);
 	}
 
@@ -1787,57 +1805,52 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
 	uint_t s = size;
 	uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t));
 	uint_t chunk = DIV_ROUND_UP(s, n);
-	uint_t waste = zil_max_waste_space(zilog);
-	waste = MAX(waste, zilog->zl_cur_max);
 	if (chunk <= md - waste) {
 		*minsize = MAX(s - (md - waste) * (n - 1), waste);
 		return (chunk);
 	} else {
-		*minsize = 0;
+		*minsize = waste;
 		return (md);
 	}
 }
 
 /*
  * Try to predict next block size based on previous history.  Make prediction
- * sufficient for 7 of 8 previous bursts.  Don't try to save if the saving is
- * less then 50%, extra writes may cost more, but we don't want single spike
- * to badly affect our predictions.
+ * sufficient for 7 of 8 previous bursts, but don't try to save if the saving
+ * is less then 50%.  Extra writes may cost more, but we don't want single
+ * spike to badly affect our predictions.
  */
-static uint_t
-zil_lwb_predict(zilog_t *zilog)
+static void
+zil_lwb_predict(zilog_t *zilog, uint64_t *min_predict, uint64_t *max_predict)
 {
-	uint_t m, o;
+	uint_t m1 = 0, m2 = 0, o;
 
-	/* If we are in the middle of a burst, take it into account also. */
-	if (zilog->zl_cur_size > 0) {
-		o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m);
-	} else {
+	/* If we are in the middle of a burst, take it as another data point. */
+	if (zilog->zl_cur_size > 0)
+		o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m1);
+	else
 		o = UINT_MAX;
-		m = 0;
-	}
 
-	/* Find minimum optimal size.  We don't need to go below that. */
-	for (int i = 0; i < ZIL_BURSTS; i++)
-		o = MIN(o, zilog->zl_prev_opt[i]);
-
-	/* Find two biggest minimal first block sizes above the optimal. */
-	uint_t m1 = MAX(m, o), m2 = o;
+	/* Find two largest minimal first block sizes. */
 	for (int i = 0; i < ZIL_BURSTS; i++) {
-		m = zilog->zl_prev_min[i];
-		if (m >= m1) {
+		uint_t cur = zilog->zl_prev_min[i];
+		if (cur >= m1) {
 			m2 = m1;
-			m1 = m;
-		} else if (m > m2) {
-			m2 = m;
+			m1 = cur;
+		} else if (cur > m2) {
+			m2 = cur;
 		}
 	}
 
-	/*
-	 * If second minimum size gives 50% saving -- use it.  It may cost us
-	 * one additional write later, but the space saving is just too big.
-	 */
-	return ((m1 < m2 * 2) ? m1 : m2);
+	/* Minimum should guarantee progress in most cases. */
+	*min_predict = (m1 < m2 * 2) ? m1 : m2;
+
+	/* Maximum doesn't need to go below the minimum optimal size. */
+	for (int i = 0; i < ZIL_BURSTS; i++)
+		o = MIN(o, zilog->zl_prev_opt[i]);
+	m1 = MAX(m1, o);
+	m2 = MAX(m2, o);
+	*max_predict = (m1 < m2 * 2) ? m1 : m2;
 }
 
 /*
@@ -1845,12 +1858,13 @@ zil_lwb_predict(zilog_t *zilog)
  * Has to be called under zl_issuer_lock to chain more lwbs.
  */
 static lwb_t *
-zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
+zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb)
 {
-	uint64_t blksz, plan, plan2;
+	uint64_t minbs, maxbs;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+	membar_producer();
 	lwb->lwb_state = LWB_STATE_CLOSED;
 
 	/*
@@ -1875,27 +1889,34 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
 		 * Try to predict what can it be and plan for the worst case.
 		 */
 		uint_t m;
-		plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
+		maxbs = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
+		minbs = m;
 		if (zilog->zl_parallel) {
-			plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left +
-			    zil_lwb_predict(zilog), &m);
-			if (plan < plan2)
-				plan = plan2;
+			uint64_t minp, maxp;
+			zil_lwb_predict(zilog, &minp, &maxp);
+			maxp = zil_lwb_plan(zilog, zilog->zl_cur_left + maxp,
+			    &m);
+			if (maxbs < maxp)
+				maxbs = maxp;
 		}
 	} else {
 		/*
 		 * The previous burst is done and we can only predict what
 		 * will come next.
 		 */
-		plan = zil_lwb_predict(zilog);
+		zil_lwb_predict(zilog, &minbs, &maxbs);
 	}
-	blksz = plan + sizeof (zil_chain_t);
-	blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t);
-	blksz = MIN(blksz, zilog->zl_max_block_size);
-	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz,
-	    uint64_t, plan);
 
-	return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state));
+	minbs += sizeof (zil_chain_t);
+	maxbs += sizeof (zil_chain_t);
+	minbs = P2ROUNDUP_TYPED(minbs, ZIL_MIN_BLKSZ, uint64_t);
+	maxbs = P2ROUNDUP_TYPED(maxbs, ZIL_MIN_BLKSZ, uint64_t);
+	maxbs = MIN(maxbs, zilog->zl_max_block_size);
+	minbs = MIN(minbs, maxbs);
+	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, minbs,
+	    uint64_t, maxbs);
+
+	return (zil_alloc_lwb(zilog, NULL, minbs, maxbs, 0, 0));
 }
 
 /*
@@ -1944,14 +1965,16 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 	mutex_exit(&zilog->zl_lock);
 
 next_lwb:
-	if (lwb->lwb_slim)
+	if (lwb->lwb_flags & LWB_FLAG_SLIM)
 		zilc = (zil_chain_t *)lwb->lwb_buf;
 	else
 		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
-	int wsz = lwb->lwb_sz;
+	uint64_t alloc_size = BP_GET_LSIZE(&lwb->lwb_blk);
+	int wsz = alloc_size;
 	if (lwb->lwb_error == 0) {
 		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
-		if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk)
+		if (!(lwb->lwb_flags & LWB_FLAG_SLOG) ||
+		    zilog->zl_cur_size <= zil_slog_bulk)
 			prio = ZIO_PRIORITY_SYNC_WRITE;
 		else
 			prio = ZIO_PRIORITY_ASYNC_WRITE;
@@ -1959,16 +1982,17 @@ next_lwb:
 		    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 		    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
 		lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
-		    &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
+		    &lwb->lwb_blk, lwb_abd, alloc_size, zil_lwb_write_done,
 		    lwb, prio, ZIO_FLAG_CANFAIL, &zb);
 		zil_lwb_add_block(lwb, &lwb->lwb_blk);
 
-		if (lwb->lwb_slim) {
+		if (lwb->lwb_flags & LWB_FLAG_SLIM) {
 			/* For Slim ZIL only write what is used. */
 			wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
 			    int);
-			ASSERT3S(wsz, <=, lwb->lwb_sz);
-			zio_shrink(lwb->lwb_write_zio, wsz);
+			ASSERT3S(wsz, <=, alloc_size);
+			if (wsz < alloc_size)
+				zio_shrink(lwb->lwb_write_zio, wsz);
 			wsz = lwb->lwb_write_zio->io_size;
 		}
 		memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
@@ -2004,13 +2028,53 @@ next_lwb:
 	BP_ZERO(bp);
 	error = lwb->lwb_error;
 	if (error == 0) {
-		error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
-		    &slog);
+		/*
+		 * Allocation flexibility depends on LWB state:
+		 * if NEW: allow range allocation and larger sizes;
+		 * if OPENED: use fixed predetermined allocation size;
+		 * if CLOSED + Slim: allocate precisely for actual usage.
+		 */
+		boolean_t flexible = (nlwb->lwb_state == LWB_STATE_NEW);
+		if (flexible) {
+			/* We need to prevent opening till we update lwb_sz. */
+			mutex_enter(&nlwb->lwb_lock);
+			flexible = (nlwb->lwb_state == LWB_STATE_NEW);
+			if (!flexible)
+				mutex_exit(&nlwb->lwb_lock); /* We lost. */
+		}
+		boolean_t closed_slim = (nlwb->lwb_state == LWB_STATE_CLOSED &&
+		    (lwb->lwb_flags & LWB_FLAG_SLIM));
+
+		uint64_t min_size, max_size;
+		if (closed_slim) {
+			/* This transition is racy, but only one way. */
+			membar_consumer();
+			min_size = max_size = P2ROUNDUP_TYPED(nlwb->lwb_nused,
+			    ZIL_MIN_BLKSZ, uint64_t);
+		} else if (flexible) {
+			min_size = nlwb->lwb_min_sz;
+			max_size = nlwb->lwb_sz;
+		} else {
+			min_size = max_size = nlwb->lwb_sz;
+		}
+
+		error = zio_alloc_zil(spa, zilog->zl_os, txg, bp,
+		    min_size, max_size, &slog, flexible);
+		if (error == 0) {
+			if (closed_slim)
+				ASSERT3U(BP_GET_LSIZE(bp), ==, max_size);
+			else if (flexible)
+				nlwb->lwb_sz = BP_GET_LSIZE(bp);
+			else
+				ASSERT3U(BP_GET_LSIZE(bp), ==, nlwb->lwb_sz);
+		}
+		if (flexible)
+			mutex_exit(&nlwb->lwb_lock);
 	}
 	if (error == 0) {
 		ASSERT3U(BP_GET_BIRTH(bp), ==, txg);
-		BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
-		    ZIO_CHECKSUM_ZILOG);
+		BP_SET_CHECKSUM(bp, (nlwb->lwb_flags & LWB_FLAG_SLIM) ?
+		    ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
 		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 	}
@@ -2039,14 +2103,15 @@ next_lwb:
 	if (nlwb) {
 		nlwb->lwb_blk = *bp;
 		nlwb->lwb_error = error;
-		nlwb->lwb_slog = slog;
+		if (slog)
+			nlwb->lwb_flags |= LWB_FLAG_SLOG;
 		nlwb->lwb_alloc_txg = txg;
 		if (nlwb->lwb_state != LWB_STATE_READY)
 			nlwb = NULL;
 	}
 	mutex_exit(&zilog->zl_lock);
 
-	if (lwb->lwb_slog) {
+	if (lwb->lwb_flags & LWB_FLAG_SLOG) {
 		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
 		    lwb->lwb_nused);
@@ -2220,7 +2285,6 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3P(lwb, !=, NULL);
-	ASSERT3P(lwb->lwb_buf, !=, NULL);
 
 	zil_lwb_write_open(zilog, lwb);
 
@@ -2262,9 +2326,10 @@ cont:
 	    (dlen % max_log_data == 0 ||
 	    lwb_sp < reclen + dlen % max_log_data))) {
 		list_insert_tail(ilwbs, lwb);
-		lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
+		lwb = zil_lwb_write_close(zilog, lwb);
 		if (lwb == NULL)
 			return (NULL);
+		zil_lwb_write_open(zilog, lwb);
 		lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
 	}
 
@@ -2554,7 +2619,7 @@ zil_itxg_clean(void *arg)
 		 * called) we will hit this case.
 		 */
 		if (itx->itx_lr.lrc_txtype == TX_COMMIT)
-			zil_commit_waiter_skip(itx->itx_private);
+			zil_commit_waiter_done(itx->itx_private, 0);
 
 		zil_itx_destroy(itx, 0);
 	}
@@ -2742,6 +2807,7 @@ zil_crash_clean(zilog_t *zilog, uint64_t synced_txg)
 		}
 
 		/* This LWB is from the past, so we can clean it up now. */
+		ASSERT(lwb->lwb_flags & LWB_FLAG_CRASHED);
 		list_remove(&zilog->zl_lwb_crash_list, lwb);
 		if (lwb->lwb_buf != NULL)
 			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
@@ -2981,7 +3047,7 @@ zil_prune_commit_list(zilog_t *zilog)
 			 * never any itx's for it to wait on), so it's
 			 * safe to skip this waiter and mark it done.
 			 */
-			zil_commit_waiter_skip(itx->itx_private);
+			zil_commit_waiter_done(itx->itx_private, 0);
 		} else {
 			zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
 		}
@@ -3212,15 +3278,21 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		 * "next" lwb on-disk. When this happens, we must stall
 		 * the ZIL write pipeline; see the comment within
 		 * zil_commit_writer_stall() for more details.
+		 *
+		 * ESHUTDOWN has to be handled carefully here. If we get it,
+		 * then the pool suspended and zil_crash() was called, so we
+		 * need to stop trying and just get an error back to the
+		 * callers.
 		 */
 		int err = 0;
 		while ((lwb = list_remove_head(ilwbs)) != NULL) {
-			err = zil_lwb_write_issue(zilog, lwb);
-			if (err != 0)
-				break;
+			if (err == 0)
+				err = zil_lwb_write_issue(zilog, lwb);
 		}
-		if (err == 0)
+		if (err != ESHUTDOWN)
 			err = zil_commit_writer_stall(zilog);
+		if (err == ESHUTDOWN)
+			err = SET_ERROR(EIO);
 
 		/*
 		 * Additionally, we have to signal and mark the "nolwb"
@@ -3230,7 +3302,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		 */
 		zil_commit_waiter_t *zcw;
 		while ((zcw = list_remove_head(&nolwb_waiters)) != NULL)
-			zil_commit_waiter_skip(zcw);
+			zil_commit_waiter_done(zcw, err);
 
 		/*
 		 * And finally, we have to destroy the itx's that
@@ -3238,7 +3310,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		 * the itx's callback if one exists for the itx.
 		 */
 		while ((itx = list_remove_head(&nolwb_itxs)) != NULL)
-			zil_itx_destroy(itx, 0);
+			zil_itx_destroy(itx, err);
 	} else {
 		ASSERT(list_is_empty(&nolwb_waiters));
 		ASSERT3P(lwb, !=, NULL);
@@ -3292,17 +3364,17 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		    (!zilog->zl_parallel || zilog->zl_suspend > 0)) {
 			zil_burst_done(zilog);
 			list_insert_tail(ilwbs, lwb);
-			lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
+			lwb = zil_lwb_write_close(zilog, lwb);
 			if (lwb == NULL) {
 				int err = 0;
 				while ((lwb =
 				    list_remove_head(ilwbs)) != NULL) {
-					err = zil_lwb_write_issue(zilog, lwb);
-					if (err != 0)
-						break;
+					if (err == 0)
+						err = zil_lwb_write_issue(
+						    zilog, lwb);
 				}
-				if (err == 0)
-					zil_commit_writer_stall(zilog);
+				if (err != ESHUTDOWN)
+					(void) zil_commit_writer_stall(zilog);
 			}
 		}
 	}
@@ -3470,7 +3542,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 * hasn't been issued.
 	 */
 	zil_burst_done(zilog);
-	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
+	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb);
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
@@ -3546,7 +3618,7 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
 		 * commit itxs. When this occurs, the commit waiters linked
 		 * off of these commit itxs will not be committed to an
 		 * lwb.  Additionally, these commit waiters will not be
-		 * marked done until zil_commit_waiter_skip() is called via
+		 * marked done until zil_commit_waiter_done() is called via
 		 * zil_itxg_clean().
 		 *
 		 * Thus, it's possible for this commit waiter (i.e. the
@@ -3624,7 +3696,7 @@ zil_alloc_commit_waiter(void)
 	list_link_init(&zcw->zcw_node);
 	zcw->zcw_lwb = NULL;
 	zcw->zcw_done = B_FALSE;
-	zcw->zcw_zio_error = 0;
+	zcw->zcw_error = 0;
 
 	return (zcw);
 }
@@ -3728,6 +3800,9 @@ zil_crash(zilog_t *zilog)
 	 */
 	for (lwb_t *lwb = list_head(&zilog->zl_lwb_crash_list); lwb != NULL;
 	    lwb = list_next(&zilog->zl_lwb_crash_list, lwb)) {
+		ASSERT(!(lwb->lwb_flags & LWB_FLAG_CRASHED));
+		lwb->lwb_flags |= LWB_FLAG_CRASHED;
+
 		itx_t *itx;
 		while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
 			zil_itx_destroy(itx, EIO);
@@ -3736,7 +3811,7 @@ zil_crash(zilog_t *zilog)
 		while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
 			mutex_enter(&zcw->zcw_lock);
 			zcw->zcw_lwb = NULL;
-			zcw->zcw_zio_error = EIO;
+			zcw->zcw_error = EIO;
 			zcw->zcw_done = B_TRUE;
 			cv_broadcast(&zcw->zcw_cv);
 			mutex_exit(&zcw->zcw_lock);
@@ -4014,7 +4089,7 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
 	zil_commit_waiter(zilog, zcw);
 
 	int err = 0;
-	if (zcw->zcw_zio_error != 0) {
+	if (zcw->zcw_error != 0) {
 		/*
 		 * If there was an error writing out the ZIL blocks that
 		 * this thread is waiting on, then we fallback to
@@ -4149,7 +4224,7 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag)
 	    offsetof(zil_commit_waiter_t, zcw_node));
 	avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
 	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
-	mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&lwb->lwb_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
 
@@ -4158,7 +4233,7 @@ zil_lwb_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	lwb_t *lwb = vbuf;
-	mutex_destroy(&lwb->lwb_vdev_lock);
+	mutex_destroy(&lwb->lwb_lock);
 	avl_destroy(&lwb->lwb_vdev_tree);
 	list_destroy(&lwb->lwb_waiters);
 	list_destroy(&lwb->lwb_itxs);
@@ -4381,7 +4456,7 @@ zil_close(zilog_t *zilog)
 	if (lwb != NULL) {
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
-		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+		ASSERT0P(lwb->lwb_buf);
 		zil_free_lwb(zilog, lwb);
 	}
 	mutex_exit(&zilog->zl_lock);
@@ -4472,16 +4547,16 @@ zil_suspend(const char *osname, void **cookiep)
 			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
 		mutex_exit(&zilog->zl_lock);
 
-		if (cookiep == NULL)
+		if (zilog->zl_restart_txg > 0) {
+			/* ZIL crashed while we were waiting. */
+			zil_resume(os);
+			error = SET_ERROR(EBUSY);
+		} else if (cookiep == NULL)
 			zil_resume(os);
 		else
 			*cookiep = os;
 
-		if (zilog->zl_restart_txg > 0)
-			/* ZIL crashed while we were waiting. */
-			return (SET_ERROR(EBUSY));
-
-		return (0);
+		return (error);
 	}
 
 	/*
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index 3f0ddb63249d..4cf8912d4269 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -4434,12 +4434,15 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
  */
 int
 zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
-    uint64_t size, boolean_t *slog)
+    uint64_t min_size, uint64_t max_size, boolean_t *slog,
+    boolean_t allow_larger)
 {
 	int error;
 	zio_alloc_list_t io_alloc_list;
+	uint64_t alloc_size = 0;
 
 	ASSERT(txg > spa_syncing_txg(spa));
+	ASSERT3U(min_size, <=, max_size);
 
 	metaslab_trace_init(&io_alloc_list);
 
@@ -4448,7 +4451,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 	 * Fill in the obvious ones before calling into metaslab_alloc().
 	 */
 	BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
-	BP_SET_PSIZE(new_bp, size);
+	BP_SET_PSIZE(new_bp, max_size);
 	BP_SET_LEVEL(new_bp, 0);
 
 	/*
@@ -4463,43 +4466,51 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 	ZIOSTAT_BUMP(ziostat_total_allocations);
 
 	/* Try log class (dedicated slog devices) first */
-	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
-	    txg, NULL, flags, &io_alloc_list, allocator, NULL);
+	error = metaslab_alloc_range(spa, spa_log_class(spa), min_size,
+	    max_size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
+	    NULL, &alloc_size);
 	*slog = (error == 0);
 
 	/* Try special_embedded_log class (reserved on special vdevs) */
 	if (error != 0) {
-		error = metaslab_alloc(spa, spa_special_embedded_log_class(spa),
-		    size, new_bp, 1, txg, NULL, flags, &io_alloc_list,
-		    allocator, NULL);
+		error = metaslab_alloc_range(spa,
+		    spa_special_embedded_log_class(spa), min_size, max_size,
+		    new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
+		    NULL, &alloc_size);
 	}
 
 	/* Try special class (general special vdev allocation) */
 	if (error != 0) {
-		error = metaslab_alloc(spa, spa_special_class(spa), size,
-		    new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
-		    NULL);
+		error = metaslab_alloc_range(spa, spa_special_class(spa),
+		    min_size, max_size, new_bp, 1, txg, NULL, flags,
+		    &io_alloc_list, allocator, NULL, &alloc_size);
 	}
 
 	/* Try embedded_log class (reserved on normal vdevs) */
 	if (error != 0) {
-		error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
-		    new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
-		    NULL);
+		error = metaslab_alloc_range(spa, spa_embedded_log_class(spa),
+		    min_size, max_size, new_bp, 1, txg, NULL, flags,
+		    &io_alloc_list, allocator, NULL, &alloc_size);
 	}
 
 	/* Finally fall back to normal class */
 	if (error != 0) {
 		ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks);
-		error = metaslab_alloc(spa, spa_normal_class(spa), size,
-		    new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
-		    NULL);
+		error = metaslab_alloc_range(spa, spa_normal_class(spa),
+		    min_size, max_size, new_bp, 1, txg, NULL, flags,
+		    &io_alloc_list, allocator, NULL, &alloc_size);
 	}
 	metaslab_trace_fini(&io_alloc_list);
 
 	if (error == 0) {
-		BP_SET_LSIZE(new_bp, size);
-		BP_SET_PSIZE(new_bp, size);
+		if (!allow_larger)
+			alloc_size = MIN(alloc_size, max_size);
+		else if (max_size <= SPA_OLD_MAXBLOCKSIZE)
+			alloc_size = MIN(alloc_size, SPA_OLD_MAXBLOCKSIZE);
+		alloc_size = P2ALIGN_TYPED(alloc_size, ZIL_MIN_BLKSZ, uint64_t);
+
+		BP_SET_LSIZE(new_bp, alloc_size);
+		BP_SET_PSIZE(new_bp, alloc_size);
 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
 		BP_SET_CHECKSUM(new_bp,
 		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
@@ -4527,8 +4538,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 		}
 	} else {
 		zfs_dbgmsg("%s: zil block allocation failure: "
-		    "size %llu, error %d", spa_name(spa), (u_longlong_t)size,
-		    error);
+		    "min_size %llu, max_size %llu, error %d", spa_name(spa),
+		    (u_longlong_t)min_size, (u_longlong_t)max_size, error);
 	}
 
 	return (error);
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
index 29f51e230a37..2fd3e1c37045 100644
--- a/sys/contrib/openzfs/module/zfs/zvol.c
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -38,25 +38,36 @@
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Klara, Inc.
  */
 
 /*
  * Note on locking of zvol state structures.
  *
- * These structures are used to maintain internal state used to emulate block
- * devices on top of zvols. In particular, management of device minor number
- * operations - create, remove, rename, and set_snapdev - involves access to
- * these structures. The zvol_state_lock is primarily used to protect the
- * zvol_state_list. The zv->zv_state_lock is used to protect the contents
- * of the zvol_state_t structures, as well as to make sure that when the
- * time comes to remove the structure from the list, it is not in use, and
- * therefore, it can be taken off zvol_state_list and freed.
+ * zvol_state_t represents the connection between a single dataset
+ * (DMU_OST_ZVOL) and the device "minor" (some OS-specific representation of a
+ * "disk" or "device" or "volume", eg, a /dev/zdXX node, a GEOM object, etc).
  *
- * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
- * e.g. for the duration of receive and rollback operations. This lock can be
- * held for significant periods of time. Given that it is undesirable to hold
- * mutexes for long periods of time, the following lock ordering applies:
+ * The global zvol_state_lock is used to protect access to zvol_state_list and
+ * zvol_htable, which are the primary way to obtain a zvol_state_t from a name.
+ * It should not be used for anything not name-relateds, and you should avoid
+ * sleeping or waiting while its held. See zvol_find_by_name(), zvol_insert(),
+ * zvol_remove().
+ *
+ * The zv_state_lock is used to protect the contents of the associated
+ * zvol_state_t. Most of the zvol_state_t is dedicated to control and
+ * configuration; almost none of it is needed for data operations (that is,
+ * read, write, flush) so this lock is rarely taken during general IO. It
+ * should be released quickly; you should avoid sleeping or waiting while its
+ * held.
+ *
+ * zv_suspend_lock is used to suspend IO/data operations to a zvol. The read
+ * half should held for the duration of an IO operation. The write half should
+ * be taken when something to wait for IO to complete and the block further IO,
+ * eg for the duration of receive and rollback operations. This lock can be
+ * held for long periods of time.
+ *
+ * Thus, the following lock ordering appies.
  * - take zvol_state_lock if necessary, to protect zvol_state_list
  * - take zv_suspend_lock if necessary, by the code path in question
  * - take zv_state_lock to protect zvol_state_t
@@ -67,9 +78,8 @@
  * these operations are serialized per pool. Consequently, we can be certain
  * that for a given zvol, there is only one operation at a time in progress.
  * That is why one can be sure that first, zvol_state_t for a given zvol is
- * allocated and placed on zvol_state_list, and then other minor operations
- * for this zvol are going to proceed in the order of issue.
- *
+ * allocated and placed on zvol_state_list, and then other minor operations for
+ * this zvol are going to proceed in the order of issue.
  */
 
 #include <sys/dataset_kstats.h>
@@ -1570,184 +1580,156 @@ zvol_create_minors_impl(zvol_task_t *task)
 }
 
 /*
- * Remove minors for specified dataset including children and snapshots.
- */
-
-/*
- * Remove the minor for a given zvol. This will do it all:
- *  - flag the zvol for removal, so new requests are rejected
- *  - wait until outstanding requests are completed
- *  - remove it from lists
- *  - free it
- * It's also usable as a taskq task, and smells nice too.
+ * Remove minors for specified dataset and, optionally, its children and
+ * snapshots.
  */
 static void
-zvol_remove_minor_task(void *arg)
-{
-	zvol_state_t *zv = (zvol_state_t *)arg;
-
-	ASSERT(!RW_LOCK_HELD(&zvol_state_lock));
-	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
-
-	mutex_enter(&zv->zv_state_lock);
-	while (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) {
-		zv->zv_flags |= ZVOL_REMOVING;
-		cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
-	}
-	mutex_exit(&zv->zv_state_lock);
-
-	rw_enter(&zvol_state_lock, RW_WRITER);
-	mutex_enter(&zv->zv_state_lock);
-
-	zvol_remove(zv);
-	zvol_os_clear_private(zv);
-
-	mutex_exit(&zv->zv_state_lock);
-	rw_exit(&zvol_state_lock);
-
-	zvol_os_free(zv);
-}
-
-static void
-zvol_free_task(void *arg)
-{
-	zvol_os_free(arg);
-}
-
-static void
 zvol_remove_minors_impl(zvol_task_t *task)
 {
 	zvol_state_t *zv, *zv_next;
 	const char *name = task ? task->zt_name1 : NULL;
 	int namelen = ((name) ? strlen(name) : 0);
-	taskqid_t t;
-	list_t delay_list, free_list;
+	boolean_t children = task ? !!task->zt_value : B_TRUE;
 
 	if (zvol_inhibit_dev)
 		return;
 
-	list_create(&delay_list, sizeof (zvol_state_t),
-	    offsetof(zvol_state_t, zv_next));
-	list_create(&free_list, sizeof (zvol_state_t),
-	    offsetof(zvol_state_t, zv_next));
+	/*
+	 * We collect up zvols that we want to remove on a separate list, so
+	 * that we don't have to hold zvol_state_lock for the whole time.
+	 *
+	 * We can't remove them from the global lists until we're completely
+	 * done with them, because that would make them appear to ZFS-side ops
+	 * that they don't exist, and the name might be reused, which can't be
+	 * good.
+	 */
+	list_t remove_list;
+	list_create(&remove_list, sizeof (zvol_state_t),
+	    offsetof(zvol_state_t, zv_remove_node));
 
-	rw_enter(&zvol_state_lock, RW_WRITER);
+	rw_enter(&zvol_state_lock, RW_READER);
 
 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 		zv_next = list_next(&zvol_state_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
+		if (zv->zv_flags & ZVOL_REMOVING) {
+			/* Another thread is handling shutdown, skip it. */
+			mutex_exit(&zv->zv_state_lock);
+			continue;
+		}
+
+		/*
+		 * This zvol should be removed if:
+		 * - no name was offered (ie removing all at shutdown); or
+		 * - name matches exactly; or
+		 * - we were asked to remove children, and
+		 *   - the start of the name matches, and
+		 *   - there is a '/' immediately after the matched name; or
+		 *   - there is a '@' immediately after the matched name
+		 */
 		if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
-		    (strncmp(zv->zv_name, name, namelen) == 0 &&
+		    (children && strncmp(zv->zv_name, name, namelen) == 0 &&
 		    (zv->zv_name[namelen] == '/' ||
 		    zv->zv_name[namelen] == '@'))) {
-			/*
-			 * By holding zv_state_lock here, we guarantee that no
-			 * one is currently using this zv
-			 */
 
 			/*
-			 * If in use, try to throw everyone off and try again
-			 * later.
+			 * Matched, so mark it removal. We want to take the
+			 * write half of the suspend lock to make sure that
+			 * the zvol is not suspended, and give any data ops
+			 * chance to finish.
 			 */
-			if (zv->zv_open_count > 0 ||
-			    atomic_read(&zv->zv_suspend_ref)) {
-				zv->zv_flags |= ZVOL_REMOVING;
-				t = taskq_dispatch(
-				    zv->zv_objset->os_spa->spa_zvol_taskq,
-				    zvol_remove_minor_task, zv, TQ_SLEEP);
-				if (t == TASKQID_INVALID) {
-					/*
-					 * Couldn't create the task, so we'll
-					 * do it in place once the loop is
-					 * finished.
-					 */
-					list_insert_head(&delay_list, zv);
-				}
+			mutex_exit(&zv->zv_state_lock);
+			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+			mutex_enter(&zv->zv_state_lock);
+
+			if (zv->zv_flags & ZVOL_REMOVING) {
+				/* Another thread has taken it, let them. */
 				mutex_exit(&zv->zv_state_lock);
+				rw_exit(&zv->zv_suspend_lock);
 				continue;
 			}
 
-			zvol_remove(zv);
-
 			/*
-			 * Cleared while holding zvol_state_lock as a writer
-			 * which will prevent zvol_open() from opening it.
+			 * Mark it and unlock. New entries will see the flag
+			 * and return ENXIO.
 			 */
-			zvol_os_clear_private(zv);
-
-			/* Drop zv_state_lock before zvol_free() */
+			zv->zv_flags |= ZVOL_REMOVING;
 			mutex_exit(&zv->zv_state_lock);
+			rw_exit(&zv->zv_suspend_lock);
 
-			/* Try parallel zv_free, if failed do it in place */
-			t = taskq_dispatch(system_taskq, zvol_free_task, zv,
-			    TQ_SLEEP);
-			if (t == TASKQID_INVALID)
-				list_insert_head(&free_list, zv);
-		} else {
+			/* Put it on the list for the next stage. */
+			list_insert_head(&remove_list, zv);
+		} else
 			mutex_exit(&zv->zv_state_lock);
-		}
 	}
-	rw_exit(&zvol_state_lock);
 
-	/* Wait for zvols that we couldn't create a remove task for */
-	while ((zv = list_remove_head(&delay_list)) != NULL)
-		zvol_remove_minor_task(zv);
-
-	/* Free any that we couldn't free in parallel earlier */
-	while ((zv = list_remove_head(&free_list)) != NULL)
-		zvol_os_free(zv);
-}
-
-/* Remove minor for this specific volume only */
-static int
-zvol_remove_minor_impl(const char *name)
-{
-	zvol_state_t *zv = NULL, *zv_next;
-
-	if (zvol_inhibit_dev)
-		return (0);
+	rw_exit(&zvol_state_lock);
 
-	rw_enter(&zvol_state_lock, RW_WRITER);
+	/* Didn't match any, nothing to do! */
+	if (list_is_empty(&remove_list)) {
+		if (task)
+			task->zt_error = SET_ERROR(ENOENT);
+		return;
+	}
 
-	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
-		zv_next = list_next(&zvol_state_list, zv);
+	/* Actually shut them all down. */
+	for (zv = list_head(&remove_list); zv != NULL; zv = zv_next) {
+		zv_next = list_next(&remove_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
-		if (strcmp(zv->zv_name, name) == 0)
-			/* Found, leave the the loop with zv_lock held */
-			break;
-		mutex_exit(&zv->zv_state_lock);
-	}
-
-	if (zv == NULL) {
-		rw_exit(&zvol_state_lock);
-		return (SET_ERROR(ENOENT));
-	}
 
-	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+		/*
+		 * Still open or suspended, just wait. This can happen if, for
+		 * example, we managed to acquire zv_state_lock in the moments
+		 * where zvol_open() or zvol_release() are trading locks to
+		 * call zvol_first_open() or zvol_last_close().
+		 */
+		while (zv->zv_open_count > 0 ||
+		    atomic_read(&zv->zv_suspend_ref))
+			cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
 
-	if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) {
 		/*
-		 * In use, so try to throw everyone off, then wait
-		 * until finished.
+		 * No users, shut down the OS side. This may not remove the
+		 * minor from view immediately, depending on the kernel
+		 * specifics, but it will ensure that it is unusable and that
+		 * this zvol_state_t can never again be reached from an OS-side
+		 * operation.
 		 */
-		zv->zv_flags |= ZVOL_REMOVING;
+		zvol_os_remove_minor(zv);
 		mutex_exit(&zv->zv_state_lock);
+
+		/* Remove it from the name lookup lists */
+		rw_enter(&zvol_state_lock, RW_WRITER);
+		zvol_remove(zv);
 		rw_exit(&zvol_state_lock);
-		zvol_remove_minor_task(zv);
-		return (0);
 	}
 
-	zvol_remove(zv);
-	zvol_os_clear_private(zv);
+	/*
+	 * Our own references on remove_list is the last one, free them and
+	 * we're done.
+	 */
+	while ((zv = list_remove_head(&remove_list)) != NULL)
+		zvol_os_free(zv);
 
-	mutex_exit(&zv->zv_state_lock);
-	rw_exit(&zvol_state_lock);
+	list_destroy(&remove_list);
+}
 
-	zvol_os_free(zv);
+/* Remove minor for this specific volume only */
+static int
+zvol_remove_minor_impl(const char *name)
+{
+	if (zvol_inhibit_dev)
+		return (0);
 
-	return (0);
+	zvol_task_t task;
+	memset(&task, 0, sizeof (zvol_task_t));
+	strlcpy(task.zt_name1, name, sizeof (task.zt_name1));
+	task.zt_value = B_FALSE;
+
+	zvol_remove_minors_impl(&task);
+
+	return (task.zt_error);
 }
 
 /*
@@ -2067,6 +2049,7 @@ zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
 	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
 	task->zt_op = ZVOL_ASYNC_REMOVE_MINORS;
 	strlcpy(task->zt_name1, name, sizeof (task->zt_name1));
+	task->zt_value = B_TRUE;
 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 	if ((async == B_FALSE) && (id != TASKQID_INVALID))
 		taskq_wait_id(spa->spa_zvol_taskq, id);
@@ -2188,14 +2171,6 @@ zvol_fini_impl(void)
 
 	zvol_remove_minors_impl(NULL);
 
-	/*
-	 * The call to "zvol_remove_minors_impl" may dispatch entries to
-	 * the system_taskq, but it doesn't wait for those entries to
-	 * complete before it returns. Thus, we must wait for all of the
-	 * removals to finish, before we can continue.
-	 */
-	taskq_wait_outstanding(system_taskq, 0);
-
 	kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
 	list_destroy(&zvol_state_list);
 	rw_destroy(&zvol_state_lock);