aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/zfs')
-rw-r--r--sys/contrib/openzfs/module/zfs/dbuf.c7
-rw-r--r--sys/contrib/openzfs/module/zfs/multilist.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_config.c115
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_misc.c8
-rw-r--r--sys/contrib/openzfs/module/zfs/zil.c349
-rw-r--r--sys/contrib/openzfs/module/zfs/zio.c51
-rw-r--r--sys/contrib/openzfs/module/zfs/zvol.c281
7 files changed, 387 insertions, 428 deletions
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
index 3d0f88b36336..7403f10d91b7 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -2557,12 +2557,13 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
/*
* Due to our use of dn_nlevels below, this can only be called
- * in open context, unless we are operating on the MOS.
- * From syncing context, dn_nlevels may be different from the
- * dn_nlevels used when dbuf was dirtied.
+ * in open context, unless we are operating on the MOS or it's
+ * a special object. From syncing context, dn_nlevels may be
+ * different from the dn_nlevels used when dbuf was dirtied.
*/
ASSERT(db->db_objset ==
dmu_objset_pool(db->db_objset)->dp_meta_objset ||
+ DMU_OBJECT_IS_SPECIAL(db->db.db_object) ||
txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT0(db->db_level);
diff --git a/sys/contrib/openzfs/module/zfs/multilist.c b/sys/contrib/openzfs/module/zfs/multilist.c
index 7b85d19e19ee..46fb79269310 100644
--- a/sys/contrib/openzfs/module/zfs/multilist.c
+++ b/sys/contrib/openzfs/module/zfs/multilist.c
@@ -81,7 +81,7 @@ multilist_create_impl(multilist_t *ml, size_t size, size_t offset,
ml->ml_num_sublists = num;
ml->ml_index_func = index_func;
- ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
+ ml->ml_sublists = vmem_zalloc(sizeof (multilist_sublist_t) *
ml->ml_num_sublists, KM_SLEEP);
ASSERT3P(ml->ml_sublists, !=, NULL);
@@ -134,7 +134,7 @@ multilist_destroy(multilist_t *ml)
}
ASSERT3P(ml->ml_sublists, !=, NULL);
- kmem_free(ml->ml_sublists,
+ vmem_free(ml->ml_sublists,
sizeof (multilist_sublist_t) * ml->ml_num_sublists);
ml->ml_num_sublists = 0;
diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c
index 7d4d06659146..cf28955b0c50 100644
--- a/sys/contrib/openzfs/module/zfs/spa_config.c
+++ b/sys/contrib/openzfs/module/zfs/spa_config.c
@@ -48,18 +48,17 @@
/*
* Pool configuration repository.
*
- * Pool configuration is stored as a packed nvlist on the filesystem. By
- * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot
- * (when the ZFS module is loaded). Pools can also have the 'cachefile'
- * property set that allows them to be stored in an alternate location until
- * the control of external software.
+ * Pool configuration is stored as a packed nvlist on the filesystem. When
+ * pools are imported they are added to the /etc/zfs/zpool.cache file and
+ * removed from it when exported. For each cache file, we have a single nvlist
+ * which holds all the configuration information. Pools can also have the
+ * 'cachefile' property set which allows this config to be stored in an
+ * alternate location under the control of external software.
*
- * For each cache file, we have a single nvlist which holds all the
- * configuration information. When the module loads, we read this information
- * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is
- * maintained independently in spa.c. Whenever the namespace is modified, or
- * the configuration of a pool is changed, we call spa_write_cachefile(), which
- * walks through all the active pools and writes the configuration to disk.
+ * The kernel independantly maintains an AVL tree of imported pools. See the
+ * "SPA locking" comment in spa.c. Whenever a pool configuration is modified
+ * we call spa_write_cachefile() which walks through all the active pools and
+ * writes the updated configuration to to /etc/zfs/zpool.cache file.
*/
static uint64_t spa_config_generation = 1;
@@ -69,94 +68,6 @@ static uint64_t spa_config_generation = 1;
* userland pools when doing testing.
*/
char *spa_config_path = (char *)ZPOOL_CACHE;
-#ifdef _KERNEL
-static int zfs_autoimport_disable = B_TRUE;
-#endif
-
-/*
- * Called when the module is first loaded, this routine loads the configuration
- * file into the SPA namespace. It does not actually open or load the pools; it
- * only populates the namespace.
- */
-void
-spa_config_load(void)
-{
- void *buf = NULL;
- nvlist_t *nvlist, *child;
- nvpair_t *nvpair;
- char *pathname;
- zfs_file_t *fp;
- zfs_file_attr_t zfa;
- uint64_t fsize;
- int err;
-
-#ifdef _KERNEL
- if (zfs_autoimport_disable)
- return;
-#endif
-
- /*
- * Open the configuration file.
- */
- pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-
- (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
-
- err = zfs_file_open(pathname, O_RDONLY, 0, &fp);
-
-#ifdef __FreeBSD__
- if (err)
- err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp);
-#endif
- kmem_free(pathname, MAXPATHLEN);
-
- if (err)
- return;
-
- if (zfs_file_getattr(fp, &zfa))
- goto out;
-
- fsize = zfa.zfa_size;
- buf = kmem_alloc(fsize, KM_SLEEP);
-
- /*
- * Read the nvlist from the file.
- */
- if (zfs_file_read(fp, buf, fsize, NULL) < 0)
- goto out;
-
- /*
- * Unpack the nvlist.
- */
- if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
- goto out;
-
- /*
- * Iterate over all elements in the nvlist, creating a new spa_t for
- * each one with the specified configuration.
- */
- mutex_enter(&spa_namespace_lock);
- nvpair = NULL;
- while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
- if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
- continue;
-
- child = fnvpair_value_nvlist(nvpair);
-
- if (spa_lookup(nvpair_name(nvpair)) != NULL)
- continue;
- (void) spa_add(nvpair_name(nvpair), child, NULL);
- }
- mutex_exit(&spa_namespace_lock);
-
- nvlist_free(nvlist);
-
-out:
- if (buf != NULL)
- kmem_free(buf, fsize);
-
- zfs_file_close(fp);
-}
static int
spa_config_remove(spa_config_dirent_t *dp)
@@ -623,7 +534,6 @@ spa_config_update(spa_t *spa, int what)
spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
}
-EXPORT_SYMBOL(spa_config_load);
EXPORT_SYMBOL(spa_all_configs);
EXPORT_SYMBOL(spa_config_set);
EXPORT_SYMBOL(spa_config_generate);
@@ -634,8 +544,3 @@ EXPORT_SYMBOL(spa_config_update);
ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD,
"SPA config file (/etc/zfs/zpool.cache)");
#endif
-
-#ifdef _KERNEL
-ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW,
- "Disable pool import at module load");
-#endif
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index cce772eae598..dceafbc27556 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -2548,13 +2548,6 @@ spa_name_compare(const void *a1, const void *a2)
}
void
-spa_boot_init(void *unused)
-{
- (void) unused;
- spa_config_load();
-}
-
-void
spa_init(spa_mode_t mode)
{
mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -2607,7 +2600,6 @@ spa_init(spa_mode_t mode)
chksum_init();
zpool_prop_init();
zpool_feature_init();
- spa_config_load();
vdev_prop_init();
l2arc_start();
scan_init();
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c
index 31b59c55f17b..0307df55aa21 100644
--- a/sys/contrib/openzfs/module/zfs/zil.c
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@@ -819,34 +819,37 @@ zil_lwb_vdev_compare(const void *x1, const void *x2)
* we choose them here and later make the block allocation match.
*/
static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
- uint64_t txg, lwb_state_t state)
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, int min_sz, int sz,
+ boolean_t slog, uint64_t txg)
{
lwb_t *lwb;
lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+ lwb->lwb_flags = 0;
lwb->lwb_zilog = zilog;
if (bp) {
lwb->lwb_blk = *bp;
- lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2);
+ if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2)
+ lwb->lwb_flags |= LWB_FLAG_SLIM;
sz = BP_GET_LSIZE(bp);
+ lwb->lwb_min_sz = sz;
} else {
BP_ZERO(&lwb->lwb_blk);
- lwb->lwb_slim = (spa_version(zilog->zl_spa) >=
- SPA_VERSION_SLIM_ZIL);
+ if (spa_version(zilog->zl_spa) >= SPA_VERSION_SLIM_ZIL)
+ lwb->lwb_flags |= LWB_FLAG_SLIM;
+ lwb->lwb_min_sz = min_sz;
}
- lwb->lwb_slog = slog;
+ if (slog)
+ lwb->lwb_flags |= LWB_FLAG_SLOG;
lwb->lwb_error = 0;
- if (lwb->lwb_slim) {
- lwb->lwb_nmax = sz;
- lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
- } else {
- lwb->lwb_nmax = sz - sizeof (zil_chain_t);
- lwb->lwb_nused = lwb->lwb_nfilled = 0;
- }
+ /*
+ * Buffer allocation and capacity setup will be done in
+ * zil_lwb_write_open() when the LWB is opened for ITX assignment.
+ */
+ lwb->lwb_nmax = lwb->lwb_nused = lwb->lwb_nfilled = 0;
lwb->lwb_sz = sz;
- lwb->lwb_state = state;
- lwb->lwb_buf = zio_buf_alloc(sz);
+ lwb->lwb_buf = NULL;
+ lwb->lwb_state = LWB_STATE_NEW;
lwb->lwb_child_zio = NULL;
lwb->lwb_write_zio = NULL;
lwb->lwb_root_zio = NULL;
@@ -857,8 +860,6 @@ zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
mutex_enter(&zilog->zl_lock);
list_insert_tail(&zilog->zl_lwb_list, lwb);
- if (state != LWB_STATE_NEW)
- zilog->zl_last_lwb_opened = lwb;
mutex_exit(&zilog->zl_lock);
return (lwb);
@@ -878,7 +879,7 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
VERIFY(list_is_empty(&lwb->lwb_itxs));
VERIFY(list_is_empty(&lwb->lwb_waiters));
ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
- ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
+ ASSERT(!MUTEX_HELD(&lwb->lwb_lock));
/*
* Clear the zilog's field to indicate this lwb is no longer
@@ -1019,7 +1020,7 @@ zil_create(zilog_t *zilog)
}
error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
- ZIL_MIN_BLKSZ, &slog);
+ ZIL_MIN_BLKSZ, ZIL_MIN_BLKSZ, &slog, B_TRUE);
if (error == 0)
zil_init_log_chain(zilog, &blk);
}
@@ -1028,7 +1029,7 @@ zil_create(zilog_t *zilog)
* Allocate a log write block (lwb) for the first log block.
*/
if (error == 0)
- lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
+ lwb = zil_alloc_lwb(zilog, &blk, 0, 0, slog, txg);
/*
* If we just allocated the first log block, commit our transaction
@@ -1324,10 +1325,12 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
* zil_commit() is racing with spa_sync().
*/
static void
-zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
+zil_commit_waiter_done(zil_commit_waiter_t *zcw, int err)
{
mutex_enter(&zcw->zcw_lock);
ASSERT3B(zcw->zcw_done, ==, B_FALSE);
+ zcw->zcw_lwb = NULL;
+ zcw->zcw_error = err;
zcw->zcw_done = B_TRUE;
cv_broadcast(&zcw->zcw_cv);
mutex_exit(&zcw->zcw_lock);
@@ -1389,7 +1392,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
if (zil_nocacheflush)
return;
- mutex_enter(&lwb->lwb_vdev_lock);
+ mutex_enter(&lwb->lwb_lock);
for (i = 0; i < ndvas; i++) {
zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
if (avl_find(t, &zvsearch, &where) == NULL) {
@@ -1398,7 +1401,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
avl_insert(t, zv, where);
}
}
- mutex_exit(&lwb->lwb_vdev_lock);
+ mutex_exit(&lwb->lwb_lock);
}
static void
@@ -1415,12 +1418,12 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
/*
* While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
- * not need the protection of lwb_vdev_lock (it will only be modified
+ * not need the protection of lwb_lock (it will only be modified
* while holding zilog->zl_lock) as its writes and those of its
* children have all completed. The younger 'nlwb' may be waiting on
* future writes to additional vdevs.
*/
- mutex_enter(&nlwb->lwb_vdev_lock);
+ mutex_enter(&nlwb->lwb_lock);
/*
* Tear down the 'lwb' vdev tree, ensuring that entries which do not
* exist in 'nlwb' are moved to it, freeing any would-be duplicates.
@@ -1434,7 +1437,7 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
kmem_free(zv, sizeof (*zv));
}
}
- mutex_exit(&nlwb->lwb_vdev_lock);
+ mutex_exit(&nlwb->lwb_lock);
}
void
@@ -1491,10 +1494,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
zil_itx_destroy(itx, 0);
while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
- mutex_enter(&zcw->zcw_lock);
-
- ASSERT3P(zcw->zcw_lwb, ==, lwb);
- zcw->zcw_lwb = NULL;
/*
* We expect any ZIO errors from child ZIOs to have been
* propagated "up" to this specific LWB's root ZIO, in
@@ -1509,14 +1508,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
* errors not being handled correctly here. See the
* comment above the call to "zio_flush" for details.
*/
-
- zcw->zcw_zio_error = zio->io_error;
-
- ASSERT3B(zcw->zcw_done, ==, B_FALSE);
- zcw->zcw_done = B_TRUE;
- cv_broadcast(&zcw->zcw_cv);
-
- mutex_exit(&zcw->zcw_lock);
+ zil_commit_waiter_done(zcw, zio->io_error);
}
uint64_t txg = lwb->lwb_issued_txg;
@@ -1588,7 +1580,7 @@ zil_lwb_write_done(zio_t *zio)
avl_tree_t *t = &lwb->lwb_vdev_tree;
void *cookie = NULL;
zil_vdev_node_t *zv;
- lwb_t *nlwb;
+ lwb_t *nlwb = NULL;
ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
@@ -1608,9 +1600,11 @@ zil_lwb_write_done(zio_t *zio)
* its write ZIO a parent this ZIO. In such case we can not defer
* our flushes or below may be a race between the done callbacks.
*/
- nlwb = list_next(&zilog->zl_lwb_list, lwb);
- if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
- nlwb = NULL;
+ if (!(lwb->lwb_flags & LWB_FLAG_CRASHED)) {
+ nlwb = list_next(&zilog->zl_lwb_list, lwb);
+ if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
+ nlwb = NULL;
+ }
mutex_exit(&zilog->zl_lock);
if (avl_numnodes(t) == 0)
@@ -1624,12 +1618,17 @@ zil_lwb_write_done(zio_t *zio)
* written out.
*
* Additionally, we don't perform any further error handling at
- * this point (e.g. setting "zcw_zio_error" appropriately), as
- * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus,
- * we expect any error seen here, to have been propagated to
- * that function).
+ * this point (e.g. setting "zcw_error" appropriately), as we
+ * expect that to occur in "zil_lwb_flush_vdevs_done" (thus, we
+ * expect any error seen here, to have been propagated to that
+ * function).
+ *
+ * Note that we treat a "crashed" LWB as though it was in error,
+ * even if it did appear to succeed, because we've already
+ * signaled error and cleaned up waiters and committers in
+ * zil_crash(); we just want to clean up and get out of here.
*/
- if (zio->io_error != 0) {
+ if (zio->io_error != 0 || (lwb->lwb_flags & LWB_FLAG_CRASHED)) {
while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
kmem_free(zv, sizeof (*zv));
return;
@@ -1742,10 +1741,26 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
return;
}
+ mutex_enter(&lwb->lwb_lock);
mutex_enter(&zilog->zl_lock);
lwb->lwb_state = LWB_STATE_OPENED;
zilog->zl_last_lwb_opened = lwb;
mutex_exit(&zilog->zl_lock);
+ mutex_exit(&lwb->lwb_lock);
+
+ /*
+ * Allocate buffer and set up LWB capacities.
+ */
+ ASSERT0P(lwb->lwb_buf);
+ ASSERT3U(lwb->lwb_sz, >, 0);
+ lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
+ if (lwb->lwb_flags & LWB_FLAG_SLIM) {
+ lwb->lwb_nmax = lwb->lwb_sz;
+ lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
+ } else {
+ lwb->lwb_nmax = lwb->lwb_sz - sizeof (zil_chain_t);
+ lwb->lwb_nused = lwb->lwb_nfilled = 0;
+ }
}
/*
@@ -1762,6 +1777,8 @@ static uint_t
zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
{
uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t);
+ uint_t waste = zil_max_waste_space(zilog);
+ waste = MAX(waste, zilog->zl_cur_max);
if (size <= md) {
/*
@@ -1772,9 +1789,10 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
} else if (size > 8 * md) {
/*
* Big bursts use maximum blocks. The first block size
- * is hard to predict, but it does not really matter.
+ * is hard to predict, but we need at least enough space
+ * to make reasonable progress.
*/
- *minsize = 0;
+ *minsize = waste;
return (md);
}
@@ -1787,57 +1805,52 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
uint_t s = size;
uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t));
uint_t chunk = DIV_ROUND_UP(s, n);
- uint_t waste = zil_max_waste_space(zilog);
- waste = MAX(waste, zilog->zl_cur_max);
if (chunk <= md - waste) {
*minsize = MAX(s - (md - waste) * (n - 1), waste);
return (chunk);
} else {
- *minsize = 0;
+ *minsize = waste;
return (md);
}
}
/*
* Try to predict next block size based on previous history. Make prediction
- * sufficient for 7 of 8 previous bursts. Don't try to save if the saving is
- * less then 50%, extra writes may cost more, but we don't want single spike
- * to badly affect our predictions.
+ * sufficient for 7 of 8 previous bursts, but don't try to save if the saving
+ * is less then 50%. Extra writes may cost more, but we don't want single
+ * spike to badly affect our predictions.
*/
-static uint_t
-zil_lwb_predict(zilog_t *zilog)
+static void
+zil_lwb_predict(zilog_t *zilog, uint64_t *min_predict, uint64_t *max_predict)
{
- uint_t m, o;
+ uint_t m1 = 0, m2 = 0, o;
- /* If we are in the middle of a burst, take it into account also. */
- if (zilog->zl_cur_size > 0) {
- o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m);
- } else {
+ /* If we are in the middle of a burst, take it as another data point. */
+ if (zilog->zl_cur_size > 0)
+ o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m1);
+ else
o = UINT_MAX;
- m = 0;
- }
- /* Find minimum optimal size. We don't need to go below that. */
- for (int i = 0; i < ZIL_BURSTS; i++)
- o = MIN(o, zilog->zl_prev_opt[i]);
-
- /* Find two biggest minimal first block sizes above the optimal. */
- uint_t m1 = MAX(m, o), m2 = o;
+ /* Find two largest minimal first block sizes. */
for (int i = 0; i < ZIL_BURSTS; i++) {
- m = zilog->zl_prev_min[i];
- if (m >= m1) {
+ uint_t cur = zilog->zl_prev_min[i];
+ if (cur >= m1) {
m2 = m1;
- m1 = m;
- } else if (m > m2) {
- m2 = m;
+ m1 = cur;
+ } else if (cur > m2) {
+ m2 = cur;
}
}
- /*
- * If second minimum size gives 50% saving -- use it. It may cost us
- * one additional write later, but the space saving is just too big.
- */
- return ((m1 < m2 * 2) ? m1 : m2);
+ /* Minimum should guarantee progress in most cases. */
+ *min_predict = (m1 < m2 * 2) ? m1 : m2;
+
+ /* Maximum doesn't need to go below the minimum optimal size. */
+ for (int i = 0; i < ZIL_BURSTS; i++)
+ o = MIN(o, zilog->zl_prev_opt[i]);
+ m1 = MAX(m1, o);
+ m2 = MAX(m2, o);
+ *max_predict = (m1 < m2 * 2) ? m1 : m2;
}
/*
@@ -1845,12 +1858,13 @@ zil_lwb_predict(zilog_t *zilog)
* Has to be called under zl_issuer_lock to chain more lwbs.
*/
static lwb_t *
-zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
+zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb)
{
- uint64_t blksz, plan, plan2;
+ uint64_t minbs, maxbs;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+ membar_producer();
lwb->lwb_state = LWB_STATE_CLOSED;
/*
@@ -1875,27 +1889,34 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
* Try to predict what can it be and plan for the worst case.
*/
uint_t m;
- plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
+ maxbs = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
+ minbs = m;
if (zilog->zl_parallel) {
- plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left +
- zil_lwb_predict(zilog), &m);
- if (plan < plan2)
- plan = plan2;
+ uint64_t minp, maxp;
+ zil_lwb_predict(zilog, &minp, &maxp);
+ maxp = zil_lwb_plan(zilog, zilog->zl_cur_left + maxp,
+ &m);
+ if (maxbs < maxp)
+ maxbs = maxp;
}
} else {
/*
* The previous burst is done and we can only predict what
* will come next.
*/
- plan = zil_lwb_predict(zilog);
+ zil_lwb_predict(zilog, &minbs, &maxbs);
}
- blksz = plan + sizeof (zil_chain_t);
- blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t);
- blksz = MIN(blksz, zilog->zl_max_block_size);
- DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz,
- uint64_t, plan);
- return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state));
+ minbs += sizeof (zil_chain_t);
+ maxbs += sizeof (zil_chain_t);
+ minbs = P2ROUNDUP_TYPED(minbs, ZIL_MIN_BLKSZ, uint64_t);
+ maxbs = P2ROUNDUP_TYPED(maxbs, ZIL_MIN_BLKSZ, uint64_t);
+ maxbs = MIN(maxbs, zilog->zl_max_block_size);
+ minbs = MIN(minbs, maxbs);
+ DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, minbs,
+ uint64_t, maxbs);
+
+ return (zil_alloc_lwb(zilog, NULL, minbs, maxbs, 0, 0));
}
/*
@@ -1944,14 +1965,16 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
mutex_exit(&zilog->zl_lock);
next_lwb:
- if (lwb->lwb_slim)
+ if (lwb->lwb_flags & LWB_FLAG_SLIM)
zilc = (zil_chain_t *)lwb->lwb_buf;
else
zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
- int wsz = lwb->lwb_sz;
+ uint64_t alloc_size = BP_GET_LSIZE(&lwb->lwb_blk);
+ int wsz = alloc_size;
if (lwb->lwb_error == 0) {
abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
- if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk)
+ if (!(lwb->lwb_flags & LWB_FLAG_SLOG) ||
+ zilog->zl_cur_size <= zil_slog_bulk)
prio = ZIO_PRIORITY_SYNC_WRITE;
else
prio = ZIO_PRIORITY_ASYNC_WRITE;
@@ -1959,16 +1982,17 @@ next_lwb:
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
- &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
+ &lwb->lwb_blk, lwb_abd, alloc_size, zil_lwb_write_done,
lwb, prio, ZIO_FLAG_CANFAIL, &zb);
zil_lwb_add_block(lwb, &lwb->lwb_blk);
- if (lwb->lwb_slim) {
+ if (lwb->lwb_flags & LWB_FLAG_SLIM) {
/* For Slim ZIL only write what is used. */
wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
int);
- ASSERT3S(wsz, <=, lwb->lwb_sz);
- zio_shrink(lwb->lwb_write_zio, wsz);
+ ASSERT3S(wsz, <=, alloc_size);
+ if (wsz < alloc_size)
+ zio_shrink(lwb->lwb_write_zio, wsz);
wsz = lwb->lwb_write_zio->io_size;
}
memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
@@ -2004,13 +2028,53 @@ next_lwb:
BP_ZERO(bp);
error = lwb->lwb_error;
if (error == 0) {
- error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
- &slog);
+ /*
+ * Allocation flexibility depends on LWB state:
+ * if NEW: allow range allocation and larger sizes;
+ * if OPENED: use fixed predetermined allocation size;
+ * if CLOSED + Slim: allocate precisely for actual usage.
+ */
+ boolean_t flexible = (nlwb->lwb_state == LWB_STATE_NEW);
+ if (flexible) {
+ /* We need to prevent opening till we update lwb_sz. */
+ mutex_enter(&nlwb->lwb_lock);
+ flexible = (nlwb->lwb_state == LWB_STATE_NEW);
+ if (!flexible)
+ mutex_exit(&nlwb->lwb_lock); /* We lost. */
+ }
+ boolean_t closed_slim = (nlwb->lwb_state == LWB_STATE_CLOSED &&
+ (lwb->lwb_flags & LWB_FLAG_SLIM));
+
+ uint64_t min_size, max_size;
+ if (closed_slim) {
+ /* This transition is racy, but only one way. */
+ membar_consumer();
+ min_size = max_size = P2ROUNDUP_TYPED(nlwb->lwb_nused,
+ ZIL_MIN_BLKSZ, uint64_t);
+ } else if (flexible) {
+ min_size = nlwb->lwb_min_sz;
+ max_size = nlwb->lwb_sz;
+ } else {
+ min_size = max_size = nlwb->lwb_sz;
+ }
+
+ error = zio_alloc_zil(spa, zilog->zl_os, txg, bp,
+ min_size, max_size, &slog, flexible);
+ if (error == 0) {
+ if (closed_slim)
+ ASSERT3U(BP_GET_LSIZE(bp), ==, max_size);
+ else if (flexible)
+ nlwb->lwb_sz = BP_GET_LSIZE(bp);
+ else
+ ASSERT3U(BP_GET_LSIZE(bp), ==, nlwb->lwb_sz);
+ }
+ if (flexible)
+ mutex_exit(&nlwb->lwb_lock);
}
if (error == 0) {
ASSERT3U(BP_GET_BIRTH(bp), ==, txg);
- BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
- ZIO_CHECKSUM_ZILOG);
+ BP_SET_CHECKSUM(bp, (nlwb->lwb_flags & LWB_FLAG_SLIM) ?
+ ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
bp->blk_cksum = lwb->lwb_blk.blk_cksum;
bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
}
@@ -2039,14 +2103,15 @@ next_lwb:
if (nlwb) {
nlwb->lwb_blk = *bp;
nlwb->lwb_error = error;
- nlwb->lwb_slog = slog;
+ if (slog)
+ nlwb->lwb_flags |= LWB_FLAG_SLOG;
nlwb->lwb_alloc_txg = txg;
if (nlwb->lwb_state != LWB_STATE_READY)
nlwb = NULL;
}
mutex_exit(&zilog->zl_lock);
- if (lwb->lwb_slog) {
+ if (lwb->lwb_flags & LWB_FLAG_SLOG) {
ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
lwb->lwb_nused);
@@ -2220,7 +2285,6 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT3P(lwb, !=, NULL);
- ASSERT3P(lwb->lwb_buf, !=, NULL);
zil_lwb_write_open(zilog, lwb);
@@ -2262,9 +2326,10 @@ cont:
(dlen % max_log_data == 0 ||
lwb_sp < reclen + dlen % max_log_data))) {
list_insert_tail(ilwbs, lwb);
- lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
+ lwb = zil_lwb_write_close(zilog, lwb);
if (lwb == NULL)
return (NULL);
+ zil_lwb_write_open(zilog, lwb);
lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
}
@@ -2554,7 +2619,7 @@ zil_itxg_clean(void *arg)
* called) we will hit this case.
*/
if (itx->itx_lr.lrc_txtype == TX_COMMIT)
- zil_commit_waiter_skip(itx->itx_private);
+ zil_commit_waiter_done(itx->itx_private, 0);
zil_itx_destroy(itx, 0);
}
@@ -2742,6 +2807,7 @@ zil_crash_clean(zilog_t *zilog, uint64_t synced_txg)
}
/* This LWB is from the past, so we can clean it up now. */
+ ASSERT(lwb->lwb_flags & LWB_FLAG_CRASHED);
list_remove(&zilog->zl_lwb_crash_list, lwb);
if (lwb->lwb_buf != NULL)
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
@@ -2981,7 +3047,7 @@ zil_prune_commit_list(zilog_t *zilog)
* never any itx's for it to wait on), so it's
* safe to skip this waiter and mark it done.
*/
- zil_commit_waiter_skip(itx->itx_private);
+ zil_commit_waiter_done(itx->itx_private, 0);
} else {
zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
}
@@ -3212,15 +3278,21 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
* "next" lwb on-disk. When this happens, we must stall
* the ZIL write pipeline; see the comment within
* zil_commit_writer_stall() for more details.
+ *
+ * ESHUTDOWN has to be handled carefully here. If we get it,
+ * then the pool suspended and zil_crash() was called, so we
+ * need to stop trying and just get an error back to the
+ * callers.
*/
int err = 0;
while ((lwb = list_remove_head(ilwbs)) != NULL) {
- err = zil_lwb_write_issue(zilog, lwb);
- if (err != 0)
- break;
+ if (err == 0)
+ err = zil_lwb_write_issue(zilog, lwb);
}
- if (err == 0)
+ if (err != ESHUTDOWN)
err = zil_commit_writer_stall(zilog);
+ if (err == ESHUTDOWN)
+ err = SET_ERROR(EIO);
/*
* Additionally, we have to signal and mark the "nolwb"
@@ -3230,7 +3302,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
*/
zil_commit_waiter_t *zcw;
while ((zcw = list_remove_head(&nolwb_waiters)) != NULL)
- zil_commit_waiter_skip(zcw);
+ zil_commit_waiter_done(zcw, err);
/*
* And finally, we have to destroy the itx's that
@@ -3238,7 +3310,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
* the itx's callback if one exists for the itx.
*/
while ((itx = list_remove_head(&nolwb_itxs)) != NULL)
- zil_itx_destroy(itx, 0);
+ zil_itx_destroy(itx, err);
} else {
ASSERT(list_is_empty(&nolwb_waiters));
ASSERT3P(lwb, !=, NULL);
@@ -3292,17 +3364,17 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
(!zilog->zl_parallel || zilog->zl_suspend > 0)) {
zil_burst_done(zilog);
list_insert_tail(ilwbs, lwb);
- lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
+ lwb = zil_lwb_write_close(zilog, lwb);
if (lwb == NULL) {
int err = 0;
while ((lwb =
list_remove_head(ilwbs)) != NULL) {
- err = zil_lwb_write_issue(zilog, lwb);
- if (err != 0)
- break;
+ if (err == 0)
+ err = zil_lwb_write_issue(
+ zilog, lwb);
}
- if (err == 0)
- zil_commit_writer_stall(zilog);
+ if (err != ESHUTDOWN)
+ (void) zil_commit_writer_stall(zilog);
}
}
}
@@ -3470,7 +3542,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* hasn't been issued.
*/
zil_burst_done(zilog);
- lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
+ lwb_t *nlwb = zil_lwb_write_close(zilog, lwb);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
@@ -3546,7 +3618,7 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
* commit itxs. When this occurs, the commit waiters linked
* off of these commit itxs will not be committed to an
* lwb. Additionally, these commit waiters will not be
- * marked done until zil_commit_waiter_skip() is called via
+ * marked done until zil_commit_waiter_done() is called via
* zil_itxg_clean().
*
* Thus, it's possible for this commit waiter (i.e. the
@@ -3624,7 +3696,7 @@ zil_alloc_commit_waiter(void)
list_link_init(&zcw->zcw_node);
zcw->zcw_lwb = NULL;
zcw->zcw_done = B_FALSE;
- zcw->zcw_zio_error = 0;
+ zcw->zcw_error = 0;
return (zcw);
}
@@ -3728,6 +3800,9 @@ zil_crash(zilog_t *zilog)
*/
for (lwb_t *lwb = list_head(&zilog->zl_lwb_crash_list); lwb != NULL;
lwb = list_next(&zilog->zl_lwb_crash_list, lwb)) {
+ ASSERT(!(lwb->lwb_flags & LWB_FLAG_CRASHED));
+ lwb->lwb_flags |= LWB_FLAG_CRASHED;
+
itx_t *itx;
while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
zil_itx_destroy(itx, EIO);
@@ -3736,7 +3811,7 @@ zil_crash(zilog_t *zilog)
while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
mutex_enter(&zcw->zcw_lock);
zcw->zcw_lwb = NULL;
- zcw->zcw_zio_error = EIO;
+ zcw->zcw_error = EIO;
zcw->zcw_done = B_TRUE;
cv_broadcast(&zcw->zcw_cv);
mutex_exit(&zcw->zcw_lock);
@@ -4014,7 +4089,7 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
zil_commit_waiter(zilog, zcw);
int err = 0;
- if (zcw->zcw_zio_error != 0) {
+ if (zcw->zcw_error != 0) {
/*
* If there was an error writing out the ZIL blocks that
* this thread is waiting on, then we fallback to
@@ -4149,7 +4224,7 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag)
offsetof(zil_commit_waiter_t, zcw_node));
avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
- mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&lwb->lwb_lock, NULL, MUTEX_DEFAULT, NULL);
return (0);
}
@@ -4158,7 +4233,7 @@ zil_lwb_dest(void *vbuf, void *unused)
{
(void) unused;
lwb_t *lwb = vbuf;
- mutex_destroy(&lwb->lwb_vdev_lock);
+ mutex_destroy(&lwb->lwb_lock);
avl_destroy(&lwb->lwb_vdev_tree);
list_destroy(&lwb->lwb_waiters);
list_destroy(&lwb->lwb_itxs);
@@ -4381,7 +4456,7 @@ zil_close(zilog_t *zilog)
if (lwb != NULL) {
ASSERT(list_is_empty(&zilog->zl_lwb_list));
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
- zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ ASSERT0P(lwb->lwb_buf);
zil_free_lwb(zilog, lwb);
}
mutex_exit(&zilog->zl_lock);
@@ -4472,16 +4547,16 @@ zil_suspend(const char *osname, void **cookiep)
cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
mutex_exit(&zilog->zl_lock);
- if (cookiep == NULL)
+ if (zilog->zl_restart_txg > 0) {
+ /* ZIL crashed while we were waiting. */
+ zil_resume(os);
+ error = SET_ERROR(EBUSY);
+ } else if (cookiep == NULL)
zil_resume(os);
else
*cookiep = os;
- if (zilog->zl_restart_txg > 0)
- /* ZIL crashed while we were waiting. */
- return (SET_ERROR(EBUSY));
-
- return (0);
+ return (error);
}
/*
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index 3f0ddb63249d..4cf8912d4269 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -4434,12 +4434,15 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
*/
int
zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
- uint64_t size, boolean_t *slog)
+ uint64_t min_size, uint64_t max_size, boolean_t *slog,
+ boolean_t allow_larger)
{
int error;
zio_alloc_list_t io_alloc_list;
+ uint64_t alloc_size = 0;
ASSERT(txg > spa_syncing_txg(spa));
+ ASSERT3U(min_size, <=, max_size);
metaslab_trace_init(&io_alloc_list);
@@ -4448,7 +4451,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
* Fill in the obvious ones before calling into metaslab_alloc().
*/
BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
- BP_SET_PSIZE(new_bp, size);
+ BP_SET_PSIZE(new_bp, max_size);
BP_SET_LEVEL(new_bp, 0);
/*
@@ -4463,43 +4466,51 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
ZIOSTAT_BUMP(ziostat_total_allocations);
/* Try log class (dedicated slog devices) first */
- error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
- txg, NULL, flags, &io_alloc_list, allocator, NULL);
+ error = metaslab_alloc_range(spa, spa_log_class(spa), min_size,
+ max_size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
+ NULL, &alloc_size);
*slog = (error == 0);
/* Try special_embedded_log class (reserved on special vdevs) */
if (error != 0) {
- error = metaslab_alloc(spa, spa_special_embedded_log_class(spa),
- size, new_bp, 1, txg, NULL, flags, &io_alloc_list,
- allocator, NULL);
+ error = metaslab_alloc_range(spa,
+ spa_special_embedded_log_class(spa), min_size, max_size,
+ new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
+ NULL, &alloc_size);
}
/* Try special class (general special vdev allocation) */
if (error != 0) {
- error = metaslab_alloc(spa, spa_special_class(spa), size,
- new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
- NULL);
+ error = metaslab_alloc_range(spa, spa_special_class(spa),
+ min_size, max_size, new_bp, 1, txg, NULL, flags,
+ &io_alloc_list, allocator, NULL, &alloc_size);
}
/* Try embedded_log class (reserved on normal vdevs) */
if (error != 0) {
- error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
- new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
- NULL);
+ error = metaslab_alloc_range(spa, spa_embedded_log_class(spa),
+ min_size, max_size, new_bp, 1, txg, NULL, flags,
+ &io_alloc_list, allocator, NULL, &alloc_size);
}
/* Finally fall back to normal class */
if (error != 0) {
ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks);
- error = metaslab_alloc(spa, spa_normal_class(spa), size,
- new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
- NULL);
+ error = metaslab_alloc_range(spa, spa_normal_class(spa),
+ min_size, max_size, new_bp, 1, txg, NULL, flags,
+ &io_alloc_list, allocator, NULL, &alloc_size);
}
metaslab_trace_fini(&io_alloc_list);
if (error == 0) {
- BP_SET_LSIZE(new_bp, size);
- BP_SET_PSIZE(new_bp, size);
+ if (!allow_larger)
+ alloc_size = MIN(alloc_size, max_size);
+ else if (max_size <= SPA_OLD_MAXBLOCKSIZE)
+ alloc_size = MIN(alloc_size, SPA_OLD_MAXBLOCKSIZE);
+ alloc_size = P2ALIGN_TYPED(alloc_size, ZIL_MIN_BLKSZ, uint64_t);
+
+ BP_SET_LSIZE(new_bp, alloc_size);
+ BP_SET_PSIZE(new_bp, alloc_size);
BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
BP_SET_CHECKSUM(new_bp,
spa_version(spa) >= SPA_VERSION_SLIM_ZIL
@@ -4527,8 +4538,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
}
} else {
zfs_dbgmsg("%s: zil block allocation failure: "
- "size %llu, error %d", spa_name(spa), (u_longlong_t)size,
- error);
+ "min_size %llu, max_size %llu, error %d", spa_name(spa),
+ (u_longlong_t)min_size, (u_longlong_t)max_size, error);
}
return (error);
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
index 29f51e230a37..2fd3e1c37045 100644
--- a/sys/contrib/openzfs/module/zfs/zvol.c
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -38,25 +38,36 @@
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Klara, Inc.
*/
/*
* Note on locking of zvol state structures.
*
- * These structures are used to maintain internal state used to emulate block
- * devices on top of zvols. In particular, management of device minor number
- * operations - create, remove, rename, and set_snapdev - involves access to
- * these structures. The zvol_state_lock is primarily used to protect the
- * zvol_state_list. The zv->zv_state_lock is used to protect the contents
- * of the zvol_state_t structures, as well as to make sure that when the
- * time comes to remove the structure from the list, it is not in use, and
- * therefore, it can be taken off zvol_state_list and freed.
+ * zvol_state_t represents the connection between a single dataset
+ * (DMU_OST_ZVOL) and the device "minor" (some OS-specific representation of a
+ * "disk" or "device" or "volume", eg, a /dev/zdXX node, a GEOM object, etc).
*
- * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
- * e.g. for the duration of receive and rollback operations. This lock can be
- * held for significant periods of time. Given that it is undesirable to hold
- * mutexes for long periods of time, the following lock ordering applies:
+ * The global zvol_state_lock is used to protect access to zvol_state_list and
+ * zvol_htable, which are the primary way to obtain a zvol_state_t from a name.
+ * It should not be used for anything not name-relateds, and you should avoid
+ * sleeping or waiting while its held. See zvol_find_by_name(), zvol_insert(),
+ * zvol_remove().
+ *
+ * The zv_state_lock is used to protect the contents of the associated
+ * zvol_state_t. Most of the zvol_state_t is dedicated to control and
+ * configuration; almost none of it is needed for data operations (that is,
+ * read, write, flush) so this lock is rarely taken during general IO. It
+ * should be released quickly; you should avoid sleeping or waiting while its
+ * held.
+ *
+ * zv_suspend_lock is used to suspend IO/data operations to a zvol. The read
+ * half should held for the duration of an IO operation. The write half should
+ * be taken when something to wait for IO to complete and the block further IO,
+ * eg for the duration of receive and rollback operations. This lock can be
+ * held for long periods of time.
+ *
+ * Thus, the following lock ordering appies.
* - take zvol_state_lock if necessary, to protect zvol_state_list
* - take zv_suspend_lock if necessary, by the code path in question
* - take zv_state_lock to protect zvol_state_t
@@ -67,9 +78,8 @@
* these operations are serialized per pool. Consequently, we can be certain
* that for a given zvol, there is only one operation at a time in progress.
* That is why one can be sure that first, zvol_state_t for a given zvol is
- * allocated and placed on zvol_state_list, and then other minor operations
- * for this zvol are going to proceed in the order of issue.
- *
+ * allocated and placed on zvol_state_list, and then other minor operations for
+ * this zvol are going to proceed in the order of issue.
*/
#include <sys/dataset_kstats.h>
@@ -1570,184 +1580,156 @@ zvol_create_minors_impl(zvol_task_t *task)
}
/*
- * Remove minors for specified dataset including children and snapshots.
- */
-
-/*
- * Remove the minor for a given zvol. This will do it all:
- * - flag the zvol for removal, so new requests are rejected
- * - wait until outstanding requests are completed
- * - remove it from lists
- * - free it
- * It's also usable as a taskq task, and smells nice too.
+ * Remove minors for specified dataset and, optionally, its children and
+ * snapshots.
*/
static void
-zvol_remove_minor_task(void *arg)
-{
- zvol_state_t *zv = (zvol_state_t *)arg;
-
- ASSERT(!RW_LOCK_HELD(&zvol_state_lock));
- ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
-
- mutex_enter(&zv->zv_state_lock);
- while (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) {
- zv->zv_flags |= ZVOL_REMOVING;
- cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
- }
- mutex_exit(&zv->zv_state_lock);
-
- rw_enter(&zvol_state_lock, RW_WRITER);
- mutex_enter(&zv->zv_state_lock);
-
- zvol_remove(zv);
- zvol_os_clear_private(zv);
-
- mutex_exit(&zv->zv_state_lock);
- rw_exit(&zvol_state_lock);
-
- zvol_os_free(zv);
-}
-
-static void
-zvol_free_task(void *arg)
-{
- zvol_os_free(arg);
-}
-
-static void
zvol_remove_minors_impl(zvol_task_t *task)
{
zvol_state_t *zv, *zv_next;
const char *name = task ? task->zt_name1 : NULL;
int namelen = ((name) ? strlen(name) : 0);
- taskqid_t t;
- list_t delay_list, free_list;
+ boolean_t children = task ? !!task->zt_value : B_TRUE;
if (zvol_inhibit_dev)
return;
- list_create(&delay_list, sizeof (zvol_state_t),
- offsetof(zvol_state_t, zv_next));
- list_create(&free_list, sizeof (zvol_state_t),
- offsetof(zvol_state_t, zv_next));
+ /*
+ * We collect up zvols that we want to remove on a separate list, so
+ * that we don't have to hold zvol_state_lock for the whole time.
+ *
+ * We can't remove them from the global lists until we're completely
+ * done with them, because that would make them appear to ZFS-side ops
+ * that they don't exist, and the name might be reused, which can't be
+ * good.
+ */
+ list_t remove_list;
+ list_create(&remove_list, sizeof (zvol_state_t),
+ offsetof(zvol_state_t, zv_remove_node));
- rw_enter(&zvol_state_lock, RW_WRITER);
+ rw_enter(&zvol_state_lock, RW_READER);
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
mutex_enter(&zv->zv_state_lock);
+ if (zv->zv_flags & ZVOL_REMOVING) {
+ /* Another thread is handling shutdown, skip it. */
+ mutex_exit(&zv->zv_state_lock);
+ continue;
+ }
+
+ /*
+ * This zvol should be removed if:
+ * - no name was offered (ie removing all at shutdown); or
+ * - name matches exactly; or
+ * - we were asked to remove children, and
+ * - the start of the name matches, and
+ * - there is a '/' immediately after the matched name; or
+ * - there is a '@' immediately after the matched name
+ */
if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
- (strncmp(zv->zv_name, name, namelen) == 0 &&
+ (children && strncmp(zv->zv_name, name, namelen) == 0 &&
(zv->zv_name[namelen] == '/' ||
zv->zv_name[namelen] == '@'))) {
- /*
- * By holding zv_state_lock here, we guarantee that no
- * one is currently using this zv
- */
/*
- * If in use, try to throw everyone off and try again
- * later.
+ * Matched, so mark it removal. We want to take the
+ * write half of the suspend lock to make sure that
+ * the zvol is not suspended, and give any data ops
+ * chance to finish.
*/
- if (zv->zv_open_count > 0 ||
- atomic_read(&zv->zv_suspend_ref)) {
- zv->zv_flags |= ZVOL_REMOVING;
- t = taskq_dispatch(
- zv->zv_objset->os_spa->spa_zvol_taskq,
- zvol_remove_minor_task, zv, TQ_SLEEP);
- if (t == TASKQID_INVALID) {
- /*
- * Couldn't create the task, so we'll
- * do it in place once the loop is
- * finished.
- */
- list_insert_head(&delay_list, zv);
- }
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+ mutex_enter(&zv->zv_state_lock);
+
+ if (zv->zv_flags & ZVOL_REMOVING) {
+ /* Another thread has taken it, let them. */
mutex_exit(&zv->zv_state_lock);
+ rw_exit(&zv->zv_suspend_lock);
continue;
}
- zvol_remove(zv);
-
/*
- * Cleared while holding zvol_state_lock as a writer
- * which will prevent zvol_open() from opening it.
+ * Mark it and unlock. New entries will see the flag
+ * and return ENXIO.
*/
- zvol_os_clear_private(zv);
-
- /* Drop zv_state_lock before zvol_free() */
+ zv->zv_flags |= ZVOL_REMOVING;
mutex_exit(&zv->zv_state_lock);
+ rw_exit(&zv->zv_suspend_lock);
- /* Try parallel zv_free, if failed do it in place */
- t = taskq_dispatch(system_taskq, zvol_free_task, zv,
- TQ_SLEEP);
- if (t == TASKQID_INVALID)
- list_insert_head(&free_list, zv);
- } else {
+ /* Put it on the list for the next stage. */
+ list_insert_head(&remove_list, zv);
+ } else
mutex_exit(&zv->zv_state_lock);
- }
}
- rw_exit(&zvol_state_lock);
- /* Wait for zvols that we couldn't create a remove task for */
- while ((zv = list_remove_head(&delay_list)) != NULL)
- zvol_remove_minor_task(zv);
-
- /* Free any that we couldn't free in parallel earlier */
- while ((zv = list_remove_head(&free_list)) != NULL)
- zvol_os_free(zv);
-}
-
-/* Remove minor for this specific volume only */
-static int
-zvol_remove_minor_impl(const char *name)
-{
- zvol_state_t *zv = NULL, *zv_next;
-
- if (zvol_inhibit_dev)
- return (0);
+ rw_exit(&zvol_state_lock);
- rw_enter(&zvol_state_lock, RW_WRITER);
+ /* Didn't match any, nothing to do! */
+ if (list_is_empty(&remove_list)) {
+ if (task)
+ task->zt_error = SET_ERROR(ENOENT);
+ return;
+ }
- for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
- zv_next = list_next(&zvol_state_list, zv);
+ /* Actually shut them all down. */
+ for (zv = list_head(&remove_list); zv != NULL; zv = zv_next) {
+ zv_next = list_next(&remove_list, zv);
mutex_enter(&zv->zv_state_lock);
- if (strcmp(zv->zv_name, name) == 0)
- /* Found, leave the the loop with zv_lock held */
- break;
- mutex_exit(&zv->zv_state_lock);
- }
-
- if (zv == NULL) {
- rw_exit(&zvol_state_lock);
- return (SET_ERROR(ENOENT));
- }
- ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ /*
+ * Still open or suspended, just wait. This can happen if, for
+ * example, we managed to acquire zv_state_lock in the moments
+ * where zvol_open() or zvol_release() are trading locks to
+ * call zvol_first_open() or zvol_last_close().
+ */
+ while (zv->zv_open_count > 0 ||
+ atomic_read(&zv->zv_suspend_ref))
+ cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
- if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) {
/*
- * In use, so try to throw everyone off, then wait
- * until finished.
+ * No users, shut down the OS side. This may not remove the
+ * minor from view immediately, depending on the kernel
+ * specifics, but it will ensure that it is unusable and that
+ * this zvol_state_t can never again be reached from an OS-side
+ * operation.
*/
- zv->zv_flags |= ZVOL_REMOVING;
+ zvol_os_remove_minor(zv);
mutex_exit(&zv->zv_state_lock);
+
+ /* Remove it from the name lookup lists */
+ rw_enter(&zvol_state_lock, RW_WRITER);
+ zvol_remove(zv);
rw_exit(&zvol_state_lock);
- zvol_remove_minor_task(zv);
- return (0);
}
- zvol_remove(zv);
- zvol_os_clear_private(zv);
+ /*
+ * Our own references on remove_list is the last one, free them and
+ * we're done.
+ */
+ while ((zv = list_remove_head(&remove_list)) != NULL)
+ zvol_os_free(zv);
- mutex_exit(&zv->zv_state_lock);
- rw_exit(&zvol_state_lock);
+ list_destroy(&remove_list);
+}
- zvol_os_free(zv);
+/* Remove minor for this specific volume only */
+static int
+zvol_remove_minor_impl(const char *name)
+{
+ if (zvol_inhibit_dev)
+ return (0);
- return (0);
+ zvol_task_t task;
+ memset(&task, 0, sizeof (zvol_task_t));
+ strlcpy(task.zt_name1, name, sizeof (task.zt_name1));
+ task.zt_value = B_FALSE;
+
+ zvol_remove_minors_impl(&task);
+
+ return (task.zt_error);
}
/*
@@ -2067,6 +2049,7 @@ zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
task->zt_op = ZVOL_ASYNC_REMOVE_MINORS;
strlcpy(task->zt_name1, name, sizeof (task->zt_name1));
+ task->zt_value = B_TRUE;
id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
if ((async == B_FALSE) && (id != TASKQID_INVALID))
taskq_wait_id(spa->spa_zvol_taskq, id);
@@ -2188,14 +2171,6 @@ zvol_fini_impl(void)
zvol_remove_minors_impl(NULL);
- /*
- * The call to "zvol_remove_minors_impl" may dispatch entries to
- * the system_taskq, but it doesn't wait for those entries to
- * complete before it returns. Thus, we must wait for all of the
- * removals to finish, before we can continue.
- */
- taskq_wait_outstanding(system_taskq, 0);
-
kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
list_destroy(&zvol_state_list);
rw_destroy(&zvol_state_lock);