aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs/zvol.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/zvol.c')
-rw-r--r--sys/contrib/openzfs/module/zfs/zvol.c676
1 files changed, 354 insertions, 322 deletions
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
index 3568d4f43fcb..2fd3e1c37045 100644
--- a/sys/contrib/openzfs/module/zfs/zvol.c
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -38,25 +38,36 @@
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Klara, Inc.
*/
/*
* Note on locking of zvol state structures.
*
- * These structures are used to maintain internal state used to emulate block
- * devices on top of zvols. In particular, management of device minor number
- * operations - create, remove, rename, and set_snapdev - involves access to
- * these structures. The zvol_state_lock is primarily used to protect the
- * zvol_state_list. The zv->zv_state_lock is used to protect the contents
- * of the zvol_state_t structures, as well as to make sure that when the
- * time comes to remove the structure from the list, it is not in use, and
- * therefore, it can be taken off zvol_state_list and freed.
+ * zvol_state_t represents the connection between a single dataset
+ * (DMU_OST_ZVOL) and the device "minor" (some OS-specific representation of a
+ * "disk" or "device" or "volume", eg, a /dev/zdXX node, a GEOM object, etc).
*
- * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
- * e.g. for the duration of receive and rollback operations. This lock can be
- * held for significant periods of time. Given that it is undesirable to hold
- * mutexes for long periods of time, the following lock ordering applies:
+ * The global zvol_state_lock is used to protect access to zvol_state_list and
+ * zvol_htable, which are the primary way to obtain a zvol_state_t from a name.
+ * It should not be used for anything not name-relateds, and you should avoid
+ * sleeping or waiting while its held. See zvol_find_by_name(), zvol_insert(),
+ * zvol_remove().
+ *
+ * The zv_state_lock is used to protect the contents of the associated
+ * zvol_state_t. Most of the zvol_state_t is dedicated to control and
+ * configuration; almost none of it is needed for data operations (that is,
+ * read, write, flush) so this lock is rarely taken during general IO. It
+ * should be released quickly; you should avoid sleeping or waiting while its
+ * held.
+ *
+ * zv_suspend_lock is used to suspend IO/data operations to a zvol. The read
+ * half should held for the duration of an IO operation. The write half should
+ * be taken when something to wait for IO to complete and the block further IO,
+ * eg for the duration of receive and rollback operations. This lock can be
+ * held for long periods of time.
+ *
+ * Thus, the following lock ordering appies.
* - take zvol_state_lock if necessary, to protect zvol_state_list
* - take zv_suspend_lock if necessary, by the code path in question
* - take zv_state_lock to protect zvol_state_t
@@ -67,9 +78,8 @@
* these operations are serialized per pool. Consequently, we can be certain
* that for a given zvol, there is only one operation at a time in progress.
* That is why one can be sure that first, zvol_state_t for a given zvol is
- * allocated and placed on zvol_state_list, and then other minor operations
- * for this zvol are going to proceed in the order of issue.
- *
+ * allocated and placed on zvol_state_list, and then other minor operations for
+ * this zvol are going to proceed in the order of issue.
*/
#include <sys/dataset_kstats.h>
@@ -102,6 +112,7 @@ extern int zfs_bclone_wait_dirty;
zv_taskq_t zvol_taskqs;
typedef enum {
+ ZVOL_ASYNC_CREATE_MINORS,
ZVOL_ASYNC_REMOVE_MINORS,
ZVOL_ASYNC_RENAME_MINORS,
ZVOL_ASYNC_SET_SNAPDEV,
@@ -110,10 +121,14 @@ typedef enum {
} zvol_async_op_t;
typedef struct {
- zvol_async_op_t op;
- char name1[MAXNAMELEN];
- char name2[MAXNAMELEN];
- uint64_t value;
+ zvol_async_op_t zt_op;
+ char zt_name1[MAXNAMELEN];
+ char zt_name2[MAXNAMELEN];
+ uint64_t zt_value;
+ uint32_t zt_total;
+ uint32_t zt_done;
+ int32_t zt_status;
+ int zt_error;
} zvol_task_t;
zv_request_task_t *
@@ -210,8 +225,8 @@ zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
int error;
uint64_t volblocksize, volsize;
- VERIFY(nvlist_lookup_uint64(nvprops,
- zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
+ VERIFY0(nvlist_lookup_uint64(nvprops,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize));
if (nvlist_lookup_uint64(nvprops,
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
@@ -220,21 +235,20 @@ zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
* These properties must be removed from the list so the generic
* property setting step won't apply to them.
*/
- VERIFY(nvlist_remove_all(nvprops,
- zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
+ VERIFY0(nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE)));
(void) nvlist_remove_all(nvprops,
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
DMU_OT_NONE, 0, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
DMU_OT_NONE, 0, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
}
/*
@@ -249,7 +263,7 @@ zvol_get_stats(objset_t *os, nvlist_t *nv)
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
if (error)
- return (SET_ERROR(error));
+ return (error);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
@@ -262,7 +276,7 @@ zvol_get_stats(objset_t *os, nvlist_t *nv)
kmem_free(doi, sizeof (dmu_object_info_t));
- return (SET_ERROR(error));
+ return (error);
}
/*
@@ -300,7 +314,7 @@ zvol_update_volsize(uint64_t volsize, objset_t *os)
error = dmu_tx_assign(tx, DMU_TX_WAIT);
if (error) {
dmu_tx_abort(tx);
- return (SET_ERROR(error));
+ return (error);
}
txg = dmu_tx_get_txg(tx);
@@ -332,7 +346,7 @@ zvol_set_volsize(const char *name, uint64_t volsize)
error = dsl_prop_get_integer(name,
zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
if (error != 0)
- return (SET_ERROR(error));
+ return (error);
if (readonly)
return (SET_ERROR(EROFS));
@@ -348,7 +362,7 @@ zvol_set_volsize(const char *name, uint64_t volsize)
FTAG, &os)) != 0) {
if (zv != NULL)
mutex_exit(&zv->zv_state_lock);
- return (SET_ERROR(error));
+ return (error);
}
owned = B_TRUE;
if (zv != NULL)
@@ -385,7 +399,7 @@ out:
if (error == 0 && zv != NULL)
zvol_os_update_volsize(zv, volsize);
- return (SET_ERROR(error));
+ return (error);
}
/*
@@ -396,7 +410,7 @@ zvol_set_volthreading(const char *name, boolean_t value)
{
zvol_state_t *zv = zvol_find_by_name(name, RW_NONE);
if (zv == NULL)
- return (ENOENT);
+ return (SET_ERROR(ENOENT));
zv->zv_threading = value;
mutex_exit(&zv->zv_state_lock);
return (0);
@@ -445,8 +459,10 @@ zvol_check_volblocksize(const char *name, uint64_t volblocksize)
* We don't allow setting the property above 1MB,
* unless the tunable has been changed.
*/
- if (volblocksize > zfs_max_recordsize)
+ if (volblocksize > zfs_max_recordsize) {
+ spa_close(spa, FTAG);
return (SET_ERROR(EDOM));
+ }
spa_close(spa, FTAG);
}
@@ -613,7 +629,7 @@ zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst,
dmu_tx_t *tx;
blkptr_t *bps;
size_t maxblocks;
- int error = EINVAL;
+ int error = 0;
rw_enter(&zv_dst->zv_suspend_lock, RW_READER);
if (zv_dst->zv_zilog == NULL) {
@@ -639,23 +655,22 @@ zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst,
*/
if (!spa_feature_is_enabled(dmu_objset_spa(outos),
SPA_FEATURE_BLOCK_CLONING)) {
- error = EOPNOTSUPP;
+ error = SET_ERROR(EOPNOTSUPP);
goto out;
}
if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
- error = EXDEV;
+ error = SET_ERROR(EXDEV);
goto out;
}
if (inos->os_encrypted != outos->os_encrypted) {
- error = EXDEV;
+ error = SET_ERROR(EXDEV);
goto out;
}
if (zv_src->zv_volblocksize != zv_dst->zv_volblocksize) {
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
goto out;
}
if (inoff >= zv_src->zv_volsize || outoff >= zv_dst->zv_volsize) {
- error = 0;
goto out;
}
@@ -666,17 +681,15 @@ zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst,
len = zv_src->zv_volsize - inoff;
if (len > zv_dst->zv_volsize - outoff)
len = zv_dst->zv_volsize - outoff;
- if (len == 0) {
- error = 0;
+ if (len == 0)
goto out;
- }
/*
* No overlapping if we are cloning within the same file
*/
if (zv_src == zv_dst) {
if (inoff < outoff + len && outoff < inoff + len) {
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
goto out;
}
}
@@ -686,7 +699,7 @@ zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst,
*/
if ((inoff % zv_src->zv_volblocksize) != 0 ||
(outoff % zv_dst->zv_volblocksize) != 0) {
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
goto out;
}
@@ -694,7 +707,7 @@ zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst,
* Length must be multiple of block size
*/
if ((len % zv_src->zv_volblocksize) != 0) {
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
goto out;
}
@@ -766,13 +779,13 @@ zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst,
zfs_rangelock_exit(outlr);
zfs_rangelock_exit(inlr);
if (error == 0 && zv_dst->zv_objset->os_sync == ZFS_SYNC_ALWAYS) {
- zil_commit(zilog_dst, ZVOL_OBJ);
+ error = zil_commit(zilog_dst, ZVOL_OBJ);
}
out:
if (zv_src != zv_dst)
rw_exit(&zv_src->zv_suspend_lock);
rw_exit(&zv_dst->zv_suspend_lock);
- return (SET_ERROR(error));
+ return (error);
}
/*
@@ -859,13 +872,8 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
};
/*
- * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
- *
- * We store data in the log buffers if it's small enough.
- * Otherwise we will later flush the data out via dmu_sync().
+ * zvol_log_write() handles TX_WRITE transactions.
*/
-static const ssize_t zvol_immediate_write_sz = 32768;
-
void
zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
uint64_t size, boolean_t commit)
@@ -878,15 +886,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
if (zil_replaying(zilog, tx))
return;
- if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
- write_state = WR_INDIRECT;
- else if (!spa_has_slogs(zilog->zl_spa) &&
- size >= blocksize && blocksize > zvol_immediate_write_sz)
- write_state = WR_INDIRECT;
- else if (commit)
- write_state = WR_COPIED;
- else
- write_state = WR_NEED_COPY;
+ write_state = zil_write_state(zilog, size, blocksize, B_FALSE, commit);
while (size) {
itx_t *itx;
@@ -905,7 +905,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
if (wr_state == WR_COPIED &&
dmu_read_by_dnode(zv->zv_dn, offset, len, lr + 1,
DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING) != 0) {
- zil_itx_destroy(itx);
+ zil_itx_destroy(itx, 0);
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr;
wr_state = WR_NEED_COPY;
@@ -924,7 +924,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
itx->itx_private = zv;
- (void) zil_itx_assign(zilog, itx, tx);
+ zil_itx_assign(zilog, itx, tx);
offset += len;
size -= len;
@@ -1034,7 +1034,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
zvol_get_done(zgd, error);
- return (SET_ERROR(error));
+ return (error);
}
/*
@@ -1079,15 +1079,15 @@ zvol_setup_zv(zvol_state_t *zv)
error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
if (error)
- return (SET_ERROR(error));
+ return (error);
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
if (error)
- return (SET_ERROR(error));
+ return (error);
error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn);
if (error)
- return (SET_ERROR(error));
+ return (error);
zvol_os_set_capacity(zv, volsize >> 9);
zv->zv_volsize = volsize;
@@ -1129,7 +1129,7 @@ zvol_shutdown_zv(zvol_state_t *zv)
*/
if (zv->zv_flags & ZVOL_WRITTEN_TO)
txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
- (void) dmu_objset_evict_dbufs(zv->zv_objset);
+ dmu_objset_evict_dbufs(zv->zv_objset);
}
/*
@@ -1206,7 +1206,7 @@ zvol_resume(zvol_state_t *zv)
if (zv->zv_flags & ZVOL_REMOVING)
cv_broadcast(&zv->zv_removing_cv);
- return (SET_ERROR(error));
+ return (error);
}
int
@@ -1222,7 +1222,7 @@ zvol_first_open(zvol_state_t *zv, boolean_t readonly)
boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
if (error)
- return (SET_ERROR(error));
+ return (error);
zv->zv_objset = os;
@@ -1434,6 +1434,48 @@ zvol_create_minors_cb(const char *dsname, void *arg)
return (0);
}
+static void
+zvol_task_update_status(zvol_task_t *task, uint64_t total, uint64_t done,
+ int error)
+{
+
+ task->zt_total += total;
+ task->zt_done += done;
+ if (task->zt_total != task->zt_done) {
+ task->zt_status = -1;
+ if (error)
+ task->zt_error = error;
+ }
+}
+
+static void
+zvol_task_report_status(zvol_task_t *task)
+{
+#ifdef ZFS_DEBUG
+ static const char *const msg[] = {
+ "create",
+ "remove",
+ "rename",
+ "set snapdev",
+ "set volmode",
+ "unknown",
+ };
+
+ if (task->zt_status == 0)
+ return;
+
+ zvol_async_op_t op = MIN(task->zt_op, ZVOL_ASYNC_MAX);
+ if (task->zt_error) {
+ dprintf("The %s minors zvol task was not ok, last error %d\n",
+ msg[op], task->zt_error);
+ } else {
+ dprintf("The %s minors zvol task was not ok\n", msg[op]);
+ }
+#else
+ (void) task;
+#endif
+}
+
/*
* Create minors for the specified dataset, including children and snapshots.
* Pay attention to the 'snapdev' property and iterate over the snapshots
@@ -1451,14 +1493,27 @@ zvol_create_minors_cb(const char *dsname, void *arg)
* 'visible' (which also verifies that the parent is a zvol), and if so,
* a minor node for that snapshot is created.
*/
-void
-zvol_create_minors_recursive(const char *name)
+static void
+zvol_create_minors_impl(zvol_task_t *task)
{
+ const char *name = task->zt_name1;
list_t minors_list;
minors_job_t *job;
+ uint64_t snapdev;
+ int total = 0, done = 0, last_error, error;
- if (zvol_inhibit_dev)
+ /*
+ * Note: the dsl_pool_config_lock must not be held.
+ * Minor node creation needs to obtain the zvol_state_lock.
+ * zvol_open() obtains the zvol_state_lock and then the dsl pool
+ * config lock. Therefore, we can't have the config lock now if
+ * we are going to wait for the zvol_state_lock, because it
+ * would be a lock order inversion which could lead to deadlock.
+ */
+
+ if (zvol_inhibit_dev) {
return;
+ }
/*
* This is the list for prefetch jobs. Whenever we found a match
@@ -1474,13 +1529,16 @@ zvol_create_minors_recursive(const char *name)
if (strchr(name, '@') != NULL) {
- uint64_t snapdev;
-
- int error = dsl_prop_get_integer(name, "snapdev",
- &snapdev, NULL);
-
- if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
- (void) zvol_os_create_minor(name);
+ error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL);
+ if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) {
+ error = zvol_os_create_minor(name);
+ if (error == 0) {
+ done++;
+ } else {
+ last_error = error;
+ }
+ total++;
+ }
} else {
fstrans_cookie_t cookie = spl_fstrans_mark();
(void) dmu_objset_find(name, zvol_create_minors_cb,
@@ -1495,229 +1553,195 @@ zvol_create_minors_recursive(const char *name)
* sequentially.
*/
while ((job = list_remove_head(&minors_list)) != NULL) {
- if (!job->error)
- (void) zvol_os_create_minor(job->name);
+ if (!job->error) {
+ error = zvol_os_create_minor(job->name);
+ if (error == 0) {
+ done++;
+ } else {
+ last_error = error;
+ }
+ } else if (job->error == EINVAL) {
+ /*
+ * The objset, with the name requested by current job
+ * exist, but have the type different from zvol.
+ * Just ignore this sort of errors.
+ */
+ done++;
+ } else {
+ last_error = job->error;
+ }
+ total++;
kmem_strfree(job->name);
kmem_free(job, sizeof (minors_job_t));
}
list_destroy(&minors_list);
+ zvol_task_update_status(task, total, done, last_error);
}
-void
-zvol_create_minor(const char *name)
-{
- /*
- * Note: the dsl_pool_config_lock must not be held.
- * Minor node creation needs to obtain the zvol_state_lock.
- * zvol_open() obtains the zvol_state_lock and then the dsl pool
- * config lock. Therefore, we can't have the config lock now if
- * we are going to wait for the zvol_state_lock, because it
- * would be a lock order inversion which could lead to deadlock.
- */
-
- if (zvol_inhibit_dev)
- return;
-
- if (strchr(name, '@') != NULL) {
- uint64_t snapdev;
-
- int error = dsl_prop_get_integer(name,
- "snapdev", &snapdev, NULL);
-
- if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
- (void) zvol_os_create_minor(name);
- } else {
- (void) zvol_os_create_minor(name);
- }
-}
-
-/*
- * Remove minors for specified dataset including children and snapshots.
- */
-
/*
- * Remove the minor for a given zvol. This will do it all:
- * - flag the zvol for removal, so new requests are rejected
- * - wait until outstanding requests are completed
- * - remove it from lists
- * - free it
- * It's also usable as a taskq task, and smells nice too.
+ * Remove minors for specified dataset and, optionally, its children and
+ * snapshots.
*/
static void
-zvol_remove_minor_task(void *arg)
-{
- zvol_state_t *zv = (zvol_state_t *)arg;
-
- ASSERT(!RW_LOCK_HELD(&zvol_state_lock));
- ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
-
- mutex_enter(&zv->zv_state_lock);
- while (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) {
- zv->zv_flags |= ZVOL_REMOVING;
- cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
- }
- mutex_exit(&zv->zv_state_lock);
-
- rw_enter(&zvol_state_lock, RW_WRITER);
- mutex_enter(&zv->zv_state_lock);
-
- zvol_remove(zv);
- zvol_os_clear_private(zv);
-
- mutex_exit(&zv->zv_state_lock);
- rw_exit(&zvol_state_lock);
-
- zvol_os_free(zv);
-}
-
-static void
-zvol_free_task(void *arg)
-{
- zvol_os_free(arg);
-}
-
-void
-zvol_remove_minors_impl(const char *name)
+zvol_remove_minors_impl(zvol_task_t *task)
{
zvol_state_t *zv, *zv_next;
+ const char *name = task ? task->zt_name1 : NULL;
int namelen = ((name) ? strlen(name) : 0);
- taskqid_t t;
- list_t delay_list, free_list;
+ boolean_t children = task ? !!task->zt_value : B_TRUE;
if (zvol_inhibit_dev)
return;
- list_create(&delay_list, sizeof (zvol_state_t),
- offsetof(zvol_state_t, zv_next));
- list_create(&free_list, sizeof (zvol_state_t),
- offsetof(zvol_state_t, zv_next));
+ /*
+ * We collect up zvols that we want to remove on a separate list, so
+ * that we don't have to hold zvol_state_lock for the whole time.
+ *
+ * We can't remove them from the global lists until we're completely
+ * done with them, because that would make them appear to ZFS-side ops
+ * that they don't exist, and the name might be reused, which can't be
+ * good.
+ */
+ list_t remove_list;
+ list_create(&remove_list, sizeof (zvol_state_t),
+ offsetof(zvol_state_t, zv_remove_node));
- rw_enter(&zvol_state_lock, RW_WRITER);
+ rw_enter(&zvol_state_lock, RW_READER);
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
mutex_enter(&zv->zv_state_lock);
+ if (zv->zv_flags & ZVOL_REMOVING) {
+ /* Another thread is handling shutdown, skip it. */
+ mutex_exit(&zv->zv_state_lock);
+ continue;
+ }
+
+ /*
+ * This zvol should be removed if:
+ * - no name was offered (ie removing all at shutdown); or
+ * - name matches exactly; or
+ * - we were asked to remove children, and
+ * - the start of the name matches, and
+ * - there is a '/' immediately after the matched name; or
+ * - there is a '@' immediately after the matched name
+ */
if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
- (strncmp(zv->zv_name, name, namelen) == 0 &&
+ (children && strncmp(zv->zv_name, name, namelen) == 0 &&
(zv->zv_name[namelen] == '/' ||
zv->zv_name[namelen] == '@'))) {
- /*
- * By holding zv_state_lock here, we guarantee that no
- * one is currently using this zv
- */
/*
- * If in use, try to throw everyone off and try again
- * later.
+ * Matched, so mark it removal. We want to take the
+ * write half of the suspend lock to make sure that
+ * the zvol is not suspended, and give any data ops
+ * chance to finish.
*/
- if (zv->zv_open_count > 0 ||
- atomic_read(&zv->zv_suspend_ref)) {
- zv->zv_flags |= ZVOL_REMOVING;
- t = taskq_dispatch(
- zv->zv_objset->os_spa->spa_zvol_taskq,
- zvol_remove_minor_task, zv, TQ_SLEEP);
- if (t == TASKQID_INVALID) {
- /*
- * Couldn't create the task, so we'll
- * do it in place once the loop is
- * finished.
- */
- list_insert_head(&delay_list, zv);
- }
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+ mutex_enter(&zv->zv_state_lock);
+
+ if (zv->zv_flags & ZVOL_REMOVING) {
+ /* Another thread has taken it, let them. */
mutex_exit(&zv->zv_state_lock);
+ rw_exit(&zv->zv_suspend_lock);
continue;
}
- zvol_remove(zv);
-
/*
- * Cleared while holding zvol_state_lock as a writer
- * which will prevent zvol_open() from opening it.
+ * Mark it and unlock. New entries will see the flag
+ * and return ENXIO.
*/
- zvol_os_clear_private(zv);
-
- /* Drop zv_state_lock before zvol_free() */
+ zv->zv_flags |= ZVOL_REMOVING;
mutex_exit(&zv->zv_state_lock);
+ rw_exit(&zv->zv_suspend_lock);
- /* Try parallel zv_free, if failed do it in place */
- t = taskq_dispatch(system_taskq, zvol_free_task, zv,
- TQ_SLEEP);
- if (t == TASKQID_INVALID)
- list_insert_head(&free_list, zv);
- } else {
+ /* Put it on the list for the next stage. */
+ list_insert_head(&remove_list, zv);
+ } else
mutex_exit(&zv->zv_state_lock);
- }
}
- rw_exit(&zvol_state_lock);
-
- /* Wait for zvols that we couldn't create a remove task for */
- while ((zv = list_remove_head(&delay_list)) != NULL)
- zvol_remove_minor_task(zv);
- /* Free any that we couldn't free in parallel earlier */
- while ((zv = list_remove_head(&free_list)) != NULL)
- zvol_os_free(zv);
-}
-
-/* Remove minor for this specific volume only */
-static void
-zvol_remove_minor_impl(const char *name)
-{
- zvol_state_t *zv = NULL, *zv_next;
+ rw_exit(&zvol_state_lock);
- if (zvol_inhibit_dev)
+ /* Didn't match any, nothing to do! */
+ if (list_is_empty(&remove_list)) {
+ if (task)
+ task->zt_error = SET_ERROR(ENOENT);
return;
+ }
- rw_enter(&zvol_state_lock, RW_WRITER);
-
- for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
- zv_next = list_next(&zvol_state_list, zv);
+ /* Actually shut them all down. */
+ for (zv = list_head(&remove_list); zv != NULL; zv = zv_next) {
+ zv_next = list_next(&remove_list, zv);
mutex_enter(&zv->zv_state_lock);
- if (strcmp(zv->zv_name, name) == 0)
- /* Found, leave the the loop with zv_lock held */
- break;
- mutex_exit(&zv->zv_state_lock);
- }
- if (zv == NULL) {
- rw_exit(&zvol_state_lock);
- return;
- }
-
- ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ /*
+ * Still open or suspended, just wait. This can happen if, for
+ * example, we managed to acquire zv_state_lock in the moments
+ * where zvol_open() or zvol_release() are trading locks to
+ * call zvol_first_open() or zvol_last_close().
+ */
+ while (zv->zv_open_count > 0 ||
+ atomic_read(&zv->zv_suspend_ref))
+ cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
- if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) {
/*
- * In use, so try to throw everyone off, then wait
- * until finished.
+ * No users, shut down the OS side. This may not remove the
+ * minor from view immediately, depending on the kernel
+ * specifics, but it will ensure that it is unusable and that
+ * this zvol_state_t can never again be reached from an OS-side
+ * operation.
*/
- zv->zv_flags |= ZVOL_REMOVING;
+ zvol_os_remove_minor(zv);
mutex_exit(&zv->zv_state_lock);
+
+ /* Remove it from the name lookup lists */
+ rw_enter(&zvol_state_lock, RW_WRITER);
+ zvol_remove(zv);
rw_exit(&zvol_state_lock);
- zvol_remove_minor_task(zv);
- return;
}
- zvol_remove(zv);
- zvol_os_clear_private(zv);
+ /*
+ * Our own references on remove_list is the last one, free them and
+ * we're done.
+ */
+ while ((zv = list_remove_head(&remove_list)) != NULL)
+ zvol_os_free(zv);
- mutex_exit(&zv->zv_state_lock);
- rw_exit(&zvol_state_lock);
+ list_destroy(&remove_list);
+}
+
+/* Remove minor for this specific volume only */
+static int
+zvol_remove_minor_impl(const char *name)
+{
+ if (zvol_inhibit_dev)
+ return (0);
+
+ zvol_task_t task;
+ memset(&task, 0, sizeof (zvol_task_t));
+ strlcpy(task.zt_name1, name, sizeof (task.zt_name1));
+ task.zt_value = B_FALSE;
+
+ zvol_remove_minors_impl(&task);
- zvol_os_free(zv);
+ return (task.zt_error);
}
/*
* Rename minors for specified dataset including children and snapshots.
*/
static void
-zvol_rename_minors_impl(const char *oldname, const char *newname)
+zvol_rename_minors_impl(zvol_task_t *task)
{
zvol_state_t *zv, *zv_next;
- int oldnamelen;
+ const char *oldname = task->zt_name1;
+ const char *newname = task->zt_name2;
+ int total = 0, done = 0, last_error, error, oldnamelen;
if (zvol_inhibit_dev)
return;
@@ -1732,24 +1756,31 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
mutex_enter(&zv->zv_state_lock);
if (strcmp(zv->zv_name, oldname) == 0) {
- zvol_os_rename_minor(zv, newname);
+ error = zvol_os_rename_minor(zv, newname);
} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
(zv->zv_name[oldnamelen] == '/' ||
zv->zv_name[oldnamelen] == '@')) {
char *name = kmem_asprintf("%s%c%s", newname,
zv->zv_name[oldnamelen],
zv->zv_name + oldnamelen + 1);
- zvol_os_rename_minor(zv, name);
+ error = zvol_os_rename_minor(zv, name);
kmem_strfree(name);
}
-
+ if (error) {
+ last_error = error;
+ } else {
+ done++;
+ }
+ total++;
mutex_exit(&zv->zv_state_lock);
}
rw_exit(&zvol_state_lock);
+ zvol_task_update_status(task, total, done, last_error);
}
typedef struct zvol_snapdev_cb_arg {
+ zvol_task_t *task;
uint64_t snapdev;
} zvol_snapdev_cb_arg_t;
@@ -1757,26 +1788,31 @@ static int
zvol_set_snapdev_cb(const char *dsname, void *param)
{
zvol_snapdev_cb_arg_t *arg = param;
+ int error = 0;
if (strchr(dsname, '@') == NULL)
return (0);
switch (arg->snapdev) {
case ZFS_SNAPDEV_VISIBLE:
- (void) zvol_os_create_minor(dsname);
+ error = zvol_os_create_minor(dsname);
break;
case ZFS_SNAPDEV_HIDDEN:
- (void) zvol_remove_minor_impl(dsname);
+ error = zvol_remove_minor_impl(dsname);
break;
}
+ zvol_task_update_status(arg->task, 1, error == 0, error);
return (0);
}
static void
-zvol_set_snapdev_impl(char *name, uint64_t snapdev)
+zvol_set_snapdev_impl(zvol_task_t *task)
{
- zvol_snapdev_cb_arg_t arg = {snapdev};
+ const char *name = task->zt_name1;
+ uint64_t snapdev = task->zt_value;
+
+ zvol_snapdev_cb_arg_t arg = {task, snapdev};
fstrans_cookie_t cookie = spl_fstrans_mark();
/*
* The zvol_set_snapdev_sync() sets snapdev appropriately
@@ -1787,11 +1823,14 @@ zvol_set_snapdev_impl(char *name, uint64_t snapdev)
}
static void
-zvol_set_volmode_impl(char *name, uint64_t volmode)
+zvol_set_volmode_impl(zvol_task_t *task)
{
+ const char *name = task->zt_name1;
+ uint64_t volmode = task->zt_value;
fstrans_cookie_t cookie;
uint64_t old_volmode;
zvol_state_t *zv;
+ int error;
if (strchr(name, '@') != NULL)
return;
@@ -1804,7 +1843,7 @@ zvol_set_volmode_impl(char *name, uint64_t volmode)
*/
zv = zvol_find_by_name(name, RW_NONE);
if (zv == NULL && volmode == ZFS_VOLMODE_NONE)
- return;
+ return;
if (zv != NULL) {
old_volmode = zv->zv_volmode;
mutex_exit(&zv->zv_state_lock);
@@ -1815,51 +1854,34 @@ zvol_set_volmode_impl(char *name, uint64_t volmode)
cookie = spl_fstrans_mark();
switch (volmode) {
case ZFS_VOLMODE_NONE:
- (void) zvol_remove_minor_impl(name);
+ error = zvol_remove_minor_impl(name);
break;
case ZFS_VOLMODE_GEOM:
case ZFS_VOLMODE_DEV:
- (void) zvol_remove_minor_impl(name);
- (void) zvol_os_create_minor(name);
+ error = zvol_remove_minor_impl(name);
+ /*
+ * The remove minor function call above, might be not
+ * needed, if volmode was switched from 'none' value.
+ * Ignore error in this case.
+ */
+ if (error == ENOENT)
+ error = 0;
+ else if (error)
+ break;
+ error = zvol_os_create_minor(name);
break;
case ZFS_VOLMODE_DEFAULT:
- (void) zvol_remove_minor_impl(name);
+ error = zvol_remove_minor_impl(name);
if (zvol_volmode == ZFS_VOLMODE_NONE)
break;
else /* if zvol_volmode is invalid defaults to "geom" */
- (void) zvol_os_create_minor(name);
+ error = zvol_os_create_minor(name);
break;
}
+ zvol_task_update_status(task, 1, error == 0, error);
spl_fstrans_unmark(cookie);
}
-static zvol_task_t *
-zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
- uint64_t value)
-{
- zvol_task_t *task;
-
- /* Never allow tasks on hidden names. */
- if (name1[0] == '$')
- return (NULL);
-
- task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
- task->op = op;
- task->value = value;
-
- strlcpy(task->name1, name1, sizeof (task->name1));
- if (name2 != NULL)
- strlcpy(task->name2, name2, sizeof (task->name2));
-
- return (task);
-}
-
-static void
-zvol_task_free(zvol_task_t *task)
-{
- kmem_free(task, sizeof (zvol_task_t));
-}
-
/*
* The worker thread function performed asynchronously.
*/
@@ -1868,25 +1890,29 @@ zvol_task_cb(void *arg)
{
zvol_task_t *task = arg;
- switch (task->op) {
+ switch (task->zt_op) {
+ case ZVOL_ASYNC_CREATE_MINORS:
+ zvol_create_minors_impl(task);
+ break;
case ZVOL_ASYNC_REMOVE_MINORS:
- zvol_remove_minors_impl(task->name1);
+ zvol_remove_minors_impl(task);
break;
case ZVOL_ASYNC_RENAME_MINORS:
- zvol_rename_minors_impl(task->name1, task->name2);
+ zvol_rename_minors_impl(task);
break;
case ZVOL_ASYNC_SET_SNAPDEV:
- zvol_set_snapdev_impl(task->name1, task->value);
+ zvol_set_snapdev_impl(task);
break;
case ZVOL_ASYNC_SET_VOLMODE:
- zvol_set_volmode_impl(task->name1, task->value);
+ zvol_set_volmode_impl(task);
break;
default:
VERIFY(0);
break;
}
- zvol_task_free(task);
+ zvol_task_report_status(task);
+ kmem_free(task, sizeof (zvol_task_t));
}
typedef struct zvol_set_prop_int_arg {
@@ -1931,23 +1957,17 @@ zvol_set_common_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
if (dsl_prop_get_int_ds(ds, prop_name, &prop) != 0)
return (0);
- switch (zsda->zsda_prop) {
- case ZFS_PROP_VOLMODE:
- task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname,
- NULL, prop);
- break;
- case ZFS_PROP_SNAPDEV:
- task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname,
- NULL, prop);
- break;
- default:
- task = NULL;
- break;
- }
-
- if (task == NULL)
+ task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+ if (zsda->zsda_prop == ZFS_PROP_VOLMODE) {
+ task->zt_op = ZVOL_ASYNC_SET_VOLMODE;
+ } else if (zsda->zsda_prop == ZFS_PROP_SNAPDEV) {
+ task->zt_op = ZVOL_ASYNC_SET_SNAPDEV;
+ } else {
+ kmem_free(task, sizeof (zvol_task_t));
return (0);
-
+ }
+ task->zt_value = prop;
+ strlcpy(task->zt_name1, dsname, sizeof (task->zt_name1));
(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
task, TQ_SLEEP);
return (0);
@@ -2001,15 +2021,35 @@ zvol_set_common(const char *ddname, zfs_prop_t prop, zprop_source_t source,
}
void
-zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
+zvol_create_minors(const char *name)
{
+ spa_t *spa;
zvol_task_t *task;
taskqid_t id;
- task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL);
- if (task == NULL)
+ if (spa_open(name, &spa, FTAG) != 0)
return;
+ task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+ task->zt_op = ZVOL_ASYNC_CREATE_MINORS;
+ strlcpy(task->zt_name1, name, sizeof (task->zt_name1));
+ id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
+ if (id != TASKQID_INVALID)
+ taskq_wait_id(spa->spa_zvol_taskq, id);
+
+ spa_close(spa, FTAG);
+}
+
+void
+zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
+{
+ zvol_task_t *task;
+ taskqid_t id;
+
+ task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+ task->zt_op = ZVOL_ASYNC_REMOVE_MINORS;
+ strlcpy(task->zt_name1, name, sizeof (task->zt_name1));
+ task->zt_value = B_TRUE;
id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
if ((async == B_FALSE) && (id != TASKQID_INVALID))
taskq_wait_id(spa->spa_zvol_taskq, id);
@@ -2022,10 +2062,10 @@ zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
zvol_task_t *task;
taskqid_t id;
- task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL);
- if (task == NULL)
- return;
-
+ task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+ task->zt_op = ZVOL_ASYNC_RENAME_MINORS;
+ strlcpy(task->zt_name1, name1, sizeof (task->zt_name1));
+ strlcpy(task->zt_name2, name2, sizeof (task->zt_name2));
id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
if ((async == B_FALSE) && (id != TASKQID_INVALID))
taskq_wait_id(spa->spa_zvol_taskq, id);
@@ -2131,20 +2171,12 @@ zvol_fini_impl(void)
zvol_remove_minors_impl(NULL);
- /*
- * The call to "zvol_remove_minors_impl" may dispatch entries to
- * the system_taskq, but it doesn't wait for those entries to
- * complete before it returns. Thus, we must wait for all of the
- * removals to finish, before we can continue.
- */
- taskq_wait_outstanding(system_taskq, 0);
-
kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
list_destroy(&zvol_state_list);
rw_destroy(&zvol_state_lock);
if (ztqs->tqs_taskq == NULL) {
- ASSERT3U(ztqs->tqs_cnt, ==, 0);
+ ASSERT0(ztqs->tqs_cnt);
} else {
for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);