aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c')
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c432
1 files changed, 175 insertions, 257 deletions
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index c8a04539258f..967a018640e1 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Klara, Inc.
*/
#include <sys/dataset_kstats.h>
@@ -51,21 +51,12 @@ static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
struct request *rq, boolean_t force_sync);
static unsigned int zvol_major = ZVOL_MAJOR;
-static unsigned int zvol_request_sync = 0;
-static unsigned int zvol_prefetch_bytes = (128 * 1024);
static unsigned long zvol_max_discard_blocks = 16384;
-/*
- * Switch taskq at multiple of 512 MB offset. This can be set to a lower value
- * to utilize more threads for small files but may affect prefetch hits.
- */
-#define ZVOL_TASKQ_OFFSET_SHIFT 29
-
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
static unsigned int zvol_open_timeout_ms = 1000;
#endif
-static unsigned int zvol_threads = 0;
static unsigned int zvol_blk_mq_threads = 0;
static unsigned int zvol_blk_mq_actual_threads;
static boolean_t zvol_use_blk_mq = B_FALSE;
@@ -82,8 +73,6 @@ static boolean_t zvol_use_blk_mq = B_FALSE;
*/
static unsigned int zvol_blk_mq_blocks_per_thread = 8;
-static unsigned int zvol_num_taskqs = 0;
-
#ifndef BLKDEV_DEFAULT_RQ
/* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
#define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
@@ -95,8 +84,9 @@ static unsigned int zvol_num_taskqs = 0;
static inline void
zvol_end_io(struct bio *bio, struct request *rq, int error)
{
+ ASSERT3U(error, >=, 0);
if (bio) {
- bio->bi_status = errno_to_bi_status(-error);
+ bio->bi_status = errno_to_bi_status(error);
bio_endio(bio);
} else {
blk_mq_end_request(rq, errno_to_bi_status(error));
@@ -117,45 +107,8 @@ struct zvol_state_os {
boolean_t use_blk_mq;
};
-typedef struct zv_taskq {
- uint_t tqs_cnt;
- taskq_t **tqs_taskq;
-} zv_taskq_t;
-static zv_taskq_t zvol_taskqs;
static struct ida zvol_ida;
-typedef struct zv_request_stack {
- zvol_state_t *zv;
- struct bio *bio;
- struct request *rq;
-} zv_request_t;
-
-typedef struct zv_work {
- struct request *rq;
- struct work_struct work;
-} zv_work_t;
-
-typedef struct zv_request_task {
- zv_request_t zvr;
- taskq_ent_t ent;
-} zv_request_task_t;
-
-static zv_request_task_t *
-zv_request_task_create(zv_request_t zvr)
-{
- zv_request_task_t *task;
- task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
- taskq_init_ent(&task->ent);
- task->zvr = zvr;
- return (task);
-}
-
-static void
-zv_request_task_free(zv_request_task_t *task)
-{
- kmem_free(task, sizeof (*task));
-}
-
/*
* This is called when a new block multiqueue request comes in. A request
* contains one or more BIOs.
@@ -256,8 +209,14 @@ zvol_write(zv_request_t *zvr)
disk = zv->zv_zso->zvo_disk;
/* bio marked as FLUSH need to flush before write */
- if (io_is_flush(bio, rq))
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if (io_is_flush(bio, rq)) {
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if (error != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ zvol_end_io(bio, rq, -error);
+ return;
+ }
+ }
/* Some requests are just for flush and nothing else. */
if (io_size(bio, rq) == 0) {
@@ -305,7 +264,8 @@ zvol_write(zv_request_t *zvr)
dmu_tx_abort(tx);
break;
}
- error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
+ error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
+ DMU_READ_PREFETCH);
if (error == 0) {
zvol_log_write(zv, tx, off, bytes, sync);
}
@@ -320,8 +280,8 @@ zvol_write(zv_request_t *zvr)
dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
task_io_account_write(nwritten);
- if (sync)
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if (error == 0 && sync)
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
rw_exit(&zv->zv_suspend_lock);
@@ -329,7 +289,7 @@ zvol_write(zv_request_t *zvr)
blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
}
- zvol_end_io(bio, rq, -error);
+ zvol_end_io(bio, rq, error);
}
static void
@@ -408,7 +368,7 @@ zvol_discard(zv_request_t *zvr)
zfs_rangelock_exit(lr);
if (error == 0 && sync)
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
unlock:
rw_exit(&zv->zv_suspend_lock);
@@ -418,7 +378,7 @@ unlock:
start_time);
}
- zvol_end_io(bio, rq, -error);
+ zvol_end_io(bio, rq, error);
}
static void
@@ -475,7 +435,8 @@ zvol_read(zv_request_t *zvr)
if (bytes > volsize - uio.uio_loffset)
bytes = volsize - uio.uio_loffset;
- error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
+ error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
+ DMU_READ_PREFETCH);
if (error) {
/* convert checksum errors into IO errors */
if (error == ECKSUM)
@@ -495,7 +456,7 @@ zvol_read(zv_request_t *zvr)
blk_generic_end_io_acct(q, disk, READ, bio, start_time);
}
- zvol_end_io(bio, rq, -error);
+ zvol_end_io(bio, rq, error);
}
static void
@@ -523,10 +484,31 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
fstrans_cookie_t cookie = spl_fstrans_mark();
uint64_t offset = io_offset(bio, rq);
uint64_t size = io_size(bio, rq);
- int rw = io_data_dir(bio, rq);
+ int rw;
+
+ if (rq != NULL) {
+ /*
+ * Flush & trim requests go down the zvol_write codepath. Or
+ * more specifically:
+ *
+ * If request is a write, or if it's op_is_sync() and not a
+ * read, or if it's a flush, or if it's a discard, then send the
+ * request down the write path.
+ */
+ if (op_is_write(rq->cmd_flags) ||
+ (op_is_sync(rq->cmd_flags) && req_op(rq) != REQ_OP_READ) ||
+ req_op(rq) == REQ_OP_FLUSH ||
+ op_is_discard(rq->cmd_flags)) {
+ rw = WRITE;
+ } else {
+ rw = READ;
+ }
+ } else {
+ rw = bio_data_dir(bio);
+ }
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
- zvol_end_io(bio, rq, -SET_ERROR(ENXIO));
+ zvol_end_io(bio, rq, SET_ERROR(ENXIO));
goto out;
}
@@ -545,7 +527,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
(long long unsigned)offset,
(long unsigned)size);
- zvol_end_io(bio, rq, -SET_ERROR(EIO));
+ zvol_end_io(bio, rq, SET_ERROR(EIO));
goto out;
}
@@ -558,8 +540,8 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
#ifdef HAVE_BLK_MQ_RQ_HCTX
blk_mq_hw_queue = rq->mq_hctx->queue_num;
#else
- blk_mq_hw_queue =
- rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
+ blk_mq_hw_queue = rq->q->queue_hw_ctx[
+ rq->q->mq_map[raw_smp_processor_id()]]->queue_num;
#endif
taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
blk_mq_hw_queue);
@@ -567,7 +549,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
if (rw == WRITE) {
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
- zvol_end_io(bio, rq, -SET_ERROR(EROFS));
+ zvol_end_io(bio, rq, SET_ERROR(EROFS));
goto out;
}
@@ -718,28 +700,19 @@ zvol_open(struct block_device *bdev, fmode_t flag)
retry:
#endif
- rw_enter(&zvol_state_lock, RW_READER);
- /*
- * Obtain a copy of private_data under the zvol_state_lock to make
- * sure that either the result of zvol free code path setting
- * disk->private_data to NULL is observed, or zvol_os_free()
- * is not called on this zv because of the positive zv_open_count.
- */
+
#ifdef HAVE_BLK_MODE_T
- zv = disk->private_data;
+ zv = atomic_load_ptr(&disk->private_data);
#else
- zv = bdev->bd_disk->private_data;
+ zv = atomic_load_ptr(&bdev->bd_disk->private_data);
#endif
if (zv == NULL) {
- rw_exit(&zvol_state_lock);
return (-SET_ERROR(ENXIO));
}
mutex_enter(&zv->zv_state_lock);
-
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
mutex_exit(&zv->zv_state_lock);
- rw_exit(&zvol_state_lock);
return (-SET_ERROR(ENXIO));
}
@@ -751,8 +724,28 @@ retry:
if (zv->zv_open_count == 0) {
if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
mutex_exit(&zv->zv_state_lock);
+
+ /*
+ * Removal may happen while the locks are down, so
+ * we can't trust zv any longer; we have to start over.
+ */
+#ifdef HAVE_BLK_MODE_T
+ zv = atomic_load_ptr(&disk->private_data);
+#else
+ zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+#endif
+ if (zv == NULL)
+ return (-SET_ERROR(ENXIO));
+
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_exit(&zv->zv_suspend_lock);
+ return (-SET_ERROR(ENXIO));
+ }
+
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
@@ -763,7 +756,6 @@ retry:
drop_suspend = B_TRUE;
}
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -860,11 +852,11 @@ zvol_release(struct gendisk *disk, fmode_t unused)
#if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
(void) unused;
#endif
- zvol_state_t *zv;
boolean_t drop_suspend = B_TRUE;
- rw_enter(&zvol_state_lock, RW_READER);
- zv = disk->private_data;
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
+ if (zv == NULL)
+ return;
mutex_enter(&zv->zv_state_lock);
ASSERT3U(zv->zv_open_count, >, 0);
@@ -878,6 +870,15 @@ zvol_release(struct gendisk *disk, fmode_t unused)
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ /*
+ * Unlike in zvol_open(), we don't check if removal
+ * started here, because we might be one of the openers
+ * that needs to be thrown out! If we're the last, we
+ * need to call zvol_last_close() below to finish
+ * cleanup. So, no special treatment for us.
+ */
+
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 1) {
rw_exit(&zv->zv_suspend_lock);
@@ -887,7 +888,6 @@ zvol_release(struct gendisk *disk, fmode_t unused)
} else {
drop_suspend = B_FALSE;
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -907,9 +907,10 @@ static int
zvol_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
- zvol_state_t *zv = bdev->bd_disk->private_data;
int error = 0;
+ zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+ ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
switch (cmd) {
@@ -932,16 +933,18 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode,
case BLKZNAME:
mutex_enter(&zv->zv_state_lock);
- error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
+ error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
mutex_exit(&zv->zv_state_lock);
+ if (error)
+ error = SET_ERROR(error);
break;
default:
- error = -ENOTTY;
+ error = SET_ERROR(ENOTTY);
break;
}
- return (SET_ERROR(error));
+ return (-error);
}
#ifdef CONFIG_COMPAT
@@ -960,9 +963,8 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
{
unsigned int mask = 0;
- rw_enter(&zvol_state_lock, RW_READER);
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
- zvol_state_t *zv = disk->private_data;
if (zv != NULL) {
mutex_enter(&zv->zv_state_lock);
mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
@@ -970,17 +972,14 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
mutex_exit(&zv->zv_state_lock);
}
- rw_exit(&zvol_state_lock);
-
return (mask);
}
static int
zvol_revalidate_disk(struct gendisk *disk)
{
- rw_enter(&zvol_state_lock, RW_READER);
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
- zvol_state_t *zv = disk->private_data;
if (zv != NULL) {
mutex_enter(&zv->zv_state_lock);
set_capacity(zv->zv_zso->zvo_disk,
@@ -988,8 +987,6 @@ zvol_revalidate_disk(struct gendisk *disk)
mutex_exit(&zv->zv_state_lock);
}
- rw_exit(&zvol_state_lock);
-
return (0);
}
@@ -1008,16 +1005,6 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
return (0);
}
-void
-zvol_os_clear_private(zvol_state_t *zv)
-{
- /*
- * Cleared while holding zvol_state_lock as a writer
- * which will prevent zvol_open() from opening it.
- */
- zv->zv_zso->zvo_disk->private_data = NULL;
-}
-
/*
* Provide a simple virtual geometry for legacy compatibility. For devices
* smaller than 1 MiB a small head and sector count is used to allow very
@@ -1027,9 +1014,10 @@ zvol_os_clear_private(zvol_state_t *zv)
static int
zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
- zvol_state_t *zv = bdev->bd_disk->private_data;
sector_t sectors;
+ zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+ ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
sectors = get_capacity(zv->zv_zso->zvo_disk);
@@ -1348,27 +1336,30 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
* Allocate memory for a new zvol_state_t and setup the required
* request queue and generic disk structures for the block device.
*/
-static zvol_state_t *
-zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
+static int
+zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize,
+ zvol_state_t **zvp)
{
zvol_state_t *zv;
struct zvol_state_os *zso;
uint64_t volmode;
int ret;
- if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
- return (NULL);
+ ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL);
+ if (ret)
+ return (ret);
if (volmode == ZFS_VOLMODE_DEFAULT)
volmode = zvol_volmode;
if (volmode == ZFS_VOLMODE_NONE)
- return (NULL);
+ return (0);
zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
zv->zv_zso = zso;
zv->zv_volmode = volmode;
+ zv->zv_volsize = volsize;
zv->zv_volblocksize = volblocksize;
list_link_init(&zv->zv_next);
@@ -1397,13 +1388,15 @@ zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
*/
if (zv->zv_zso->use_blk_mq) {
ret = zvol_alloc_blk_mq(zv, &limits);
+ if (ret != 0)
+ goto out_kmem;
zso->zvo_disk->fops = &zvol_ops_blk_mq;
} else {
ret = zvol_alloc_non_blk_mq(zso, &limits);
+ if (ret != 0)
+ goto out_kmem;
zso->zvo_disk->fops = &zvol_ops;
}
- if (ret != 0)
- goto out_kmem;
/* Limit read-ahead to a single page to prevent over-prefetching. */
blk_queue_set_read_ahead(zso->zvo_queue, 1);
@@ -1440,61 +1433,79 @@ zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
ZVOL_DEV_NAME, (dev & MINORMASK));
- return (zv);
+ *zvp = zv;
+ return (ret);
out_kmem:
kmem_free(zso, sizeof (struct zvol_state_os));
kmem_free(zv, sizeof (zvol_state_t));
- return (NULL);
+ return (ret);
}
-/*
- * Cleanup then free a zvol_state_t which was created by zvol_alloc().
- * At this time, the structure is not opened by anyone, is taken off
- * the zvol_state_list, and has its private data set to NULL.
- * The zvol_state_lock is dropped.
- *
- * This function may take many milliseconds to complete (e.g. we've seen
- * it take over 256ms), due to the calls to "blk_cleanup_queue" and
- * "del_gendisk". Thus, consumers need to be careful to account for this
- * latency when calling this function.
- */
void
-zvol_os_free(zvol_state_t *zv)
+zvol_os_remove_minor(zvol_state_t *zv)
{
-
- ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
- ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
ASSERT0(zv->zv_open_count);
- ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
+ ASSERT0(atomic_read(&zv->zv_suspend_ref));
+ ASSERT(zv->zv_flags & ZVOL_REMOVING);
- rw_destroy(&zv->zv_suspend_lock);
- zfs_rangelock_fini(&zv->zv_rangelock);
+ struct zvol_state_os *zso = zv->zv_zso;
+ zv->zv_zso = NULL;
+
+ /* Clearing private_data will make new callers return immediately. */
+ atomic_store_ptr(&zso->zvo_disk->private_data, NULL);
+
+ /*
+ * Drop the state lock before calling del_gendisk(). There may be
+ * callers waiting to acquire it, but del_gendisk() will block until
+ * they exit, which would deadlock.
+ */
+ mutex_exit(&zv->zv_state_lock);
- del_gendisk(zv->zv_zso->zvo_disk);
+ del_gendisk(zso->zvo_disk);
#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
#if defined(HAVE_BLK_CLEANUP_DISK)
- blk_cleanup_disk(zv->zv_zso->zvo_disk);
+ blk_cleanup_disk(zso->zvo_disk);
#else
- put_disk(zv->zv_zso->zvo_disk);
+ put_disk(zso->zvo_disk);
#endif
#else
- blk_cleanup_queue(zv->zv_zso->zvo_queue);
- put_disk(zv->zv_zso->zvo_disk);
+ blk_cleanup_queue(zso->zvo_queue);
+ put_disk(zso->zvo_disk);
#endif
- if (zv->zv_zso->use_blk_mq)
- blk_mq_free_tag_set(&zv->zv_zso->tag_set);
+ if (zso->use_blk_mq)
+ blk_mq_free_tag_set(&zso->tag_set);
- ida_simple_remove(&zvol_ida,
- MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
+ ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
+
+ kmem_free(zso, sizeof (struct zvol_state_os));
+
+ mutex_enter(&zv->zv_state_lock);
+}
+
+void
+zvol_os_free(zvol_state_t *zv)
+{
+
+ ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+ ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT0(zv->zv_open_count);
+ ASSERT0P(zv->zv_zso);
+
+ ASSERT0P(zv->zv_objset);
+ ASSERT0P(zv->zv_zilog);
+ ASSERT0P(zv->zv_dn);
+
+ rw_destroy(&zv->zv_suspend_lock);
+ zfs_rangelock_fini(&zv->zv_rangelock);
cv_destroy(&zv->zv_removing_cv);
mutex_destroy(&zv->zv_state_lock);
dataset_kstats_destroy(&zv->zv_kstat);
- kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
kmem_free(zv, sizeof (zvol_state_t));
}
@@ -1514,7 +1525,9 @@ __zvol_os_add_disk(struct gendisk *disk)
{
int error = 0;
#ifdef HAVE_ADD_DISK_RET
- error = add_disk(disk);
+ error = -add_disk(disk);
+ if (error)
+ error = SET_ERROR(error);
#else
add_disk(disk);
#endif
@@ -1606,7 +1619,7 @@ zvol_os_add_disk(struct gendisk *disk)
int
zvol_os_create_minor(const char *name)
{
- zvol_state_t *zv;
+ zvol_state_t *zv = NULL;
objset_t *os;
dmu_object_info_t *doi;
uint64_t volsize;
@@ -1655,18 +1668,16 @@ zvol_os_create_minor(const char *name)
if (error)
goto out_dmu_objset_disown;
- zv = zvol_alloc(MKDEV(zvol_major, minor), name,
- doi->doi_data_block_size);
- if (zv == NULL) {
- error = SET_ERROR(EAGAIN);
+ error = zvol_alloc(MKDEV(zvol_major, minor), name,
+ volsize, doi->doi_data_block_size, &zv);
+ if (error || zv == NULL)
goto out_dmu_objset_disown;
- }
+
zv->zv_hash = hash;
if (dmu_objset_is_snapshot(os))
zv->zv_flags |= ZVOL_RDONLY;
- zv->zv_volsize = volsize;
zv->zv_objset = os;
/* Default */
@@ -1691,11 +1702,11 @@ zvol_os_create_minor(const char *name)
blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
#endif
- ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+ ASSERT0P(zv->zv_kstat.dk_kstats);
error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
if (error)
goto out_dmu_objset_disown;
- ASSERT3P(zv->zv_zilog, ==, NULL);
+ ASSERT0P(zv->zv_zilog);
zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
if (spa_writeable(dmu_objset_spa(os))) {
if (zil_replay_disable)
@@ -1733,7 +1744,7 @@ out_doi:
* zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
* directly as well.
*/
- if (error == 0) {
+ if (error == 0 && zv) {
rw_enter(&zvol_state_lock, RW_WRITER);
zvol_insert(zv);
rw_exit(&zvol_state_lock);
@@ -1745,7 +1756,7 @@ out_doi:
return (error);
}
-void
+int
zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
{
int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
@@ -1772,6 +1783,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
set_disk_ro(zv->zv_zso->zvo_disk, readonly);
dataset_kstats_rename(&zv->zv_kstat, newname);
+
+ return (0);
}
void
@@ -1793,61 +1806,16 @@ zvol_init(void)
{
int error;
- /*
- * zvol_threads is the module param the user passes in.
- *
- * zvol_actual_threads is what we use internally, since the user can
- * pass zvol_thread = 0 to mean "use all the CPUs" (the default).
- */
- static unsigned int zvol_actual_threads;
-
- if (zvol_threads == 0) {
- /*
- * See dde9380a1 for why 32 was chosen here. This should
- * probably be refined to be some multiple of the number
- * of CPUs.
- */
- zvol_actual_threads = MAX(num_online_cpus(), 32);
- } else {
- zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
+ error = zvol_init_impl();
+ if (error) {
+ printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error);
+ return (error);
}
- /*
- * Use atleast 32 zvol_threads but for many core system,
- * prefer 6 threads per taskq, but no more taskqs
- * than threads in them on large systems.
- *
- * taskq total
- * cpus taskqs threads threads
- * ------- ------- ------- -------
- * 1 1 32 32
- * 2 1 32 32
- * 4 1 32 32
- * 8 2 16 32
- * 16 3 11 33
- * 32 5 7 35
- * 64 8 8 64
- * 128 11 12 132
- * 256 16 16 256
- */
- zv_taskq_t *ztqs = &zvol_taskqs;
- uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
- if (num_tqs == 0) {
- num_tqs = 1 + num_online_cpus() / 6;
- while (num_tqs * num_tqs > zvol_actual_threads)
- num_tqs--;
- }
- uint_t per_tq_thread = zvol_actual_threads / num_tqs;
- if (per_tq_thread * num_tqs < zvol_actual_threads)
- per_tq_thread++;
- ztqs->tqs_cnt = num_tqs;
- ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
- error = register_blkdev(zvol_major, ZVOL_DRIVER);
+ error = -register_blkdev(zvol_major, ZVOL_DRIVER);
if (error) {
- kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
- ztqs->tqs_taskq = NULL;
printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
- return (error);
+ return (SET_ERROR(error));
}
if (zvol_blk_mq_queue_depth == 0) {
@@ -1864,25 +1832,6 @@ zvol_init(void)
1024);
}
- for (uint_t i = 0; i < num_tqs; i++) {
- char name[32];
- (void) snprintf(name, sizeof (name), "%s_tq-%u",
- ZVOL_DRIVER, i);
- ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
- maxclsyspri, per_tq_thread, INT_MAX,
- TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
- if (ztqs->tqs_taskq[i] == NULL) {
- for (int j = i - 1; j >= 0; j--)
- taskq_destroy(ztqs->tqs_taskq[j]);
- unregister_blkdev(zvol_major, ZVOL_DRIVER);
- kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
- sizeof (taskq_t *));
- ztqs->tqs_taskq = NULL;
- return (-ENOMEM);
- }
- }
-
- zvol_init_impl();
ida_init(&zvol_ida);
return (0);
}
@@ -1890,50 +1839,19 @@ zvol_init(void)
void
zvol_fini(void)
{
- zv_taskq_t *ztqs = &zvol_taskqs;
- zvol_fini_impl();
unregister_blkdev(zvol_major, ZVOL_DRIVER);
- if (ztqs->tqs_taskq == NULL) {
- ASSERT3U(ztqs->tqs_cnt, ==, 0);
- } else {
- for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
- ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
- taskq_destroy(ztqs->tqs_taskq[i]);
- }
- kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
- sizeof (taskq_t *));
- ztqs->tqs_taskq = NULL;
- }
+ zvol_fini_impl();
ida_destroy(&zvol_ida);
}
-module_param(zvol_inhibit_dev, uint, 0644);
-MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
-
module_param(zvol_major, uint, 0444);
MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
-module_param(zvol_threads, uint, 0444);
-MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set"
- "to 0 to use all active CPUs");
-
-module_param(zvol_request_sync, uint, 0644);
-MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
-
module_param(zvol_max_discard_blocks, ulong, 0444);
MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
-module_param(zvol_num_taskqs, uint, 0444);
-MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");
-
-module_param(zvol_prefetch_bytes, uint, 0644);
-MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
-
-module_param(zvol_volmode, uint, 0644);
-MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
-
module_param(zvol_blk_mq_queue_depth, uint, 0644);
MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");