1 files changed, 175 insertions, 257 deletions
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index c8a04539258f..967a018640e1 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Klara, Inc.
  */
 
 #include <sys/dataset_kstats.h>
@@ -51,21 +51,12 @@ static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
     struct request *rq, boolean_t force_sync);
 
 static unsigned int zvol_major = ZVOL_MAJOR;
-static unsigned int zvol_request_sync = 0;
-static unsigned int zvol_prefetch_bytes = (128 * 1024);
 static unsigned long zvol_max_discard_blocks = 16384;
 
-/*
- * Switch taskq at multiple of 512 MB offset. This can be set to a lower value
- * to utilize more threads for small files but may affect prefetch hits.
- */
-#define	ZVOL_TASKQ_OFFSET_SHIFT 29
-
 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
 static unsigned int zvol_open_timeout_ms = 1000;
 #endif
 
-static unsigned int zvol_threads = 0;
 static unsigned int zvol_blk_mq_threads = 0;
 static unsigned int zvol_blk_mq_actual_threads;
 static boolean_t zvol_use_blk_mq = B_FALSE;
@@ -82,8 +73,6 @@ static boolean_t zvol_use_blk_mq = B_FALSE;
  */
 static unsigned int zvol_blk_mq_blocks_per_thread = 8;
 
-static unsigned int zvol_num_taskqs = 0;
-
 #ifndef	BLKDEV_DEFAULT_RQ
 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
 #define	BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
@@ -95,8 +84,9 @@ static unsigned int zvol_num_taskqs = 0;
 static inline void
 zvol_end_io(struct bio *bio, struct request *rq, int error)
 {
+	ASSERT3U(error, >=, 0);
 	if (bio) {
-		bio->bi_status = errno_to_bi_status(-error);
+		bio->bi_status = errno_to_bi_status(error);
 		bio_endio(bio);
 	} else {
 		blk_mq_end_request(rq, errno_to_bi_status(error));
@@ -117,45 +107,8 @@ struct zvol_state_os {
 	boolean_t use_blk_mq;
 };
 
-typedef struct zv_taskq {
-	uint_t tqs_cnt;
-	taskq_t **tqs_taskq;
-} zv_taskq_t;
-static zv_taskq_t zvol_taskqs;
 static struct ida zvol_ida;
 
-typedef struct zv_request_stack {
-	zvol_state_t	*zv;
-	struct bio	*bio;
-	struct request *rq;
-} zv_request_t;
-
-typedef struct zv_work {
-	struct request  *rq;
-	struct work_struct work;
-} zv_work_t;
-
-typedef struct zv_request_task {
-	zv_request_t zvr;
-	taskq_ent_t	ent;
-} zv_request_task_t;
-
-static zv_request_task_t *
-zv_request_task_create(zv_request_t zvr)
-{
-	zv_request_task_t *task;
-	task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
-	taskq_init_ent(&task->ent);
-	task->zvr = zvr;
-	return (task);
-}
-
-static void
-zv_request_task_free(zv_request_task_t *task)
-{
-	kmem_free(task, sizeof (*task));
-}
-
 /*
  * This is called when a new block multiqueue request comes in.  A request
  * contains one or more BIOs.
@@ -256,8 +209,14 @@ zvol_write(zv_request_t *zvr)
 	disk = zv->zv_zso->zvo_disk;
 
 	/* bio marked as FLUSH need to flush before write */
-	if (io_is_flush(bio, rq))
-		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+	if (io_is_flush(bio, rq)) {
+		error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		if (error != 0) {
+			rw_exit(&zv->zv_suspend_lock);
+			zvol_end_io(bio, rq, -error);
+			return;
+		}
+	}
 
 	/* Some requests are just for flush and nothing else. */
 	if (io_size(bio, rq) == 0) {
@@ -305,7 +264,8 @@ zvol_write(zv_request_t *zvr)
 			dmu_tx_abort(tx);
 			break;
 		}
-		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
+		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
+		    DMU_READ_PREFETCH);
 		if (error == 0) {
 			zvol_log_write(zv, tx, off, bytes, sync);
 		}
@@ -320,8 +280,8 @@ zvol_write(zv_request_t *zvr)
 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
 	task_io_account_write(nwritten);
 
-	if (sync)
-		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+	if (error == 0 && sync)
+		error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 	rw_exit(&zv->zv_suspend_lock);
 
@@ -329,7 +289,7 @@ zvol_write(zv_request_t *zvr)
 		blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
 	}
 
-	zvol_end_io(bio, rq, -error);
+	zvol_end_io(bio, rq, error);
 }
 
 static void
@@ -408,7 +368,7 @@ zvol_discard(zv_request_t *zvr)
 	zfs_rangelock_exit(lr);
 
 	if (error == 0 && sync)
-		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 unlock:
 	rw_exit(&zv->zv_suspend_lock);
@@ -418,7 +378,7 @@ unlock:
 		    start_time);
 	}
 
-	zvol_end_io(bio, rq, -error);
+	zvol_end_io(bio, rq, error);
 }
 
 static void
@@ -475,7 +435,8 @@ zvol_read(zv_request_t *zvr)
 		if (bytes > volsize - uio.uio_loffset)
 			bytes = volsize - uio.uio_loffset;
 
-		error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
+		error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
+		    DMU_READ_PREFETCH);
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
@@ -495,7 +456,7 @@ zvol_read(zv_request_t *zvr)
 		blk_generic_end_io_acct(q, disk, READ, bio, start_time);
 	}
 
-	zvol_end_io(bio, rq, -error);
+	zvol_end_io(bio, rq, error);
 }
 
 static void
@@ -523,10 +484,31 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	uint64_t offset = io_offset(bio, rq);
 	uint64_t size = io_size(bio, rq);
-	int rw = io_data_dir(bio, rq);
+	int rw;
+
+	if (rq != NULL) {
+		/*
+		 * Flush & trim requests go down the zvol_write codepath.  Or
+		 * more specifically:
+		 *
+		 * If request is a write, or if it's op_is_sync() and not a
+		 * read, or if it's a flush, or if it's a discard, then send the
+		 * request down the write path.
+		 */
+		if (op_is_write(rq->cmd_flags) ||
+		    (op_is_sync(rq->cmd_flags) && req_op(rq) != REQ_OP_READ) ||
+		    req_op(rq) == REQ_OP_FLUSH ||
+		    op_is_discard(rq->cmd_flags)) {
+			rw = WRITE;
+		} else {
+			rw = READ;
+		}
+	} else {
+		rw = bio_data_dir(bio);
+	}
 
 	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
-		zvol_end_io(bio, rq, -SET_ERROR(ENXIO));
+		zvol_end_io(bio, rq, SET_ERROR(ENXIO));
 		goto out;
 	}
 
@@ -545,7 +527,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 		    (long long unsigned)offset,
 		    (long unsigned)size);
 
-		zvol_end_io(bio, rq, -SET_ERROR(EIO));
+		zvol_end_io(bio, rq, SET_ERROR(EIO));
 		goto out;
 	}
 
@@ -558,8 +540,8 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 #ifdef HAVE_BLK_MQ_RQ_HCTX
 		blk_mq_hw_queue = rq->mq_hctx->queue_num;
 #else
-		blk_mq_hw_queue =
-		    rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
+		blk_mq_hw_queue = rq->q->queue_hw_ctx[
+		    rq->q->mq_map[raw_smp_processor_id()]]->queue_num;
 #endif
 	taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
 	    blk_mq_hw_queue);
@@ -567,7 +549,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 
 	if (rw == WRITE) {
 		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
-			zvol_end_io(bio, rq, -SET_ERROR(EROFS));
+			zvol_end_io(bio, rq, SET_ERROR(EROFS));
 			goto out;
 		}
 
@@ -718,28 +700,19 @@ zvol_open(struct block_device *bdev, fmode_t flag)
 
 retry:
 #endif
-	rw_enter(&zvol_state_lock, RW_READER);
-	/*
-	 * Obtain a copy of private_data under the zvol_state_lock to make
-	 * sure that either the result of zvol free code path setting
-	 * disk->private_data to NULL is observed, or zvol_os_free()
-	 * is not called on this zv because of the positive zv_open_count.
-	 */
+
 #ifdef HAVE_BLK_MODE_T
-	zv = disk->private_data;
+	zv = atomic_load_ptr(&disk->private_data);
 #else
-	zv = bdev->bd_disk->private_data;
+	zv = atomic_load_ptr(&bdev->bd_disk->private_data);
 #endif
 	if (zv == NULL) {
-		rw_exit(&zvol_state_lock);
 		return (-SET_ERROR(ENXIO));
 	}
 
 	mutex_enter(&zv->zv_state_lock);
-
 	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
 		mutex_exit(&zv->zv_state_lock);
-		rw_exit(&zvol_state_lock);
 		return (-SET_ERROR(ENXIO));
 	}
 
@@ -751,8 +724,28 @@ retry:
 	if (zv->zv_open_count == 0) {
 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 			mutex_exit(&zv->zv_state_lock);
+
+			/*
+			 * Removal may happen while the locks are down, so
+			 * we can't trust zv any longer; we have to start over.
+			 */
+#ifdef HAVE_BLK_MODE_T
+			zv = atomic_load_ptr(&disk->private_data);
+#else
+			zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+#endif
+			if (zv == NULL)
+				return (-SET_ERROR(ENXIO));
+
 			rw_enter(&zv->zv_suspend_lock, RW_READER);
 			mutex_enter(&zv->zv_state_lock);
+
+			if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
+				mutex_exit(&zv->zv_state_lock);
+				rw_exit(&zv->zv_suspend_lock);
+				return (-SET_ERROR(ENXIO));
+			}
+
 			/* check to see if zv_suspend_lock is needed */
 			if (zv->zv_open_count != 0) {
 				rw_exit(&zv->zv_suspend_lock);
@@ -763,7 +756,6 @@ retry:
 			drop_suspend = B_TRUE;
 		}
 	}
-	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -860,11 +852,11 @@ zvol_release(struct gendisk *disk, fmode_t unused)
 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
 	(void) unused;
 #endif
-	zvol_state_t *zv;
 	boolean_t drop_suspend = B_TRUE;
 
-	rw_enter(&zvol_state_lock, RW_READER);
-	zv = disk->private_data;
+	zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
+	if (zv == NULL)
+		return;
 
 	mutex_enter(&zv->zv_state_lock);
 	ASSERT3U(zv->zv_open_count, >, 0);
@@ -878,6 +870,15 @@ zvol_release(struct gendisk *disk, fmode_t unused)
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, RW_READER);
 			mutex_enter(&zv->zv_state_lock);
+
+			/*
+			 * Unlike in zvol_open(), we don't check if removal
+			 * started here, because we might be one of the openers
+			 * that needs to be thrown out! If we're the last, we
+			 * need to call zvol_last_close() below to finish
+			 * cleanup. So, no special treatment for us.
+			 */
+
 			/* check to see if zv_suspend_lock is needed */
 			if (zv->zv_open_count != 1) {
 				rw_exit(&zv->zv_suspend_lock);
@@ -887,7 +888,6 @@ zvol_release(struct gendisk *disk, fmode_t unused)
 	} else {
 		drop_suspend = B_FALSE;
 	}
-	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -907,9 +907,10 @@ static int
 zvol_ioctl(struct block_device *bdev, fmode_t mode,
     unsigned int cmd, unsigned long arg)
 {
-	zvol_state_t *zv = bdev->bd_disk->private_data;
 	int error = 0;
 
+	zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+	ASSERT3P(zv, !=, NULL);
 	ASSERT3U(zv->zv_open_count, >, 0);
 
 	switch (cmd) {
@@ -932,16 +933,18 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode,
 
 	case BLKZNAME:
 		mutex_enter(&zv->zv_state_lock);
-		error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
+		error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
 		mutex_exit(&zv->zv_state_lock);
+		if (error)
+			error = SET_ERROR(error);
 		break;
 
 	default:
-		error = -ENOTTY;
+		error = SET_ERROR(ENOTTY);
 		break;
 	}
 
-	return (SET_ERROR(error));
+	return (-error);
 }
 
 #ifdef CONFIG_COMPAT
@@ -960,9 +963,8 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
 {
 	unsigned int mask = 0;
 
-	rw_enter(&zvol_state_lock, RW_READER);
+	zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
 
-	zvol_state_t *zv = disk->private_data;
 	if (zv != NULL) {
 		mutex_enter(&zv->zv_state_lock);
 		mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
@@ -970,17 +972,14 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
 		mutex_exit(&zv->zv_state_lock);
 	}
 
-	rw_exit(&zvol_state_lock);
-
 	return (mask);
 }
 
 static int
 zvol_revalidate_disk(struct gendisk *disk)
 {
-	rw_enter(&zvol_state_lock, RW_READER);
+	zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
 
-	zvol_state_t *zv = disk->private_data;
 	if (zv != NULL) {
 		mutex_enter(&zv->zv_state_lock);
 		set_capacity(zv->zv_zso->zvo_disk,
@@ -988,8 +987,6 @@ zvol_revalidate_disk(struct gendisk *disk)
 		mutex_exit(&zv->zv_state_lock);
 	}
 
-	rw_exit(&zvol_state_lock);
-
 	return (0);
 }
 
@@ -1008,16 +1005,6 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
 	return (0);
 }
 
-void
-zvol_os_clear_private(zvol_state_t *zv)
-{
-	/*
-	 * Cleared while holding zvol_state_lock as a writer
-	 * which will prevent zvol_open() from opening it.
-	 */
-	zv->zv_zso->zvo_disk->private_data = NULL;
-}
-
 /*
  * Provide a simple virtual geometry for legacy compatibility.  For devices
  * smaller than 1 MiB a small head and sector count is used to allow very
@@ -1027,9 +1014,10 @@ zvol_os_clear_private(zvol_state_t *zv)
 static int
 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	zvol_state_t *zv = bdev->bd_disk->private_data;
 	sector_t sectors;
 
+	zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+	ASSERT3P(zv, !=, NULL);
 	ASSERT3U(zv->zv_open_count, >, 0);
 
 	sectors = get_capacity(zv->zv_zso->zvo_disk);
@@ -1348,27 +1336,30 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
  * Allocate memory for a new zvol_state_t and setup the required
  * request queue and generic disk structures for the block device.
  */
-static zvol_state_t *
-zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
+static int
+zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize,
+    zvol_state_t **zvp)
 {
 	zvol_state_t *zv;
 	struct zvol_state_os *zso;
 	uint64_t volmode;
 	int ret;
 
-	if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
-		return (NULL);
+	ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL);
+	if (ret)
+		return (ret);
 
 	if (volmode == ZFS_VOLMODE_DEFAULT)
 		volmode = zvol_volmode;
 
 	if (volmode == ZFS_VOLMODE_NONE)
-		return (NULL);
+		return (0);
 
 	zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
 	zv->zv_zso = zso;
 	zv->zv_volmode = volmode;
+	zv->zv_volsize = volsize;
 	zv->zv_volblocksize = volblocksize;
 
 	list_link_init(&zv->zv_next);
@@ -1397,13 +1388,15 @@ zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
 	 */
 	if (zv->zv_zso->use_blk_mq) {
 		ret = zvol_alloc_blk_mq(zv, &limits);
+		if (ret != 0)
+			goto out_kmem;
 		zso->zvo_disk->fops = &zvol_ops_blk_mq;
 	} else {
 		ret = zvol_alloc_non_blk_mq(zso, &limits);
+		if (ret != 0)
+			goto out_kmem;
 		zso->zvo_disk->fops = &zvol_ops;
 	}
-	if (ret != 0)
-		goto out_kmem;
 
 	/* Limit read-ahead to a single page to prevent over-prefetching. */
 	blk_queue_set_read_ahead(zso->zvo_queue, 1);
@@ -1440,61 +1433,79 @@ zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
 	snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
 	    ZVOL_DEV_NAME, (dev & MINORMASK));
 
-	return (zv);
+	*zvp = zv;
+	return (ret);
 
 out_kmem:
 	kmem_free(zso, sizeof (struct zvol_state_os));
 	kmem_free(zv, sizeof (zvol_state_t));
-	return (NULL);
+	return (ret);
 }
 
-/*
- * Cleanup then free a zvol_state_t which was created by zvol_alloc().
- * At this time, the structure is not opened by anyone, is taken off
- * the zvol_state_list, and has its private data set to NULL.
- * The zvol_state_lock is dropped.
- *
- * This function may take many milliseconds to complete (e.g. we've seen
- * it take over 256ms), due to the calls to "blk_cleanup_queue" and
- * "del_gendisk". Thus, consumers need to be careful to account for this
- * latency when calling this function.
- */
 void
-zvol_os_free(zvol_state_t *zv)
+zvol_os_remove_minor(zvol_state_t *zv)
 {
-
-	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
-	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT0(zv->zv_open_count);
-	ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
+	ASSERT0(atomic_read(&zv->zv_suspend_ref));
+	ASSERT(zv->zv_flags & ZVOL_REMOVING);
 
-	rw_destroy(&zv->zv_suspend_lock);
-	zfs_rangelock_fini(&zv->zv_rangelock);
+	struct zvol_state_os *zso = zv->zv_zso;
+	zv->zv_zso = NULL;
+
+	/* Clearing private_data will make new callers return immediately. */
+	atomic_store_ptr(&zso->zvo_disk->private_data, NULL);
+
+	/*
+	 * Drop the state lock before calling del_gendisk(). There may be
+	 * callers waiting to acquire it, but del_gendisk() will block until
+	 * they exit, which would deadlock.
+	 */
+	mutex_exit(&zv->zv_state_lock);
 
-	del_gendisk(zv->zv_zso->zvo_disk);
+	del_gendisk(zso->zvo_disk);
 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
 	(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
 #if defined(HAVE_BLK_CLEANUP_DISK)
-	blk_cleanup_disk(zv->zv_zso->zvo_disk);
+	blk_cleanup_disk(zso->zvo_disk);
 #else
-	put_disk(zv->zv_zso->zvo_disk);
+	put_disk(zso->zvo_disk);
 #endif
 #else
-	blk_cleanup_queue(zv->zv_zso->zvo_queue);
-	put_disk(zv->zv_zso->zvo_disk);
+	blk_cleanup_queue(zso->zvo_queue);
+	put_disk(zso->zvo_disk);
 #endif
 
-	if (zv->zv_zso->use_blk_mq)
-		blk_mq_free_tag_set(&zv->zv_zso->tag_set);
+	if (zso->use_blk_mq)
+		blk_mq_free_tag_set(&zso->tag_set);
 
-	ida_simple_remove(&zvol_ida,
-	    MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
+	ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
+
+	kmem_free(zso, sizeof (struct zvol_state_os));
+
+	mutex_enter(&zv->zv_state_lock);
+}
+
+void
+zvol_os_free(zvol_state_t *zv)
+{
+
+	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT0(zv->zv_open_count);
+	ASSERT0P(zv->zv_zso);
+
+	ASSERT0P(zv->zv_objset);
+	ASSERT0P(zv->zv_zilog);
+	ASSERT0P(zv->zv_dn);
+
+	rw_destroy(&zv->zv_suspend_lock);
+	zfs_rangelock_fini(&zv->zv_rangelock);
 
 	cv_destroy(&zv->zv_removing_cv);
 	mutex_destroy(&zv->zv_state_lock);
 	dataset_kstats_destroy(&zv->zv_kstat);
 
-	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
 	kmem_free(zv, sizeof (zvol_state_t));
 }
 
@@ -1514,7 +1525,9 @@ __zvol_os_add_disk(struct gendisk *disk)
 {
 	int error = 0;
 #ifdef HAVE_ADD_DISK_RET
-	error = add_disk(disk);
+	error = -add_disk(disk);
+	if (error)
+		error = SET_ERROR(error);
 #else
 	add_disk(disk);
 #endif
@@ -1606,7 +1619,7 @@ zvol_os_add_disk(struct gendisk *disk)
 int
 zvol_os_create_minor(const char *name)
 {
-	zvol_state_t *zv;
+	zvol_state_t *zv = NULL;
 	objset_t *os;
 	dmu_object_info_t *doi;
 	uint64_t volsize;
@@ -1655,18 +1668,16 @@ zvol_os_create_minor(const char *name)
 	if (error)
 		goto out_dmu_objset_disown;
 
-	zv = zvol_alloc(MKDEV(zvol_major, minor), name,
-	    doi->doi_data_block_size);
-	if (zv == NULL) {
-		error = SET_ERROR(EAGAIN);
+	error = zvol_alloc(MKDEV(zvol_major, minor), name,
+	    volsize, doi->doi_data_block_size, &zv);
+	if (error || zv == NULL)
 		goto out_dmu_objset_disown;
-	}
+
 	zv->zv_hash = hash;
 
 	if (dmu_objset_is_snapshot(os))
 		zv->zv_flags |= ZVOL_RDONLY;
 
-	zv->zv_volsize = volsize;
 	zv->zv_objset = os;
 
 	/* Default */
@@ -1691,11 +1702,11 @@ zvol_os_create_minor(const char *name)
 	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
 #endif
 
-	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+	ASSERT0P(zv->zv_kstat.dk_kstats);
 	error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
 	if (error)
 		goto out_dmu_objset_disown;
-	ASSERT3P(zv->zv_zilog, ==, NULL);
+	ASSERT0P(zv->zv_zilog);
 	zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
 	if (spa_writeable(dmu_objset_spa(os))) {
 		if (zil_replay_disable)
@@ -1733,7 +1744,7 @@ out_doi:
 	 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
 	 * directly as well.
 	 */
-	if (error == 0) {
+	if (error == 0 && zv) {
 		rw_enter(&zvol_state_lock, RW_WRITER);
 		zvol_insert(zv);
 		rw_exit(&zvol_state_lock);
@@ -1745,7 +1756,7 @@ out_doi:
 	return (error);
 }
 
-void
+int
 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 {
 	int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
@@ -1772,6 +1783,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 	set_disk_ro(zv->zv_zso->zvo_disk, readonly);
 
 	dataset_kstats_rename(&zv->zv_kstat, newname);
+
+	return (0);
 }
 
 void
@@ -1793,61 +1806,16 @@ zvol_init(void)
 {
 	int error;
 
-	/*
-	 * zvol_threads is the module param the user passes in.
-	 *
-	 * zvol_actual_threads is what we use internally, since the user can
-	 * pass zvol_thread = 0 to mean "use all the CPUs" (the default).
-	 */
-	static unsigned int zvol_actual_threads;
-
-	if (zvol_threads == 0) {
-		/*
-		 * See dde9380a1 for why 32 was chosen here.  This should
-		 * probably be refined to be some multiple of the number
-		 * of CPUs.
-		 */
-		zvol_actual_threads = MAX(num_online_cpus(), 32);
-	} else {
-		zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
+	error = zvol_init_impl();
+	if (error) {
+		printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error);
+		return (error);
 	}
 
-	/*
-	 * Use atleast 32 zvol_threads but for many core system,
-	 * prefer 6 threads per taskq, but no more taskqs
-	 * than threads in them on large systems.
-	 *
-	 *                 taskq   total
-	 * cpus    taskqs  threads threads
-	 * ------- ------- ------- -------
-	 * 1       1       32       32
-	 * 2       1       32       32
-	 * 4       1       32       32
-	 * 8       2       16       32
-	 * 16      3       11       33
-	 * 32      5       7        35
-	 * 64      8       8        64
-	 * 128     11      12       132
-	 * 256     16      16       256
-	 */
-	zv_taskq_t *ztqs = &zvol_taskqs;
-	uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
-	if (num_tqs == 0) {
-		num_tqs = 1 + num_online_cpus() / 6;
-		while (num_tqs * num_tqs > zvol_actual_threads)
-			num_tqs--;
-	}
-	uint_t per_tq_thread = zvol_actual_threads / num_tqs;
-	if (per_tq_thread * num_tqs < zvol_actual_threads)
-		per_tq_thread++;
-	ztqs->tqs_cnt = num_tqs;
-	ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
-	error = register_blkdev(zvol_major, ZVOL_DRIVER);
+	error = -register_blkdev(zvol_major, ZVOL_DRIVER);
 	if (error) {
-		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
-		ztqs->tqs_taskq = NULL;
 		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
-		return (error);
+		return (SET_ERROR(error));
 	}
 
 	if (zvol_blk_mq_queue_depth == 0) {
@@ -1864,25 +1832,6 @@ zvol_init(void)
 		    1024);
 	}
 
-	for (uint_t i = 0; i < num_tqs; i++) {
-		char name[32];
-		(void) snprintf(name, sizeof (name), "%s_tq-%u",
-		    ZVOL_DRIVER, i);
-		ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
-		    maxclsyspri, per_tq_thread, INT_MAX,
-		    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
-		if (ztqs->tqs_taskq[i] == NULL) {
-			for (int j = i - 1; j >= 0; j--)
-				taskq_destroy(ztqs->tqs_taskq[j]);
-			unregister_blkdev(zvol_major, ZVOL_DRIVER);
-			kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
-			    sizeof (taskq_t *));
-			ztqs->tqs_taskq = NULL;
-			return (-ENOMEM);
-		}
-	}
-
-	zvol_init_impl();
 	ida_init(&zvol_ida);
 	return (0);
 }
@@ -1890,50 +1839,19 @@ zvol_init(void)
 void
 zvol_fini(void)
 {
-	zv_taskq_t *ztqs = &zvol_taskqs;
-	zvol_fini_impl();
 	unregister_blkdev(zvol_major, ZVOL_DRIVER);
 
-	if (ztqs->tqs_taskq == NULL) {
-		ASSERT3U(ztqs->tqs_cnt, ==, 0);
-	} else {
-		for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
-			ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
-			taskq_destroy(ztqs->tqs_taskq[i]);
-		}
-		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
-		    sizeof (taskq_t *));
-		ztqs->tqs_taskq = NULL;
-	}
+	zvol_fini_impl();
 
 	ida_destroy(&zvol_ida);
 }
 
-module_param(zvol_inhibit_dev, uint, 0644);
-MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
-
 module_param(zvol_major, uint, 0444);
 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
 
-module_param(zvol_threads, uint, 0444);
-MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set"
-	"to 0 to use all active CPUs");
-
-module_param(zvol_request_sync, uint, 0644);
-MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
-
 module_param(zvol_max_discard_blocks, ulong, 0444);
 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
 
-module_param(zvol_num_taskqs, uint, 0444);
-MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");
-
-module_param(zvol_prefetch_bytes, uint, 0644);
-MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
-
-module_param(zvol_volmode, uint, 0644);
-MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
-
 module_param(zvol_blk_mq_queue_depth, uint, 0644);
 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");