aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c')
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c208
1 files changed, 120 insertions, 88 deletions
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index a7431cc4da9d..bac166fcd89e 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Klara, Inc.
*/
#include <sys/dataset_kstats.h>
@@ -84,8 +84,9 @@ static unsigned int zvol_blk_mq_blocks_per_thread = 8;
static inline void
zvol_end_io(struct bio *bio, struct request *rq, int error)
{
+ ASSERT3U(error, >=, 0);
if (bio) {
- bio->bi_status = errno_to_bi_status(-error);
+ bio->bi_status = errno_to_bi_status(error);
bio_endio(bio);
} else {
blk_mq_end_request(rq, errno_to_bi_status(error));
@@ -208,8 +209,14 @@ zvol_write(zv_request_t *zvr)
disk = zv->zv_zso->zvo_disk;
/* bio marked as FLUSH need to flush before write */
- if (io_is_flush(bio, rq))
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if (io_is_flush(bio, rq)) {
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if (error != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ zvol_end_io(bio, rq, -error);
+ return;
+ }
+ }
/* Some requests are just for flush and nothing else. */
if (io_size(bio, rq) == 0) {
@@ -273,8 +280,8 @@ zvol_write(zv_request_t *zvr)
dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
task_io_account_write(nwritten);
- if (sync)
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if (error == 0 && sync)
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
rw_exit(&zv->zv_suspend_lock);
@@ -282,7 +289,7 @@ zvol_write(zv_request_t *zvr)
blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
}
- zvol_end_io(bio, rq, -error);
+ zvol_end_io(bio, rq, error);
}
static void
@@ -361,7 +368,7 @@ zvol_discard(zv_request_t *zvr)
zfs_rangelock_exit(lr);
if (error == 0 && sync)
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
unlock:
rw_exit(&zv->zv_suspend_lock);
@@ -371,7 +378,7 @@ unlock:
start_time);
}
- zvol_end_io(bio, rq, -error);
+ zvol_end_io(bio, rq, error);
}
static void
@@ -449,7 +456,7 @@ zvol_read(zv_request_t *zvr)
blk_generic_end_io_acct(q, disk, READ, bio, start_time);
}
- zvol_end_io(bio, rq, -error);
+ zvol_end_io(bio, rq, error);
}
static void
@@ -480,7 +487,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
int rw = io_data_dir(bio, rq);
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
- zvol_end_io(bio, rq, -SET_ERROR(ENXIO));
+ zvol_end_io(bio, rq, SET_ERROR(ENXIO));
goto out;
}
@@ -499,7 +506,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
(long long unsigned)offset,
(long unsigned)size);
- zvol_end_io(bio, rq, -SET_ERROR(EIO));
+ zvol_end_io(bio, rq, SET_ERROR(EIO));
goto out;
}
@@ -512,8 +519,8 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
#ifdef HAVE_BLK_MQ_RQ_HCTX
blk_mq_hw_queue = rq->mq_hctx->queue_num;
#else
- blk_mq_hw_queue =
- rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
+ blk_mq_hw_queue = rq->q->queue_hw_ctx[
+ rq->q->mq_map[raw_smp_processor_id()]]->queue_num;
#endif
taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
blk_mq_hw_queue);
@@ -521,7 +528,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
if (rw == WRITE) {
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
- zvol_end_io(bio, rq, -SET_ERROR(EROFS));
+ zvol_end_io(bio, rq, SET_ERROR(EROFS));
goto out;
}
@@ -672,28 +679,19 @@ zvol_open(struct block_device *bdev, fmode_t flag)
retry:
#endif
- rw_enter(&zvol_state_lock, RW_READER);
- /*
- * Obtain a copy of private_data under the zvol_state_lock to make
- * sure that either the result of zvol free code path setting
- * disk->private_data to NULL is observed, or zvol_os_free()
- * is not called on this zv because of the positive zv_open_count.
- */
+
#ifdef HAVE_BLK_MODE_T
- zv = disk->private_data;
+ zv = atomic_load_ptr(&disk->private_data);
#else
- zv = bdev->bd_disk->private_data;
+ zv = atomic_load_ptr(&bdev->bd_disk->private_data);
#endif
if (zv == NULL) {
- rw_exit(&zvol_state_lock);
return (-SET_ERROR(ENXIO));
}
mutex_enter(&zv->zv_state_lock);
-
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
mutex_exit(&zv->zv_state_lock);
- rw_exit(&zvol_state_lock);
return (-SET_ERROR(ENXIO));
}
@@ -705,8 +703,28 @@ retry:
if (zv->zv_open_count == 0) {
if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
mutex_exit(&zv->zv_state_lock);
+
+ /*
+ * Removal may happen while the locks are down, so
+ * we can't trust zv any longer; we have to start over.
+ */
+#ifdef HAVE_BLK_MODE_T
+ zv = atomic_load_ptr(&disk->private_data);
+#else
+ zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+#endif
+ if (zv == NULL)
+ return (-SET_ERROR(ENXIO));
+
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_exit(&zv->zv_suspend_lock);
+ return (-SET_ERROR(ENXIO));
+ }
+
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
@@ -717,7 +735,6 @@ retry:
drop_suspend = B_TRUE;
}
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -814,11 +831,11 @@ zvol_release(struct gendisk *disk, fmode_t unused)
#if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
(void) unused;
#endif
- zvol_state_t *zv;
boolean_t drop_suspend = B_TRUE;
- rw_enter(&zvol_state_lock, RW_READER);
- zv = disk->private_data;
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
+ if (zv == NULL)
+ return;
mutex_enter(&zv->zv_state_lock);
ASSERT3U(zv->zv_open_count, >, 0);
@@ -832,6 +849,15 @@ zvol_release(struct gendisk *disk, fmode_t unused)
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ /*
+ * Unlike in zvol_open(), we don't check if removal
+ * started here, because we might be one of the openers
+ * that needs to be thrown out! If we're the last, we
+ * need to call zvol_last_close() below to finish
+ * cleanup. So, no special treatment for us.
+ */
+
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 1) {
rw_exit(&zv->zv_suspend_lock);
@@ -841,7 +867,6 @@ zvol_release(struct gendisk *disk, fmode_t unused)
} else {
drop_suspend = B_FALSE;
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -861,9 +886,10 @@ static int
zvol_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
- zvol_state_t *zv = bdev->bd_disk->private_data;
int error = 0;
+ zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+ ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
switch (cmd) {
@@ -886,16 +912,18 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode,
case BLKZNAME:
mutex_enter(&zv->zv_state_lock);
- error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
+ error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
mutex_exit(&zv->zv_state_lock);
+ if (error)
+ error = SET_ERROR(error);
break;
default:
- error = -ENOTTY;
+ error = SET_ERROR(ENOTTY);
break;
}
- return (SET_ERROR(error));
+ return (-error);
}
#ifdef CONFIG_COMPAT
@@ -914,9 +942,8 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
{
unsigned int mask = 0;
- rw_enter(&zvol_state_lock, RW_READER);
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
- zvol_state_t *zv = disk->private_data;
if (zv != NULL) {
mutex_enter(&zv->zv_state_lock);
mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
@@ -924,17 +951,14 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
mutex_exit(&zv->zv_state_lock);
}
- rw_exit(&zvol_state_lock);
-
return (mask);
}
static int
zvol_revalidate_disk(struct gendisk *disk)
{
- rw_enter(&zvol_state_lock, RW_READER);
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
- zvol_state_t *zv = disk->private_data;
if (zv != NULL) {
mutex_enter(&zv->zv_state_lock);
set_capacity(zv->zv_zso->zvo_disk,
@@ -942,8 +966,6 @@ zvol_revalidate_disk(struct gendisk *disk)
mutex_exit(&zv->zv_state_lock);
}
- rw_exit(&zvol_state_lock);
-
return (0);
}
@@ -962,16 +984,6 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
return (0);
}
-void
-zvol_os_clear_private(zvol_state_t *zv)
-{
- /*
- * Cleared while holding zvol_state_lock as a writer
- * which will prevent zvol_open() from opening it.
- */
- zv->zv_zso->zvo_disk->private_data = NULL;
-}
-
/*
* Provide a simple virtual geometry for legacy compatibility. For devices
* smaller than 1 MiB a small head and sector count is used to allow very
@@ -981,9 +993,10 @@ zvol_os_clear_private(zvol_state_t *zv)
static int
zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
- zvol_state_t *zv = bdev->bd_disk->private_data;
sector_t sectors;
+ zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+ ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
sectors = get_capacity(zv->zv_zso->zvo_disk);
@@ -1408,53 +1421,70 @@ out_kmem:
return (ret);
}
-/*
- * Cleanup then free a zvol_state_t which was created by zvol_alloc().
- * At this time, the structure is not opened by anyone, is taken off
- * the zvol_state_list, and has its private data set to NULL.
- * The zvol_state_lock is dropped.
- *
- * This function may take many milliseconds to complete (e.g. we've seen
- * it take over 256ms), due to the calls to "blk_cleanup_queue" and
- * "del_gendisk". Thus, consumers need to be careful to account for this
- * latency when calling this function.
- */
void
-zvol_os_free(zvol_state_t *zv)
+zvol_os_remove_minor(zvol_state_t *zv)
{
-
- ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
- ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
ASSERT0(zv->zv_open_count);
- ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
+ ASSERT0(atomic_read(&zv->zv_suspend_ref));
+ ASSERT(zv->zv_flags & ZVOL_REMOVING);
- rw_destroy(&zv->zv_suspend_lock);
- zfs_rangelock_fini(&zv->zv_rangelock);
+ struct zvol_state_os *zso = zv->zv_zso;
+ zv->zv_zso = NULL;
+
+ /* Clearing private_data will make new callers return immediately. */
+ atomic_store_ptr(&zso->zvo_disk->private_data, NULL);
- del_gendisk(zv->zv_zso->zvo_disk);
+ /*
+ * Drop the state lock before calling del_gendisk(). There may be
+ * callers waiting to acquire it, but del_gendisk() will block until
+ * they exit, which would deadlock.
+ */
+ mutex_exit(&zv->zv_state_lock);
+
+ del_gendisk(zso->zvo_disk);
#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
#if defined(HAVE_BLK_CLEANUP_DISK)
- blk_cleanup_disk(zv->zv_zso->zvo_disk);
+ blk_cleanup_disk(zso->zvo_disk);
#else
- put_disk(zv->zv_zso->zvo_disk);
+ put_disk(zso->zvo_disk);
#endif
#else
- blk_cleanup_queue(zv->zv_zso->zvo_queue);
- put_disk(zv->zv_zso->zvo_disk);
+ blk_cleanup_queue(zso->zvo_queue);
+ put_disk(zso->zvo_disk);
#endif
- if (zv->zv_zso->use_blk_mq)
- blk_mq_free_tag_set(&zv->zv_zso->tag_set);
+ if (zso->use_blk_mq)
+ blk_mq_free_tag_set(&zso->tag_set);
- ida_simple_remove(&zvol_ida,
- MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
+ ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
+
+ kmem_free(zso, sizeof (struct zvol_state_os));
+
+ mutex_enter(&zv->zv_state_lock);
+}
+
+void
+zvol_os_free(zvol_state_t *zv)
+{
+
+ ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+ ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT0(zv->zv_open_count);
+ ASSERT0P(zv->zv_zso);
+
+ ASSERT0P(zv->zv_objset);
+ ASSERT0P(zv->zv_zilog);
+ ASSERT0P(zv->zv_dn);
+
+ rw_destroy(&zv->zv_suspend_lock);
+ zfs_rangelock_fini(&zv->zv_rangelock);
cv_destroy(&zv->zv_removing_cv);
mutex_destroy(&zv->zv_state_lock);
dataset_kstats_destroy(&zv->zv_kstat);
- kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
kmem_free(zv, sizeof (zvol_state_t));
}
@@ -1474,7 +1504,9 @@ __zvol_os_add_disk(struct gendisk *disk)
{
int error = 0;
#ifdef HAVE_ADD_DISK_RET
- error = add_disk(disk);
+ error = -add_disk(disk);
+ if (error)
+ error = SET_ERROR(error);
#else
add_disk(disk);
#endif
@@ -1649,11 +1681,11 @@ zvol_os_create_minor(const char *name)
blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
#endif
- ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+ ASSERT0P(zv->zv_kstat.dk_kstats);
error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
if (error)
goto out_dmu_objset_disown;
- ASSERT3P(zv->zv_zilog, ==, NULL);
+ ASSERT0P(zv->zv_zilog);
zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
if (spa_writeable(dmu_objset_spa(os))) {
if (zil_replay_disable)
@@ -1759,10 +1791,10 @@ zvol_init(void)
return (error);
}
- error = register_blkdev(zvol_major, ZVOL_DRIVER);
+ error = -register_blkdev(zvol_major, ZVOL_DRIVER);
if (error) {
printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
- return (error);
+ return (SET_ERROR(error));
}
if (zvol_blk_mq_queue_depth == 0) {