diff options
Diffstat (limited to 'sys/contrib/openzfs/module/os')
5 files changed, 225 insertions, 197 deletions
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c index c114db14a916..b218c0da8125 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c @@ -112,7 +112,6 @@ static int zfs__fini(void); static void zfs_shutdown(void *, int); static eventhandler_tag zfs_shutdown_event_tag; -static eventhandler_tag zfs_mountroot_event_tag; #define ZFS_MIN_KSTACK_PAGES 4 @@ -311,9 +310,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused) zfs_shutdown_event_tag = EVENTHANDLER_REGISTER( shutdown_post_sync, zfs_shutdown, NULL, SHUTDOWN_PRI_FIRST); - zfs_mountroot_event_tag = EVENTHANDLER_REGISTER( - mountroot, spa_boot_init, NULL, - SI_ORDER_ANY); } return (err); case MOD_UNLOAD: @@ -322,9 +318,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused) if (zfs_shutdown_event_tag != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, zfs_shutdown_event_tag); - if (zfs_mountroot_event_tag != NULL) - EVENTHANDLER_DEREGISTER(mountroot, - zfs_mountroot_event_tag); } return (err); case MOD_SHUTDOWN: diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c index 61d0bb26d1e5..a222c5de4a2a 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c @@ -494,7 +494,7 @@ zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) vap->va_uid = 0; vap->va_gid = 0; - vap->va_rdev = 0; + vap->va_rdev = NODEV; /* * We are a purely virtual object, so we have no * blocksize or allocated blocks. @@ -674,6 +674,7 @@ zfsctl_root_readdir(struct vop_readdir_args *ap) zfs_uio_t uio; int *eofp = ap->a_eofflag; off_t dots_offset; + ssize_t orig_resid; int error; zfs_uio_init(&uio, ap->a_uio); @@ -688,16 +689,16 @@ zfsctl_root_readdir(struct vop_readdir_args *ap) * count to return is 0. */ if (zfs_uio_offset(&uio) == 3 * sizeof (entry)) { + if (eofp != NULL) + *eofp = 1; return (0); } + orig_resid = zfs_uio_resid(&uio); error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, &uio, &dots_offset); - if (error != 0) { - if (error == ENAMETOOLONG) /* ran out of destination space */ - error = 0; - return (error); - } + if (error != 0) + goto err; if (zfs_uio_offset(&uio) != dots_offset) return (SET_ERROR(EINVAL)); @@ -710,8 +711,11 @@ zfsctl_root_readdir(struct vop_readdir_args *ap) entry.d_reclen = sizeof (entry); error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio)); if (error != 0) { - if (error == ENAMETOOLONG) - error = 0; +err: + if (error == ENAMETOOLONG) { + error = orig_resid == zfs_uio_resid(&uio) ? + EINVAL : 0; + } return (SET_ERROR(error)); } if (eofp != NULL) @@ -1056,17 +1060,21 @@ zfsctl_snapdir_readdir(struct vop_readdir_args *ap) zfs_uio_t uio; int *eofp = ap->a_eofflag; off_t dots_offset; + ssize_t orig_resid; int error; zfs_uio_init(&uio, ap->a_uio); + orig_resid = zfs_uio_resid(&uio); ASSERT3S(vp->v_type, ==, VDIR); error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, &uio, &dots_offset); if (error != 0) { - if (error == ENAMETOOLONG) /* ran out of destination space */ - error = 0; + if (error == ENAMETOOLONG) { /* ran out of destination space */ + error = orig_resid == zfs_uio_resid(&uio) ? + EINVAL : 0; + } return (error); } @@ -1084,9 +1092,13 @@ zfsctl_snapdir_readdir(struct vop_readdir_args *ap) dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); if (error != 0) { if (error == ENOENT) { - if (eofp != NULL) - *eofp = 1; - error = 0; + if (orig_resid == zfs_uio_resid(&uio)) { + error = EINVAL; + } else { + error = 0; + if (eofp != NULL) + *eofp = 1; + } } zfs_exit(zfsvfs, FTAG); return (error); @@ -1099,8 +1111,10 @@ zfsctl_snapdir_readdir(struct vop_readdir_args *ap) entry.d_reclen = sizeof (entry); error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio)); if (error != 0) { - if (error == ENAMETOOLONG) - error = 0; + if (error == ENAMETOOLONG) { + error = orig_resid == zfs_uio_resid(&uio) ? + EINVAL : 0; + } zfs_exit(zfsvfs, FTAG); return (SET_ERROR(error)); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index 1813c411b013..174141a5deab 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -1695,6 +1695,7 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, objset_t *os; caddr_t outbuf; size_t bufsize; + ssize_t orig_resid; zap_cursor_t zc; zap_attribute_t *zap; uint_t bytes_wanted; @@ -1735,7 +1736,7 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, /* * Quit if directory has been removed (posix) */ - if ((*eofp = zp->z_unlinked) != 0) { + if ((*eofp = (zp->z_unlinked != 0)) != 0) { zfs_exit(zfsvfs, FTAG); return (0); } @@ -1743,6 +1744,7 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, error = 0; os = zfsvfs->z_os; offset = zfs_uio_offset(uio); + orig_resid = zfs_uio_resid(uio); prefetch = zp->z_zn_prefetch; zap = zap_attribute_long_alloc(); @@ -1922,7 +1924,7 @@ update: kmem_free(outbuf, bufsize); if (error == ENOENT) - error = 0; + error = orig_resid == zfs_uio_resid(uio) ? EINVAL : 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); @@ -2013,7 +2015,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) if (vp->v_type == VBLK || vp->v_type == VCHR) vap->va_rdev = zfs_cmpldev(rdev); else - vap->va_rdev = 0; + vap->va_rdev = NODEV; vap->va_gen = zp->z_gen; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ vap->va_filerev = zp->z_seq; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index 265dfd55fc4d..0dd2ecd7fd8d 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -31,7 +31,7 @@ * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Klara, Inc. */ /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */ @@ -196,7 +196,6 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); static int zvol_geom_open(struct g_provider *pp, int flag, int count); static int zvol_geom_close(struct g_provider *pp, int flag, int count); -static void zvol_geom_destroy(zvol_state_t *zv); static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); static void zvol_geom_bio_start(struct bio *bp); static int zvol_geom_bio_getattr(struct bio *bp); @@ -226,25 +225,14 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) } retry: - rw_enter(&zvol_state_lock, ZVOL_RW_READER); - /* - * Obtain a copy of private under zvol_state_lock to make sure either - * the result of zvol free code setting private to NULL is observed, - * or the zv is protected from being freed because of the positive - * zv_open_count. - */ - zv = pp->private; - if (zv == NULL) { - rw_exit(&zvol_state_lock); - err = SET_ERROR(ENXIO); - goto out_locked; - } + zv = atomic_load_ptr(&pp->private); + if (zv == NULL) + return (SET_ERROR(ENXIO)); mutex_enter(&zv->zv_state_lock); if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) { - rw_exit(&zvol_state_lock); err = SET_ERROR(ENXIO); - goto out_zv_locked; + goto out_locked; } ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); @@ -257,8 +245,24 @@ retry: drop_suspend = B_TRUE; if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); + + /* + * Removal may happen while the locks are down, so + * we can't trust zv any longer; we have to start over. + */ + zv = atomic_load_ptr(&pp->private); + if (zv == NULL) + return (SET_ERROR(ENXIO)); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); + + if (zv->zv_zso->zso_dying || + zv->zv_flags & ZVOL_REMOVING) { + err = SET_ERROR(ENXIO); + goto out_locked; + } + /* Check to see if zv_suspend_lock is needed. */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); @@ -266,7 +270,6 @@ retry: } } } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -294,7 +297,7 @@ retry: if (drop_namespace) mutex_exit(&spa_namespace_lock); if (err) - goto out_zv_locked; + goto out_locked; pp->mediasize = zv->zv_volsize; pp->stripeoffset = 0; pp->stripesize = zv->zv_volblocksize; @@ -329,9 +332,8 @@ out_opened: zvol_last_close(zv); wakeup(zv); } -out_zv_locked: - mutex_exit(&zv->zv_state_lock); out_locked: + mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (err); @@ -345,12 +347,9 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) boolean_t drop_suspend = B_TRUE; int new_open_count; - rw_enter(&zvol_state_lock, ZVOL_RW_READER); - zv = pp->private; - if (zv == NULL) { - rw_exit(&zvol_state_lock); + zv = atomic_load_ptr(&pp->private); + if (zv == NULL) return (SET_ERROR(ENXIO)); - } mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { @@ -377,6 +376,15 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); + + /* + * Unlike in zvol_geom_open(), we don't check if + * removal started here, because we might be one of the + * openers that needs to be thrown out! If we're the + * last, we need to call zvol_last_close() below to + * finish cleanup. So, no special treatment for us. + */ + /* Check to see if zv_suspend_lock is needed. */ new_open_count = zv->zv_open_count - count; if (new_open_count != 0) { @@ -387,7 +395,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) } else { drop_suspend = B_FALSE; } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -408,20 +415,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) return (0); } -static void -zvol_geom_destroy(zvol_state_t *zv) -{ - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp = zsg->zsg_provider; - - ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); - - g_topology_assert(); - - zsg->zsg_provider = NULL; - g_wither_geom(pp->geom, ENXIO); -} - void zvol_wait_close(zvol_state_t *zv) { @@ -454,7 +447,7 @@ zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", pp->name, acr, acw, ace)); - if (pp->private == NULL) { + if (atomic_load_ptr(&pp->private) == NULL) { if (acr <= 0 && acw <= 0 && ace <= 0) return (0); return (pp->error); @@ -921,25 +914,14 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) boolean_t drop_suspend = B_FALSE; retry: - rw_enter(&zvol_state_lock, ZVOL_RW_READER); - /* - * Obtain a copy of si_drv2 under zvol_state_lock to make sure either - * the result of zvol free code setting si_drv2 to NULL is observed, - * or the zv is protected from being freed because of the positive - * zv_open_count. - */ - zv = dev->si_drv2; - if (zv == NULL) { - rw_exit(&zvol_state_lock); - err = SET_ERROR(ENXIO); - goto out_locked; - } + zv = atomic_load_ptr(&dev->si_drv2); + if (zv == NULL) + return (SET_ERROR(ENXIO)); mutex_enter(&zv->zv_state_lock); - if (zv->zv_zso->zso_dying) { - rw_exit(&zvol_state_lock); + if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) { err = SET_ERROR(ENXIO); - goto out_zv_locked; + goto out_locked; } ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); @@ -954,6 +936,13 @@ retry: mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); + + if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { + /* Removal started while locks were down. */ + err = SET_ERROR(ENXIO); + goto out_locked; + } + /* Check to see if zv_suspend_lock is needed. */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); @@ -961,7 +950,6 @@ retry: } } } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -989,7 +977,7 @@ retry: if (drop_namespace) mutex_exit(&spa_namespace_lock); if (err) - goto out_zv_locked; + goto out_locked; } ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -1016,9 +1004,8 @@ out_opened: zvol_last_close(zv); wakeup(zv); } -out_zv_locked: - mutex_exit(&zv->zv_state_lock); out_locked: + mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (err); @@ -1030,12 +1017,9 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; - rw_enter(&zvol_state_lock, ZVOL_RW_READER); - zv = dev->si_drv2; - if (zv == NULL) { - rw_exit(&zvol_state_lock); + zv = atomic_load_ptr(&dev->si_drv2); + if (zv == NULL) return (SET_ERROR(ENXIO)); - } mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { @@ -1060,6 +1044,15 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); + + /* + * Unlike in zvol_cdev_open(), we don't check if + * removal started here, because we might be one of the + * openers that needs to be thrown out! If we're the + * last, we need to call zvol_last_close() below to + * finish cleanup. So, no special treatment for us. + */ + /* Check to see if zv_suspend_lock is needed. */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); @@ -1069,7 +1062,6 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) } else { drop_suspend = B_FALSE; } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -1101,7 +1093,8 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, int error; boolean_t sync; - zv = dev->si_drv2; + zv = atomic_load_ptr(&dev->si_drv2); + ASSERT3P(zv, !=, NULL); error = 0; KASSERT(zv->zv_open_count > 0, @@ -1162,6 +1155,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, *(off_t *)data = 0; break; case DIOCGATTR: { + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); spa_t *spa = dmu_objset_spa(zv->zv_objset); struct diocgattr_arg *arg = (struct diocgattr_arg *)data; uint64_t refd, avail, usedobjs, availobjs; @@ -1186,6 +1180,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, arg->value.off = refd / DEV_BSIZE; } else error = SET_ERROR(ENOIOCTL); + rw_exit(&zv->zv_suspend_lock); break; } case FIOSEEKHOLE: @@ -1196,10 +1191,12 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, hole = (cmd == FIOSEEKHOLE); noff = *off; + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX, RL_READER); error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); zfs_rangelock_exit(lr); + rw_exit(&zv->zv_suspend_lock); *off = noff; break; } @@ -1400,42 +1397,65 @@ zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize, * Remove minor node for the specified volume. */ void -zvol_os_free(zvol_state_t *zv) +zvol_os_remove_minor(zvol_state_t *zv) { - ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); - ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT0(zv->zv_open_count); + ASSERT0(atomic_read(&zv->zv_suspend_ref)); + ASSERT(zv->zv_flags & ZVOL_REMOVING); - ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); - - rw_destroy(&zv->zv_suspend_lock); - zfs_rangelock_fini(&zv->zv_rangelock); + struct zvol_state_os *zso = zv->zv_zso; + zv->zv_zso = NULL; if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp __maybe_unused = zsg->zsg_provider; - - ASSERT0P(pp->private); + struct zvol_state_geom *zsg = &zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + atomic_store_ptr(&pp->private, NULL); + mutex_exit(&zv->zv_state_lock); g_topology_lock(); - zvol_geom_destroy(zv); + g_wither_geom(pp->geom, ENXIO); g_topology_unlock(); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { - struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct zvol_state_dev *zsd = &zso->zso_dev; struct cdev *dev = zsd->zsd_cdev; + if (dev != NULL) + atomic_store_ptr(&dev->si_drv2, NULL); + mutex_exit(&zv->zv_state_lock); + if (dev != NULL) { - ASSERT0P(dev->si_drv2); destroy_dev(dev); knlist_clear(&zsd->zsd_selinfo.si_note, 0); knlist_destroy(&zsd->zsd_selinfo.si_note); } } + kmem_free(zso, sizeof (struct zvol_state_os)); + + mutex_enter(&zv->zv_state_lock); +} + +void +zvol_os_free(zvol_state_t *zv) +{ + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT0(zv->zv_open_count); + ASSERT0P(zv->zv_zso); + + ASSERT0P(zv->zv_objset); + ASSERT0P(zv->zv_zilog); + ASSERT0P(zv->zv_dn); + + ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); + + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); + mutex_destroy(&zv->zv_state_lock); cv_destroy(&zv->zv_removing_cv); dataset_kstats_destroy(&zv->zv_kstat); - kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); zvol_minors--; } @@ -1538,28 +1558,6 @@ out_doi: return (error); } -void -zvol_os_clear_private(zvol_state_t *zv) -{ - ASSERT(RW_LOCK_HELD(&zvol_state_lock)); - if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp = zsg->zsg_provider; - - if (pp->private == NULL) /* already cleared */ - return; - - pp->private = NULL; - ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); - } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { - struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; - struct cdev *dev = zsd->zsd_cdev; - - if (dev != NULL) - dev->si_drv2 = NULL; - } -} - int zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index a73acdad34ae..bac166fcd89e 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Klara, Inc. */ #include <sys/dataset_kstats.h> @@ -679,28 +679,19 @@ zvol_open(struct block_device *bdev, fmode_t flag) retry: #endif - rw_enter(&zvol_state_lock, RW_READER); - /* - * Obtain a copy of private_data under the zvol_state_lock to make - * sure that either the result of zvol free code path setting - * disk->private_data to NULL is observed, or zvol_os_free() - * is not called on this zv because of the positive zv_open_count. - */ + #ifdef HAVE_BLK_MODE_T - zv = disk->private_data; + zv = atomic_load_ptr(&disk->private_data); #else - zv = bdev->bd_disk->private_data; + zv = atomic_load_ptr(&bdev->bd_disk->private_data); #endif if (zv == NULL) { - rw_exit(&zvol_state_lock); return (-SET_ERROR(ENXIO)); } mutex_enter(&zv->zv_state_lock); - if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { mutex_exit(&zv->zv_state_lock); - rw_exit(&zvol_state_lock); return (-SET_ERROR(ENXIO)); } @@ -712,8 +703,28 @@ retry: if (zv->zv_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); + + /* + * Removal may happen while the locks are down, so + * we can't trust zv any longer; we have to start over. + */ +#ifdef HAVE_BLK_MODE_T + zv = atomic_load_ptr(&disk->private_data); +#else + zv = atomic_load_ptr(&bdev->bd_disk->private_data); +#endif + if (zv == NULL) + return (-SET_ERROR(ENXIO)); + rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); + + if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); + return (-SET_ERROR(ENXIO)); + } + /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); @@ -724,7 +735,6 @@ retry: drop_suspend = B_TRUE; } } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -821,11 +831,11 @@ zvol_release(struct gendisk *disk, fmode_t unused) #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) (void) unused; #endif - zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; - rw_enter(&zvol_state_lock, RW_READER); - zv = disk->private_data; + zvol_state_t *zv = atomic_load_ptr(&disk->private_data); + if (zv == NULL) + return; mutex_enter(&zv->zv_state_lock); ASSERT3U(zv->zv_open_count, >, 0); @@ -839,6 +849,15 @@ zvol_release(struct gendisk *disk, fmode_t unused) mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); + + /* + * Unlike in zvol_open(), we don't check if removal + * started here, because we might be one of the openers + * that needs to be thrown out! If we're the last, we + * need to call zvol_last_close() below to finish + * cleanup. So, no special treatment for us. + */ + /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); @@ -848,7 +867,6 @@ zvol_release(struct gendisk *disk, fmode_t unused) } else { drop_suspend = B_FALSE; } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -868,9 +886,10 @@ static int zvol_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; + zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); + ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); switch (cmd) { @@ -923,9 +942,8 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing) { unsigned int mask = 0; - rw_enter(&zvol_state_lock, RW_READER); + zvol_state_t *zv = atomic_load_ptr(&disk->private_data); - zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; @@ -933,17 +951,14 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing) mutex_exit(&zv->zv_state_lock); } - rw_exit(&zvol_state_lock); - return (mask); } static int zvol_revalidate_disk(struct gendisk *disk) { - rw_enter(&zvol_state_lock, RW_READER); + zvol_state_t *zv = atomic_load_ptr(&disk->private_data); - zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); set_capacity(zv->zv_zso->zvo_disk, @@ -951,8 +966,6 @@ zvol_revalidate_disk(struct gendisk *disk) mutex_exit(&zv->zv_state_lock); } - rw_exit(&zvol_state_lock); - return (0); } @@ -971,16 +984,6 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) return (0); } -void -zvol_os_clear_private(zvol_state_t *zv) -{ - /* - * Cleared while holding zvol_state_lock as a writer - * which will prevent zvol_open() from opening it. - */ - zv->zv_zso->zvo_disk->private_data = NULL; -} - /* * Provide a simple virtual geometry for legacy compatibility. For devices * smaller than 1 MiB a small head and sector count is used to allow very @@ -990,9 +993,10 @@ zvol_os_clear_private(zvol_state_t *zv) static int zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) { - zvol_state_t *zv = bdev->bd_disk->private_data; sector_t sectors; + zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); + ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); sectors = get_capacity(zv->zv_zso->zvo_disk); @@ -1417,53 +1421,70 @@ out_kmem: return (ret); } -/* - * Cleanup then free a zvol_state_t which was created by zvol_alloc(). - * At this time, the structure is not opened by anyone, is taken off - * the zvol_state_list, and has its private data set to NULL. - * The zvol_state_lock is dropped. - * - * This function may take many milliseconds to complete (e.g. we've seen - * it take over 256ms), due to the calls to "blk_cleanup_queue" and - * "del_gendisk". Thus, consumers need to be careful to account for this - * latency when calling this function. - */ void -zvol_os_free(zvol_state_t *zv) +zvol_os_remove_minor(zvol_state_t *zv) { - - ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); - ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT0(zv->zv_open_count); - ASSERT0P(zv->zv_zso->zvo_disk->private_data); + ASSERT0(atomic_read(&zv->zv_suspend_ref)); + ASSERT(zv->zv_flags & ZVOL_REMOVING); - rw_destroy(&zv->zv_suspend_lock); - zfs_rangelock_fini(&zv->zv_rangelock); + struct zvol_state_os *zso = zv->zv_zso; + zv->zv_zso = NULL; + + /* Clearing private_data will make new callers return immediately. */ + atomic_store_ptr(&zso->zvo_disk->private_data, NULL); + + /* + * Drop the state lock before calling del_gendisk(). There may be + * callers waiting to acquire it, but del_gendisk() will block until + * they exit, which would deadlock. + */ + mutex_exit(&zv->zv_state_lock); - del_gendisk(zv->zv_zso->zvo_disk); + del_gendisk(zso->zvo_disk); #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) #if defined(HAVE_BLK_CLEANUP_DISK) - blk_cleanup_disk(zv->zv_zso->zvo_disk); + blk_cleanup_disk(zso->zvo_disk); #else - put_disk(zv->zv_zso->zvo_disk); + put_disk(zso->zvo_disk); #endif #else - blk_cleanup_queue(zv->zv_zso->zvo_queue); - put_disk(zv->zv_zso->zvo_disk); + blk_cleanup_queue(zso->zvo_queue); + put_disk(zso->zvo_disk); #endif - if (zv->zv_zso->use_blk_mq) - blk_mq_free_tag_set(&zv->zv_zso->tag_set); + if (zso->use_blk_mq) + blk_mq_free_tag_set(&zso->tag_set); + + ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS); - ida_simple_remove(&zvol_ida, - MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); + kmem_free(zso, sizeof (struct zvol_state_os)); + + mutex_enter(&zv->zv_state_lock); +} + +void +zvol_os_free(zvol_state_t *zv) +{ + + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT0(zv->zv_open_count); + ASSERT0P(zv->zv_zso); + + ASSERT0P(zv->zv_objset); + ASSERT0P(zv->zv_zilog); + ASSERT0P(zv->zv_dn); + + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); cv_destroy(&zv->zv_removing_cv); mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); - kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); } |