aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/os
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/os')
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c7
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c244
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c2
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c4
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c3
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c27
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c6
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c210
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c6
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c3
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c6
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c55
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c85
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c153
14 files changed, 369 insertions, 442 deletions
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
index c114db14a916..b218c0da8125 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
@@ -112,7 +112,6 @@ static int zfs__fini(void);
static void zfs_shutdown(void *, int);
static eventhandler_tag zfs_shutdown_event_tag;
-static eventhandler_tag zfs_mountroot_event_tag;
#define ZFS_MIN_KSTACK_PAGES 4
@@ -311,9 +310,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused)
zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
shutdown_post_sync, zfs_shutdown, NULL,
SHUTDOWN_PRI_FIRST);
- zfs_mountroot_event_tag = EVENTHANDLER_REGISTER(
- mountroot, spa_boot_init, NULL,
- SI_ORDER_ANY);
}
return (err);
case MOD_UNLOAD:
@@ -322,9 +318,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused)
if (zfs_shutdown_event_tag != NULL)
EVENTHANDLER_DEREGISTER(shutdown_post_sync,
zfs_shutdown_event_tag);
- if (zfs_mountroot_event_tag != NULL)
- EVENTHANDLER_DEREGISTER(mountroot,
- zfs_mountroot_event_tag);
}
return (err);
case MOD_SHUTDOWN:
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
index ace2360c032d..393bfaa65ff5 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@@ -188,11 +188,6 @@ param_set_arc_max(SYSCTL_HANDLER_ARGS)
return (0);
}
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max,
- CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
- NULL, 0, param_set_arc_max, "LU",
- "Maximum ARC size in bytes (LEGACY)");
-
int
param_set_arc_min(SYSCTL_HANDLER_ARGS)
{
@@ -217,11 +212,6 @@ param_set_arc_min(SYSCTL_HANDLER_ARGS)
return (0);
}
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min,
- CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
- NULL, 0, param_set_arc_min, "LU",
- "Minimum ARC size in bytes (LEGACY)");
-
extern uint_t zfs_arc_free_target;
int
@@ -245,16 +235,6 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS)
return (0);
}
-/*
- * NOTE: This sysctl is CTLFLAG_RW not CTLFLAG_RWTUN due to its dependency on
- * pagedaemon initialization.
- */
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
- CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
- NULL, 0, param_set_arc_free_target, "IU",
- "Desired number of free pages below which ARC triggers reclaim"
- " (LEGACY)");
-
int
param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
{
@@ -273,187 +253,6 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
return (0);
}
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift,
- CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
- NULL, 0, param_set_arc_no_grow_shift, "I",
- "log2(fraction of ARC which must be free to allow growing) (LEGACY)");
-
-extern uint64_t l2arc_write_max;
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max,
- CTLFLAG_RWTUN, &l2arc_write_max, 0,
- "Max write bytes per interval (LEGACY)");
-
-extern uint64_t l2arc_write_boost;
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost,
- CTLFLAG_RWTUN, &l2arc_write_boost, 0,
- "Extra write bytes during device warmup (LEGACY)");
-
-extern uint64_t l2arc_headroom;
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom,
- CTLFLAG_RWTUN, &l2arc_headroom, 0,
- "Number of max device writes to precache (LEGACY)");
-
-extern uint64_t l2arc_headroom_boost;
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom_boost,
- CTLFLAG_RWTUN, &l2arc_headroom_boost, 0,
- "Compressed l2arc_headroom multiplier (LEGACY)");
-
-extern uint64_t l2arc_feed_secs;
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs,
- CTLFLAG_RWTUN, &l2arc_feed_secs, 0,
- "Seconds between L2ARC writing (LEGACY)");
-
-extern uint64_t l2arc_feed_min_ms;
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms,
- CTLFLAG_RWTUN, &l2arc_feed_min_ms, 0,
- "Min feed interval in milliseconds (LEGACY)");
-
-extern int l2arc_noprefetch;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch,
- CTLFLAG_RWTUN, &l2arc_noprefetch, 0,
- "Skip caching prefetched buffers (LEGACY)");
-
-extern int l2arc_feed_again;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again,
- CTLFLAG_RWTUN, &l2arc_feed_again, 0,
- "Turbo L2ARC warmup (LEGACY)");
-
-extern int l2arc_norw;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw,
- CTLFLAG_RWTUN, &l2arc_norw, 0,
- "No reads during writes (LEGACY)");
-
-static int
-param_get_arc_state_size(SYSCTL_HANDLER_ARGS)
-{
- arc_state_t *state = (arc_state_t *)arg1;
- int64_t val;
-
- val = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]) +
- zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]);
- return (sysctl_handle_64(oidp, &val, 0, req));
-}
-
-extern arc_state_t ARC_anon;
-
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, anon_size,
- CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
- &ARC_anon, 0, param_get_arc_state_size, "Q",
- "size of anonymous state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
- &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
- "size of evictable metadata in anonymous state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
- &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
- "size of evictable data in anonymous state");
-
-extern arc_state_t ARC_mru;
-
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_size,
- CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
- &ARC_mru, 0, param_get_arc_state_size, "Q",
- "size of mru state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
- &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
- "size of evictable metadata in mru state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
- &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
- "size of evictable data in mru state");
-
-extern arc_state_t ARC_mru_ghost;
-
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_ghost_size,
- CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
- &ARC_mru_ghost, 0, param_get_arc_state_size, "Q",
- "size of mru ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
- &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
- "size of evictable metadata in mru ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
- &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
- "size of evictable data in mru ghost state");
-
-extern arc_state_t ARC_mfu;
-
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_size,
- CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
- &ARC_mfu, 0, param_get_arc_state_size, "Q",
- "size of mfu state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
- &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
- "size of evictable metadata in mfu state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
- &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
- "size of evictable data in mfu state");
-
-extern arc_state_t ARC_mfu_ghost;
-
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_ghost_size,
- CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
- &ARC_mfu_ghost, 0, param_get_arc_state_size, "Q",
- "size of mfu ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
- &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
- "size of evictable metadata in mfu ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
- &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
- "size of evictable data in mfu ghost state");
-
-extern arc_state_t ARC_uncached;
-
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, uncached_size,
- CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
- &ARC_uncached, 0, param_get_arc_state_size, "Q",
- "size of uncached state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD,
- &ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
- "size of evictable metadata in uncached state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD,
- &ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
- "size of evictable data in uncached state");
-
-extern arc_state_t ARC_l2c_only;
-
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, l2c_only_size,
- CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
- &ARC_l2c_only, 0, param_get_arc_state_size, "Q",
- "size of l2c_only state");
-
-/* dbuf.c */
-
-/* dmu.c */
-
-/* dmu_zfetch.c */
-
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)");
-
-extern uint32_t zfetch_max_distance;
-
-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance,
- CTLFLAG_RWTUN, &zfetch_max_distance, 0,
- "Max bytes to prefetch per stream (LEGACY)");
-
-extern uint32_t zfetch_max_idistance;
-
-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance,
- CTLFLAG_RWTUN, &zfetch_max_idistance, 0,
- "Max bytes to prefetch indirects for per stream (LEGACY)");
-
-/* dsl_pool.c */
-
-/* dnode.c */
-
-/* dsl_scan.c */
-
/* metaslab.c */
int
@@ -514,19 +313,6 @@ SYSCTL_UINT(_vfs_zfs, OID_AUTO, condense_pct,
"Condense on-disk spacemap when it is more than this many percents"
" of in-memory counterpart");
-extern uint_t zfs_remove_max_segment;
-
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, remove_max_segment,
- CTLFLAG_RWTUN, &zfs_remove_max_segment, 0,
- "Largest contiguous segment ZFS will attempt to allocate when removing"
- " a device");
-
-extern int zfs_removal_suspend_progress;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress,
- CTLFLAG_RWTUN, &zfs_removal_suspend_progress, 0,
- "Ensures certain actions can happen while in the middle of a removal");
-
/*
* Minimum size which forces the dynamic allocator to change
* it's allocation strategy. Once the space map cannot satisfy
@@ -749,12 +535,6 @@ param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS)
return (0);
}
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
- CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
- &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift),
- param_set_min_auto_ashift, "IU",
- "Min ashift used when creating new top-level vdev. (LEGACY)");
-
int
param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS)
{
@@ -774,13 +554,6 @@ param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS)
return (0);
}
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
- CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
- &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift),
- param_set_max_auto_ashift, "IU",
- "Max ashift used when optimizing for logical -> physical sector size on"
- " new top-level vdevs. (LEGACY)");
-
/*
* Since the DTL space map of a vdev is not expected to have a lot of
* entries, we default its block size to 4K.
@@ -802,23 +575,6 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz,
CTLFLAG_RDTUN, &zfs_vdev_standard_sm_blksz, 0,
"Block size for standard space map. Power of 2 greater than 4096.");
-extern int vdev_validate_skip;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip,
- CTLFLAG_RDTUN, &vdev_validate_skip, 0,
- "Enable to bypass vdev_validate().");
-
-/* vdev_mirror.c */
-
-/* vdev_queue.c */
-
-extern uint_t zfs_vdev_max_active;
-
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight,
- CTLFLAG_RWTUN, &zfs_vdev_max_active, 0,
- "The maximum number of I/Os of all types active for each device."
- " (LEGACY)");
-
/* zio.c */
SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata,
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
index b15a3e6e38c0..cb5787269db2 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
@@ -1175,7 +1175,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
int count = 0;
zfs_acl_phys_t acl_phys;
- if (zp->z_zfsvfs->z_replay == B_FALSE) {
+ if (ZTOV(zp) != NULL && zp->z_zfsvfs->z_replay == B_FALSE) {
ASSERT_VOP_IN_SEQC(ZTOV(zp));
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
index 61d0bb26d1e5..4de48e013ec4 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
@@ -494,7 +494,7 @@ zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
vap->va_uid = 0;
vap->va_gid = 0;
- vap->va_rdev = 0;
+ vap->va_rdev = NODEV;
/*
* We are a purely virtual object, so we have no
* blocksize or allocated blocks.
@@ -688,6 +688,8 @@ zfsctl_root_readdir(struct vop_readdir_args *ap)
* count to return is 0.
*/
if (zfs_uio_offset(&uio) == 3 * sizeof (entry)) {
+ if (eofp != NULL)
+ *eofp = 1;
return (0);
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
index 21e5f7938f9f..ca13569a1235 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
@@ -164,8 +164,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
int
zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
- ssize_t *resid)
+ uint8_t ashift, ssize_t *resid)
{
+ (void) ashift;
return (zfs_file_write_impl(fp, buf, count, &off, resid));
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
index 1813c411b013..411225786089 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -61,6 +61,7 @@
#include <sys/fs/zfs.h>
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/dbuf.h>
@@ -388,7 +389,9 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
error = vn_lock(vp, LK_EXCLUSIVE);
if (error)
return (error);
+ vn_seqc_write_begin(vp);
error = zfs_ioctl_setxattr(vp, fsx, cred);
+ vn_seqc_write_end(vp);
VOP_UNLOCK(vp);
return (error);
}
@@ -1735,7 +1738,7 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
/*
* Quit if directory has been removed (posix)
*/
- if ((*eofp = zp->z_unlinked) != 0) {
+ if ((*eofp = (zp->z_unlinked != 0)) != 0) {
zfs_exit(zfsvfs, FTAG);
return (0);
}
@@ -2013,7 +2016,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
if (vp->v_type == VBLK || vp->v_type == VCHR)
vap->va_rdev = zfs_cmpldev(rdev);
else
- vap->va_rdev = 0;
+ vap->va_rdev = NODEV;
vap->va_gen = zp->z_gen;
vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
vap->va_filerev = zp->z_seq;
@@ -2203,6 +2206,7 @@ zfs_setattr_dir(znode_t *dzp)
if (err)
break;
+ vn_seqc_write_begin(ZTOV(zp));
mutex_enter(&dzp->z_lock);
if (zp->z_uid != dzp->z_uid) {
@@ -2252,6 +2256,7 @@ sa_add_projid_err:
dmu_tx_abort(tx);
}
tx = NULL;
+ vn_seqc_write_end(ZTOV(zp));
if (err != 0 && err != ENOENT)
break;
@@ -5727,6 +5732,9 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
{
ulong_t val;
int error;
+#ifdef _PC_CLONE_BLKSIZE
+ zfsvfs_t *zfsvfs;
+#endif
error = zfs_pathconf(ap->a_vp, ap->a_name, &val,
curthread->td_ucred, NULL);
@@ -5773,6 +5781,21 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
*ap->a_retval = 1;
return (0);
#endif
+#ifdef _PC_CLONE_BLKSIZE
+ case _PC_CLONE_BLKSIZE:
+ zfsvfs = (zfsvfs_t *)ap->a_vp->v_mount->mnt_data;
+ if (zfs_bclone_enabled &&
+ spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
+ SPA_FEATURE_BLOCK_CLONING))
+ *ap->a_retval = dsl_dataset_feature_is_active(
+ zfsvfs->z_os->os_dsl_dataset,
+ SPA_FEATURE_LARGE_BLOCKS) ?
+ SPA_MAXBLOCKSIZE :
+ SPA_OLD_MAXBLOCKSIZE;
+ else
+ *ap->a_retval = 0;
+ return (0);
+#endif
default:
return (vop_stdpathconf(ap));
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
index 7cd0a153577c..649022ab5bcb 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
@@ -817,6 +817,10 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
(*zpp)->z_dnodesize = dnodesize;
(*zpp)->z_projid = projid;
+ vnode_t *vp = ZTOV(*zpp);
+ if (!(flag & IS_ROOT_NODE))
+ vn_seqc_write_begin(vp);
+
if (vap->va_mask & AT_XVATTR)
zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
@@ -825,7 +829,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
}
if (!(flag & IS_ROOT_NODE)) {
- vnode_t *vp = ZTOV(*zpp);
+ vn_seqc_write_end(vp);
vp->v_vflag |= VV_FORCEINSMQ;
int err = insmntque(vp, zfsvfs->z_vfs);
vp->v_vflag &= ~VV_FORCEINSMQ;
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
index 265dfd55fc4d..0dd2ecd7fd8d 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
@@ -31,7 +31,7 @@
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Klara, Inc.
*/
/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
@@ -196,7 +196,6 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
static int zvol_geom_open(struct g_provider *pp, int flag, int count);
static int zvol_geom_close(struct g_provider *pp, int flag, int count);
-static void zvol_geom_destroy(zvol_state_t *zv);
static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
static void zvol_geom_bio_start(struct bio *bp);
static int zvol_geom_bio_getattr(struct bio *bp);
@@ -226,25 +225,14 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
}
retry:
- rw_enter(&zvol_state_lock, ZVOL_RW_READER);
- /*
- * Obtain a copy of private under zvol_state_lock to make sure either
- * the result of zvol free code setting private to NULL is observed,
- * or the zv is protected from being freed because of the positive
- * zv_open_count.
- */
- zv = pp->private;
- if (zv == NULL) {
- rw_exit(&zvol_state_lock);
- err = SET_ERROR(ENXIO);
- goto out_locked;
- }
+ zv = atomic_load_ptr(&pp->private);
+ if (zv == NULL)
+ return (SET_ERROR(ENXIO));
mutex_enter(&zv->zv_state_lock);
if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
- rw_exit(&zvol_state_lock);
err = SET_ERROR(ENXIO);
- goto out_zv_locked;
+ goto out_locked;
}
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
@@ -257,8 +245,24 @@ retry:
drop_suspend = B_TRUE;
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
mutex_exit(&zv->zv_state_lock);
+
+ /*
+ * Removal may happen while the locks are down, so
+ * we can't trust zv any longer; we have to start over.
+ */
+ zv = atomic_load_ptr(&pp->private);
+ if (zv == NULL)
+ return (SET_ERROR(ENXIO));
+
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ if (zv->zv_zso->zso_dying ||
+ zv->zv_flags & ZVOL_REMOVING) {
+ err = SET_ERROR(ENXIO);
+ goto out_locked;
+ }
+
/* Check to see if zv_suspend_lock is needed. */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
@@ -266,7 +270,6 @@ retry:
}
}
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -294,7 +297,7 @@ retry:
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
if (err)
- goto out_zv_locked;
+ goto out_locked;
pp->mediasize = zv->zv_volsize;
pp->stripeoffset = 0;
pp->stripesize = zv->zv_volblocksize;
@@ -329,9 +332,8 @@ out_opened:
zvol_last_close(zv);
wakeup(zv);
}
-out_zv_locked:
- mutex_exit(&zv->zv_state_lock);
out_locked:
+ mutex_exit(&zv->zv_state_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
return (err);
@@ -345,12 +347,9 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
boolean_t drop_suspend = B_TRUE;
int new_open_count;
- rw_enter(&zvol_state_lock, ZVOL_RW_READER);
- zv = pp->private;
- if (zv == NULL) {
- rw_exit(&zvol_state_lock);
+ zv = atomic_load_ptr(&pp->private);
+ if (zv == NULL)
return (SET_ERROR(ENXIO));
- }
mutex_enter(&zv->zv_state_lock);
if (zv->zv_flags & ZVOL_EXCL) {
@@ -377,6 +376,15 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ /*
+ * Unlike in zvol_geom_open(), we don't check if
+ * removal started here, because we might be one of the
+ * openers that needs to be thrown out! If we're the
+ * last, we need to call zvol_last_close() below to
+ * finish cleanup. So, no special treatment for us.
+ */
+
/* Check to see if zv_suspend_lock is needed. */
new_open_count = zv->zv_open_count - count;
if (new_open_count != 0) {
@@ -387,7 +395,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
} else {
drop_suspend = B_FALSE;
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -408,20 +415,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
return (0);
}
-static void
-zvol_geom_destroy(zvol_state_t *zv)
-{
- struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
- struct g_provider *pp = zsg->zsg_provider;
-
- ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
-
- g_topology_assert();
-
- zsg->zsg_provider = NULL;
- g_wither_geom(pp->geom, ENXIO);
-}
-
void
zvol_wait_close(zvol_state_t *zv)
{
@@ -454,7 +447,7 @@ zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
pp->name, acr, acw, ace));
- if (pp->private == NULL) {
+ if (atomic_load_ptr(&pp->private) == NULL) {
if (acr <= 0 && acw <= 0 && ace <= 0)
return (0);
return (pp->error);
@@ -921,25 +914,14 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
boolean_t drop_suspend = B_FALSE;
retry:
- rw_enter(&zvol_state_lock, ZVOL_RW_READER);
- /*
- * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
- * the result of zvol free code setting si_drv2 to NULL is observed,
- * or the zv is protected from being freed because of the positive
- * zv_open_count.
- */
- zv = dev->si_drv2;
- if (zv == NULL) {
- rw_exit(&zvol_state_lock);
- err = SET_ERROR(ENXIO);
- goto out_locked;
- }
+ zv = atomic_load_ptr(&dev->si_drv2);
+ if (zv == NULL)
+ return (SET_ERROR(ENXIO));
mutex_enter(&zv->zv_state_lock);
- if (zv->zv_zso->zso_dying) {
- rw_exit(&zvol_state_lock);
+ if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
err = SET_ERROR(ENXIO);
- goto out_zv_locked;
+ goto out_locked;
}
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
@@ -954,6 +936,13 @@ retry:
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
+ /* Removal started while locks were down. */
+ err = SET_ERROR(ENXIO);
+ goto out_locked;
+ }
+
/* Check to see if zv_suspend_lock is needed. */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
@@ -961,7 +950,6 @@ retry:
}
}
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -989,7 +977,7 @@ retry:
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
if (err)
- goto out_zv_locked;
+ goto out_locked;
}
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -1016,9 +1004,8 @@ out_opened:
zvol_last_close(zv);
wakeup(zv);
}
-out_zv_locked:
- mutex_exit(&zv->zv_state_lock);
out_locked:
+ mutex_exit(&zv->zv_state_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
return (err);
@@ -1030,12 +1017,9 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
zvol_state_t *zv;
boolean_t drop_suspend = B_TRUE;
- rw_enter(&zvol_state_lock, ZVOL_RW_READER);
- zv = dev->si_drv2;
- if (zv == NULL) {
- rw_exit(&zvol_state_lock);
+ zv = atomic_load_ptr(&dev->si_drv2);
+ if (zv == NULL)
return (SET_ERROR(ENXIO));
- }
mutex_enter(&zv->zv_state_lock);
if (zv->zv_flags & ZVOL_EXCL) {
@@ -1060,6 +1044,15 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ /*
+ * Unlike in zvol_cdev_open(), we don't check if
+ * removal started here, because we might be one of the
+ * openers that needs to be thrown out! If we're the
+ * last, we need to call zvol_last_close() below to
+ * finish cleanup. So, no special treatment for us.
+ */
+
/* Check to see if zv_suspend_lock is needed. */
if (zv->zv_open_count != 1) {
rw_exit(&zv->zv_suspend_lock);
@@ -1069,7 +1062,6 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
} else {
drop_suspend = B_FALSE;
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -1101,7 +1093,8 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
int error;
boolean_t sync;
- zv = dev->si_drv2;
+ zv = atomic_load_ptr(&dev->si_drv2);
+ ASSERT3P(zv, !=, NULL);
error = 0;
KASSERT(zv->zv_open_count > 0,
@@ -1162,6 +1155,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
*(off_t *)data = 0;
break;
case DIOCGATTR: {
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
spa_t *spa = dmu_objset_spa(zv->zv_objset);
struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
uint64_t refd, avail, usedobjs, availobjs;
@@ -1186,6 +1180,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
arg->value.off = refd / DEV_BSIZE;
} else
error = SET_ERROR(ENOIOCTL);
+ rw_exit(&zv->zv_suspend_lock);
break;
}
case FIOSEEKHOLE:
@@ -1196,10 +1191,12 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
hole = (cmd == FIOSEEKHOLE);
noff = *off;
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
RL_READER);
error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
zfs_rangelock_exit(lr);
+ rw_exit(&zv->zv_suspend_lock);
*off = noff;
break;
}
@@ -1400,42 +1397,65 @@ zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
* Remove minor node for the specified volume.
*/
void
-zvol_os_free(zvol_state_t *zv)
+zvol_os_remove_minor(zvol_state_t *zv)
{
- ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
- ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
ASSERT0(zv->zv_open_count);
+ ASSERT0(atomic_read(&zv->zv_suspend_ref));
+ ASSERT(zv->zv_flags & ZVOL_REMOVING);
- ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
-
- rw_destroy(&zv->zv_suspend_lock);
- zfs_rangelock_fini(&zv->zv_rangelock);
+ struct zvol_state_os *zso = zv->zv_zso;
+ zv->zv_zso = NULL;
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
- struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
- struct g_provider *pp __maybe_unused = zsg->zsg_provider;
-
- ASSERT0P(pp->private);
+ struct zvol_state_geom *zsg = &zso->zso_geom;
+ struct g_provider *pp = zsg->zsg_provider;
+ atomic_store_ptr(&pp->private, NULL);
+ mutex_exit(&zv->zv_state_lock);
g_topology_lock();
- zvol_geom_destroy(zv);
+ g_wither_geom(pp->geom, ENXIO);
g_topology_unlock();
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
- struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+ struct zvol_state_dev *zsd = &zso->zso_dev;
struct cdev *dev = zsd->zsd_cdev;
+ if (dev != NULL)
+ atomic_store_ptr(&dev->si_drv2, NULL);
+ mutex_exit(&zv->zv_state_lock);
+
if (dev != NULL) {
- ASSERT0P(dev->si_drv2);
destroy_dev(dev);
knlist_clear(&zsd->zsd_selinfo.si_note, 0);
knlist_destroy(&zsd->zsd_selinfo.si_note);
}
}
+ kmem_free(zso, sizeof (struct zvol_state_os));
+
+ mutex_enter(&zv->zv_state_lock);
+}
+
+void
+zvol_os_free(zvol_state_t *zv)
+{
+ ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+ ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT0(zv->zv_open_count);
+ ASSERT0P(zv->zv_zso);
+
+ ASSERT0P(zv->zv_objset);
+ ASSERT0P(zv->zv_zilog);
+ ASSERT0P(zv->zv_dn);
+
+ ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+
+ rw_destroy(&zv->zv_suspend_lock);
+ zfs_rangelock_fini(&zv->zv_rangelock);
+
mutex_destroy(&zv->zv_state_lock);
cv_destroy(&zv->zv_removing_cv);
dataset_kstats_destroy(&zv->zv_kstat);
- kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
kmem_free(zv, sizeof (zvol_state_t));
zvol_minors--;
}
@@ -1538,28 +1558,6 @@ out_doi:
return (error);
}
-void
-zvol_os_clear_private(zvol_state_t *zv)
-{
- ASSERT(RW_LOCK_HELD(&zvol_state_lock));
- if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
- struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
- struct g_provider *pp = zsg->zsg_provider;
-
- if (pp->private == NULL) /* already cleared */
- return;
-
- pp->private = NULL;
- ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
- } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
- struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
- struct cdev *dev = zsd->zsd_cdev;
-
- if (dev != NULL)
- dev->si_drv2 = NULL;
- }
-}
-
int
zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
{
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
index 830fad7fe793..1bd3500e9f66 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@@ -471,13 +471,17 @@ vdev_disk_close(vdev_t *v)
if (v->vdev_reopening || vd == NULL)
return;
+ rw_enter(&vd->vd_lock, RW_WRITER);
+
if (vd->vd_bdh != NULL)
vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
zfs_vdev_holder);
+ v->vdev_tsd = NULL;
+
+ rw_exit(&vd->vd_lock);
rw_destroy(&vd->vd_lock);
kmem_free(vd, sizeof (vdev_disk_t));
- v->vdev_tsd = NULL;
}
/*
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
index c729947369c2..3fdcdbac6f68 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
@@ -115,8 +115,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
*/
int
zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
- ssize_t *resid)
+ uint8_t ashift, ssize_t *resid)
{
+ (void) ashift;
ssize_t rc;
rc = kernel_write(fp, buf, count, &off);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
index cd606e667bff..8a7d14ab6119 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -1556,6 +1556,12 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
sb->s_xattr = zpl_xattr_handlers;
sb->s_export_op = &zpl_export_operations;
+#ifdef HAVE_SET_DEFAULT_D_OP
+ set_default_d_op(sb, &zpl_dentry_operations);
+#else
+ sb->s_d_op = &zpl_dentry_operations;
+#endif
+
/* Set features for file system. */
zfs_set_fuid_feature(zfsvfs);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
index 48dae79a2373..81ac26cb0c93 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
@@ -202,7 +202,7 @@ zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
return (!!dentry->d_inode);
}
-static dentry_operations_t zpl_dops_snapdirs = {
+static const struct dentry_operations zpl_dops_snapdirs = {
/*
* Auto mounting of snapshots is only supported for 2.6.37 and
* newer kernels. Prior to this kernel the ops->follow_link()
@@ -215,6 +215,51 @@ static dentry_operations_t zpl_dops_snapdirs = {
.d_revalidate = zpl_snapdir_revalidate,
};
+/*
+ * For the .zfs control directory to work properly we must be able to override
+ * the default operations table and register custom .d_automount and
+ * .d_revalidate callbacks.
+ */
+static void
+set_snapdir_dentry_ops(struct dentry *dentry, unsigned int extraflags) {
+ static const unsigned int op_flags =
+ DCACHE_OP_HASH | DCACHE_OP_COMPARE |
+ DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE |
+ DCACHE_OP_PRUNE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_REAL;
+
+#ifdef HAVE_D_SET_D_OP
+ /*
+ * d_set_d_op() will set the DCACHE_OP_ flags according to what it
+ * finds in the passed dentry_operations, so we don't have to.
+ *
+ * We clear the flags and the old op table before calling d_set_d_op()
+ * because issues a warning when the dentry operations table is already
+ * set.
+ */
+ dentry->d_op = NULL;
+ dentry->d_flags &= ~op_flags;
+ d_set_d_op(dentry, &zpl_dops_snapdirs);
+ dentry->d_flags |= extraflags;
+#else
+ /*
+ * Since 6.17 there's no exported way to modify dentry ops, so we have
+ * to reach in and do it ourselves. This should be safe for our very
+ * narrow use case, which is to create or splice in an entry to give
+ * access to a snapshot.
+ *
+ * We need to set the op flags directly. We hardcode
+ * DCACHE_OP_REVALIDATE because that's the only operation we have; if
+ * we ever extend zpl_dops_snapdirs we will need to update the op flags
+ * to match.
+ */
+ spin_lock(&dentry->d_lock);
+ dentry->d_op = &zpl_dops_snapdirs;
+ dentry->d_flags &= ~op_flags;
+ dentry->d_flags |= DCACHE_OP_REVALIDATE | extraflags;
+ spin_unlock(&dentry->d_lock);
+#endif
+}
+
static struct dentry *
zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
unsigned int flags)
@@ -236,10 +281,7 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
return (ERR_PTR(error));
ASSERT(error == 0 || ip == NULL);
- d_clear_d_op(dentry);
- d_set_d_op(dentry, &zpl_dops_snapdirs);
- dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
-
+ set_snapdir_dentry_ops(dentry, DCACHE_NEED_AUTOMOUNT);
return (d_splice_alias(ip, dentry));
}
@@ -373,8 +415,7 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
if (error == 0) {
- d_clear_d_op(dentry);
- d_set_d_op(dentry, &zpl_dops_snapdirs);
+ set_snapdir_dentry_ops(dentry, 0);
d_instantiate(dentry, ip);
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
index 53819628627d..444948d03cb3 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2011, Lawrence Livermore National Security, LLC.
* Copyright (c) 2023, Datto Inc. All rights reserved.
+ * Copyright (c) 2025, Klara, Inc.
*/
@@ -33,6 +34,20 @@
#include <linux/iversion.h>
#include <linux/version.h>
+/*
+ * What to do when the last reference to an inode is released. If 0, the kernel
+ * will cache it on the superblock. If 1, the inode will be freed immediately.
+ * See zpl_drop_inode().
+ */
+int zfs_delete_inode = 0;
+
+/*
+ * What to do when the last reference to a dentry is released. If 0, the kernel
+ * will cache it until the entry (file) is destroyed. If 1, the dentry will be
+ * marked for cleanup, at which time its inode reference will be released. See
+ * zpl_dentry_delete().
+ */
+int zfs_delete_dentry = 0;
static struct inode *
zpl_inode_alloc(struct super_block *sb)
@@ -77,11 +92,36 @@ zpl_dirty_inode(struct inode *ip, int flags)
}
/*
- * When ->drop_inode() is called its return value indicates if the
- * inode should be evicted from the inode cache. If the inode is
- * unhashed and has no links the default policy is to evict it
- * immediately.
+ * ->drop_inode() is called when the last reference to an inode is released.
+ * Its return value indicates if the inode should be destroyed immediately, or
+ * cached on the superblock structure.
+ *
+ * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns
+ * "destroy immediately" if the inode is unhashed and has no links (roughly: no
+ * longer exists on disk). On datasets with millions of rarely-accessed files,
+ * this can cause a large amount of memory to be "pinned" by cached inodes,
+ * which in turn pin their associated dnodes and dbufs, until the kernel starts
+ * reporting memory pressure and requests OpenZFS release some memory (see
+ * zfs_prune()).
+ *
+ * When set to 1, we call generic_delete_node(), which always returns "destroy
+ * immediately", resulting in inodes being destroyed immediately, releasing
+ * their associated dnodes and dbufs to the dbuf cached and the ARC to be
+ * evicted as normal.
*
+ * Note that the "last reference" doesn't always mean the last _userspace_
+ * reference; the dentry cache also holds a reference, so "busy" inodes will
+ * still be kept alive that way (subject to dcache tuning).
+ */
+static int
+zpl_drop_inode(struct inode *ip)
+{
+ if (zfs_delete_inode)
+ return (generic_delete_inode(ip));
+ return (generic_drop_inode(ip));
+}
+
+/*
* The ->evict_inode() callback must minimally truncate the inode pages,
* and call clear_inode(). For 2.6.35 and later kernels this will
* simply update the inode state, with the sync occurring before the
@@ -470,6 +510,7 @@ const struct super_operations zpl_super_operations = {
.destroy_inode = zpl_inode_destroy,
.dirty_inode = zpl_dirty_inode,
.write_inode = NULL,
+ .drop_inode = zpl_drop_inode,
.evict_inode = zpl_evict_inode,
.put_super = zpl_put_super,
.sync_fs = zpl_sync_fs,
@@ -480,6 +521,35 @@ const struct super_operations zpl_super_operations = {
.show_stats = NULL,
};
+/*
+ * ->d_delete() is called when the last reference to a dentry is released. Its
+ * return value indicates if the dentry should be destroyed immediately, or
+ * retained in the dentry cache.
+ *
+ * By default (zfs_delete_dentry=0) the kernel will always cache unused
+ * entries. Each dentry holds an inode reference, so cached dentries can hold
+ * the final inode reference indefinitely, leading to the inode and its related
+ * data being pinned (see zpl_drop_inode()).
+ *
+ * When set to 1, we signal that the dentry should be destroyed immediately and
+ * never cached. This reduces memory usage, at the cost of higher overheads to
+ * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be
+ * reloaded and reinflated.
+ *
+ * Note that userspace does not have direct control over dentry references and
+ * reclaim; rather, this is part of the kernel's caching and reclaim subsystems
+ * (eg vm.vfs_cache_pressure).
+ */
+static int
+zpl_dentry_delete(const struct dentry *dentry)
+{
+ return (zfs_delete_dentry ? 1 : 0);
+}
+
+const struct dentry_operations zpl_dentry_operations = {
+ .d_delete = zpl_dentry_delete,
+};
+
struct file_system_type zpl_fs_type = {
.owner = THIS_MODULE,
.name = ZFS_DRIVER,
@@ -491,3 +561,10 @@ struct file_system_type zpl_fs_type = {
.mount = zpl_mount,
.kill_sb = zpl_kill_sb,
};
+
+ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW,
+ "Delete inodes as soon as the last reference is released.");
+
+ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW,
+ "Delete dentries from dentry cache as soon as the last reference is "
+ "released.");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index a73acdad34ae..bac166fcd89e 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Klara, Inc.
*/
#include <sys/dataset_kstats.h>
@@ -679,28 +679,19 @@ zvol_open(struct block_device *bdev, fmode_t flag)
retry:
#endif
- rw_enter(&zvol_state_lock, RW_READER);
- /*
- * Obtain a copy of private_data under the zvol_state_lock to make
- * sure that either the result of zvol free code path setting
- * disk->private_data to NULL is observed, or zvol_os_free()
- * is not called on this zv because of the positive zv_open_count.
- */
+
#ifdef HAVE_BLK_MODE_T
- zv = disk->private_data;
+ zv = atomic_load_ptr(&disk->private_data);
#else
- zv = bdev->bd_disk->private_data;
+ zv = atomic_load_ptr(&bdev->bd_disk->private_data);
#endif
if (zv == NULL) {
- rw_exit(&zvol_state_lock);
return (-SET_ERROR(ENXIO));
}
mutex_enter(&zv->zv_state_lock);
-
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
mutex_exit(&zv->zv_state_lock);
- rw_exit(&zvol_state_lock);
return (-SET_ERROR(ENXIO));
}
@@ -712,8 +703,28 @@ retry:
if (zv->zv_open_count == 0) {
if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
mutex_exit(&zv->zv_state_lock);
+
+ /*
+ * Removal may happen while the locks are down, so
+ * we can't trust zv any longer; we have to start over.
+ */
+#ifdef HAVE_BLK_MODE_T
+ zv = atomic_load_ptr(&disk->private_data);
+#else
+ zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+#endif
+ if (zv == NULL)
+ return (-SET_ERROR(ENXIO));
+
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_exit(&zv->zv_suspend_lock);
+ return (-SET_ERROR(ENXIO));
+ }
+
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
@@ -724,7 +735,6 @@ retry:
drop_suspend = B_TRUE;
}
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -821,11 +831,11 @@ zvol_release(struct gendisk *disk, fmode_t unused)
#if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
(void) unused;
#endif
- zvol_state_t *zv;
boolean_t drop_suspend = B_TRUE;
- rw_enter(&zvol_state_lock, RW_READER);
- zv = disk->private_data;
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
+ if (zv == NULL)
+ return;
mutex_enter(&zv->zv_state_lock);
ASSERT3U(zv->zv_open_count, >, 0);
@@ -839,6 +849,15 @@ zvol_release(struct gendisk *disk, fmode_t unused)
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ /*
+ * Unlike in zvol_open(), we don't check if removal
+ * started here, because we might be one of the openers
+ * that needs to be thrown out! If we're the last, we
+ * need to call zvol_last_close() below to finish
+ * cleanup. So, no special treatment for us.
+ */
+
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 1) {
rw_exit(&zv->zv_suspend_lock);
@@ -848,7 +867,6 @@ zvol_release(struct gendisk *disk, fmode_t unused)
} else {
drop_suspend = B_FALSE;
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -868,9 +886,10 @@ static int
zvol_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
- zvol_state_t *zv = bdev->bd_disk->private_data;
int error = 0;
+ zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+ ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
switch (cmd) {
@@ -923,9 +942,8 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
{
unsigned int mask = 0;
- rw_enter(&zvol_state_lock, RW_READER);
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
- zvol_state_t *zv = disk->private_data;
if (zv != NULL) {
mutex_enter(&zv->zv_state_lock);
mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
@@ -933,17 +951,14 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
mutex_exit(&zv->zv_state_lock);
}
- rw_exit(&zvol_state_lock);
-
return (mask);
}
static int
zvol_revalidate_disk(struct gendisk *disk)
{
- rw_enter(&zvol_state_lock, RW_READER);
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
- zvol_state_t *zv = disk->private_data;
if (zv != NULL) {
mutex_enter(&zv->zv_state_lock);
set_capacity(zv->zv_zso->zvo_disk,
@@ -951,8 +966,6 @@ zvol_revalidate_disk(struct gendisk *disk)
mutex_exit(&zv->zv_state_lock);
}
- rw_exit(&zvol_state_lock);
-
return (0);
}
@@ -971,16 +984,6 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
return (0);
}
-void
-zvol_os_clear_private(zvol_state_t *zv)
-{
- /*
- * Cleared while holding zvol_state_lock as a writer
- * which will prevent zvol_open() from opening it.
- */
- zv->zv_zso->zvo_disk->private_data = NULL;
-}
-
/*
* Provide a simple virtual geometry for legacy compatibility. For devices
* smaller than 1 MiB a small head and sector count is used to allow very
@@ -990,9 +993,10 @@ zvol_os_clear_private(zvol_state_t *zv)
static int
zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
- zvol_state_t *zv = bdev->bd_disk->private_data;
sector_t sectors;
+ zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+ ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
sectors = get_capacity(zv->zv_zso->zvo_disk);
@@ -1417,53 +1421,70 @@ out_kmem:
return (ret);
}
-/*
- * Cleanup then free a zvol_state_t which was created by zvol_alloc().
- * At this time, the structure is not opened by anyone, is taken off
- * the zvol_state_list, and has its private data set to NULL.
- * The zvol_state_lock is dropped.
- *
- * This function may take many milliseconds to complete (e.g. we've seen
- * it take over 256ms), due to the calls to "blk_cleanup_queue" and
- * "del_gendisk". Thus, consumers need to be careful to account for this
- * latency when calling this function.
- */
void
-zvol_os_free(zvol_state_t *zv)
+zvol_os_remove_minor(zvol_state_t *zv)
{
-
- ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
- ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
ASSERT0(zv->zv_open_count);
- ASSERT0P(zv->zv_zso->zvo_disk->private_data);
+ ASSERT0(atomic_read(&zv->zv_suspend_ref));
+ ASSERT(zv->zv_flags & ZVOL_REMOVING);
- rw_destroy(&zv->zv_suspend_lock);
- zfs_rangelock_fini(&zv->zv_rangelock);
+ struct zvol_state_os *zso = zv->zv_zso;
+ zv->zv_zso = NULL;
+
+ /* Clearing private_data will make new callers return immediately. */
+ atomic_store_ptr(&zso->zvo_disk->private_data, NULL);
+
+ /*
+ * Drop the state lock before calling del_gendisk(). There may be
+ * callers waiting to acquire it, but del_gendisk() will block until
+ * they exit, which would deadlock.
+ */
+ mutex_exit(&zv->zv_state_lock);
- del_gendisk(zv->zv_zso->zvo_disk);
+ del_gendisk(zso->zvo_disk);
#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
#if defined(HAVE_BLK_CLEANUP_DISK)
- blk_cleanup_disk(zv->zv_zso->zvo_disk);
+ blk_cleanup_disk(zso->zvo_disk);
#else
- put_disk(zv->zv_zso->zvo_disk);
+ put_disk(zso->zvo_disk);
#endif
#else
- blk_cleanup_queue(zv->zv_zso->zvo_queue);
- put_disk(zv->zv_zso->zvo_disk);
+ blk_cleanup_queue(zso->zvo_queue);
+ put_disk(zso->zvo_disk);
#endif
- if (zv->zv_zso->use_blk_mq)
- blk_mq_free_tag_set(&zv->zv_zso->tag_set);
+ if (zso->use_blk_mq)
+ blk_mq_free_tag_set(&zso->tag_set);
+
+ ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
- ida_simple_remove(&zvol_ida,
- MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
+ kmem_free(zso, sizeof (struct zvol_state_os));
+
+ mutex_enter(&zv->zv_state_lock);
+}
+
+void
+zvol_os_free(zvol_state_t *zv)
+{
+
+ ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+ ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT0(zv->zv_open_count);
+ ASSERT0P(zv->zv_zso);
+
+ ASSERT0P(zv->zv_objset);
+ ASSERT0P(zv->zv_zilog);
+ ASSERT0P(zv->zv_dn);
+
+ rw_destroy(&zv->zv_suspend_lock);
+ zfs_rangelock_fini(&zv->zv_rangelock);
cv_destroy(&zv->zv_removing_cv);
mutex_destroy(&zv->zv_state_lock);
dataset_kstats_destroy(&zv->zv_kstat);
- kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
kmem_free(zv, sizeof (zvol_state_t));
}