diff options
Diffstat (limited to 'sys/contrib/openzfs/module/os/linux')
12 files changed, 492 insertions, 138 deletions
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c index 092f090d934b..00ff789265c6 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c @@ -32,7 +32,6 @@ #include <sys/taskq.h> #include <sys/kmem.h> #include <sys/tsd.h> -#include <sys/trace_spl.h> #include <sys/time.h> #include <sys/atomic.h> #include <sys/kstat.h> @@ -325,7 +324,6 @@ task_expire_impl(taskq_ent_t *t) } t->tqent_birth = jiffies; - DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t); /* * The priority list must be maintained in strict task id order @@ -713,9 +711,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) t->tqent_taskq = tq; t->tqent_timer.function = NULL; t->tqent_timer.expires = 0; - t->tqent_birth = jiffies; - DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t); ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); @@ -840,9 +836,7 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, t->tqent_func = func; t->tqent_arg = arg; t->tqent_taskq = tq; - t->tqent_birth = jiffies; - DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t); spin_unlock(&t->tqent_lock); @@ -1054,11 +1048,6 @@ taskq_thread(void *args) * A TQENT_FLAG_PREALLOC task may be reused or freed * during the task function call. Store tqent_id and * tqent_flags here. - * - * Also use an on stack taskq_ent_t for tqt_task - * assignment in this case; we want to make sure - * to duplicate all fields, so the values are - * correct when it's accessed via DTRACE_PROBE*. */ tqt->tqt_id = t->tqent_id; tqt->tqt_flags = t->tqent_flags; @@ -1074,13 +1063,10 @@ taskq_thread(void *args) spin_unlock_irqrestore(&tq->tq_lock, flags); TQSTAT_INC(tq, threads_active); - DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t); /* Perform the requested task */ t->tqent_func(t->tqent_arg); - DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t); - TQSTAT_DEC(tq, threads_active); if ((t->tqent_flags & TQENT_LIST_MASK) == TQENT_LIST_PENDING) diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c index 45c2999a4bb1..b2eae5d00b10 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c @@ -25,6 +25,10 @@ * SUCH DAMAGE. */ +/* + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> + */ + #include <sys/types.h> #include <sys/sysmacros.h> #include <sys/kmem.h> @@ -56,6 +60,19 @@ typedef struct zone_dataset { } zone_dataset_t; #ifdef CONFIG_USER_NS + +/* + * Linux 6.18 moved the generic namespace type away from ns->ops->type onto + * ns_common itself. + */ +#ifdef HAVE_NS_COMMON_TYPE +#define ns_is_newuser(ns) \ + ((ns)->ns_type == CLONE_NEWUSER) +#else +#define ns_is_newuser(ns) \ + ((ns)->ops != NULL && (ns)->ops->type == CLONE_NEWUSER) +#endif + /* * Returns: * - 0 on success @@ -84,7 +101,7 @@ user_ns_get(int fd, struct user_namespace **userns) goto done; } ns = get_proc_ns(file_inode(nsfile)); - if (ns->ops->type != CLONE_NEWUSER) { + if (!ns_is_newuser(ns)) { error = ENOTTY; goto done; } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index 8a8316f63c48..18f2426fbbfc 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -23,6 +23,7 @@ * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 by Delphix. All rights reserved. * Copyright (c) 2023, 2024, Klara Inc. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> */ /* @@ -1109,6 +1110,14 @@ abd_return_buf_copy(abd_t *abd, void *buf, size_t n) #define ABD_ITER_PAGE_SIZE(page) (PAGESIZE) #endif +#ifndef nth_page +/* + * Since 6.18 nth_page() no longer exists, and is no longer required to iterate + * within a single SG entry, so we replace it with a simple addition. + */ +#define nth_page(p, n) ((p)+(n)) +#endif + void abd_iter_page(struct abd_iter *aiter) { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c index 830fad7fe793..1bd3500e9f66 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c @@ -471,13 +471,17 @@ vdev_disk_close(vdev_t *v) if (v->vdev_reopening || vd == NULL) return; + rw_enter(&vd->vd_lock, RW_WRITER); + if (vd->vd_bdh != NULL) vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), zfs_vdev_holder); + v->vdev_tsd = NULL; + + rw_exit(&vd->vd_lock); rw_destroy(&vd->vd_lock); kmem_free(vd, sizeof (vdev_disk_t)); - v->vdev_tsd = NULL; } /* diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c index daa4b5776837..4c929a4642b1 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c @@ -1447,7 +1447,8 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) if (aclnode->z_ace_count == 0) continue; dmu_write(zfsvfs->z_os, aoid, off, - aclnode->z_size, aclnode->z_acldata, tx); + aclnode->z_size, aclnode->z_acldata, tx, + DMU_READ_NO_PREFETCH); off += aclnode->z_size; } } else { @@ -2524,7 +2525,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, * Also note: DOS R/O is ignored for directories. */ if ((v4_mode & WRITE_MASK_DATA) && - S_ISDIR(ZTOI(zp)->i_mode) && + !S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_READONLY)) { return (SET_ERROR(EPERM)); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c index c729947369c2..3fdcdbac6f68 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c @@ -115,8 +115,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) */ int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, - ssize_t *resid) + uint8_t ashift, ssize_t *resid) { + (void) ashift; ssize_t rc; rc = kernel_write(fp, buf, count, &off); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index cd606e667bff..8a7d14ab6119 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -1556,6 +1556,12 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) sb->s_xattr = zpl_xattr_handlers; sb->s_export_op = &zpl_export_operations; +#ifdef HAVE_SET_DEFAULT_D_OP + set_default_d_op(sb, &zpl_dentry_operations); +#else + sb->s_d_op = &zpl_dentry_operations; +#endif + /* Set features for file system. */ zfs_set_fuid_feature(zfsvfs); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index 6106726651a3..02465adf36d5 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -2033,10 +2033,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) goto out3; } - if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { - err = SET_ERROR(EPERM); - goto out3; - } + /* ZFS_READONLY will be handled in zfs_zaccess() */ /* * Verify timestamps doesn't overflow 32 bits. @@ -3895,7 +3892,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, va = kmap(pp); ASSERT3U(pglen, <=, PAGE_SIZE); - dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); + dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx, + DMU_READ_PREFETCH); kunmap(pp); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c index 48dae79a2373..81ac26cb0c93 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c @@ -202,7 +202,7 @@ zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags) return (!!dentry->d_inode); } -static dentry_operations_t zpl_dops_snapdirs = { +static const struct dentry_operations zpl_dops_snapdirs = { /* * Auto mounting of snapshots is only supported for 2.6.37 and * newer kernels. Prior to this kernel the ops->follow_link() @@ -215,6 +215,51 @@ static dentry_operations_t zpl_dops_snapdirs = { .d_revalidate = zpl_snapdir_revalidate, }; +/* + * For the .zfs control directory to work properly we must be able to override + * the default operations table and register custom .d_automount and + * .d_revalidate callbacks. + */ +static void +set_snapdir_dentry_ops(struct dentry *dentry, unsigned int extraflags) { + static const unsigned int op_flags = + DCACHE_OP_HASH | DCACHE_OP_COMPARE | + DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE | + DCACHE_OP_PRUNE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_REAL; + +#ifdef HAVE_D_SET_D_OP + /* + * d_set_d_op() will set the DCACHE_OP_ flags according to what it + * finds in the passed dentry_operations, so we don't have to. + * + * We clear the flags and the old op table before calling d_set_d_op() + * because issues a warning when the dentry operations table is already + * set. + */ + dentry->d_op = NULL; + dentry->d_flags &= ~op_flags; + d_set_d_op(dentry, &zpl_dops_snapdirs); + dentry->d_flags |= extraflags; +#else + /* + * Since 6.17 there's no exported way to modify dentry ops, so we have + * to reach in and do it ourselves. This should be safe for our very + * narrow use case, which is to create or splice in an entry to give + * access to a snapshot. + * + * We need to set the op flags directly. We hardcode + * DCACHE_OP_REVALIDATE because that's the only operation we have; if + * we ever extend zpl_dops_snapdirs we will need to update the op flags + * to match. + */ + spin_lock(&dentry->d_lock); + dentry->d_op = &zpl_dops_snapdirs; + dentry->d_flags &= ~op_flags; + dentry->d_flags |= DCACHE_OP_REVALIDATE | extraflags; + spin_unlock(&dentry->d_lock); +#endif +} + static struct dentry * zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) @@ -236,10 +281,7 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, return (ERR_PTR(error)); ASSERT(error == 0 || ip == NULL); - d_clear_d_op(dentry); - d_set_d_op(dentry, &zpl_dops_snapdirs); - dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; - + set_snapdir_dentry_ops(dentry, DCACHE_NEED_AUTOMOUNT); return (d_splice_alias(ip, dentry)); } @@ -373,8 +415,7 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); if (error == 0) { - d_clear_d_op(dentry); - d_set_d_op(dentry, &zpl_dops_snapdirs); + set_snapdir_dentry_ops(dentry, 0); d_instantiate(dentry, ip); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index d07317b0d910..f7691c02d163 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -23,6 +23,7 @@ * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright (c) 2025, Klara, Inc. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> */ @@ -478,6 +479,7 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) return (ret); } +#ifdef HAVE_WRITE_CACHE_PAGES #ifdef HAVE_WRITEPAGE_T_FOLIO static int zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data) @@ -499,6 +501,78 @@ zpl_write_cache_pages(struct address_space *mapping, #endif return (result); } +#else +static inline int +zpl_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, void *data) +{ + pgoff_t start = wbc->range_start >> PAGE_SHIFT; + pgoff_t end = wbc->range_end >> PAGE_SHIFT; + + struct folio_batch fbatch; + folio_batch_init(&fbatch); + + /* + * This atomically (-ish) tags all DIRTY pages in the range with + * TOWRITE, allowing users to continue dirtying or undirtying pages + * while we get on with writeback, without us treading on each other. + */ + tag_pages_for_writeback(mapping, start, end); + + int err = 0; + unsigned int npages; + + /* + * Grab references to the TOWRITE pages just flagged. This may not get + * all of them, so we do it in a loop until there are none left. + */ + while ((npages = filemap_get_folios_tag(mapping, &start, end, + PAGECACHE_TAG_TOWRITE, &fbatch)) != 0) { + + /* Loop over each page and write it out. */ + struct folio *folio; + while ((folio = folio_batch_next(&fbatch)) != NULL) { + folio_lock(folio); + + /* + * If the folio has been remapped, or is no longer + * dirty, then there's nothing to do. + */ + if (folio->mapping != mapping || + !folio_test_dirty(folio)) { + folio_unlock(folio); + continue; + } + + /* + * If writeback is already in progress, wait for it to + * finish. We continue after this even if the page + * ends up clean; zfs_putpage() will skip it if no + * further work is required. + */ + while (folio_test_writeback(folio)) + folio_wait_bit(folio, PG_writeback); + + /* + * Write it out and collect any error. zfs_putpage() + * will clear the TOWRITE and DIRTY flags, and return + * with the page unlocked. + */ + int ferr = zpl_putpage(&folio->page, wbc, data); + if (err == 0 && ferr != 0) + err = ferr; + + /* Housekeeping for the caller. */ + wbc->nr_to_write -= folio_nr_pages(folio); + } + + /* Release any remaining references on the batch. */ + folio_batch_release(&fbatch); + } + + return (err); +} +#endif static int zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -737,28 +811,44 @@ zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) return (error); } -#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) -#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) +#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL) +#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | FS_PROJINHERIT_FL) + + +static struct { + uint64_t zfs_flag; + uint32_t fs_flag; + uint32_t xflag; +} flags_lookup[] = { + {ZFS_IMMUTABLE, FS_IMMUTABLE_FL, FS_XFLAG_IMMUTABLE}, + {ZFS_APPENDONLY, FS_APPEND_FL, FS_XFLAG_APPEND}, + {ZFS_NODUMP, FS_NODUMP_FL, FS_XFLAG_NODUMP}, + {ZFS_PROJINHERIT, FS_PROJINHERIT_FL, FS_XFLAG_PROJINHERIT} +}; static uint32_t __zpl_ioctl_getflags(struct inode *ip) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; uint32_t ioctl_flags = 0; + for (int i = 0; i < ARRAY_SIZE(flags_lookup); i++) + if (zfs_flags & flags_lookup[i].zfs_flag) + ioctl_flags |= flags_lookup[i].fs_flag; - if (zfs_flags & ZFS_IMMUTABLE) - ioctl_flags |= FS_IMMUTABLE_FL; - - if (zfs_flags & ZFS_APPENDONLY) - ioctl_flags |= FS_APPEND_FL; + return (ioctl_flags); +} - if (zfs_flags & ZFS_NODUMP) - ioctl_flags |= FS_NODUMP_FL; +static uint32_t +__zpl_ioctl_getxflags(struct inode *ip) +{ + uint64_t zfs_flags = ITOZ(ip)->z_pflags; + uint32_t ioctl_flags = 0; - if (zfs_flags & ZFS_PROJINHERIT) - ioctl_flags |= ZFS_PROJINHERIT_FL; + for (int i = 0; i < ARRAY_SIZE(flags_lookup); i++) + if (zfs_flags & flags_lookup[i].zfs_flag) + ioctl_flags |= flags_lookup[i].xflag; - return (ioctl_flags & ZFS_FL_USER_VISIBLE); + return (ioctl_flags); } /* @@ -772,6 +862,7 @@ zpl_ioctl_getflags(struct file *filp, void __user *arg) int err; flags = __zpl_ioctl_getflags(file_inode(filp)); + flags = flags & ZFS_FL_USER_VISIBLE; err = copy_to_user(arg, &flags, sizeof (flags)); return (err); @@ -795,7 +886,7 @@ __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) xoptattr_t *xoap; if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | - ZFS_PROJINHERIT_FL)) + FS_PROJINHERIT_FL)) return (-EOPNOTSUPP); if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE) @@ -826,7 +917,51 @@ __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) xoap->xoa_appendonly); FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump); - FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT, + FLAG_CHANGE(FS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT, + xoap->xoa_projinherit); + +#undef FLAG_CHANGE + + return (0); +} + +static int +__zpl_ioctl_setxflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) +{ + uint64_t zfs_flags = ITOZ(ip)->z_pflags; + xoptattr_t *xoap; + + if (ioctl_flags & ~(FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND | + FS_XFLAG_NODUMP | FS_XFLAG_PROJINHERIT)) + return (-EOPNOTSUPP); + + if ((fchange(ioctl_flags, zfs_flags, FS_XFLAG_IMMUTABLE, + ZFS_IMMUTABLE) || + fchange(ioctl_flags, zfs_flags, FS_XFLAG_APPEND, ZFS_APPENDONLY)) && + !capable(CAP_LINUX_IMMUTABLE)) + return (-EPERM); + + if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) + return (-EACCES); + + xva_init(xva); + xoap = xva_getxoptattr(xva); + +#define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \ + if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \ + ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \ + XVA_SET_REQ(xva, (xflag)); \ + (xfield) = ((ioctl_flags & (iflag)) != 0); \ + } \ +} while (0) + + FLAG_CHANGE(FS_XFLAG_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, + xoap->xoa_immutable); + FLAG_CHANGE(FS_XFLAG_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, + xoap->xoa_appendonly); + FLAG_CHANGE(FS_XFLAG_NODUMP, ZFS_NODUMP, XAT_NODUMP, + xoap->xoa_nodump); + FLAG_CHANGE(FS_XFLAG_PROJINHERIT, ZFS_PROJINHERIT, XAT_PROJINHERIT, xoap->xoa_projinherit); #undef FLAG_CHANGE @@ -867,7 +1002,7 @@ zpl_ioctl_getxattr(struct file *filp, void __user *arg) struct inode *ip = file_inode(filp); int err; - fsx.fsx_xflags = __zpl_ioctl_getflags(ip); + fsx.fsx_xflags = __zpl_ioctl_getxflags(ip); fsx.fsx_projid = ITOZ(ip)->z_projid; err = copy_to_user(arg, &fsx, sizeof (fsx)); @@ -891,7 +1026,7 @@ zpl_ioctl_setxattr(struct file *filp, void __user *arg) if (!zpl_is_valid_projid(fsx.fsx_projid)) return (-EINVAL); - err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva); + err = __zpl_ioctl_setxflags(ip, fsx.fsx_xflags, &xva); if (err) return (err); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index 53819628627d..347b352506e5 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -22,6 +22,8 @@ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2023, Datto Inc. All rights reserved. + * Copyright (c) 2025, Klara, Inc. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> */ @@ -32,7 +34,22 @@ #include <sys/zpl.h> #include <linux/iversion.h> #include <linux/version.h> +#include <linux/vfs_compat.h> +/* + * What to do when the last reference to an inode is released. If 0, the kernel + * will cache it on the superblock. If 1, the inode will be freed immediately. + * See zpl_drop_inode(). + */ +int zfs_delete_inode = 0; + +/* + * What to do when the last reference to a dentry is released. If 0, the kernel + * will cache it until the entry (file) is destroyed. If 1, the dentry will be + * marked for cleanup, at which time its inode reference will be released. See + * zpl_dentry_delete(). + */ +int zfs_delete_dentry = 0; static struct inode * zpl_inode_alloc(struct super_block *sb) @@ -77,11 +94,36 @@ zpl_dirty_inode(struct inode *ip, int flags) } /* - * When ->drop_inode() is called its return value indicates if the - * inode should be evicted from the inode cache. If the inode is - * unhashed and has no links the default policy is to evict it - * immediately. + * ->drop_inode() is called when the last reference to an inode is released. + * Its return value indicates if the inode should be destroyed immediately, or + * cached on the superblock structure. + * + * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns + * "destroy immediately" if the inode is unhashed and has no links (roughly: no + * longer exists on disk). On datasets with millions of rarely-accessed files, + * this can cause a large amount of memory to be "pinned" by cached inodes, + * which in turn pin their associated dnodes and dbufs, until the kernel starts + * reporting memory pressure and requests OpenZFS release some memory (see + * zfs_prune()). + * + * When set to 1, we call generic_delete_inode(), which always returns "destroy + * immediately", resulting in inodes being destroyed immediately, releasing + * their associated dnodes and dbufs to the dbuf cached and the ARC to be + * evicted as normal. * + * Note that the "last reference" doesn't always mean the last _userspace_ + * reference; the dentry cache also holds a reference, so "busy" inodes will + * still be kept alive that way (subject to dcache tuning). + */ +static int +zpl_drop_inode(struct inode *ip) +{ + if (zfs_delete_inode) + return (generic_delete_inode(ip)); + return (generic_drop_inode(ip)); +} + +/* * The ->evict_inode() callback must minimally truncate the inode pages, * and call clear_inode(). For 2.6.35 and later kernels this will * simply update the inode state, with the sync occurring before the @@ -470,6 +512,7 @@ const struct super_operations zpl_super_operations = { .destroy_inode = zpl_inode_destroy, .dirty_inode = zpl_dirty_inode, .write_inode = NULL, + .drop_inode = zpl_drop_inode, .evict_inode = zpl_evict_inode, .put_super = zpl_put_super, .sync_fs = zpl_sync_fs, @@ -480,6 +523,35 @@ const struct super_operations zpl_super_operations = { .show_stats = NULL, }; +/* + * ->d_delete() is called when the last reference to a dentry is released. Its + * return value indicates if the dentry should be destroyed immediately, or + * retained in the dentry cache. + * + * By default (zfs_delete_dentry=0) the kernel will always cache unused + * entries. Each dentry holds an inode reference, so cached dentries can hold + * the final inode reference indefinitely, leading to the inode and its related + * data being pinned (see zpl_drop_inode()). + * + * When set to 1, we signal that the dentry should be destroyed immediately and + * never cached. This reduces memory usage, at the cost of higher overheads to + * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be + * reloaded and reinflated. + * + * Note that userspace does not have direct control over dentry references and + * reclaim; rather, this is part of the kernel's caching and reclaim subsystems + * (eg vm.vfs_cache_pressure). + */ +static int +zpl_dentry_delete(const struct dentry *dentry) +{ + return (zfs_delete_dentry ? 1 : 0); +} + +const struct dentry_operations zpl_dentry_operations = { + .d_delete = zpl_dentry_delete, +}; + struct file_system_type zpl_fs_type = { .owner = THIS_MODULE, .name = ZFS_DRIVER, @@ -491,3 +563,10 @@ struct file_system_type zpl_fs_type = { .mount = zpl_mount, .kill_sb = zpl_kill_sb, }; + +ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW, + "Delete inodes as soon as the last reference is released."); + +ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW, + "Delete dentries from dentry cache as soon as the last reference is " + "released."); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index a73acdad34ae..89f9bc555fcf 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -21,8 +21,8 @@ */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. - * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Rob Norris <robn@despairlabs.com> + * Copyright (c) 2024, 2025, Klara, Inc. */ #include <sys/dataset_kstats.h> @@ -337,16 +337,14 @@ zvol_discard(zv_request_t *zvr) } /* - * Align the request to volume block boundaries when a secure erase is - * not required. This will prevent dnode_free_range() from zeroing out - * the unaligned parts which is slow (read-modify-write) and useless - * since we are not freeing any space by doing so. + * Align the request to volume block boundaries. This will prevent + * dnode_free_range() from zeroing out the unaligned parts which is + * slow (read-modify-write) and useless since we are not freeing any + * space by doing so. */ - if (!io_is_secure_erase(bio, rq)) { - start = P2ROUNDUP(start, zv->zv_volblocksize); - end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); - size = end - start; - } + start = P2ROUNDUP(start, zv->zv_volblocksize); + end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); + size = end - start; if (start >= end) goto unlock; @@ -467,6 +465,24 @@ zvol_read_task(void *arg) zv_request_task_free(task); } +/* + * Note: + * + * The kernel uses different enum names for the IO opcode, depending on the + * kernel version ('req_opf', 'req_op'). To sidestep this, use macros rather + * than inline functions for these checks. + */ +/* Should this IO go down the zvol write path? */ +#define ZVOL_OP_IS_WRITE(op) \ + (op == REQ_OP_WRITE || \ + op == REQ_OP_FLUSH || \ + op == REQ_OP_DISCARD) + +/* Is this IO type supported by zvols? */ +#define ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op)) + +/* Get the IO opcode */ +#define ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq)) /* * Process a BIO or request @@ -484,7 +500,33 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = io_offset(bio, rq); uint64_t size = io_size(bio, rq); - int rw = io_data_dir(bio, rq); + int rw; + + if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) { + zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x", + rq != NULL ? "request" : "BIO", + ZVOL_OP(bio, rq), + rq != NULL ? rq->cmd_flags : bio->bi_opf); + ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq))); + zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP)); + goto out; + } + + if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) { + rw = WRITE; + } else { + rw = READ; + } + + /* + * Sanity check + * + * If we're a BIO, check our rw matches the kernel's + * bio_data_dir(bio) rw. We need to check because we support fewer + * IO operations, and want to verify that what we think are reads and + * writes from those operations match what the kernel thinks. + */ + ASSERT(rq != NULL || rw == bio_data_dir(bio)); if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { zvol_end_io(bio, rq, SET_ERROR(ENXIO)); @@ -589,7 +631,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, * interfaces lack this functionality (they block waiting for * the i/o to complete). */ - if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { + if (io_is_discard(bio, rq)) { if (force_sync) { zvol_discard(&zvr); } else { @@ -679,28 +721,19 @@ zvol_open(struct block_device *bdev, fmode_t flag) retry: #endif - rw_enter(&zvol_state_lock, RW_READER); - /* - * Obtain a copy of private_data under the zvol_state_lock to make - * sure that either the result of zvol free code path setting - * disk->private_data to NULL is observed, or zvol_os_free() - * is not called on this zv because of the positive zv_open_count. - */ + #ifdef HAVE_BLK_MODE_T - zv = disk->private_data; + zv = atomic_load_ptr(&disk->private_data); #else - zv = bdev->bd_disk->private_data; + zv = atomic_load_ptr(&bdev->bd_disk->private_data); #endif if (zv == NULL) { - rw_exit(&zvol_state_lock); return (-SET_ERROR(ENXIO)); } mutex_enter(&zv->zv_state_lock); - if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { mutex_exit(&zv->zv_state_lock); - rw_exit(&zvol_state_lock); return (-SET_ERROR(ENXIO)); } @@ -712,8 +745,28 @@ retry: if (zv->zv_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); + + /* + * Removal may happen while the locks are down, so + * we can't trust zv any longer; we have to start over. + */ +#ifdef HAVE_BLK_MODE_T + zv = atomic_load_ptr(&disk->private_data); +#else + zv = atomic_load_ptr(&bdev->bd_disk->private_data); +#endif + if (zv == NULL) + return (-SET_ERROR(ENXIO)); + rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); + + if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); + return (-SET_ERROR(ENXIO)); + } + /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); @@ -724,7 +777,6 @@ retry: drop_suspend = B_TRUE; } } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -757,8 +809,8 @@ retry: * the kernel so the only option is to return the error for * the caller to handle it. */ - if (!mutex_owned(&spa_namespace_lock)) { - if (!mutex_tryenter(&spa_namespace_lock)) { + if (!spa_namespace_held()) { + if (!spa_namespace_tryenter(FTAG)) { mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; @@ -782,7 +834,7 @@ retry: error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); if (drop_namespace) - mutex_exit(&spa_namespace_lock); + spa_namespace_exit(FTAG); } if (error == 0) { @@ -821,11 +873,11 @@ zvol_release(struct gendisk *disk, fmode_t unused) #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) (void) unused; #endif - zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; - rw_enter(&zvol_state_lock, RW_READER); - zv = disk->private_data; + zvol_state_t *zv = atomic_load_ptr(&disk->private_data); + if (zv == NULL) + return; mutex_enter(&zv->zv_state_lock); ASSERT3U(zv->zv_open_count, >, 0); @@ -839,6 +891,15 @@ zvol_release(struct gendisk *disk, fmode_t unused) mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); + + /* + * Unlike in zvol_open(), we don't check if removal + * started here, because we might be one of the openers + * that needs to be thrown out! If we're the last, we + * need to call zvol_last_close() below to finish + * cleanup. So, no special treatment for us. + */ + /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); @@ -848,7 +909,6 @@ zvol_release(struct gendisk *disk, fmode_t unused) } else { drop_suspend = B_FALSE; } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -868,9 +928,10 @@ static int zvol_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; + zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); + ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); switch (cmd) { @@ -923,9 +984,8 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing) { unsigned int mask = 0; - rw_enter(&zvol_state_lock, RW_READER); + zvol_state_t *zv = atomic_load_ptr(&disk->private_data); - zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; @@ -933,17 +993,14 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing) mutex_exit(&zv->zv_state_lock); } - rw_exit(&zvol_state_lock); - return (mask); } static int zvol_revalidate_disk(struct gendisk *disk) { - rw_enter(&zvol_state_lock, RW_READER); + zvol_state_t *zv = atomic_load_ptr(&disk->private_data); - zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); set_capacity(zv->zv_zso->zvo_disk, @@ -951,8 +1008,6 @@ zvol_revalidate_disk(struct gendisk *disk) mutex_exit(&zv->zv_state_lock); } - rw_exit(&zvol_state_lock); - return (0); } @@ -971,28 +1026,19 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) return (0); } -void -zvol_os_clear_private(zvol_state_t *zv) -{ - /* - * Cleared while holding zvol_state_lock as a writer - * which will prevent zvol_open() from opening it. - */ - zv->zv_zso->zvo_disk->private_data = NULL; -} - /* * Provide a simple virtual geometry for legacy compatibility. For devices * smaller than 1 MiB a small head and sector count is used to allow very * tiny devices. For devices over 1 Mib a standard head and sector count * is used to keep the cylinders count reasonable. */ -static int -zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static inline int +zvol_getgeo_impl(struct gendisk *disk, struct hd_geometry *geo) { - zvol_state_t *zv = bdev->bd_disk->private_data; + zvol_state_t *zv = atomic_load_ptr(&disk->private_data); sector_t sectors; + ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); sectors = get_capacity(zv->zv_zso->zvo_disk); @@ -1011,6 +1057,20 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) return (0); } +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK +static int +zvol_getgeo(struct gendisk *disk, struct hd_geometry *geo) +{ + return (zvol_getgeo_impl(disk, geo)); +} +#else +static int +zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + return (zvol_getgeo_impl(bdev->bd_disk, geo)); +} +#endif + /* * Why have two separate block_device_operations structs? * @@ -1417,53 +1477,70 @@ out_kmem: return (ret); } -/* - * Cleanup then free a zvol_state_t which was created by zvol_alloc(). - * At this time, the structure is not opened by anyone, is taken off - * the zvol_state_list, and has its private data set to NULL. - * The zvol_state_lock is dropped. - * - * This function may take many milliseconds to complete (e.g. we've seen - * it take over 256ms), due to the calls to "blk_cleanup_queue" and - * "del_gendisk". Thus, consumers need to be careful to account for this - * latency when calling this function. - */ void -zvol_os_free(zvol_state_t *zv) +zvol_os_remove_minor(zvol_state_t *zv) { - - ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); - ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT0(zv->zv_open_count); - ASSERT0P(zv->zv_zso->zvo_disk->private_data); + ASSERT0(atomic_read(&zv->zv_suspend_ref)); + ASSERT(zv->zv_flags & ZVOL_REMOVING); - rw_destroy(&zv->zv_suspend_lock); - zfs_rangelock_fini(&zv->zv_rangelock); + struct zvol_state_os *zso = zv->zv_zso; + zv->zv_zso = NULL; - del_gendisk(zv->zv_zso->zvo_disk); + /* Clearing private_data will make new callers return immediately. */ + atomic_store_ptr(&zso->zvo_disk->private_data, NULL); + + /* + * Drop the state lock before calling del_gendisk(). There may be + * callers waiting to acquire it, but del_gendisk() will block until + * they exit, which would deadlock. + */ + mutex_exit(&zv->zv_state_lock); + + del_gendisk(zso->zvo_disk); #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) #if defined(HAVE_BLK_CLEANUP_DISK) - blk_cleanup_disk(zv->zv_zso->zvo_disk); + blk_cleanup_disk(zso->zvo_disk); #else - put_disk(zv->zv_zso->zvo_disk); + put_disk(zso->zvo_disk); #endif #else - blk_cleanup_queue(zv->zv_zso->zvo_queue); - put_disk(zv->zv_zso->zvo_disk); + blk_cleanup_queue(zso->zvo_queue); + put_disk(zso->zvo_disk); #endif - if (zv->zv_zso->use_blk_mq) - blk_mq_free_tag_set(&zv->zv_zso->tag_set); + if (zso->use_blk_mq) + blk_mq_free_tag_set(&zso->tag_set); + + ida_free(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS); + + kmem_free(zso, sizeof (struct zvol_state_os)); + + mutex_enter(&zv->zv_state_lock); +} + +void +zvol_os_free(zvol_state_t *zv) +{ + + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT0(zv->zv_open_count); + ASSERT0P(zv->zv_zso); + + ASSERT0P(zv->zv_objset); + ASSERT0P(zv->zv_zilog); + ASSERT0P(zv->zv_dn); - ida_simple_remove(&zvol_ida, - MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); cv_destroy(&zv->zv_removing_cv); mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); - kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); } @@ -1592,7 +1669,7 @@ zvol_os_create_minor(const char *name) if (zvol_inhibit_dev) return (0); - idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); + idx = ida_alloc(&zvol_ida, kmem_flags_convert(KM_SLEEP)); if (idx < 0) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; @@ -1600,7 +1677,7 @@ zvol_os_create_minor(const char *name) /* too many partitions can cause an overflow */ zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", name, minor, MINOR(minor)); - ida_simple_remove(&zvol_ida, idx); + ida_free(&zvol_ida, idx); return (SET_ERROR(EINVAL)); } @@ -1608,7 +1685,7 @@ zvol_os_create_minor(const char *name) if (zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); mutex_exit(&zv->zv_state_lock); - ida_simple_remove(&zvol_ida, idx); + ida_free(&zvol_ida, idx); return (SET_ERROR(EEXIST)); } @@ -1708,7 +1785,7 @@ out_doi: rw_exit(&zvol_state_lock); error = zvol_os_add_disk(zv->zv_zso->zvo_disk); } else { - ida_simple_remove(&zvol_ida, idx); + ida_free(&zvol_ida, idx); } return (error); |
