diff options
Diffstat (limited to 'sys/contrib/openzfs/module/os')
19 files changed, 712 insertions, 261 deletions
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c index f9125a067cd1..3f360d167b17 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c @@ -101,6 +101,15 @@ spl_panic(const char *file, const char *func, int line, const char *fmt, ...) va_end(ap); } +/* + * Check if the current thread is a memory reclaim thread. + * Returns true if curproc is pageproc (FreeBSD's page daemon). + */ +int +current_is_reclaim_thread(void) +{ + return (curproc == pageproc); +} SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY, opensolaris_utsname_init, NULL); diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c index 733c2bd07ebb..9d5f025423a1 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c @@ -43,6 +43,7 @@ const int zfs_vm_pagerret_bad = VM_PAGER_BAD; const int zfs_vm_pagerret_error = VM_PAGER_ERROR; const int zfs_vm_pagerret_ok = VM_PAGER_OK; +const int zfs_vm_pagerret_pend = VM_PAGER_PEND; const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC; const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c index 334264f6da2f..5c5adc6cc12b 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c @@ -2357,10 +2357,42 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr, * In FreeBSD, we don't care about permissions of individual ADS. * Note that not checking them is not just an optimization - without * this shortcut, EA operations may bogusly fail with EACCES. + * + * If this is a named attribute lookup, do the checks. */ +#if __FreeBSD_version >= 1500040 + if ((zp->z_pflags & ZFS_XATTR) && (flags & V_NAMEDATTR) == 0) +#else if (zp->z_pflags & ZFS_XATTR) +#endif return (0); + /* + * If a named attribute directory then validate against base file + */ + if (is_attr) { + if ((error = zfs_zget(ZTOZSB(zp), + zp->z_xattr_parent, &xzp)) != 0) { + return (error); + } + + check_zp = xzp; + + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); /* diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c index 493ac9f69ad4..0456552ed07e 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c @@ -1209,6 +1209,8 @@ zfs_set_fuid_feature(zfsvfs_t *zfsvfs) zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } +extern int zfs_xattr_compat; + static int zfs_domount(vfs_t *vfsp, char *osname) { @@ -1289,6 +1291,16 @@ zfs_domount(vfs_t *vfsp, char *osname) goto out; } +#if __FreeBSD_version >= 1500040 + /* + * Named attributes can only work if the xattr property is set to + * on/dir and not sa. Also, zfs_xattr_compat must be set. + */ + if ((zfsvfs->z_flags & ZSB_XATTR) != 0 && !zfsvfs->z_xattr_sa && + zfs_xattr_compat) + vfsp->mnt_flag |= MNT_NAMEDATTR; +#endif + vfs_mountedfrom(vfsp, osname); if (!zfsvfs->z_issnap) @@ -1812,6 +1824,14 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) err = vn_lock(*vpp, flags); if (err != 0) vrele(*vpp); +#if __FreeBSD_version >= 1500040 + else if ((zp->z_pflags & ZFS_XATTR) != 0) { + if ((*vpp)->v_type == VDIR) + vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR); + else + vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR); + } +#endif } if (err != 0) *vpp = NULL; @@ -1964,9 +1984,17 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) *vpp = ZTOV(zp); zfs_exit(zfsvfs, FTAG); err = vn_lock(*vpp, flags); - if (err == 0) + if (err == 0) { vnode_create_vobject(*vpp, zp->z_size, curthread); - else +#if __FreeBSD_version >= 1500040 + if ((zp->z_pflags & ZFS_XATTR) != 0) { + if ((*vpp)->v_type == VDIR) + vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR); + else + vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR); + } +#endif + } else *vpp = NULL; return (err); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index 8a5006c488f3..c4270d8b5d5c 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -25,6 +25,7 @@ * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2025, Klara, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ @@ -114,6 +115,8 @@ typedef uint64_t cookie_t; typedef ulong_t cookie_t; #endif +static int zfs_check_attrname(const char *name); + /* * Programming rules. * @@ -813,7 +816,12 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, /* * Do we have permission to get into attribute directory? */ - error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr, NULL); + if (flags & LOOKUP_NAMED_ATTR) + error = zfs_zaccess(zp, ACE_EXECUTE, V_NAMEDATTR, + B_FALSE, cr, NULL); + else + error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr, + NULL); if (error) { vrele(ZTOV(zp)); } @@ -4299,6 +4307,33 @@ zfs_freebsd_getpages(struct vop_getpages_args *ap) ap->a_rahead)); } +typedef struct { + uint_t pca_npages; + vm_page_t pca_pages[]; +} putpage_commit_arg_t; + +static void +zfs_putpage_commit_cb(void *arg) +{ + putpage_commit_arg_t *pca = arg; + vm_object_t object = pca->pca_pages[0]->object; + + zfs_vmobject_wlock(object); + + for (uint_t i = 0; i < pca->pca_npages; i++) { + vm_page_t pp = pca->pca_pages[i]; + vm_page_undirty(pp); + vm_page_sunbusy(pp); + } + + vm_object_pip_wakeupn(object, pca->pca_npages); + + zfs_vmobject_wunlock(object); + + kmem_free(pca, + offsetof(putpage_commit_arg_t, pca_pages[pca->pca_npages])); +} + static int zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, int *rtvals) @@ -4400,10 +4435,12 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, } if (zp->z_blksz < PAGE_SIZE) { - for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { - tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; + vm_ooffset_t woff = off; + size_t wlen = len; + for (i = 0; wlen > 0; woff += tocopy, wlen -= tocopy, i++) { + tocopy = MIN(PAGE_SIZE, wlen); va = zfs_map_page(ma[i], &sf); - dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); + dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx); zfs_unmap_page(sf); } } else { @@ -4424,19 +4461,48 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(err); - /* - * XXX we should be passing a callback to undirty - * but that would make the locking messier - */ - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, - len, commit, B_FALSE, NULL, NULL); - zfs_vmobject_wlock(object); - for (i = 0; i < ncount; i++) { - rtvals[i] = zfs_vm_pagerret_ok; - vm_page_undirty(ma[i]); + if (commit) { + /* + * Caller requested that we commit immediately. We set + * a callback on the log entry, to be called once its + * on disk after the call to zil_commit() below. The + * pages will be undirtied and unbusied there. + */ + putpage_commit_arg_t *pca = kmem_alloc( + offsetof(putpage_commit_arg_t, pca_pages[ncount]), + KM_SLEEP); + pca->pca_npages = ncount; + memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount); + + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, + B_TRUE, B_FALSE, zfs_putpage_commit_cb, pca); + + for (i = 0; i < ncount; i++) + rtvals[i] = zfs_vm_pagerret_pend; + } else { + /* + * Caller just wants the page written back somewhere, + * but doesn't need it committed yet. We've already + * written it back to the DMU, so we just need to put + * it on the async log, then undirty the page and + * return. + * + * We cannot use a callback here, because it would keep + * the page busy (locked) until it is eventually + * written down at txg sync. + */ + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, + B_FALSE, B_FALSE, NULL, NULL); + + zfs_vmobject_wlock(object); + for (i = 0; i < ncount; i++) { + rtvals[i] = zfs_vm_pagerret_ok; + vm_page_undirty(ma[i]); + } + zfs_vmobject_wunlock(object); } - zfs_vmobject_wunlock(object); + VM_CNT_INC(v_vnodeout); VM_CNT_ADD(v_vnodepgsout, ncount); } @@ -4707,8 +4773,16 @@ zfs_freebsd_access(struct vop_access_args *ap) * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, */ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); - if (accmode != 0) - error = zfs_access(zp, accmode, 0, ap->a_cred); + if (accmode != 0) { +#if __FreeBSD_version >= 1500040 + /* For named attributes, do the checks. */ + if ((vn_irflag_read(vp) & VIRF_NAMEDATTR) != 0) + error = zfs_access(zp, accmode, V_NAMEDATTR, + ap->a_cred); + else +#endif + error = zfs_access(zp, accmode, 0, ap->a_cred); + } /* * VADMIN has to be handled by vaccess(). @@ -4741,6 +4815,190 @@ struct vop_lookup_args { }; #endif +#if __FreeBSD_version >= 1500040 +static int +zfs_lookup_nameddir(struct vnode *dvp, struct componentname *cnp, + struct vnode **vpp) +{ + struct vnode *xvp; + int error, flags; + + *vpp = NULL; + flags = LOOKUP_XATTR | LOOKUP_NAMED_ATTR; + if ((cnp->cn_flags & CREATENAMED) != 0) + flags |= CREATE_XATTR_DIR; + error = zfs_lookup(dvp, NULL, &xvp, NULL, 0, cnp->cn_cred, flags, + B_FALSE); + if (error == 0) { + if ((cnp->cn_flags & LOCKLEAF) != 0) + error = vn_lock(xvp, cnp->cn_lkflags); + if (error == 0) { + vn_irflag_set_cond(xvp, VIRF_NAMEDDIR); + *vpp = xvp; + } else { + vrele(xvp); + } + } + return (error); +} + +static ssize_t +zfs_readdir_named(struct vnode *vp, char *buf, ssize_t blen, off_t *offp, + int *eofflagp, struct ucred *cred, struct thread *td) +{ + struct uio io; + struct iovec iv; + zfs_uio_t uio; + int error; + + io.uio_offset = *offp; + io.uio_segflg = UIO_SYSSPACE; + io.uio_rw = UIO_READ; + io.uio_td = td; + iv.iov_base = buf; + iv.iov_len = blen; + io.uio_iov = &iv; + io.uio_iovcnt = 1; + io.uio_resid = blen; + zfs_uio_init(&uio, &io); + error = zfs_readdir(vp, &uio, cred, eofflagp, NULL, NULL); + if (error != 0) + return (-1); + *offp = io.uio_offset; + return (blen - io.uio_resid); +} + +static bool +zfs_has_namedattr(struct vnode *vp, struct ucred *cred) +{ + struct componentname cn; + struct vnode *xvp; + struct dirent *dp; + off_t offs; + ssize_t rsize; + char *buf, *cp, *endcp; + int eofflag, error; + bool ret; + + MNT_ILOCK(vp->v_mount); + if ((vp->v_mount->mnt_flag & MNT_NAMEDATTR) == 0) { + MNT_IUNLOCK(vp->v_mount); + return (false); + } + MNT_IUNLOCK(vp->v_mount); + + /* Now see if a named attribute directory exists. */ + cn.cn_flags = LOCKLEAF; + cn.cn_lkflags = LK_SHARED; + cn.cn_cred = cred; + error = zfs_lookup_nameddir(vp, &cn, &xvp); + if (error != 0) + return (false); + + /* It exists, so see if there is any entry other than "." and "..". */ + buf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK); + ret = false; + offs = 0; + do { + rsize = zfs_readdir_named(xvp, buf, DEV_BSIZE, &offs, &eofflag, + cred, curthread); + if (rsize <= 0) + break; + cp = buf; + endcp = &buf[rsize]; + while (cp < endcp) { + dp = (struct dirent *)cp; + if (dp->d_fileno != 0 && (dp->d_type == DT_REG || + dp->d_type == DT_UNKNOWN) && + !ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name) && + ((dp->d_namlen == 1 && dp->d_name[0] != '.') || + (dp->d_namlen == 2 && (dp->d_name[0] != '.' || + dp->d_name[1] != '.')) || dp->d_namlen > 2)) { + ret = true; + break; + } + cp += dp->d_reclen; + } + } while (!ret && rsize > 0 && eofflag == 0); + vput(xvp); + free(buf, M_TEMP); + return (ret); +} + +static int +zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) +{ + struct componentname *cnp = ap->a_cnp; + char nm[NAME_MAX + 1]; + int error; + struct vnode **vpp = ap->a_vpp, *dvp = ap->a_dvp, *xvp; + bool is_nameddir, needs_nameddir, opennamed = false; + + /* + * These variables are used to handle the named attribute cases: + * opennamed - Is true when this is a call from open with O_NAMEDATTR + * specified and it is the last component. + * is_nameddir - Is true when the directory is a named attribute dir. + * needs_nameddir - Is set when the lookup needs to look for/create + * a named attribute directory. It is only set when is_nameddir + * is_nameddir is false and opennamed is true. + * xvp - Is the directory that the lookup needs to be done in. + * Usually dvp, unless needs_nameddir is true where it is the + * result of the first non-named directory lookup. + * Note that name caching must be disabled for named attribute + * handling. + */ + needs_nameddir = false; + xvp = dvp; + opennamed = (cnp->cn_flags & (OPENNAMED | ISLASTCN)) == + (OPENNAMED | ISLASTCN); + is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0; + if (is_nameddir && (cnp->cn_flags & ISLASTCN) == 0) + return (ENOATTR); + if (opennamed && !is_nameddir && (cnp->cn_flags & ISDOTDOT) != 0) + return (ENOATTR); + if (opennamed || is_nameddir) + cnp->cn_flags &= ~MAKEENTRY; + if (opennamed && !is_nameddir) + needs_nameddir = true; + ASSERT3U(cnp->cn_namelen, <, sizeof (nm)); + error = 0; + *vpp = NULL; + if (needs_nameddir) { + if (VOP_ISLOCKED(dvp) != LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + error = zfs_lookup_nameddir(dvp, cnp, &xvp); + if (error == 0) + is_nameddir = true; + } + if (error == 0) { + if (!needs_nameddir || cnp->cn_namelen != 1 || + *cnp->cn_nameptr != '.') { + strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, + sizeof (nm))); + error = zfs_lookup(xvp, nm, vpp, cnp, cnp->cn_nameiop, + cnp->cn_cred, 0, cached); + if (is_nameddir && error == 0 && + (cnp->cn_namelen != 1 || *cnp->cn_nameptr != '.') && + (cnp->cn_flags & ISDOTDOT) == 0) { + if ((*vpp)->v_type == VDIR) + vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR); + else + vn_irflag_set_cond(*vpp, + VIRF_NAMEDATTR); + } + if (needs_nameddir && xvp != *vpp) + vput(xvp); + } else { + /* + * Lookup of "." when a named attribute dir is needed. + */ + *vpp = xvp; + } + } + return (error); +} +#else static int zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) { @@ -4753,6 +5011,7 @@ zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, cnp->cn_cred, 0, cached)); } +#endif static int zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap) @@ -4775,7 +5034,11 @@ zfs_cache_lookup(struct vop_lookup_args *ap) zfsvfs_t *zfsvfs; zfsvfs = ap->a_dvp->v_mount->mnt_data; +#if __FreeBSD_version >= 1500040 + if (zfsvfs->z_use_namecache && (ap->a_cnp->cn_flags & OPENNAMED) == 0) +#else if (zfsvfs->z_use_namecache) +#endif return (vfs_cache_lookup(ap)); else return (zfs_freebsd_lookup(ap, B_FALSE)); @@ -4798,6 +5061,11 @@ zfs_freebsd_create(struct vop_create_args *ap) vattr_t *vap = ap->a_vap; znode_t *zp = NULL; int rc, mode; + struct vnode *dvp = ap->a_dvp; +#if __FreeBSD_version >= 1500040 + struct vnode *xvp; + bool is_nameddir; +#endif #if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); @@ -4808,10 +5076,36 @@ zfs_freebsd_create(struct vop_create_args *ap) zfsvfs = ap->a_dvp->v_mount->mnt_data; *ap->a_vpp = NULL; - rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, 0, mode, - &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL); + rc = 0; +#if __FreeBSD_version >= 1500040 + xvp = NULL; + is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0; + if (!is_nameddir && (cnp->cn_flags & OPENNAMED) != 0) { + /* Needs a named attribute directory. */ + rc = zfs_lookup_nameddir(dvp, cnp, &xvp); + if (rc == 0) { + dvp = xvp; + is_nameddir = true; + } + } + if (is_nameddir && rc == 0) + rc = zfs_check_attrname(cnp->cn_nameptr); +#endif + if (rc == 0) + rc = zfs_create(VTOZ(dvp), cnp->cn_nameptr, vap, 0, mode, + &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL); +#if __FreeBSD_version >= 1500040 + if (xvp != NULL) + vput(xvp); +#endif + if (rc == 0) { *ap->a_vpp = ZTOV(zp); +#if __FreeBSD_version >= 1500040 + if (is_nameddir) + vn_irflag_set_cond(*ap->a_vpp, VIRF_NAMEDATTR); +#endif + } if (zfsvfs->z_use_namecache && rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, cnp); @@ -4830,13 +5124,21 @@ struct vop_remove_args { static int zfs_freebsd_remove(struct vop_remove_args *ap) { + int error = 0; #if __FreeBSD_version < 1400068 ASSERT(ap->a_cnp->cn_flags & SAVENAME); #endif - return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, - ap->a_cnp->cn_cred)); +#if __FreeBSD_version >= 1500040 + if ((vn_irflag_read(ap->a_dvp) & VIRF_NAMEDDIR) != 0) + error = zfs_check_attrname(ap->a_cnp->cn_nameptr); +#endif + + if (error == 0) + error = zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, + ap->a_cnp->cn_cred); + return (error); } #ifndef _SYS_SYSPROTO_H_ @@ -4994,6 +5296,11 @@ zfs_freebsd_getattr(struct vop_getattr_args *ap) #undef FLAG_CHECK *vap = xvap.xva_vattr; vap->va_flags = fflags; + +#if __FreeBSD_version >= 1500040 + if ((vn_irflag_read(ap->a_vp) & (VIRF_NAMEDDIR | VIRF_NAMEDATTR)) != 0) + vap->va_bsdflags |= SFBSD_NAMEDATTR; +#endif return (0); } @@ -5136,15 +5443,24 @@ zfs_freebsd_rename(struct vop_rename_args *ap) vnode_t *fvp = ap->a_fvp; vnode_t *tdvp = ap->a_tdvp; vnode_t *tvp = ap->a_tvp; - int error; + int error = 0; #if __FreeBSD_version < 1400068 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); #endif - error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, - ap->a_tcnp, ap->a_fcnp->cn_cred); +#if __FreeBSD_version >= 1500040 + if ((vn_irflag_read(fdvp) & VIRF_NAMEDDIR) != 0) { + error = zfs_check_attrname(ap->a_fcnp->cn_nameptr); + if (error == 0) + error = zfs_check_attrname(ap->a_tcnp->cn_nameptr); + } +#endif + + if (error == 0) + error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, + ap->a_tcnp, ap->a_fcnp->cn_cred); vrele(fdvp); vrele(fvp); @@ -5398,12 +5714,33 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap) return (0); } return (EINVAL); +#if __FreeBSD_version >= 1500040 + case _PC_NAMEDATTR_ENABLED: + MNT_ILOCK(ap->a_vp->v_mount); + if ((ap->a_vp->v_mount->mnt_flag & MNT_NAMEDATTR) != 0) + *ap->a_retval = 1; + else + *ap->a_retval = 0; + MNT_IUNLOCK(ap->a_vp->v_mount); + return (0); + case _PC_HAS_NAMEDATTR: + if (zfs_has_namedattr(ap->a_vp, curthread->td_ucred)) + *ap->a_retval = 1; + else + *ap->a_retval = 0; + return (0); +#endif +#ifdef _PC_HAS_HIDDENSYSTEM + case _PC_HAS_HIDDENSYSTEM: + *ap->a_retval = 1; + return (0); +#endif default: return (vop_stdpathconf(ap)); } } -static int zfs_xattr_compat = 1; +int zfs_xattr_compat = 1; static int zfs_check_attrname(const char *name) diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c index 9bad1e13d7cc..775f54a65f7d 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c @@ -150,8 +150,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_vnode = NULL; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; return (0); } @@ -172,9 +170,6 @@ zfs_znode_cache_destructor(void *buf, void *arg) ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); - - ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); - ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } @@ -456,8 +451,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; atomic_store_ptr(&zp->z_cached_symlink, NULL); zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index 212ef560db07..72a7c4ea082a 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -1248,9 +1248,11 @@ zvol_os_is_zvol(const char *device) return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); } -void +int zvol_os_rename_minor(zvol_state_t *zv, const char *newname) { + int error = 0; + ASSERT(RW_LOCK_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -1304,14 +1306,94 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) args.mda_gid = GID_OPERATOR; args.mda_mode = 0640; args.mda_si_drv2 = zv; - if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) - == 0) { + error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname); + if (error == 0) { dev->si_iosize_max = maxphys; zsd->zsd_cdev = dev; } } strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); dataset_kstats_rename(&zv->zv_kstat, newname); + + return (error); +} + +/* + * Allocate memory for a new zvol_state_t and setup the required + * request queue and generic disk structures for the block device. + */ +static int +zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize, + zvol_state_t **zvp) +{ + zvol_state_t *zv; + uint64_t volmode; + int error; + + error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE), + &volmode, NULL); + if (error) + return (error); + + if (volmode == ZFS_VOLMODE_DEFAULT) + volmode = zvol_volmode; + + if (volmode == ZFS_VOLMODE_NONE) + return (0); + + zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); + mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); + zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); + zv->zv_volmode = volmode; + zv->zv_volsize = volsize; + zv->zv_volblocksize = volblocksize; + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp; + struct g_geom *gp; + + g_topology_lock(); + gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); + gp->start = zvol_geom_bio_start; + gp->access = zvol_geom_access; + pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); + pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; + pp->sectorsize = DEV_BSIZE; + pp->mediasize = 0; + pp->private = zv; + + zsg->zsg_provider = pp; + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev; + struct make_dev_args args; + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_devsw = &zvol_cdevsw; + args.mda_cr = NULL; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_OPERATOR; + args.mda_mode = 0640; + args.mda_si_drv2 = zv; + error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name); + if (error) { + kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (zvol_state_t)); + return (error); + } + + dev->si_iosize_max = maxphys; + zsd->zsd_cdev = dev; + knlist_init_sx(&zsd->zsd_selinfo.si_note, &zv->zv_state_lock); + } + (void) strlcpy(zv->zv_name, name, MAXPATHLEN); + rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); + zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); + + *zvp = zv; + return (error); } /* @@ -1364,11 +1446,11 @@ zvol_os_free(zvol_state_t *zv) int zvol_os_create_minor(const char *name) { - zvol_state_t *zv; + zvol_state_t *zv = NULL; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; - uint64_t volmode, hash, len; + uint64_t hash, len; int error; bool replayed_zil = B_FALSE; @@ -1400,67 +1482,15 @@ zvol_os_create_minor(const char *name) if (error) goto out_dmu_objset_disown; - error = dsl_prop_get_integer(name, - zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); - if (error || volmode == ZFS_VOLMODE_DEFAULT) - volmode = zvol_volmode; - error = 0; + error = zvol_alloc(name, volsize, doi->doi_data_block_size, &zv); + if (error || zv == NULL) + goto out_dmu_objset_disown; - /* - * zvol_alloc equivalent ... - */ - zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); zv->zv_hash = hash; - mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); - zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); - zv->zv_volmode = volmode; - if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp; - struct g_geom *gp; - - g_topology_lock(); - gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); - gp->start = zvol_geom_bio_start; - gp->access = zvol_geom_access; - pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); - pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; - pp->sectorsize = DEV_BSIZE; - pp->mediasize = 0; - pp->private = zv; - - zsg->zsg_provider = pp; - } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { - struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; - struct cdev *dev; - struct make_dev_args args; - - make_dev_args_init(&args); - args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; - args.mda_devsw = &zvol_cdevsw; - args.mda_cr = NULL; - args.mda_uid = UID_ROOT; - args.mda_gid = GID_OPERATOR; - args.mda_mode = 0640; - args.mda_si_drv2 = zv; - if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name) - == 0) { - dev->si_iosize_max = maxphys; - zsd->zsd_cdev = dev; - knlist_init_sx(&zsd->zsd_selinfo.si_note, - &zv->zv_state_lock); - } - } - (void) strlcpy(zv->zv_name, name, MAXPATHLEN); - rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); - zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) zv->zv_flags |= ZVOL_RDONLY; - zv->zv_volblocksize = doi->doi_data_block_size; - zv->zv_volsize = volsize; zv->zv_objset = os; ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); @@ -1490,13 +1520,14 @@ zvol_os_create_minor(const char *name) out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); - if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { + if (error == 0 && zv && zv->zv_volmode == ZFS_VOLMODE_GEOM) { g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0); + /* geom was locked inside zvol_alloc() function */ g_topology_unlock(); } out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); - if (error == 0) { + if (error == 0 && zv) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); zvol_minors++; diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c index 337a4bcf76a0..9fe008cef868 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c @@ -302,13 +302,8 @@ spl_kmem_free_impl(const void *buf, size_t size) #ifdef DEBUG_KMEM /* Shim layer memory accounting */ -#ifdef HAVE_ATOMIC64_T atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); -unsigned long long kmem_alloc_max = 0; -#else /* HAVE_ATOMIC64_T */ -atomic_t kmem_alloc_used = ATOMIC_INIT(0); -unsigned long long kmem_alloc_max = 0; -#endif /* HAVE_ATOMIC64_T */ +uint64_t kmem_alloc_max = 0; EXPORT_SYMBOL(kmem_alloc_used); EXPORT_SYMBOL(kmem_alloc_max); @@ -320,9 +315,9 @@ spl_kmem_alloc_debug(size_t size, int flags, int node) ptr = spl_kmem_alloc_impl(size, flags, node); if (ptr) { - kmem_alloc_used_add(size); - if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) - kmem_alloc_max = kmem_alloc_used_read(); + atomic64_add(size, &kmem_alloc_used); + if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max)) + kmem_alloc_max = atomic64_read(&kmem_alloc_used); } return (ptr); @@ -331,7 +326,7 @@ spl_kmem_alloc_debug(size_t size, int flags, int node) inline void spl_kmem_free_debug(const void *ptr, size_t size) { - kmem_alloc_used_sub(size); + atomic64_sub(size, &kmem_alloc_used); spl_kmem_free_impl(ptr, size); } @@ -595,7 +590,7 @@ spl_kmem_init(void) { #ifdef DEBUG_KMEM - kmem_alloc_used_set(0); + atomic64_set(&kmem_alloc_used, 0); @@ -617,9 +612,10 @@ spl_kmem_fini(void) * at that address to aid in debugging. Performance is not * a serious concern here since it is module unload time. */ - if (kmem_alloc_used_read() != 0) + if (atomic64_read(&kmem_alloc_used) != 0) printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", - (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); + (unsigned long)atomic64_read(&kmem_alloc_used), + kmem_alloc_max); #ifdef DEBUG_KMEM_TRACKING spl_kmem_fini_tracking(&kmem_list, &kmem_lock); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c index 4ed0deedd5b9..8cdd5fc5cfe5 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c @@ -82,11 +82,7 @@ proc_domemused(CONST_CTL_TABLE *table, int write, if (write) { *ppos += *lenp; } else { -#ifdef HAVE_ATOMIC64_T val = atomic64_read((atomic64_t *)table->data); -#else - val = atomic_read((atomic_t *)table->data); -#endif /* HAVE_ATOMIC64_T */ rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos); } @@ -315,18 +311,14 @@ static struct ctl_table spl_kmem_table[] = { { .procname = "kmem_used", .data = &kmem_alloc_used, -#ifdef HAVE_ATOMIC64_T .maxlen = sizeof (atomic64_t), -#else - .maxlen = sizeof (atomic_t), -#endif /* HAVE_ATOMIC64_T */ .mode = 0444, .proc_handler = &proc_domemused, }, { .procname = "kmem_max", .data = &kmem_alloc_max, - .maxlen = sizeof (unsigned long), + .maxlen = sizeof (uint64_t), .extra1 = &table_min, .extra2 = &table_max, .mode = 0444, diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c index 1398483a3ac8..f42f455222de 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c @@ -28,6 +28,7 @@ #include <sys/kmem.h> #include <sys/tsd.h> #include <sys/string.h> +#include <sys/misc.h> /* * Thread interfaces @@ -197,3 +198,14 @@ issig(void) } EXPORT_SYMBOL(issig); + +/* + * Check if the current thread is a memory reclaim thread. + * Returns true if current thread is kswapd. + */ +int +current_is_reclaim_thread(void) +{ + return (current_is_kswapd()); +} +EXPORT_SYMBOL(current_is_reclaim_thread); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index e1140b31a97a..248c9b7a6d3b 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -256,10 +256,6 @@ abd_unmark_zfs_page(struct page *page) #ifndef CONFIG_HIGHMEM -#ifndef __GFP_RECLAIM -#define __GFP_RECLAIM __GFP_WAIT -#endif - /* * The goal is to minimize fragmentation by preferentially populating ABDs * with higher order compound pages from a single zone. Allocation size is diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c index 84b25cb2c5ac..6552a933ce0a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c @@ -511,8 +511,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, zp->z_pflags = 0; zp->z_mode = 0; zp->z_sync_cnt = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; ip->i_generation = 0; ip->i_ino = id; ip->i_mode = (S_IFDIR | S_IRWXUGO); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c index d193eb80dca2..c729947369c2 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c @@ -260,24 +260,12 @@ zfs_file_fsync(zfs_file_t *filp, int flags) { int datasync = 0; int error; - int fstrans; if (flags & O_DSYNC) datasync = 1; - /* - * May enter XFS which generates a warning when PF_FSTRANS is set. - * To avoid this the flag is cleared over vfs_sync() and then reset. - */ - fstrans = __spl_pf_fstrans_check(); - if (fstrans) - current->flags &= ~(__SPL_PF_FSTRANS); - error = -vfs_fsync(filp, datasync); - if (fstrans) - current->flags |= __SPL_PF_FSTRANS; - return (error); } @@ -292,14 +280,6 @@ int zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len) { /* - * May enter XFS which generates a warning when PF_FSTRANS is set. - * To avoid this the flag is cleared over vfs_sync() and then reset. - */ - int fstrans = __spl_pf_fstrans_check(); - if (fstrans) - current->flags &= ~(__SPL_PF_FSTRANS); - - /* * When supported by the underlying file system preferentially * use the fallocate() callback to preallocate the space. */ @@ -308,9 +288,6 @@ zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len) error = -fp->f_op->fallocate(fp, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len); - if (fstrans) - current->flags |= __SPL_PF_FSTRANS; - if (error) return (SET_ERROR(error)); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index a3837f784668..396faef8f646 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -1217,6 +1217,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) } /* + * Dentry and inode caches referenced by a task in non-root memcg are + * not going to be scanned by the kernel-provided shrinker. So, if + * kernel prunes nothing, fall back to this manual walk to free dnodes. + * To avoid scanning the same znodes multiple times they are always rotated + * to the end of the z_all_znodes list. New znodes are inserted at the + * end of the list so we're always scanning the oldest znodes first. + */ +static int +zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) +{ + znode_t **zp_array, *zp; + int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); + int objects = 0; + int i = 0, j = 0; + + zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); + + mutex_enter(&zfsvfs->z_znodes_lock); + while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { + + if ((i++ > nr_to_scan) || (j >= max_array)) + break; + + ASSERT(list_link_active(&zp->z_link_node)); + list_remove(&zfsvfs->z_all_znodes, zp); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + + /* Skip active znodes and .zfs entries */ + if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) + continue; + + if (igrab(ZTOI(zp)) == NULL) + continue; + + zp_array[j] = zp; + j++; + } + mutex_exit(&zfsvfs->z_znodes_lock); + + for (i = 0; i < j; i++) { + zp = zp_array[i]; + + ASSERT3P(zp, !=, NULL); + d_prune_aliases(ZTOI(zp)); + + if (atomic_read(&ZTOI(zp)->i_count) == 1) + objects++; + + zrele(zp); + } + + vmem_free(zp_array, max_array * sizeof (znode_t *)); + + return (objects); +} + +/* * The ARC has requested that the filesystem drop entries from the dentry * and inode caches. This can occur when the ARC needs to free meta data * blocks but can't because they are all pinned by entries in these caches. @@ -1267,6 +1324,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) *objects = (*shrinker->scan_objects)(shrinker, &sc); #endif + /* + * Fall back to zfs_prune_aliases if kernel's shrinker did nothing + * due to dentry and inode caches being referenced by a task running + * in non-root memcg. + */ + if (*objects == 0) + *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); + zfs_exit(zfsvfs, FTAG); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index ed9721dade76..6a2fc5ad7935 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -25,6 +25,7 @@ * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2025, Klara, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ @@ -3691,7 +3692,7 @@ top: } static void -zfs_putpage_sync_commit_cb(void *arg) +zfs_putpage_commit_cb(void *arg) { struct page *pp = arg; @@ -3699,17 +3700,6 @@ zfs_putpage_sync_commit_cb(void *arg) end_page_writeback(pp); } -static void -zfs_putpage_async_commit_cb(void *arg) -{ - struct page *pp = arg; - znode_t *zp = ITOZ(pp->mapping->host); - - ClearPageError(pp); - end_page_writeback(pp); - atomic_dec_32(&zp->z_async_writes_cnt); -} - /* * Push a page out to disk, once the page is on stable storage the * registered commit callback will be run as notification of completion. @@ -3827,15 +3817,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { - /* - * Speed up any non-sync page writebacks since - * they may take several seconds to complete. - * Refer to the comment in zpl_fsync() for details. - */ - if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { - zil_commit(zfsvfs->z_log, zp->z_id); - } - if (PageWriteback(pp)) #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT folio_wait_bit(page_folio(pp), PG_writeback); @@ -3861,8 +3842,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, * was in fact not skipped and should not be counted as if it were. */ wbc->pages_skipped--; - if (!for_sync) - atomic_inc_32(&zp->z_async_writes_cnt); set_page_writeback(pp); unlock_page(pp); @@ -3881,8 +3860,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, #endif ClearPageError(pp); end_page_writeback(pp); - if (!for_sync) - atomic_dec_32(&zp->z_async_writes_cnt); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (err); @@ -3908,35 +3885,61 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); - boolean_t commit = B_FALSE; - if (wbc->sync_mode != WB_SYNC_NONE) { - /* - * Note that this is rarely called under writepages(), because - * writepages() normally handles the entire commit for - * performance reasons. - */ - commit = B_TRUE; - } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { - /* - * If the caller does not intend to wait synchronously - * for this page writeback to complete and there are active - * synchronous calls on this file, do a commit so that - * the latter don't accidentally end up waiting for - * our writeback to complete. Refer to the comment in - * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. - */ - commit = B_TRUE; - } + /* + * A note about for_sync vs wbc->sync_mode. + * + * for_sync indicates that this is a syncing writeback, that is, kernel + * caller expects the data to be durably stored before being notified. + * Often, but not always, the call was triggered by a userspace syncing + * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE + * means that that page should remain "locked" (in the writeback state) + * until it is definitely on disk (ie zil_commit() or spa_sync()). + * Otherwise, we can unlock and return as soon as it is on the + * in-memory ZIL. + * + * wbc->sync_mode has similar meaning. wbc is passed from the kernel to + * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE + * indicates this a regular async writeback (eg a cache eviction) and + * so does not need a durability guarantee, while WB_SYNC_ALL indicates + * a syncing op that must be waited on (by convention, we test for + * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over + * performance should there ever be a new mode that we have not yet + * added support for). + * + * So, why a separate for_sync field? This is because zpl_writepages() + * calls zfs_putpage() multiple times for a single "logical" operation. + * It wants all the individual pages to be for_sync==TRUE ie only + * unlocked once durably stored, but it only wants one call to + * zil_commit() at the very end, once all the pages are synced. So, + * it repurposes sync_mode slightly to indicate who issue and wait for + * the IO: for NONE, the caller to zfs_putpage() will do it, while for + * ALL, zfs_putpage should do it. + * + * Summary: + * for_sync: 0=unlock immediately; 1 unlock once on disk + * sync_mode: NONE=caller will commit; ALL=we will commit + */ + boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE); + + /* + * We use for_sync as the "commit" arg to zfs_log_write() (arg 7) + * because it is a policy flag that indicates "someone will call + * zil_commit() soon". for_sync=TRUE means exactly that; the only + * question is whether it will be us, or zpl_writepages(). + */ + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync, + B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp); - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, - B_FALSE, for_sync ? zfs_putpage_sync_commit_cb : - zfs_putpage_async_commit_cb, pp); + if (!for_sync) { + ClearPageError(pp); + end_page_writeback(pp); + } dmu_tx_commit(tx); zfs_rangelock_exit(lr); - if (commit) + if (need_commit) zil_commit(zfsvfs->z_log, zp->z_id); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c index 54e60b4820f6..7683eeb3cf9f 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c @@ -126,8 +126,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; return (0); } @@ -149,9 +147,6 @@ zfs_znode_cache_destructor(void *buf, void *arg) ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); - - ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); - ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } static int @@ -371,6 +366,12 @@ zfs_inode_alloc(struct super_block *sb, struct inode **ip) return (0); } +void +zfs_inode_free(struct inode *ip) +{ + kmem_cache_free(znode_cache, ITOZ(ip)); +} + /* * Called in multiple places when an inode should be destroyed. */ @@ -395,8 +396,15 @@ zfs_inode_destroy(struct inode *ip) nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } - - kmem_cache_free(znode_cache, zp); +#ifndef HAVE_SOPS_FREE_INODE + /* + * inode needs to be freed in RCU callback. If we have + * super_operations->free_inode, Linux kernel will do call_rcu + * for us. But if we don't have it, since call_rcu is GPL-only + * symbol, we can only free synchronously and accept the risk. + */ + zfs_inode_free(ip); +#endif } static void @@ -535,8 +543,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index 1a82c13e1523..ef7bd7352084 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -111,52 +111,11 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; znode_t *zp = ITOZ(inode); - zfsvfs_t *zfsvfs = ITOZSB(inode); cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; - /* - * The variables z_sync_writes_cnt and z_async_writes_cnt work in - * tandem so that sync writes can detect if there are any non-sync - * writes going on and vice-versa. The "vice-versa" part to this logic - * is located in zfs_putpage() where non-sync writes check if there are - * any ongoing sync writes. If any sync and non-sync writes overlap, - * we do a commit to complete the non-sync writes since the latter can - * potentially take several seconds to complete and thus block sync - * writes in the upcoming call to filemap_write_and_wait_range(). - */ - atomic_inc_32(&zp->z_sync_writes_cnt); - /* - * If the following check does not detect an overlapping non-sync write - * (say because it's just about to start), then it is guaranteed that - * the non-sync write will detect this sync write. This is because we - * always increment z_sync_writes_cnt / z_async_writes_cnt before doing - * the check on z_async_writes_cnt / z_sync_writes_cnt here and in - * zfs_putpage() respectively. - */ - if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { - if ((error = zpl_enter(zfsvfs, FTAG)) != 0) { - atomic_dec_32(&zp->z_sync_writes_cnt); - return (error); - } - zil_commit(zfsvfs->z_log, zp->z_id); - zpl_exit(zfsvfs, FTAG); - } - error = filemap_write_and_wait_range(inode->i_mapping, start, end); - - /* - * The sync write is not complete yet but we decrement - * z_sync_writes_cnt since zfs_fsync() increments and decrements - * it internally. If a non-sync write starts just after the decrement - * operation but before we call zfs_fsync(), it may not detect this - * overlapping sync write but it does not matter since we have already - * gone past filemap_write_and_wait_range() and we won't block due to - * the non-sync write. - */ - atomic_dec_32(&zp->z_sync_writes_cnt); - if (error) return (error); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index a682bfd33c38..94dcdd0b887d 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -45,6 +45,15 @@ zpl_inode_alloc(struct super_block *sb) return (ip); } +#ifdef HAVE_SOPS_FREE_INODE +static void +zpl_inode_free(struct inode *ip) +{ + ASSERT(atomic_read(&ip->i_count) == 0); + zfs_inode_free(ip); +} +#endif + static void zpl_inode_destroy(struct inode *ip) { @@ -455,6 +464,9 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg) const struct super_operations zpl_super_operations = { .alloc_inode = zpl_inode_alloc, +#ifdef HAVE_SOPS_FREE_INODE + .free_inode = zpl_inode_free, +#endif .destroy_inode = zpl_inode_destroy, .dirty_inode = zpl_dirty_inode, .write_inode = NULL, diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index 57a9711e9027..a7431cc4da9d 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -1302,27 +1302,30 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. */ -static zvol_state_t * -zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize) +static int +zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize, + zvol_state_t **zvp) { zvol_state_t *zv; struct zvol_state_os *zso; uint64_t volmode; int ret; - if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) - return (NULL); + ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL); + if (ret) + return (ret); if (volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; if (volmode == ZFS_VOLMODE_NONE) - return (NULL); + return (0); zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_zso = zso; zv->zv_volmode = volmode; + zv->zv_volsize = volsize; zv->zv_volblocksize = volblocksize; list_link_init(&zv->zv_next); @@ -1396,12 +1399,13 @@ zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize) snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", ZVOL_DEV_NAME, (dev & MINORMASK)); - return (zv); + *zvp = zv; + return (ret); out_kmem: kmem_free(zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); - return (NULL); + return (ret); } /* @@ -1562,7 +1566,7 @@ zvol_os_add_disk(struct gendisk *disk) int zvol_os_create_minor(const char *name) { - zvol_state_t *zv; + zvol_state_t *zv = NULL; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; @@ -1611,18 +1615,16 @@ zvol_os_create_minor(const char *name) if (error) goto out_dmu_objset_disown; - zv = zvol_alloc(MKDEV(zvol_major, minor), name, - doi->doi_data_block_size); - if (zv == NULL) { - error = SET_ERROR(EAGAIN); + error = zvol_alloc(MKDEV(zvol_major, minor), name, + volsize, doi->doi_data_block_size, &zv); + if (error || zv == NULL) goto out_dmu_objset_disown; - } + zv->zv_hash = hash; if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; - zv->zv_volsize = volsize; zv->zv_objset = os; /* Default */ @@ -1689,7 +1691,7 @@ out_doi: * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() * directly as well. */ - if (error == 0) { + if (error == 0 && zv) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); rw_exit(&zvol_state_lock); @@ -1701,7 +1703,7 @@ out_doi: return (error); } -void +int zvol_os_rename_minor(zvol_state_t *zv, const char *newname) { int readonly = get_disk_ro(zv->zv_zso->zvo_disk); @@ -1728,6 +1730,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) set_disk_ro(zv->zv_zso->zvo_disk, readonly); dataset_kstats_rename(&zv->zv_kstat, newname); + + return (0); } void |