aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/os
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/os')
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c9
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c1
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c32
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c32
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c387
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c7
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c155
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c22
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-proc.c10
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-thread.c12
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/abd_os.c4
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c23
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c65
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c99
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c24
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c41
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c12
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c36
19 files changed, 712 insertions, 261 deletions
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
index f9125a067cd1..3f360d167b17 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
@@ -101,6 +101,15 @@ spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
va_end(ap);
}
+/*
+ * Check if the current thread is a memory reclaim thread.
+ * Returns true if curproc is pageproc (FreeBSD's page daemon).
+ */
+int
+current_is_reclaim_thread(void)
+{
+ return (curproc == pageproc);
+}
SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY,
opensolaris_utsname_init, NULL);
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
index 733c2bd07ebb..9d5f025423a1 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
@@ -43,6 +43,7 @@
const int zfs_vm_pagerret_bad = VM_PAGER_BAD;
const int zfs_vm_pagerret_error = VM_PAGER_ERROR;
const int zfs_vm_pagerret_ok = VM_PAGER_OK;
+const int zfs_vm_pagerret_pend = VM_PAGER_PEND;
const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC;
const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL;
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
index 334264f6da2f..5c5adc6cc12b 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
@@ -2357,10 +2357,42 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr,
* In FreeBSD, we don't care about permissions of individual ADS.
* Note that not checking them is not just an optimization - without
* this shortcut, EA operations may bogusly fail with EACCES.
+ *
+ * If this is a named attribute lookup, do the checks.
*/
+#if __FreeBSD_version >= 1500040
+ if ((zp->z_pflags & ZFS_XATTR) && (flags & V_NAMEDATTR) == 0)
+#else
if (zp->z_pflags & ZFS_XATTR)
+#endif
return (0);
+ /*
+ * If a named attribute directory then validate against base file
+ */
+ if (is_attr) {
+ if ((error = zfs_zget(ZTOZSB(zp),
+ zp->z_xattr_parent, &xzp)) != 0) {
+ return (error);
+ }
+
+ check_zp = xzp;
+
+ /*
+ * fixup mode to map to xattr perms
+ */
+
+ if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+ mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ mode |= ACE_WRITE_NAMED_ATTRS;
+ }
+
+ if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+ mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+ mode |= ACE_READ_NAMED_ATTRS;
+ }
+ }
+
owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
/*
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
index 493ac9f69ad4..0456552ed07e 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
@@ -1209,6 +1209,8 @@ zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
}
+extern int zfs_xattr_compat;
+
static int
zfs_domount(vfs_t *vfsp, char *osname)
{
@@ -1289,6 +1291,16 @@ zfs_domount(vfs_t *vfsp, char *osname)
goto out;
}
+#if __FreeBSD_version >= 1500040
+ /*
+ * Named attributes can only work if the xattr property is set to
+ * on/dir and not sa. Also, zfs_xattr_compat must be set.
+ */
+ if ((zfsvfs->z_flags & ZSB_XATTR) != 0 && !zfsvfs->z_xattr_sa &&
+ zfs_xattr_compat)
+ vfsp->mnt_flag |= MNT_NAMEDATTR;
+#endif
+
vfs_mountedfrom(vfsp, osname);
if (!zfsvfs->z_issnap)
@@ -1812,6 +1824,14 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
err = vn_lock(*vpp, flags);
if (err != 0)
vrele(*vpp);
+#if __FreeBSD_version >= 1500040
+ else if ((zp->z_pflags & ZFS_XATTR) != 0) {
+ if ((*vpp)->v_type == VDIR)
+ vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
+ else
+ vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR);
+ }
+#endif
}
if (err != 0)
*vpp = NULL;
@@ -1964,9 +1984,17 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
*vpp = ZTOV(zp);
zfs_exit(zfsvfs, FTAG);
err = vn_lock(*vpp, flags);
- if (err == 0)
+ if (err == 0) {
vnode_create_vobject(*vpp, zp->z_size, curthread);
- else
+#if __FreeBSD_version >= 1500040
+ if ((zp->z_pflags & ZFS_XATTR) != 0) {
+ if ((*vpp)->v_type == VDIR)
+ vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
+ else
+ vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR);
+ }
+#endif
+ } else
*vpp = NULL;
return (err);
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
index 8a5006c488f3..c4270d8b5d5c 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -25,6 +25,7 @@
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2025, Klara, Inc.
*/
/* Portions Copyright 2007 Jeremy Teo */
@@ -114,6 +115,8 @@ typedef uint64_t cookie_t;
typedef ulong_t cookie_t;
#endif
+static int zfs_check_attrname(const char *name);
+
/*
* Programming rules.
*
@@ -813,7 +816,12 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp,
/*
* Do we have permission to get into attribute directory?
*/
- error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr, NULL);
+ if (flags & LOOKUP_NAMED_ATTR)
+ error = zfs_zaccess(zp, ACE_EXECUTE, V_NAMEDATTR,
+ B_FALSE, cr, NULL);
+ else
+ error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr,
+ NULL);
if (error) {
vrele(ZTOV(zp));
}
@@ -4299,6 +4307,33 @@ zfs_freebsd_getpages(struct vop_getpages_args *ap)
ap->a_rahead));
}
+typedef struct {
+ uint_t pca_npages;
+ vm_page_t pca_pages[];
+} putpage_commit_arg_t;
+
+static void
+zfs_putpage_commit_cb(void *arg)
+{
+ putpage_commit_arg_t *pca = arg;
+ vm_object_t object = pca->pca_pages[0]->object;
+
+ zfs_vmobject_wlock(object);
+
+ for (uint_t i = 0; i < pca->pca_npages; i++) {
+ vm_page_t pp = pca->pca_pages[i];
+ vm_page_undirty(pp);
+ vm_page_sunbusy(pp);
+ }
+
+ vm_object_pip_wakeupn(object, pca->pca_npages);
+
+ zfs_vmobject_wunlock(object);
+
+ kmem_free(pca,
+ offsetof(putpage_commit_arg_t, pca_pages[pca->pca_npages]));
+}
+
static int
zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
int *rtvals)
@@ -4400,10 +4435,12 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
}
if (zp->z_blksz < PAGE_SIZE) {
- for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
- tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
+ vm_ooffset_t woff = off;
+ size_t wlen = len;
+ for (i = 0; wlen > 0; woff += tocopy, wlen -= tocopy, i++) {
+ tocopy = MIN(PAGE_SIZE, wlen);
va = zfs_map_page(ma[i], &sf);
- dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
+ dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx);
zfs_unmap_page(sf);
}
} else {
@@ -4424,19 +4461,48 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
ASSERT0(err);
- /*
- * XXX we should be passing a callback to undirty
- * but that would make the locking messier
- */
- zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
- len, commit, B_FALSE, NULL, NULL);
- zfs_vmobject_wlock(object);
- for (i = 0; i < ncount; i++) {
- rtvals[i] = zfs_vm_pagerret_ok;
- vm_page_undirty(ma[i]);
+ if (commit) {
+ /*
+ * Caller requested that we commit immediately. We set
+ * a callback on the log entry, to be called once its
+ * on disk after the call to zil_commit() below. The
+ * pages will be undirtied and unbusied there.
+ */
+ putpage_commit_arg_t *pca = kmem_alloc(
+ offsetof(putpage_commit_arg_t, pca_pages[ncount]),
+ KM_SLEEP);
+ pca->pca_npages = ncount;
+ memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount);
+
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
+ B_TRUE, B_FALSE, zfs_putpage_commit_cb, pca);
+
+ for (i = 0; i < ncount; i++)
+ rtvals[i] = zfs_vm_pagerret_pend;
+ } else {
+ /*
+ * Caller just wants the page written back somewhere,
+ * but doesn't need it committed yet. We've already
+ * written it back to the DMU, so we just need to put
+ * it on the async log, then undirty the page and
+ * return.
+ *
+ * We cannot use a callback here, because it would keep
+ * the page busy (locked) until it is eventually
+ * written down at txg sync.
+ */
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
+ B_FALSE, B_FALSE, NULL, NULL);
+
+ zfs_vmobject_wlock(object);
+ for (i = 0; i < ncount; i++) {
+ rtvals[i] = zfs_vm_pagerret_ok;
+ vm_page_undirty(ma[i]);
+ }
+ zfs_vmobject_wunlock(object);
}
- zfs_vmobject_wunlock(object);
+
VM_CNT_INC(v_vnodeout);
VM_CNT_ADD(v_vnodepgsout, ncount);
}
@@ -4707,8 +4773,16 @@ zfs_freebsd_access(struct vop_access_args *ap)
* ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
*/
accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
- if (accmode != 0)
- error = zfs_access(zp, accmode, 0, ap->a_cred);
+ if (accmode != 0) {
+#if __FreeBSD_version >= 1500040
+ /* For named attributes, do the checks. */
+ if ((vn_irflag_read(vp) & VIRF_NAMEDATTR) != 0)
+ error = zfs_access(zp, accmode, V_NAMEDATTR,
+ ap->a_cred);
+ else
+#endif
+ error = zfs_access(zp, accmode, 0, ap->a_cred);
+ }
/*
* VADMIN has to be handled by vaccess().
@@ -4741,6 +4815,190 @@ struct vop_lookup_args {
};
#endif
+#if __FreeBSD_version >= 1500040
+static int
+zfs_lookup_nameddir(struct vnode *dvp, struct componentname *cnp,
+ struct vnode **vpp)
+{
+ struct vnode *xvp;
+ int error, flags;
+
+ *vpp = NULL;
+ flags = LOOKUP_XATTR | LOOKUP_NAMED_ATTR;
+ if ((cnp->cn_flags & CREATENAMED) != 0)
+ flags |= CREATE_XATTR_DIR;
+ error = zfs_lookup(dvp, NULL, &xvp, NULL, 0, cnp->cn_cred, flags,
+ B_FALSE);
+ if (error == 0) {
+ if ((cnp->cn_flags & LOCKLEAF) != 0)
+ error = vn_lock(xvp, cnp->cn_lkflags);
+ if (error == 0) {
+ vn_irflag_set_cond(xvp, VIRF_NAMEDDIR);
+ *vpp = xvp;
+ } else {
+ vrele(xvp);
+ }
+ }
+ return (error);
+}
+
+static ssize_t
+zfs_readdir_named(struct vnode *vp, char *buf, ssize_t blen, off_t *offp,
+ int *eofflagp, struct ucred *cred, struct thread *td)
+{
+ struct uio io;
+ struct iovec iv;
+ zfs_uio_t uio;
+ int error;
+
+ io.uio_offset = *offp;
+ io.uio_segflg = UIO_SYSSPACE;
+ io.uio_rw = UIO_READ;
+ io.uio_td = td;
+ iv.iov_base = buf;
+ iv.iov_len = blen;
+ io.uio_iov = &iv;
+ io.uio_iovcnt = 1;
+ io.uio_resid = blen;
+ zfs_uio_init(&uio, &io);
+ error = zfs_readdir(vp, &uio, cred, eofflagp, NULL, NULL);
+ if (error != 0)
+ return (-1);
+ *offp = io.uio_offset;
+ return (blen - io.uio_resid);
+}
+
+static bool
+zfs_has_namedattr(struct vnode *vp, struct ucred *cred)
+{
+ struct componentname cn;
+ struct vnode *xvp;
+ struct dirent *dp;
+ off_t offs;
+ ssize_t rsize;
+ char *buf, *cp, *endcp;
+ int eofflag, error;
+ bool ret;
+
+ MNT_ILOCK(vp->v_mount);
+ if ((vp->v_mount->mnt_flag & MNT_NAMEDATTR) == 0) {
+ MNT_IUNLOCK(vp->v_mount);
+ return (false);
+ }
+ MNT_IUNLOCK(vp->v_mount);
+
+ /* Now see if a named attribute directory exists. */
+ cn.cn_flags = LOCKLEAF;
+ cn.cn_lkflags = LK_SHARED;
+ cn.cn_cred = cred;
+ error = zfs_lookup_nameddir(vp, &cn, &xvp);
+ if (error != 0)
+ return (false);
+
+ /* It exists, so see if there is any entry other than "." and "..". */
+ buf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK);
+ ret = false;
+ offs = 0;
+ do {
+ rsize = zfs_readdir_named(xvp, buf, DEV_BSIZE, &offs, &eofflag,
+ cred, curthread);
+ if (rsize <= 0)
+ break;
+ cp = buf;
+ endcp = &buf[rsize];
+ while (cp < endcp) {
+ dp = (struct dirent *)cp;
+ if (dp->d_fileno != 0 && (dp->d_type == DT_REG ||
+ dp->d_type == DT_UNKNOWN) &&
+ !ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name) &&
+ ((dp->d_namlen == 1 && dp->d_name[0] != '.') ||
+ (dp->d_namlen == 2 && (dp->d_name[0] != '.' ||
+ dp->d_name[1] != '.')) || dp->d_namlen > 2)) {
+ ret = true;
+ break;
+ }
+ cp += dp->d_reclen;
+ }
+ } while (!ret && rsize > 0 && eofflag == 0);
+ vput(xvp);
+ free(buf, M_TEMP);
+ return (ret);
+}
+
+static int
+zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
+{
+ struct componentname *cnp = ap->a_cnp;
+ char nm[NAME_MAX + 1];
+ int error;
+ struct vnode **vpp = ap->a_vpp, *dvp = ap->a_dvp, *xvp;
+ bool is_nameddir, needs_nameddir, opennamed = false;
+
+ /*
+ * These variables are used to handle the named attribute cases:
+ * opennamed - Is true when this is a call from open with O_NAMEDATTR
+ * specified and it is the last component.
+ * is_nameddir - Is true when the directory is a named attribute dir.
+ * needs_nameddir - Is set when the lookup needs to look for/create
+ * a named attribute directory. It is only set when is_nameddir
+ * is_nameddir is false and opennamed is true.
+ * xvp - Is the directory that the lookup needs to be done in.
+ * Usually dvp, unless needs_nameddir is true where it is the
+ * result of the first non-named directory lookup.
+ * Note that name caching must be disabled for named attribute
+ * handling.
+ */
+ needs_nameddir = false;
+ xvp = dvp;
+ opennamed = (cnp->cn_flags & (OPENNAMED | ISLASTCN)) ==
+ (OPENNAMED | ISLASTCN);
+ is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0;
+ if (is_nameddir && (cnp->cn_flags & ISLASTCN) == 0)
+ return (ENOATTR);
+ if (opennamed && !is_nameddir && (cnp->cn_flags & ISDOTDOT) != 0)
+ return (ENOATTR);
+ if (opennamed || is_nameddir)
+ cnp->cn_flags &= ~MAKEENTRY;
+ if (opennamed && !is_nameddir)
+ needs_nameddir = true;
+ ASSERT3U(cnp->cn_namelen, <, sizeof (nm));
+ error = 0;
+ *vpp = NULL;
+ if (needs_nameddir) {
+ if (VOP_ISLOCKED(dvp) != LK_EXCLUSIVE)
+ vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+ error = zfs_lookup_nameddir(dvp, cnp, &xvp);
+ if (error == 0)
+ is_nameddir = true;
+ }
+ if (error == 0) {
+ if (!needs_nameddir || cnp->cn_namelen != 1 ||
+ *cnp->cn_nameptr != '.') {
+ strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1,
+ sizeof (nm)));
+ error = zfs_lookup(xvp, nm, vpp, cnp, cnp->cn_nameiop,
+ cnp->cn_cred, 0, cached);
+ if (is_nameddir && error == 0 &&
+ (cnp->cn_namelen != 1 || *cnp->cn_nameptr != '.') &&
+ (cnp->cn_flags & ISDOTDOT) == 0) {
+ if ((*vpp)->v_type == VDIR)
+ vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
+ else
+ vn_irflag_set_cond(*vpp,
+ VIRF_NAMEDATTR);
+ }
+ if (needs_nameddir && xvp != *vpp)
+ vput(xvp);
+ } else {
+ /*
+ * Lookup of "." when a named attribute dir is needed.
+ */
+ *vpp = xvp;
+ }
+ }
+ return (error);
+}
+#else
static int
zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
{
@@ -4753,6 +5011,7 @@ zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
cnp->cn_cred, 0, cached));
}
+#endif
static int
zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
@@ -4775,7 +5034,11 @@ zfs_cache_lookup(struct vop_lookup_args *ap)
zfsvfs_t *zfsvfs;
zfsvfs = ap->a_dvp->v_mount->mnt_data;
+#if __FreeBSD_version >= 1500040
+ if (zfsvfs->z_use_namecache && (ap->a_cnp->cn_flags & OPENNAMED) == 0)
+#else
if (zfsvfs->z_use_namecache)
+#endif
return (vfs_cache_lookup(ap));
else
return (zfs_freebsd_lookup(ap, B_FALSE));
@@ -4798,6 +5061,11 @@ zfs_freebsd_create(struct vop_create_args *ap)
vattr_t *vap = ap->a_vap;
znode_t *zp = NULL;
int rc, mode;
+ struct vnode *dvp = ap->a_dvp;
+#if __FreeBSD_version >= 1500040
+ struct vnode *xvp;
+ bool is_nameddir;
+#endif
#if __FreeBSD_version < 1400068
ASSERT(cnp->cn_flags & SAVENAME);
@@ -4808,10 +5076,36 @@ zfs_freebsd_create(struct vop_create_args *ap)
zfsvfs = ap->a_dvp->v_mount->mnt_data;
*ap->a_vpp = NULL;
- rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, 0, mode,
- &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL);
+ rc = 0;
+#if __FreeBSD_version >= 1500040
+ xvp = NULL;
+ is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0;
+ if (!is_nameddir && (cnp->cn_flags & OPENNAMED) != 0) {
+ /* Needs a named attribute directory. */
+ rc = zfs_lookup_nameddir(dvp, cnp, &xvp);
+ if (rc == 0) {
+ dvp = xvp;
+ is_nameddir = true;
+ }
+ }
+ if (is_nameddir && rc == 0)
+ rc = zfs_check_attrname(cnp->cn_nameptr);
+#endif
+
if (rc == 0)
+ rc = zfs_create(VTOZ(dvp), cnp->cn_nameptr, vap, 0, mode,
+ &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL);
+#if __FreeBSD_version >= 1500040
+ if (xvp != NULL)
+ vput(xvp);
+#endif
+ if (rc == 0) {
*ap->a_vpp = ZTOV(zp);
+#if __FreeBSD_version >= 1500040
+ if (is_nameddir)
+ vn_irflag_set_cond(*ap->a_vpp, VIRF_NAMEDATTR);
+#endif
+ }
if (zfsvfs->z_use_namecache &&
rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
@@ -4830,13 +5124,21 @@ struct vop_remove_args {
static int
zfs_freebsd_remove(struct vop_remove_args *ap)
{
+ int error = 0;
#if __FreeBSD_version < 1400068
ASSERT(ap->a_cnp->cn_flags & SAVENAME);
#endif
- return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
- ap->a_cnp->cn_cred));
+#if __FreeBSD_version >= 1500040
+ if ((vn_irflag_read(ap->a_dvp) & VIRF_NAMEDDIR) != 0)
+ error = zfs_check_attrname(ap->a_cnp->cn_nameptr);
+#endif
+
+ if (error == 0)
+ error = zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
+ ap->a_cnp->cn_cred);
+ return (error);
}
#ifndef _SYS_SYSPROTO_H_
@@ -4994,6 +5296,11 @@ zfs_freebsd_getattr(struct vop_getattr_args *ap)
#undef FLAG_CHECK
*vap = xvap.xva_vattr;
vap->va_flags = fflags;
+
+#if __FreeBSD_version >= 1500040
+ if ((vn_irflag_read(ap->a_vp) & (VIRF_NAMEDDIR | VIRF_NAMEDATTR)) != 0)
+ vap->va_bsdflags |= SFBSD_NAMEDATTR;
+#endif
return (0);
}
@@ -5136,15 +5443,24 @@ zfs_freebsd_rename(struct vop_rename_args *ap)
vnode_t *fvp = ap->a_fvp;
vnode_t *tdvp = ap->a_tdvp;
vnode_t *tvp = ap->a_tvp;
- int error;
+ int error = 0;
#if __FreeBSD_version < 1400068
ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
#endif
- error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
- ap->a_tcnp, ap->a_fcnp->cn_cred);
+#if __FreeBSD_version >= 1500040
+ if ((vn_irflag_read(fdvp) & VIRF_NAMEDDIR) != 0) {
+ error = zfs_check_attrname(ap->a_fcnp->cn_nameptr);
+ if (error == 0)
+ error = zfs_check_attrname(ap->a_tcnp->cn_nameptr);
+ }
+#endif
+
+ if (error == 0)
+ error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
+ ap->a_tcnp, ap->a_fcnp->cn_cred);
vrele(fdvp);
vrele(fvp);
@@ -5398,12 +5714,33 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
return (0);
}
return (EINVAL);
+#if __FreeBSD_version >= 1500040
+ case _PC_NAMEDATTR_ENABLED:
+ MNT_ILOCK(ap->a_vp->v_mount);
+ if ((ap->a_vp->v_mount->mnt_flag & MNT_NAMEDATTR) != 0)
+ *ap->a_retval = 1;
+ else
+ *ap->a_retval = 0;
+ MNT_IUNLOCK(ap->a_vp->v_mount);
+ return (0);
+ case _PC_HAS_NAMEDATTR:
+ if (zfs_has_namedattr(ap->a_vp, curthread->td_ucred))
+ *ap->a_retval = 1;
+ else
+ *ap->a_retval = 0;
+ return (0);
+#endif
+#ifdef _PC_HAS_HIDDENSYSTEM
+ case _PC_HAS_HIDDENSYSTEM:
+ *ap->a_retval = 1;
+ return (0);
+#endif
default:
return (vop_stdpathconf(ap));
}
}
-static int zfs_xattr_compat = 1;
+int zfs_xattr_compat = 1;
static int
zfs_check_attrname(const char *name)
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
index 9bad1e13d7cc..775f54a65f7d 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
@@ -150,8 +150,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
zp->z_xattr_cached = NULL;
zp->z_xattr_parent = 0;
zp->z_vnode = NULL;
- zp->z_sync_writes_cnt = 0;
- zp->z_async_writes_cnt = 0;
return (0);
}
@@ -172,9 +170,6 @@ zfs_znode_cache_destructor(void *buf, void *arg)
ASSERT3P(zp->z_acl_cached, ==, NULL);
ASSERT3P(zp->z_xattr_cached, ==, NULL);
-
- ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
- ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
}
@@ -456,8 +451,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
zp->z_sync_cnt = 0;
- zp->z_sync_writes_cnt = 0;
- zp->z_async_writes_cnt = 0;
atomic_store_ptr(&zp->z_cached_symlink, NULL);
zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
index 212ef560db07..72a7c4ea082a 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
@@ -1248,9 +1248,11 @@ zvol_os_is_zvol(const char *device)
return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
}
-void
+int
zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
{
+ int error = 0;
+
ASSERT(RW_LOCK_HELD(&zvol_state_lock));
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -1304,14 +1306,94 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
args.mda_gid = GID_OPERATOR;
args.mda_mode = 0640;
args.mda_si_drv2 = zv;
- if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
- == 0) {
+ error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname);
+ if (error == 0) {
dev->si_iosize_max = maxphys;
zsd->zsd_cdev = dev;
}
}
strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
dataset_kstats_rename(&zv->zv_kstat, newname);
+
+ return (error);
+}
+
+/*
+ * Allocate memory for a new zvol_state_t and setup the required
+ * request queue and generic disk structures for the block device.
+ */
+static int
+zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
+ zvol_state_t **zvp)
+{
+ zvol_state_t *zv;
+ uint64_t volmode;
+ int error;
+
+ error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE),
+ &volmode, NULL);
+ if (error)
+ return (error);
+
+ if (volmode == ZFS_VOLMODE_DEFAULT)
+ volmode = zvol_volmode;
+
+ if (volmode == ZFS_VOLMODE_NONE)
+ return (0);
+
+ zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
+ mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
+ zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
+ zv->zv_volmode = volmode;
+ zv->zv_volsize = volsize;
+ zv->zv_volblocksize = volblocksize;
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp;
+ struct g_geom *gp;
+
+ g_topology_lock();
+ gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+ gp->start = zvol_geom_bio_start;
+ gp->access = zvol_geom_access;
+ pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
+ pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+ pp->sectorsize = DEV_BSIZE;
+ pp->mediasize = 0;
+ pp->private = zv;
+
+ zsg->zsg_provider = pp;
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+ struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+ struct cdev *dev;
+ struct make_dev_args args;
+
+ make_dev_args_init(&args);
+ args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+ args.mda_devsw = &zvol_cdevsw;
+ args.mda_cr = NULL;
+ args.mda_uid = UID_ROOT;
+ args.mda_gid = GID_OPERATOR;
+ args.mda_mode = 0640;
+ args.mda_si_drv2 = zv;
+ error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
+ if (error) {
+ kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+ kmem_free(zv, sizeof (zvol_state_t));
+ return (error);
+ }
+
+ dev->si_iosize_max = maxphys;
+ zsd->zsd_cdev = dev;
+ knlist_init_sx(&zsd->zsd_selinfo.si_note, &zv->zv_state_lock);
+ }
+ (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
+ rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
+ zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
+
+ *zvp = zv;
+ return (error);
}
/*
@@ -1364,11 +1446,11 @@ zvol_os_free(zvol_state_t *zv)
int
zvol_os_create_minor(const char *name)
{
- zvol_state_t *zv;
+ zvol_state_t *zv = NULL;
objset_t *os;
dmu_object_info_t *doi;
uint64_t volsize;
- uint64_t volmode, hash, len;
+ uint64_t hash, len;
int error;
bool replayed_zil = B_FALSE;
@@ -1400,67 +1482,15 @@ zvol_os_create_minor(const char *name)
if (error)
goto out_dmu_objset_disown;
- error = dsl_prop_get_integer(name,
- zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
- if (error || volmode == ZFS_VOLMODE_DEFAULT)
- volmode = zvol_volmode;
- error = 0;
+ error = zvol_alloc(name, volsize, doi->doi_data_block_size, &zv);
+ if (error || zv == NULL)
+ goto out_dmu_objset_disown;
- /*
- * zvol_alloc equivalent ...
- */
- zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
zv->zv_hash = hash;
- mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
- zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
- zv->zv_volmode = volmode;
- if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
- struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
- struct g_provider *pp;
- struct g_geom *gp;
-
- g_topology_lock();
- gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
- gp->start = zvol_geom_bio_start;
- gp->access = zvol_geom_access;
- pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
- pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
- pp->sectorsize = DEV_BSIZE;
- pp->mediasize = 0;
- pp->private = zv;
-
- zsg->zsg_provider = pp;
- } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
- struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
- struct cdev *dev;
- struct make_dev_args args;
-
- make_dev_args_init(&args);
- args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
- args.mda_devsw = &zvol_cdevsw;
- args.mda_cr = NULL;
- args.mda_uid = UID_ROOT;
- args.mda_gid = GID_OPERATOR;
- args.mda_mode = 0640;
- args.mda_si_drv2 = zv;
- if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
- == 0) {
- dev->si_iosize_max = maxphys;
- zsd->zsd_cdev = dev;
- knlist_init_sx(&zsd->zsd_selinfo.si_note,
- &zv->zv_state_lock);
- }
- }
- (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
- rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
- zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
zv->zv_flags |= ZVOL_RDONLY;
- zv->zv_volblocksize = doi->doi_data_block_size;
- zv->zv_volsize = volsize;
zv->zv_objset = os;
ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
@@ -1490,13 +1520,14 @@ zvol_os_create_minor(const char *name)
out_dmu_objset_disown:
dmu_objset_disown(os, B_TRUE, FTAG);
- if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
+ if (error == 0 && zv && zv->zv_volmode == ZFS_VOLMODE_GEOM) {
g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0);
+ /* geom was locked inside zvol_alloc() function */
g_topology_unlock();
}
out_doi:
kmem_free(doi, sizeof (dmu_object_info_t));
- if (error == 0) {
+ if (error == 0 && zv) {
rw_enter(&zvol_state_lock, RW_WRITER);
zvol_insert(zv);
zvol_minors++;
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
index 337a4bcf76a0..9fe008cef868 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
@@ -302,13 +302,8 @@ spl_kmem_free_impl(const void *buf, size_t size)
#ifdef DEBUG_KMEM
/* Shim layer memory accounting */
-#ifdef HAVE_ATOMIC64_T
atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
-unsigned long long kmem_alloc_max = 0;
-#else /* HAVE_ATOMIC64_T */
-atomic_t kmem_alloc_used = ATOMIC_INIT(0);
-unsigned long long kmem_alloc_max = 0;
-#endif /* HAVE_ATOMIC64_T */
+uint64_t kmem_alloc_max = 0;
EXPORT_SYMBOL(kmem_alloc_used);
EXPORT_SYMBOL(kmem_alloc_max);
@@ -320,9 +315,9 @@ spl_kmem_alloc_debug(size_t size, int flags, int node)
ptr = spl_kmem_alloc_impl(size, flags, node);
if (ptr) {
- kmem_alloc_used_add(size);
- if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
- kmem_alloc_max = kmem_alloc_used_read();
+ atomic64_add(size, &kmem_alloc_used);
+ if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max))
+ kmem_alloc_max = atomic64_read(&kmem_alloc_used);
}
return (ptr);
@@ -331,7 +326,7 @@ spl_kmem_alloc_debug(size_t size, int flags, int node)
inline void
spl_kmem_free_debug(const void *ptr, size_t size)
{
- kmem_alloc_used_sub(size);
+ atomic64_sub(size, &kmem_alloc_used);
spl_kmem_free_impl(ptr, size);
}
@@ -595,7 +590,7 @@ spl_kmem_init(void)
{
#ifdef DEBUG_KMEM
- kmem_alloc_used_set(0);
+ atomic64_set(&kmem_alloc_used, 0);
@@ -617,9 +612,10 @@ spl_kmem_fini(void)
* at that address to aid in debugging. Performance is not
* a serious concern here since it is module unload time.
*/
- if (kmem_alloc_used_read() != 0)
+ if (atomic64_read(&kmem_alloc_used) != 0)
printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
- (unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
+ (unsigned long)atomic64_read(&kmem_alloc_used),
+ kmem_alloc_max);
#ifdef DEBUG_KMEM_TRACKING
spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
index 4ed0deedd5b9..8cdd5fc5cfe5 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
@@ -82,11 +82,7 @@ proc_domemused(CONST_CTL_TABLE *table, int write,
if (write) {
*ppos += *lenp;
} else {
-#ifdef HAVE_ATOMIC64_T
val = atomic64_read((atomic64_t *)table->data);
-#else
- val = atomic_read((atomic_t *)table->data);
-#endif /* HAVE_ATOMIC64_T */
rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
}
@@ -315,18 +311,14 @@ static struct ctl_table spl_kmem_table[] = {
{
.procname = "kmem_used",
.data = &kmem_alloc_used,
-#ifdef HAVE_ATOMIC64_T
.maxlen = sizeof (atomic64_t),
-#else
- .maxlen = sizeof (atomic_t),
-#endif /* HAVE_ATOMIC64_T */
.mode = 0444,
.proc_handler = &proc_domemused,
},
{
.procname = "kmem_max",
.data = &kmem_alloc_max,
- .maxlen = sizeof (unsigned long),
+ .maxlen = sizeof (uint64_t),
.extra1 = &table_min,
.extra2 = &table_max,
.mode = 0444,
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
index 1398483a3ac8..f42f455222de 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
@@ -28,6 +28,7 @@
#include <sys/kmem.h>
#include <sys/tsd.h>
#include <sys/string.h>
+#include <sys/misc.h>
/*
* Thread interfaces
@@ -197,3 +198,14 @@ issig(void)
}
EXPORT_SYMBOL(issig);
+
+/*
+ * Check if the current thread is a memory reclaim thread.
+ * Returns true if current thread is kswapd.
+ */
+int
+current_is_reclaim_thread(void)
+{
+ return (current_is_kswapd());
+}
+EXPORT_SYMBOL(current_is_reclaim_thread);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
index e1140b31a97a..248c9b7a6d3b 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
@@ -256,10 +256,6 @@ abd_unmark_zfs_page(struct page *page)
#ifndef CONFIG_HIGHMEM
-#ifndef __GFP_RECLAIM
-#define __GFP_RECLAIM __GFP_WAIT
-#endif
-
/*
* The goal is to minimize fragmentation by preferentially populating ABDs
* with higher order compound pages from a single zone. Allocation size is
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
index 84b25cb2c5ac..6552a933ce0a 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
@@ -511,8 +511,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
zp->z_pflags = 0;
zp->z_mode = 0;
zp->z_sync_cnt = 0;
- zp->z_sync_writes_cnt = 0;
- zp->z_async_writes_cnt = 0;
ip->i_generation = 0;
ip->i_ino = id;
ip->i_mode = (S_IFDIR | S_IRWXUGO);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
index d193eb80dca2..c729947369c2 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
@@ -260,24 +260,12 @@ zfs_file_fsync(zfs_file_t *filp, int flags)
{
int datasync = 0;
int error;
- int fstrans;
if (flags & O_DSYNC)
datasync = 1;
- /*
- * May enter XFS which generates a warning when PF_FSTRANS is set.
- * To avoid this the flag is cleared over vfs_sync() and then reset.
- */
- fstrans = __spl_pf_fstrans_check();
- if (fstrans)
- current->flags &= ~(__SPL_PF_FSTRANS);
-
error = -vfs_fsync(filp, datasync);
- if (fstrans)
- current->flags |= __SPL_PF_FSTRANS;
-
return (error);
}
@@ -292,14 +280,6 @@ int
zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len)
{
/*
- * May enter XFS which generates a warning when PF_FSTRANS is set.
- * To avoid this the flag is cleared over vfs_sync() and then reset.
- */
- int fstrans = __spl_pf_fstrans_check();
- if (fstrans)
- current->flags &= ~(__SPL_PF_FSTRANS);
-
- /*
* When supported by the underlying file system preferentially
* use the fallocate() callback to preallocate the space.
*/
@@ -308,9 +288,6 @@ zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len)
error = -fp->f_op->fallocate(fp,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len);
- if (fstrans)
- current->flags |= __SPL_PF_FSTRANS;
-
if (error)
return (SET_ERROR(error));
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
index a3837f784668..396faef8f646 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -1217,6 +1217,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
}
/*
+ * Dentry and inode caches referenced by a task in non-root memcg are
+ * not going to be scanned by the kernel-provided shrinker. So, if
+ * kernel prunes nothing, fall back to this manual walk to free dnodes.
+ * To avoid scanning the same znodes multiple times they are always rotated
+ * to the end of the z_all_znodes list. New znodes are inserted at the
+ * end of the list so we're always scanning the oldest znodes first.
+ */
+static int
+zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
+{
+ znode_t **zp_array, *zp;
+ int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
+ int objects = 0;
+ int i = 0, j = 0;
+
+ zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
+
+ if ((i++ > nr_to_scan) || (j >= max_array))
+ break;
+
+ ASSERT(list_link_active(&zp->z_link_node));
+ list_remove(&zfsvfs->z_all_znodes, zp);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+
+ /* Skip active znodes and .zfs entries */
+ if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
+ continue;
+
+ if (igrab(ZTOI(zp)) == NULL)
+ continue;
+
+ zp_array[j] = zp;
+ j++;
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ for (i = 0; i < j; i++) {
+ zp = zp_array[i];
+
+ ASSERT3P(zp, !=, NULL);
+ d_prune_aliases(ZTOI(zp));
+
+ if (atomic_read(&ZTOI(zp)->i_count) == 1)
+ objects++;
+
+ zrele(zp);
+ }
+
+ vmem_free(zp_array, max_array * sizeof (znode_t *));
+
+ return (objects);
+}
+
+/*
* The ARC has requested that the filesystem drop entries from the dentry
* and inode caches. This can occur when the ARC needs to free meta data
* blocks but can't because they are all pinned by entries in these caches.
@@ -1267,6 +1324,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
*objects = (*shrinker->scan_objects)(shrinker, &sc);
#endif
+ /*
+ * Fall back to zfs_prune_aliases if kernel's shrinker did nothing
+ * due to dentry and inode caches being referenced by a task running
+ * in non-root memcg.
+ */
+ if (*objects == 0)
+ *objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
+
zfs_exit(zfsvfs, FTAG);
dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
index ed9721dade76..6a2fc5ad7935 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
@@ -25,6 +25,7 @@
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2025, Klara, Inc.
*/
/* Portions Copyright 2007 Jeremy Teo */
@@ -3691,7 +3692,7 @@ top:
}
static void
-zfs_putpage_sync_commit_cb(void *arg)
+zfs_putpage_commit_cb(void *arg)
{
struct page *pp = arg;
@@ -3699,17 +3700,6 @@ zfs_putpage_sync_commit_cb(void *arg)
end_page_writeback(pp);
}
-static void
-zfs_putpage_async_commit_cb(void *arg)
-{
- struct page *pp = arg;
- znode_t *zp = ITOZ(pp->mapping->host);
-
- ClearPageError(pp);
- end_page_writeback(pp);
- atomic_dec_32(&zp->z_async_writes_cnt);
-}
-
/*
* Push a page out to disk, once the page is on stable storage the
* registered commit callback will be run as notification of completion.
@@ -3827,15 +3817,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
zfs_rangelock_exit(lr);
if (wbc->sync_mode != WB_SYNC_NONE) {
- /*
- * Speed up any non-sync page writebacks since
- * they may take several seconds to complete.
- * Refer to the comment in zpl_fsync() for details.
- */
- if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
- zil_commit(zfsvfs->z_log, zp->z_id);
- }
-
if (PageWriteback(pp))
#ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
folio_wait_bit(page_folio(pp), PG_writeback);
@@ -3861,8 +3842,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
* was in fact not skipped and should not be counted as if it were.
*/
wbc->pages_skipped--;
- if (!for_sync)
- atomic_inc_32(&zp->z_async_writes_cnt);
set_page_writeback(pp);
unlock_page(pp);
@@ -3881,8 +3860,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
#endif
ClearPageError(pp);
end_page_writeback(pp);
- if (!for_sync)
- atomic_dec_32(&zp->z_async_writes_cnt);
zfs_rangelock_exit(lr);
zfs_exit(zfsvfs, FTAG);
return (err);
@@ -3908,35 +3885,61 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
- boolean_t commit = B_FALSE;
- if (wbc->sync_mode != WB_SYNC_NONE) {
- /*
- * Note that this is rarely called under writepages(), because
- * writepages() normally handles the entire commit for
- * performance reasons.
- */
- commit = B_TRUE;
- } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
- /*
- * If the caller does not intend to wait synchronously
- * for this page writeback to complete and there are active
- * synchronous calls on this file, do a commit so that
- * the latter don't accidentally end up waiting for
- * our writeback to complete. Refer to the comment in
- * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
- */
- commit = B_TRUE;
- }
+ /*
+ * A note about for_sync vs wbc->sync_mode.
+ *
+ * for_sync indicates that this is a syncing writeback, that is, kernel
+ * caller expects the data to be durably stored before being notified.
+ * Often, but not always, the call was triggered by a userspace syncing
+ * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE
+ * means that that page should remain "locked" (in the writeback state)
+ * until it is definitely on disk (ie zil_commit() or spa_sync()).
+ * Otherwise, we can unlock and return as soon as it is on the
+ * in-memory ZIL.
+ *
+ * wbc->sync_mode has similar meaning. wbc is passed from the kernel to
+ * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE
+ * indicates this a regular async writeback (eg a cache eviction) and
+ * so does not need a durability guarantee, while WB_SYNC_ALL indicates
+ * a syncing op that must be waited on (by convention, we test for
+ * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over
+ * performance should there ever be a new mode that we have not yet
+ * added support for).
+ *
+ * So, why a separate for_sync field? This is because zpl_writepages()
+ * calls zfs_putpage() multiple times for a single "logical" operation.
+ * It wants all the individual pages to be for_sync==TRUE ie only
+ * unlocked once durably stored, but it only wants one call to
+ * zil_commit() at the very end, once all the pages are synced. So,
+ * it repurposes sync_mode slightly to indicate who issue and wait for
+ * the IO: for NONE, the caller to zfs_putpage() will do it, while for
+ * ALL, zfs_putpage should do it.
+ *
+ * Summary:
+ * for_sync: 0=unlock immediately; 1 unlock once on disk
+ * sync_mode: NONE=caller will commit; ALL=we will commit
+ */
+ boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE);
+
+ /*
+ * We use for_sync as the "commit" arg to zfs_log_write() (arg 7)
+ * because it is a policy flag that indicates "someone will call
+ * zil_commit() soon". for_sync=TRUE means exactly that; the only
+ * question is whether it will be us, or zpl_writepages().
+ */
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync,
+ B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);
- zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
- B_FALSE, for_sync ? zfs_putpage_sync_commit_cb :
- zfs_putpage_async_commit_cb, pp);
+ if (!for_sync) {
+ ClearPageError(pp);
+ end_page_writeback(pp);
+ }
dmu_tx_commit(tx);
zfs_rangelock_exit(lr);
- if (commit)
+ if (need_commit)
zil_commit(zfsvfs->z_log, zp->z_id);
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c
index 54e60b4820f6..7683eeb3cf9f 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c
@@ -126,8 +126,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
zp->z_acl_cached = NULL;
zp->z_xattr_cached = NULL;
zp->z_xattr_parent = 0;
- zp->z_sync_writes_cnt = 0;
- zp->z_async_writes_cnt = 0;
return (0);
}
@@ -149,9 +147,6 @@ zfs_znode_cache_destructor(void *buf, void *arg)
ASSERT3P(zp->z_dirlocks, ==, NULL);
ASSERT3P(zp->z_acl_cached, ==, NULL);
ASSERT3P(zp->z_xattr_cached, ==, NULL);
-
- ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
- ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
}
static int
@@ -371,6 +366,12 @@ zfs_inode_alloc(struct super_block *sb, struct inode **ip)
return (0);
}
+void
+zfs_inode_free(struct inode *ip)
+{
+ kmem_cache_free(znode_cache, ITOZ(ip));
+}
+
/*
* Called in multiple places when an inode should be destroyed.
*/
@@ -395,8 +396,15 @@ zfs_inode_destroy(struct inode *ip)
nvlist_free(zp->z_xattr_cached);
zp->z_xattr_cached = NULL;
}
-
- kmem_cache_free(znode_cache, zp);
+#ifndef HAVE_SOPS_FREE_INODE
+ /*
+ * inode needs to be freed in RCU callback. If we have
+ * super_operations->free_inode, Linux kernel will do call_rcu
+ * for us. But if we don't have it, since call_rcu is GPL-only
+ * symbol, we can only free synchronously and accept the risk.
+ */
+ zfs_inode_free(ip);
+#endif
}
static void
@@ -535,8 +543,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
zp->z_sync_cnt = 0;
- zp->z_sync_writes_cnt = 0;
- zp->z_async_writes_cnt = 0;
zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
index 1a82c13e1523..ef7bd7352084 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
@@ -111,52 +111,11 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
struct inode *inode = filp->f_mapping->host;
znode_t *zp = ITOZ(inode);
- zfsvfs_t *zfsvfs = ITOZSB(inode);
cred_t *cr = CRED();
int error;
fstrans_cookie_t cookie;
- /*
- * The variables z_sync_writes_cnt and z_async_writes_cnt work in
- * tandem so that sync writes can detect if there are any non-sync
- * writes going on and vice-versa. The "vice-versa" part to this logic
- * is located in zfs_putpage() where non-sync writes check if there are
- * any ongoing sync writes. If any sync and non-sync writes overlap,
- * we do a commit to complete the non-sync writes since the latter can
- * potentially take several seconds to complete and thus block sync
- * writes in the upcoming call to filemap_write_and_wait_range().
- */
- atomic_inc_32(&zp->z_sync_writes_cnt);
- /*
- * If the following check does not detect an overlapping non-sync write
- * (say because it's just about to start), then it is guaranteed that
- * the non-sync write will detect this sync write. This is because we
- * always increment z_sync_writes_cnt / z_async_writes_cnt before doing
- * the check on z_async_writes_cnt / z_sync_writes_cnt here and in
- * zfs_putpage() respectively.
- */
- if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
- if ((error = zpl_enter(zfsvfs, FTAG)) != 0) {
- atomic_dec_32(&zp->z_sync_writes_cnt);
- return (error);
- }
- zil_commit(zfsvfs->z_log, zp->z_id);
- zpl_exit(zfsvfs, FTAG);
- }
-
error = filemap_write_and_wait_range(inode->i_mapping, start, end);
-
- /*
- * The sync write is not complete yet but we decrement
- * z_sync_writes_cnt since zfs_fsync() increments and decrements
- * it internally. If a non-sync write starts just after the decrement
- * operation but before we call zfs_fsync(), it may not detect this
- * overlapping sync write but it does not matter since we have already
- * gone past filemap_write_and_wait_range() and we won't block due to
- * the non-sync write.
- */
- atomic_dec_32(&zp->z_sync_writes_cnt);
-
if (error)
return (error);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
index a682bfd33c38..94dcdd0b887d 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@@ -45,6 +45,15 @@ zpl_inode_alloc(struct super_block *sb)
return (ip);
}
+#ifdef HAVE_SOPS_FREE_INODE
+static void
+zpl_inode_free(struct inode *ip)
+{
+ ASSERT(atomic_read(&ip->i_count) == 0);
+ zfs_inode_free(ip);
+}
+#endif
+
static void
zpl_inode_destroy(struct inode *ip)
{
@@ -455,6 +464,9 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg)
const struct super_operations zpl_super_operations = {
.alloc_inode = zpl_inode_alloc,
+#ifdef HAVE_SOPS_FREE_INODE
+ .free_inode = zpl_inode_free,
+#endif
.destroy_inode = zpl_inode_destroy,
.dirty_inode = zpl_dirty_inode,
.write_inode = NULL,
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index 57a9711e9027..a7431cc4da9d 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -1302,27 +1302,30 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
* Allocate memory for a new zvol_state_t and setup the required
* request queue and generic disk structures for the block device.
*/
-static zvol_state_t *
-zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
+static int
+zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize,
+ zvol_state_t **zvp)
{
zvol_state_t *zv;
struct zvol_state_os *zso;
uint64_t volmode;
int ret;
- if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
- return (NULL);
+ ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL);
+ if (ret)
+ return (ret);
if (volmode == ZFS_VOLMODE_DEFAULT)
volmode = zvol_volmode;
if (volmode == ZFS_VOLMODE_NONE)
- return (NULL);
+ return (0);
zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
zv->zv_zso = zso;
zv->zv_volmode = volmode;
+ zv->zv_volsize = volsize;
zv->zv_volblocksize = volblocksize;
list_link_init(&zv->zv_next);
@@ -1396,12 +1399,13 @@ zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
ZVOL_DEV_NAME, (dev & MINORMASK));
- return (zv);
+ *zvp = zv;
+ return (ret);
out_kmem:
kmem_free(zso, sizeof (struct zvol_state_os));
kmem_free(zv, sizeof (zvol_state_t));
- return (NULL);
+ return (ret);
}
/*
@@ -1562,7 +1566,7 @@ zvol_os_add_disk(struct gendisk *disk)
int
zvol_os_create_minor(const char *name)
{
- zvol_state_t *zv;
+ zvol_state_t *zv = NULL;
objset_t *os;
dmu_object_info_t *doi;
uint64_t volsize;
@@ -1611,18 +1615,16 @@ zvol_os_create_minor(const char *name)
if (error)
goto out_dmu_objset_disown;
- zv = zvol_alloc(MKDEV(zvol_major, minor), name,
- doi->doi_data_block_size);
- if (zv == NULL) {
- error = SET_ERROR(EAGAIN);
+ error = zvol_alloc(MKDEV(zvol_major, minor), name,
+ volsize, doi->doi_data_block_size, &zv);
+ if (error || zv == NULL)
goto out_dmu_objset_disown;
- }
+
zv->zv_hash = hash;
if (dmu_objset_is_snapshot(os))
zv->zv_flags |= ZVOL_RDONLY;
- zv->zv_volsize = volsize;
zv->zv_objset = os;
/* Default */
@@ -1689,7 +1691,7 @@ out_doi:
* zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
* directly as well.
*/
- if (error == 0) {
+ if (error == 0 && zv) {
rw_enter(&zvol_state_lock, RW_WRITER);
zvol_insert(zv);
rw_exit(&zvol_state_lock);
@@ -1701,7 +1703,7 @@ out_doi:
return (error);
}
-void
+int
zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
{
int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
@@ -1728,6 +1730,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
set_disk_ro(zv->zv_zso->zvo_disk, readonly);
dataset_kstats_rename(&zv->zv_kstat, newname);
+
+ return (0);
}
void