aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/os/linux/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/os/linux/zfs')
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/abd_os.c19
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c12
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c11
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c10
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c24
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c26
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c98
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c217
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c82
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c55
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c261
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c101
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c4
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c342
16 files changed, 866 insertions, 400 deletions
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
index e1140b31a97a..18f2426fbbfc 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
@@ -23,6 +23,7 @@
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
*/
/*
@@ -256,10 +257,6 @@ abd_unmark_zfs_page(struct page *page)
#ifndef CONFIG_HIGHMEM
-#ifndef __GFP_RECLAIM
-#define __GFP_RECLAIM __GFP_WAIT
-#endif
-
/*
* The goal is to minimize fragmentation by preferentially populating ABDs
* with higher order compound pages from a single zone. Allocation size is
@@ -867,9 +864,9 @@ abd_iter_advance(struct abd_iter *aiter, size_t amount)
* Ensure that last chunk is not in use. abd_iterate_*() must clear
* this state (directly or abd_iter_unmap()) before advancing.
*/
- ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0P(aiter->iter_mapaddr);
ASSERT0(aiter->iter_mapsize);
- ASSERT3P(aiter->iter_page, ==, NULL);
+ ASSERT0P(aiter->iter_page);
ASSERT0(aiter->iter_page_doff);
ASSERT0(aiter->iter_page_dsize);
@@ -901,7 +898,7 @@ abd_iter_map(struct abd_iter *aiter)
void *paddr;
size_t offset = 0;
- ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0P(aiter->iter_mapaddr);
ASSERT0(aiter->iter_mapsize);
/* There's nothing left to iterate over, so do nothing */
@@ -1113,6 +1110,14 @@ abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
#define ABD_ITER_PAGE_SIZE(page) (PAGESIZE)
#endif
+#ifndef nth_page
+/*
+ * Since 6.18 nth_page() no longer exists, and is no longer required to iterate
+ * within a single SG entry, so we replace it with a simple addition.
+ */
+#define nth_page(p, n) ((p)+(n))
+#endif
+
void
abd_iter_page(struct abd_iter *aiter)
{
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
index 154ca22d9513..1bd3500e9f66 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@@ -471,13 +471,17 @@ vdev_disk_close(vdev_t *v)
if (v->vdev_reopening || vd == NULL)
return;
+ rw_enter(&vd->vd_lock, RW_WRITER);
+
if (vd->vd_bdh != NULL)
vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
zfs_vdev_holder);
+ v->vdev_tsd = NULL;
+
+ rw_exit(&vd->vd_lock);
rw_destroy(&vd->vd_lock);
kmem_free(vd, sizeof (vdev_disk_t));
- v->vdev_tsd = NULL;
}
/*
@@ -552,7 +556,7 @@ vdev_bio_associate_blkg(struct bio *bio)
#endif
ASSERT3P(q, !=, NULL);
- ASSERT3P(bio->bi_blkg, ==, NULL);
+ ASSERT0P(bio->bi_blkg);
if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
bio->bi_blkg = q->root_blkg;
@@ -574,7 +578,7 @@ vdev_bio_set_dev(struct bio *bio, struct block_device *bdev)
bio->bi_bdev = bdev;
ASSERT3P(q, !=, NULL);
- ASSERT3P(bio->bi_blkg, ==, NULL);
+ ASSERT0P(bio->bi_blkg);
if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
bio->bi_blkg = q->root_blkg;
@@ -806,7 +810,7 @@ vbio_completion(struct bio *bio)
* here; instead we stash vbio on the zio and take care of it in the
* done callback.
*/
- ASSERT3P(zio->io_bio, ==, NULL);
+ ASSERT0P(zio->io_bio);
zio->io_bio = vbio;
zio_delay_interrupt(zio);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
index 1b169122f25b..4c929a4642b1 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
@@ -1447,7 +1447,8 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
if (aclnode->z_ace_count == 0)
continue;
dmu_write(zfsvfs->z_os, aoid, off,
- aclnode->z_size, aclnode->z_acldata, tx);
+ aclnode->z_size, aclnode->z_acldata, tx,
+ DMU_READ_NO_PREFETCH);
off += aclnode->z_size;
}
} else {
@@ -1900,7 +1901,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
if (!(flag & IS_ROOT_NODE) &&
(dzp->z_pflags & ZFS_INHERIT_ACE) &&
!(dzp->z_pflags & ZFS_XATTR)) {
- VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
+ VERIFY0(zfs_acl_node_read(dzp, B_TRUE,
&paclp, B_FALSE));
acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
vap->va_mode, paclp, acl_ids->z_mode, &need_chmod);
@@ -2204,8 +2205,8 @@ top:
}
error = zfs_aclset_common(zp, aclp, cr, tx);
- ASSERT(error == 0);
- ASSERT(zp->z_acl_cached == NULL);
+ ASSERT0(error);
+ ASSERT0P(zp->z_acl_cached);
zp->z_acl_cached = aclp;
if (fuid_dirtied)
@@ -2524,7 +2525,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
* Also note: DOS R/O is ignored for directories.
*/
if ((v4_mode & WRITE_MASK_DATA) &&
- S_ISDIR(ZTOI(zp)->i_mode) &&
+ !S_ISDIR(ZTOI(zp)->i_mode) &&
(zp->z_pflags & ZFS_READONLY)) {
return (SET_ERROR(EPERM));
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
index 84b25cb2c5ac..fb4de50480a3 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
@@ -494,9 +494,9 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
if (!creation)
now = current_time(ip);
zp = ITOZ(ip);
- ASSERT3P(zp->z_dirlocks, ==, NULL);
- ASSERT3P(zp->z_acl_cached, ==, NULL);
- ASSERT3P(zp->z_xattr_cached, ==, NULL);
+ ASSERT0P(zp->z_dirlocks);
+ ASSERT0P(zp->z_acl_cached);
+ ASSERT0P(zp->z_xattr_cached);
zp->z_id = id;
zp->z_unlinked = B_FALSE;
zp->z_atime_dirty = B_FALSE;
@@ -511,8 +511,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
zp->z_pflags = 0;
zp->z_mode = 0;
zp->z_sync_cnt = 0;
- zp->z_sync_writes_cnt = 0;
- zp->z_async_writes_cnt = 0;
ip->i_generation = 0;
ip->i_ino = id;
ip->i_mode = (S_IFDIR | S_IRWXUGO);
@@ -592,7 +590,7 @@ zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
int
zfsctl_create(zfsvfs_t *zfsvfs)
{
- ASSERT(zfsvfs->z_ctldir == NULL);
+ ASSERT0P(zfsvfs->z_ctldir);
zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
&zpl_fops_root, &zpl_ops_root, 0);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
index 2f935bb3fc8c..e8de536606e2 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
@@ -463,7 +463,7 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
zfsvfs_t *zfsvfs = ZTOZSB(zp);
ASSERT(zp->z_unlinked);
- ASSERT(ZTOI(zp)->i_nlink == 0);
+ ASSERT0(ZTOI(zp)->i_nlink);
VERIFY3U(0, ==,
zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
@@ -662,8 +662,8 @@ zfs_rmnode(znode_t *zp)
uint64_t links;
int error;
- ASSERT(ZTOI(zp)->i_nlink == 0);
- ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0);
+ ASSERT0(ZTOI(zp)->i_nlink);
+ ASSERT0(atomic_read(&ZTOI(zp)->i_count));
/*
* If this is an attribute directory, purge its contents.
@@ -710,7 +710,7 @@ zfs_rmnode(znode_t *zp)
&xattr_obj, sizeof (xattr_obj));
if (error == 0 && xattr_obj) {
error = zfs_zget(zfsvfs, xattr_obj, &xzp);
- ASSERT(error == 0);
+ ASSERT0(error);
}
acl_obj = zfs_external_acl(zp);
@@ -744,12 +744,12 @@ zfs_rmnode(znode_t *zp)
}
if (xzp) {
- ASSERT(error == 0);
+ ASSERT0(error);
mutex_enter(&xzp->z_lock);
xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
clear_nlink(ZTOI(xzp)); /* no more links to it */
links = 0;
- VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ VERIFY0(sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
&links, sizeof (links), tx));
mutex_exit(&xzp->z_lock);
zfs_unlinked_add(xzp, tx);
@@ -872,7 +872,7 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
ctime);
}
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
mutex_exit(&zp->z_lock);
@@ -894,7 +894,7 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
&dzp->z_pflags, sizeof (dzp->z_pflags));
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
mutex_exit(&dzp->z_lock);
return (0);
@@ -986,7 +986,7 @@ zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp)
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
NULL, &links, sizeof (links));
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- ASSERT3U(error, ==, 0);
+ ASSERT0(error);
if (unlinkedp != NULL)
*unlinkedp = unlinked;
@@ -1058,7 +1058,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
/* The only error is !zfs_dirempty() and we checked earlier. */
error = zfs_drop_nlink_locked(zp, tx, &unlinked);
- ASSERT3U(error, ==, 0);
+ ASSERT0(error);
mutex_exit(&zp->z_lock);
} else {
error = zfs_dropname(dl, zp, dzp, tx, flag);
@@ -1083,7 +1083,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
mutex_exit(&dzp->z_lock);
if (unlinkedp != NULL)
@@ -1167,7 +1167,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr)
ASSERT(error == 0 && parent == zp->z_id);
#endif
- VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+ VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
sizeof (xzp->z_id), tx));
if (!zp->z_unlinked)
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
index d193eb80dca2..3fdcdbac6f68 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
@@ -115,8 +115,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
*/
int
zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
- ssize_t *resid)
+ uint8_t ashift, ssize_t *resid)
{
+ (void) ashift;
ssize_t rc;
rc = kernel_write(fp, buf, count, &off);
@@ -260,24 +261,12 @@ zfs_file_fsync(zfs_file_t *filp, int flags)
{
int datasync = 0;
int error;
- int fstrans;
if (flags & O_DSYNC)
datasync = 1;
- /*
- * May enter XFS which generates a warning when PF_FSTRANS is set.
- * To avoid this the flag is cleared over vfs_sync() and then reset.
- */
- fstrans = __spl_pf_fstrans_check();
- if (fstrans)
- current->flags &= ~(__SPL_PF_FSTRANS);
-
error = -vfs_fsync(filp, datasync);
- if (fstrans)
- current->flags |= __SPL_PF_FSTRANS;
-
return (error);
}
@@ -292,14 +281,6 @@ int
zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len)
{
/*
- * May enter XFS which generates a warning when PF_FSTRANS is set.
- * To avoid this the flag is cleared over vfs_sync() and then reset.
- */
- int fstrans = __spl_pf_fstrans_check();
- if (fstrans)
- current->flags &= ~(__SPL_PF_FSTRANS);
-
- /*
* When supported by the underlying file system preferentially
* use the fallocate() callback to preallocate the space.
*/
@@ -308,9 +289,6 @@ zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len)
error = -fp->f_op->fallocate(fp,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len);
- if (fstrans)
- current->flags |= __SPL_PF_FSTRANS;
-
if (error)
return (SET_ERROR(error));
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
index 1c187d7b9cab..895d80b2d79e 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
@@ -223,7 +223,7 @@ zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name)
{
/* zko_default_group.attrs must be NULL terminated */
ASSERT(zkobj->zko_default_group.attrs != NULL);
- ASSERT(zkobj->zko_default_group.attrs[zkobj->zko_attr_count] == NULL);
+ ASSERT0P(zkobj->zko_default_group.attrs[zkobj->zko_attr_count]);
kobject_init(&zkobj->zko_kobj, &zkobj->zko_kobj_type);
return (kobject_add(&zkobj->zko_kobj, parent, name));
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
index a3837f784668..8a7d14ab6119 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -279,19 +279,14 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr)
return (err);
/*
- * If the pool is suspended, just return an error. This is to help
- * with shutting down with pools suspended, as we don't want to block
- * in that case.
+ * Sync any pending writes, but do not block if the pool is suspended.
+ * This is to help with shutting down with pools suspended, as we don't
+ * want to block in that case.
*/
- if (spa_suspended(zfsvfs->z_os->os_spa)) {
- zfs_exit(zfsvfs, FTAG);
- return (SET_ERROR(EIO));
- }
-
- zil_commit(zfsvfs->z_log, 0);
+ err = zil_commit_flags(zfsvfs->z_log, 0, ZIL_COMMIT_NOW);
zfs_exit(zfsvfs, FTAG);
- return (0);
+ return (err);
}
static void
@@ -883,7 +878,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
* operations out since we closed the ZIL.
*/
if (mounting) {
- ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+ ASSERT0P(zfsvfs->z_kstat.dk_kstats);
error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
if (error)
return (error);
@@ -1217,6 +1212,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
}
/*
+ * Dentry and inode caches referenced by a task in non-root memcg are
+ * not going to be scanned by the kernel-provided shrinker. So, if
+ * kernel prunes nothing, fall back to this manual walk to free dnodes.
+ * To avoid scanning the same znodes multiple times they are always rotated
+ * to the end of the z_all_znodes list. New znodes are inserted at the
+ * end of the list so we're always scanning the oldest znodes first.
+ */
+static int
+zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
+{
+ znode_t **zp_array, *zp;
+ int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
+ int objects = 0;
+ int i = 0, j = 0;
+
+ zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
+
+ if ((i++ > nr_to_scan) || (j >= max_array))
+ break;
+
+ ASSERT(list_link_active(&zp->z_link_node));
+ list_remove(&zfsvfs->z_all_znodes, zp);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+
+ /* Skip active znodes and .zfs entries */
+ if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
+ continue;
+
+ if (igrab(ZTOI(zp)) == NULL)
+ continue;
+
+ zp_array[j] = zp;
+ j++;
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ for (i = 0; i < j; i++) {
+ zp = zp_array[i];
+
+ ASSERT3P(zp, !=, NULL);
+ d_prune_aliases(ZTOI(zp));
+
+ if (atomic_read(&ZTOI(zp)->i_count) == 1)
+ objects++;
+
+ zrele(zp);
+ }
+
+ vmem_free(zp_array, max_array * sizeof (znode_t *));
+
+ return (objects);
+}
+
+/*
* The ARC has requested that the filesystem drop entries from the dentry
* and inode caches. This can occur when the ARC needs to free meta data
* blocks but can't because they are all pinned by entries in these caches.
@@ -1267,6 +1319,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
*objects = (*shrinker->scan_objects)(shrinker, &sc);
#endif
+ /*
+ * Fall back to zfs_prune_aliases if kernel's shrinker did nothing
+ * due to dentry and inode caches being referenced by a task running
+ * in non-root memcg.
+ */
+ if (*objects == 0)
+ *objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
+
zfs_exit(zfsvfs, FTAG);
dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
@@ -1496,6 +1556,12 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
sb->s_xattr = zpl_xattr_handlers;
sb->s_export_op = &zpl_export_operations;
+#ifdef HAVE_SET_DEFAULT_D_OP
+ set_default_d_op(sb, &zpl_dentry_operations);
+#else
+ sb->s_d_op = &zpl_dentry_operations;
+#endif
+
/* Set features for file system. */
zfs_set_fuid_feature(zfsvfs);
@@ -1611,7 +1677,7 @@ zfs_umount(struct super_block *sb)
if (zfsvfs->z_arc_prune != NULL)
arc_remove_prune_callback(zfsvfs->z_arc_prune);
- VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
+ VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
os = zfsvfs->z_os;
/*
@@ -1737,8 +1803,8 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
ASSERT(*ipp != NULL);
if (object == ZFSCTL_INO_SNAPDIR) {
- VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp,
- 0, kcred, NULL, NULL) == 0);
+ VERIFY0(zfsctl_root_lookup(*ipp, "snapshot", ipp,
+ 0, kcred, NULL, NULL));
} else {
/*
* Must have an existing ref, so igrab()
@@ -1840,7 +1906,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
goto bail;
ds->ds_dir->dd_activity_cancelled = B_FALSE;
- VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+ VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
zfs_set_fuid_feature(zfsvfs);
zfsvfs->z_rollback_time = jiffies;
@@ -2013,7 +2079,7 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
ASSERT0(error);
- VERIFY(0 == sa_set_sa_object(os, sa_obj));
+ VERIFY0(sa_set_sa_object(os, sa_obj));
sa_register_update_callback(os, zfs_sa_upgrade);
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
index ed9721dade76..02465adf36d5 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
@@ -25,6 +25,7 @@
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2025, Klara, Inc.
*/
/* Portions Copyright 2007 Jeremy Teo */
@@ -840,8 +841,8 @@ out:
*zpp = zp;
}
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -1202,8 +1203,8 @@ out:
zfs_zrele_async(xzp);
}
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -1391,14 +1392,15 @@ out:
zfs_dirent_unlock(dl);
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
if (error != 0) {
zrele(zp);
} else {
zfs_znode_update_vfs(dzp);
zfs_znode_update_vfs(zp);
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
+
}
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -1527,8 +1529,8 @@ out:
zfs_znode_update_vfs(zp);
zrele(zp);
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -2031,10 +2033,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
goto out3;
}
- if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
- err = SET_ERROR(EPERM);
- goto out3;
- }
+ /* ZFS_READONLY will be handled in zfs_zaccess() */
/*
* Verify timestamps doesn't overflow 32 bits.
@@ -2482,10 +2481,10 @@ top:
new_mode = zp->z_mode;
}
err = zfs_acl_chown_setattr(zp);
- ASSERT(err == 0);
+ ASSERT0(err);
if (attrzp) {
err = zfs_acl_chown_setattr(attrzp);
- ASSERT(err == 0);
+ ASSERT0(err);
}
}
@@ -2599,7 +2598,7 @@ out:
if (err == 0 && xattr_count > 0) {
err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
xattr_count, tx);
- ASSERT(err2 == 0);
+ ASSERT0(err2);
}
if (aclp)
@@ -2629,8 +2628,8 @@ out:
}
out2:
- if (os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS)
+ err = zil_commit(zilog, 0);
out3:
kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
@@ -3156,7 +3155,7 @@ top:
* zfs_link_create() to add back the same entry, but with a new
* dnode (szp), should not fail.
*/
- ASSERT3P(tzp, ==, NULL);
+ ASSERT0P(tzp);
goto commit_link_tzp;
}
@@ -3234,8 +3233,8 @@ out:
zfs_dirent_unlock(sdl);
zfs_dirent_unlock(tdl);
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -3435,7 +3434,7 @@ top:
*zpp = zp;
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ error = zil_commit(zilog, 0);
} else {
zrele(zp);
}
@@ -3653,8 +3652,8 @@ top:
* operation are sync safe.
*/
if (is_tmpfile) {
- VERIFY(zap_remove_int(zfsvfs->z_os,
- zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
+ VERIFY0(zap_remove_int(zfsvfs->z_os,
+ zfsvfs->z_unlinkedobj, szp->z_id, tx));
} else {
if (flags & FIGNORECASE)
txtype |= TX_CI;
@@ -3669,18 +3668,20 @@ top:
zfs_dirent_unlock(dl);
- if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
- if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
- txg_wait_flag_t wait_flags =
- spa_get_failmode(dmu_objset_spa(zfsvfs->z_os)) ==
- ZIO_FAILURE_MODE_CONTINUE ? TXG_WAIT_SUSPEND : 0;
- error = txg_wait_synced_flags(dmu_objset_pool(zfsvfs->z_os),
- txg, wait_flags);
- if (error != 0) {
- ASSERT3U(error, ==, ESHUTDOWN);
- error = SET_ERROR(EIO);
+ if (error == 0) {
+ if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
+
+ if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+ txg_wait_flag_t wait_flags =
+ spa_get_failmode(dmu_objset_spa(zfsvfs->z_os)) ==
+ ZIO_FAILURE_MODE_CONTINUE ? TXG_WAIT_SUSPEND : 0;
+ error = txg_wait_synced_flags(
+ dmu_objset_pool(zfsvfs->z_os), txg, wait_flags);
+ if (error != 0) {
+ ASSERT3U(error, ==, ESHUTDOWN);
+ error = SET_ERROR(EIO);
+ }
}
}
@@ -3690,24 +3691,39 @@ top:
return (error);
}
-static void
-zfs_putpage_sync_commit_cb(void *arg)
+/* Finish page writeback. */
+static inline void
+zfs_page_writeback_done(struct page *pp, int err)
{
- struct page *pp = arg;
+ if (err != 0) {
+ /*
+ * Writeback failed. Re-dirty the page. It was undirtied before
+ * the IO was issued (in zfs_putpage() or write_cache_pages()).
+ * The kernel only considers writeback for dirty pages; if we
+ * don't do this, it is eligible for eviction without being
+ * written out, which we definitely don't want.
+ */
+#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
+ filemap_dirty_folio(page_mapping(pp), page_folio(pp));
+#else
+ __set_page_dirty_nobuffers(pp);
+#endif
+ }
ClearPageError(pp);
end_page_writeback(pp);
}
+/*
+ * ZIL callback for page writeback. Passes to zfs_log_write() in zfs_putpage()
+ * for syncing writes. Called when the ZIL itx has been written to the log or
+ * the whole txg syncs, or if the ZIL crashes or the pool suspends. Any failure
+ * is passed as `err`.
+ */
static void
-zfs_putpage_async_commit_cb(void *arg)
+zfs_putpage_commit_cb(void *arg, int err)
{
- struct page *pp = arg;
- znode_t *zp = ITOZ(pp->mapping->host);
-
- ClearPageError(pp);
- end_page_writeback(pp);
- atomic_dec_32(&zp->z_async_writes_cnt);
+ zfs_page_writeback_done(arg, err);
}
/*
@@ -3827,15 +3843,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
zfs_rangelock_exit(lr);
if (wbc->sync_mode != WB_SYNC_NONE) {
- /*
- * Speed up any non-sync page writebacks since
- * they may take several seconds to complete.
- * Refer to the comment in zpl_fsync() for details.
- */
- if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
- zil_commit(zfsvfs->z_log, zp->z_id);
- }
-
if (PageWriteback(pp))
#ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
folio_wait_bit(page_folio(pp), PG_writeback);
@@ -3861,8 +3868,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
* was in fact not skipped and should not be counted as if it were.
*/
wbc->pages_skipped--;
- if (!for_sync)
- atomic_inc_32(&zp->z_async_writes_cnt);
set_page_writeback(pp);
unlock_page(pp);
@@ -3874,23 +3879,21 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
err = dmu_tx_assign(tx, DMU_TX_WAIT);
if (err != 0) {
dmu_tx_abort(tx);
-#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
- filemap_dirty_folio(page_mapping(pp), page_folio(pp));
-#else
- __set_page_dirty_nobuffers(pp);
-#endif
- ClearPageError(pp);
- end_page_writeback(pp);
- if (!for_sync)
- atomic_dec_32(&zp->z_async_writes_cnt);
+ zfs_page_writeback_done(pp, err);
zfs_rangelock_exit(lr);
zfs_exit(zfsvfs, FTAG);
- return (err);
+
+ /*
+ * Don't return error for an async writeback; we've re-dirtied
+ * the page so it will be tried again some other time.
+ */
+ return (for_sync ? err : 0);
}
va = kmap(pp);
ASSERT3U(pglen, <=, PAGE_SIZE);
- dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
+ dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx,
+ DMU_READ_PREFETCH);
kunmap(pp);
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
@@ -3908,36 +3911,70 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
- boolean_t commit = B_FALSE;
- if (wbc->sync_mode != WB_SYNC_NONE) {
- /*
- * Note that this is rarely called under writepages(), because
- * writepages() normally handles the entire commit for
- * performance reasons.
- */
- commit = B_TRUE;
- } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
+ /*
+ * A note about for_sync vs wbc->sync_mode.
+ *
+ * for_sync indicates that this is a syncing writeback, that is, kernel
+ * caller expects the data to be durably stored before being notified.
+ * Often, but not always, the call was triggered by a userspace syncing
+ * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE
+ * means that that page should remain "locked" (in the writeback state)
+ * until it is definitely on disk (ie zil_commit() or spa_sync()).
+ * Otherwise, we can unlock and return as soon as it is on the
+ * in-memory ZIL.
+ *
+ * wbc->sync_mode has similar meaning. wbc is passed from the kernel to
+ * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE
+ * indicates this a regular async writeback (eg a cache eviction) and
+ * so does not need a durability guarantee, while WB_SYNC_ALL indicates
+ * a syncing op that must be waited on (by convention, we test for
+ * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over
+ * performance should there ever be a new mode that we have not yet
+ * added support for).
+ *
+ * So, why a separate for_sync field? This is because zpl_writepages()
+ * calls zfs_putpage() multiple times for a single "logical" operation.
+ * It wants all the individual pages to be for_sync==TRUE ie only
+ * unlocked once durably stored, but it only wants one call to
+ * zil_commit() at the very end, once all the pages are synced. So,
+ * it repurposes sync_mode slightly to indicate who issue and wait for
+ * the IO: for NONE, the caller to zfs_putpage() will do it, while for
+ * ALL, zfs_putpage should do it.
+ *
+ * Summary:
+ * for_sync: 0=unlock immediately; 1=unlock once on disk
+ * sync_mode: NONE=caller will commit; ALL=we will commit
+ */
+ boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE);
+
+ /*
+ * We use for_sync as the "commit" arg to zfs_log_write() (arg 7)
+ * because it is a policy flag that indicates "someone will call
+ * zil_commit() soon". for_sync=TRUE means exactly that; the only
+ * question is whether it will be us, or zpl_writepages().
+ */
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync,
+ B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);
+
+ if (!for_sync) {
/*
- * If the caller does not intend to wait synchronously
- * for this page writeback to complete and there are active
- * synchronous calls on this file, do a commit so that
- * the latter don't accidentally end up waiting for
- * our writeback to complete. Refer to the comment in
- * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
+ * Async writeback is logged and written to the DMU, so page
+ * can now be unlocked.
*/
- commit = B_TRUE;
+ zfs_page_writeback_done(pp, 0);
}
- zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
- B_FALSE, for_sync ? zfs_putpage_sync_commit_cb :
- zfs_putpage_async_commit_cb, pp);
-
dmu_tx_commit(tx);
zfs_rangelock_exit(lr);
- if (commit)
- zil_commit(zfsvfs->z_log, zp->z_id);
+ if (need_commit) {
+ err = zil_commit_flags(zfsvfs->z_log, zp->z_id, ZIL_COMMIT_NOW);
+ if (err != 0) {
+ zfs_exit(zfsvfs, FTAG);
+ return (err);
+ }
+ }
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c
index 54e60b4820f6..bcaabeb32b8a 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c
@@ -126,8 +126,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
zp->z_acl_cached = NULL;
zp->z_xattr_cached = NULL;
zp->z_xattr_parent = 0;
- zp->z_sync_writes_cnt = 0;
- zp->z_async_writes_cnt = 0;
return (0);
}
@@ -146,12 +144,9 @@ zfs_znode_cache_destructor(void *buf, void *arg)
rw_destroy(&zp->z_xattr_lock);
zfs_rangelock_fini(&zp->z_rangelock);
- ASSERT3P(zp->z_dirlocks, ==, NULL);
- ASSERT3P(zp->z_acl_cached, ==, NULL);
- ASSERT3P(zp->z_xattr_cached, ==, NULL);
-
- ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
- ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
+ ASSERT0P(zp->z_dirlocks);
+ ASSERT0P(zp->z_acl_cached);
+ ASSERT0P(zp->z_xattr_cached);
}
static int
@@ -183,13 +178,13 @@ zfs_znode_init(void)
* backed by kmalloc() when on the Linux slab in order that any
* wait_on_bit() operations on the related inode operate properly.
*/
- ASSERT(znode_cache == NULL);
+ ASSERT0P(znode_cache);
znode_cache = kmem_cache_create("zfs_znode_cache",
sizeof (znode_t), 0, zfs_znode_cache_constructor,
zfs_znode_cache_destructor, NULL, NULL, NULL,
KMC_SLAB | KMC_RECLAIMABLE);
- ASSERT(znode_hold_cache == NULL);
+ ASSERT0P(znode_hold_cache);
znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
@@ -332,10 +327,10 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
mutex_enter(&zp->z_lock);
- ASSERT(zp->z_sa_hdl == NULL);
- ASSERT(zp->z_acl_cached == NULL);
+ ASSERT0P(zp->z_sa_hdl);
+ ASSERT0P(zp->z_acl_cached);
if (sa_hdl == NULL) {
- VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+ VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp,
SA_HDL_SHARED, &zp->z_sa_hdl));
} else {
zp->z_sa_hdl = sa_hdl;
@@ -371,6 +366,12 @@ zfs_inode_alloc(struct super_block *sb, struct inode **ip)
return (0);
}
+void
+zfs_inode_free(struct inode *ip)
+{
+ kmem_cache_free(znode_cache, ITOZ(ip));
+}
+
/*
* Called in multiple places when an inode should be destroyed.
*/
@@ -395,8 +396,15 @@ zfs_inode_destroy(struct inode *ip)
nvlist_free(zp->z_xattr_cached);
zp->z_xattr_cached = NULL;
}
-
- kmem_cache_free(znode_cache, zp);
+#ifndef HAVE_SOPS_FREE_INODE
+ /*
+ * inode needs to be freed in RCU callback. If we have
+ * super_operations->free_inode, Linux kernel will do call_rcu
+ * for us. But if we don't have it, since call_rcu is GPL-only
+ * symbol, we can only free synchronously and accept the risk.
+ */
+ zfs_inode_free(ip);
+#endif
}
static void
@@ -522,9 +530,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
return (NULL);
zp = ITOZ(ip);
- ASSERT(zp->z_dirlocks == NULL);
- ASSERT3P(zp->z_acl_cached, ==, NULL);
- ASSERT3P(zp->z_xattr_cached, ==, NULL);
+ ASSERT0P(zp->z_dirlocks);
+ ASSERT0P(zp->z_acl_cached);
+ ASSERT0P(zp->z_xattr_cached);
zp->z_unlinked = B_FALSE;
zp->z_atime_dirty = B_FALSE;
zp->z_is_ctldir = B_FALSE;
@@ -535,8 +543,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
zp->z_sync_cnt = 0;
- zp->z_sync_writes_cnt = 0;
- zp->z_async_writes_cnt = 0;
zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
@@ -605,7 +611,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
* processing so do not hash unlinked znodes.
*/
if (links > 0)
- VERIFY3S(insert_inode_locked(ip), ==, 0);
+ VERIFY0(insert_inode_locked(ip));
mutex_enter(&zfsvfs->z_znodes_lock);
list_insert_tail(&zfsvfs->z_all_znodes, zp);
@@ -805,7 +811,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
}
/* Now add in all of the "SA" attributes */
- VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+ VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
&sa_hdl));
/*
@@ -895,7 +901,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
acl_ids->z_fuid, acl_ids->z_fgid);
}
- VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+ VERIFY0(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx));
if (!(flag & IS_ROOT_NODE)) {
/*
@@ -1194,7 +1200,7 @@ zfs_rezget(znode_t *zp)
}
rw_exit(&zp->z_xattr_lock);
- ASSERT(zp->z_sa_hdl == NULL);
+ ASSERT0P(zp->z_sa_hdl);
err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
if (err) {
zfs_znode_hold_exit(zfsvfs, zh);
@@ -1308,9 +1314,9 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
zh = zfs_znode_hold_enter(zfsvfs, obj);
if (acl_obj) {
VERIFY(!zp->z_is_sa);
- VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+ VERIFY0(dmu_object_free(os, acl_obj, tx));
}
- VERIFY(0 == dmu_object_free(os, obj, tx));
+ VERIFY0(dmu_object_free(os, obj, tx));
zfs_znode_dmu_fini(zp);
zfs_znode_hold_exit(zfsvfs, zh);
}
@@ -1530,7 +1536,7 @@ zfs_extend(znode_t *zp, uint64_t end)
zp->z_size = end;
- VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
+ VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
&zp->z_size, sizeof (zp->z_size), tx));
zfs_rangelock_exit(lr);
@@ -1720,7 +1726,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
NULL, &zp->z_pflags, 8);
}
- VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
+ VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
dmu_tx_commit(tx);
zfs_rangelock_exit(lr);
@@ -1787,7 +1793,7 @@ log:
NULL, &zp->z_pflags, 8);
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
@@ -1834,7 +1840,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
moid = MASTER_NODE_OBJ;
error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
DMU_OT_NONE, 0, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
/*
* Set starting attributes.
@@ -1847,7 +1853,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
const char *name;
ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
- VERIFY(nvpair_value_uint64(elem, &val) == 0);
+ VERIFY0(nvpair_value_uint64(elem, &val));
name = nvpair_name(elem);
if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
if (val < version)
@@ -1855,7 +1861,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
} else {
error = zap_update(os, moid, name, 8, 1, &val, tx);
}
- ASSERT(error == 0);
+ ASSERT0(error);
if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
norm = val;
else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
@@ -1863,7 +1869,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
}
ASSERT(version != 0);
error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
/*
* Create zap object used for SA attribute registration
@@ -1873,7 +1879,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
DMU_OT_NONE, 0, tx);
error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
} else {
sa_obj = 0;
}
@@ -1883,7 +1889,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
/*
* Create root znode. Create minimal znode/inode/zfsvfs/sb
@@ -1916,7 +1922,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
&zfsvfs->z_attr_table);
- ASSERT(error == 0);
+ ASSERT0(error);
/*
* Fold case on file systems that are always or sometimes case
@@ -1940,12 +1946,12 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
}
- VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
+ VERIFY0(zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
cr, NULL, &acl_ids, zfs_init_idmap));
zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
ASSERT3P(zp, ==, rootzp);
error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
zfs_acl_ids_free(&acl_ids);
atomic_set(&ZTOI(rootzp)->i_count, 0);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
index 48dae79a2373..81ac26cb0c93 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
@@ -202,7 +202,7 @@ zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
return (!!dentry->d_inode);
}
-static dentry_operations_t zpl_dops_snapdirs = {
+static const struct dentry_operations zpl_dops_snapdirs = {
/*
* Auto mounting of snapshots is only supported for 2.6.37 and
* newer kernels. Prior to this kernel the ops->follow_link()
@@ -215,6 +215,51 @@ static dentry_operations_t zpl_dops_snapdirs = {
.d_revalidate = zpl_snapdir_revalidate,
};
+/*
+ * For the .zfs control directory to work properly we must be able to override
+ * the default operations table and register custom .d_automount and
+ * .d_revalidate callbacks.
+ */
+static void
+set_snapdir_dentry_ops(struct dentry *dentry, unsigned int extraflags) {
+ static const unsigned int op_flags =
+ DCACHE_OP_HASH | DCACHE_OP_COMPARE |
+ DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE |
+ DCACHE_OP_PRUNE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_REAL;
+
+#ifdef HAVE_D_SET_D_OP
+ /*
+ * d_set_d_op() will set the DCACHE_OP_ flags according to what it
+ * finds in the passed dentry_operations, so we don't have to.
+ *
+ * We clear the flags and the old op table before calling d_set_d_op()
+ * because issues a warning when the dentry operations table is already
+ * set.
+ */
+ dentry->d_op = NULL;
+ dentry->d_flags &= ~op_flags;
+ d_set_d_op(dentry, &zpl_dops_snapdirs);
+ dentry->d_flags |= extraflags;
+#else
+ /*
+ * Since 6.17 there's no exported way to modify dentry ops, so we have
+ * to reach in and do it ourselves. This should be safe for our very
+ * narrow use case, which is to create or splice in an entry to give
+ * access to a snapshot.
+ *
+ * We need to set the op flags directly. We hardcode
+ * DCACHE_OP_REVALIDATE because that's the only operation we have; if
+ * we ever extend zpl_dops_snapdirs we will need to update the op flags
+ * to match.
+ */
+ spin_lock(&dentry->d_lock);
+ dentry->d_op = &zpl_dops_snapdirs;
+ dentry->d_flags &= ~op_flags;
+ dentry->d_flags |= DCACHE_OP_REVALIDATE | extraflags;
+ spin_unlock(&dentry->d_lock);
+#endif
+}
+
static struct dentry *
zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
unsigned int flags)
@@ -236,10 +281,7 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
return (ERR_PTR(error));
ASSERT(error == 0 || ip == NULL);
- d_clear_d_op(dentry);
- d_set_d_op(dentry, &zpl_dops_snapdirs);
- dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
-
+ set_snapdir_dentry_ops(dentry, DCACHE_NEED_AUTOMOUNT);
return (d_splice_alias(ip, dentry));
}
@@ -373,8 +415,7 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
if (error == 0) {
- d_clear_d_op(dentry);
- d_set_d_op(dentry, &zpl_dops_snapdirs);
+ set_snapdir_dentry_ops(dentry, 0);
d_instantiate(dentry, ip);
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
index 1a82c13e1523..f7691c02d163 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
@@ -22,6 +22,8 @@
/*
* Copyright (c) 2011, Lawrence Livermore National Security, LLC.
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2025, Klara, Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
*/
@@ -106,60 +108,52 @@ zpl_iterate(struct file *filp, struct dir_context *ctx)
return (error);
}
+static inline int
+zpl_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc, void *data);
+
static int
zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
struct inode *inode = filp->f_mapping->host;
znode_t *zp = ITOZ(inode);
- zfsvfs_t *zfsvfs = ITOZSB(inode);
cred_t *cr = CRED();
int error;
fstrans_cookie_t cookie;
/*
- * The variables z_sync_writes_cnt and z_async_writes_cnt work in
- * tandem so that sync writes can detect if there are any non-sync
- * writes going on and vice-versa. The "vice-versa" part to this logic
- * is located in zfs_putpage() where non-sync writes check if there are
- * any ongoing sync writes. If any sync and non-sync writes overlap,
- * we do a commit to complete the non-sync writes since the latter can
- * potentially take several seconds to complete and thus block sync
- * writes in the upcoming call to filemap_write_and_wait_range().
+ * Force dirty pages in the range out to the DMU and the log, ready
+ * for zil_commit() to write down.
+ *
+ * We call write_cache_pages() directly to ensure that zpl_putpage() is
+ * called with the flags we need. We need WB_SYNC_NONE to avoid a call
+ * to zil_commit() (since we're doing this as a kind of pre-sync); but
+ * we do need for_sync so that the pages remain in writeback until
+ * they're on disk, and so that we get an error if the DMU write fails.
*/
- atomic_inc_32(&zp->z_sync_writes_cnt);
- /*
- * If the following check does not detect an overlapping non-sync write
- * (say because it's just about to start), then it is guaranteed that
- * the non-sync write will detect this sync write. This is because we
- * always increment z_sync_writes_cnt / z_async_writes_cnt before doing
- * the check on z_async_writes_cnt / z_sync_writes_cnt here and in
- * zfs_putpage() respectively.
- */
- if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
- if ((error = zpl_enter(zfsvfs, FTAG)) != 0) {
- atomic_dec_32(&zp->z_sync_writes_cnt);
+ if (filemap_range_has_page(inode->i_mapping, start, end)) {
+ int for_sync = 1;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = LONG_MAX,
+ .range_start = start,
+ .range_end = end,
+ };
+ error =
+ zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync);
+ if (error != 0) {
+ /*
+ * Unclear what state things are in. zfs_putpage() will
+ * ensure the pages remain dirty if they haven't been
+ * written down to the DMU, but because there may be
+ * nothing logged, we can't assume that zfs_sync() ->
+ * zil_commit() will give us a useful error. It's
+ * safest if we just error out here.
+ */
return (error);
}
- zil_commit(zfsvfs->z_log, zp->z_id);
- zpl_exit(zfsvfs, FTAG);
}
- error = filemap_write_and_wait_range(inode->i_mapping, start, end);
-
- /*
- * The sync write is not complete yet but we decrement
- * z_sync_writes_cnt since zfs_fsync() increments and decrements
- * it internally. If a non-sync write starts just after the decrement
- * operation but before we call zfs_fsync(), it may not detect this
- * overlapping sync write but it does not matter since we have already
- * gone past filemap_write_and_wait_range() and we won't block due to
- * the non-sync write.
- */
- atomic_dec_32(&zp->z_sync_writes_cnt);
-
- if (error)
- return (error);
-
crhold(cr);
cookie = spl_fstrans_mark();
error = -zfs_fsync(zp, datasync, cr);
@@ -485,6 +479,7 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
return (ret);
}
+#ifdef HAVE_WRITE_CACHE_PAGES
#ifdef HAVE_WRITEPAGE_T_FOLIO
static int
zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
@@ -506,6 +501,78 @@ zpl_write_cache_pages(struct address_space *mapping,
#endif
return (result);
}
+#else
+static inline int
+zpl_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc, void *data)
+{
+ pgoff_t start = wbc->range_start >> PAGE_SHIFT;
+ pgoff_t end = wbc->range_end >> PAGE_SHIFT;
+
+ struct folio_batch fbatch;
+ folio_batch_init(&fbatch);
+
+ /*
+ * This atomically (-ish) tags all DIRTY pages in the range with
+ * TOWRITE, allowing users to continue dirtying or undirtying pages
+ * while we get on with writeback, without us treading on each other.
+ */
+ tag_pages_for_writeback(mapping, start, end);
+
+ int err = 0;
+ unsigned int npages;
+
+ /*
+ * Grab references to the TOWRITE pages just flagged. This may not get
+ * all of them, so we do it in a loop until there are none left.
+ */
+ while ((npages = filemap_get_folios_tag(mapping, &start, end,
+ PAGECACHE_TAG_TOWRITE, &fbatch)) != 0) {
+
+ /* Loop over each page and write it out. */
+ struct folio *folio;
+ while ((folio = folio_batch_next(&fbatch)) != NULL) {
+ folio_lock(folio);
+
+ /*
+ * If the folio has been remapped, or is no longer
+ * dirty, then there's nothing to do.
+ */
+ if (folio->mapping != mapping ||
+ !folio_test_dirty(folio)) {
+ folio_unlock(folio);
+ continue;
+ }
+
+ /*
+ * If writeback is already in progress, wait for it to
+ * finish. We continue after this even if the page
+ * ends up clean; zfs_putpage() will skip it if no
+ * further work is required.
+ */
+ while (folio_test_writeback(folio))
+ folio_wait_bit(folio, PG_writeback);
+
+ /*
+ * Write it out and collect any error. zfs_putpage()
+ * will clear the TOWRITE and DIRTY flags, and return
+ * with the page unlocked.
+ */
+ int ferr = zpl_putpage(&folio->page, wbc, data);
+ if (err == 0 && ferr != 0)
+ err = ferr;
+
+ /* Housekeeping for the caller. */
+ wbc->nr_to_write -= folio_nr_pages(folio);
+ }
+
+ /* Release any remaining references on the batch. */
+ folio_batch_release(&fbatch);
+ }
+
+ return (err);
+}
+#endif
static int
zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
@@ -535,11 +602,30 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
if (sync_mode != wbc->sync_mode) {
if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
return (result);
- if (zfsvfs->z_log != NULL)
- zil_commit(zfsvfs->z_log, zp->z_id);
+
+ if (zfsvfs->z_log != NULL) {
+ /*
+ * We don't want to block here if the pool suspends,
+ * because this is not a syncing op by itself, but
+ * might be part of one that the caller will
+ * coordinate.
+ */
+ result = -zil_commit_flags(zfsvfs->z_log, zp->z_id,
+ ZIL_COMMIT_NOW);
+ }
+
zpl_exit(zfsvfs, FTAG);
/*
+ * If zil_commit_flags() failed, it's unclear what state things
+ * are currently in. putpage() has written back out what it can
+ * to the DMU, but it may not be on disk. We have little choice
+ * but to escape.
+ */
+ if (result != 0)
+ return (result);
+
+ /*
* We need to call write_cache_pages() again (we can't just
* return after the commit) because the previous call in
* non-SYNC mode does not guarantee that we got all the dirty
@@ -725,28 +811,44 @@ zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
return (error);
}
-#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
-#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
+#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL)
+#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | FS_PROJINHERIT_FL)
+
+
+static struct {
+ uint64_t zfs_flag;
+ uint32_t fs_flag;
+ uint32_t xflag;
+} flags_lookup[] = {
+ {ZFS_IMMUTABLE, FS_IMMUTABLE_FL, FS_XFLAG_IMMUTABLE},
+ {ZFS_APPENDONLY, FS_APPEND_FL, FS_XFLAG_APPEND},
+ {ZFS_NODUMP, FS_NODUMP_FL, FS_XFLAG_NODUMP},
+ {ZFS_PROJINHERIT, FS_PROJINHERIT_FL, FS_XFLAG_PROJINHERIT}
+};
static uint32_t
__zpl_ioctl_getflags(struct inode *ip)
{
uint64_t zfs_flags = ITOZ(ip)->z_pflags;
uint32_t ioctl_flags = 0;
+ for (int i = 0; i < ARRAY_SIZE(flags_lookup); i++)
+ if (zfs_flags & flags_lookup[i].zfs_flag)
+ ioctl_flags |= flags_lookup[i].fs_flag;
- if (zfs_flags & ZFS_IMMUTABLE)
- ioctl_flags |= FS_IMMUTABLE_FL;
-
- if (zfs_flags & ZFS_APPENDONLY)
- ioctl_flags |= FS_APPEND_FL;
+ return (ioctl_flags);
+}
- if (zfs_flags & ZFS_NODUMP)
- ioctl_flags |= FS_NODUMP_FL;
+static uint32_t
+__zpl_ioctl_getxflags(struct inode *ip)
+{
+ uint64_t zfs_flags = ITOZ(ip)->z_pflags;
+ uint32_t ioctl_flags = 0;
- if (zfs_flags & ZFS_PROJINHERIT)
- ioctl_flags |= ZFS_PROJINHERIT_FL;
+ for (int i = 0; i < ARRAY_SIZE(flags_lookup); i++)
+ if (zfs_flags & flags_lookup[i].zfs_flag)
+ ioctl_flags |= flags_lookup[i].xflag;
- return (ioctl_flags & ZFS_FL_USER_VISIBLE);
+ return (ioctl_flags);
}
/*
@@ -760,6 +862,7 @@ zpl_ioctl_getflags(struct file *filp, void __user *arg)
int err;
flags = __zpl_ioctl_getflags(file_inode(filp));
+ flags = flags & ZFS_FL_USER_VISIBLE;
err = copy_to_user(arg, &flags, sizeof (flags));
return (err);
@@ -783,7 +886,7 @@ __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
xoptattr_t *xoap;
if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |
- ZFS_PROJINHERIT_FL))
+ FS_PROJINHERIT_FL))
return (-EOPNOTSUPP);
if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)
@@ -814,7 +917,51 @@ __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
xoap->xoa_appendonly);
FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP,
xoap->xoa_nodump);
- FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
+ FLAG_CHANGE(FS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
+ xoap->xoa_projinherit);
+
+#undef FLAG_CHANGE
+
+ return (0);
+}
+
+static int
+__zpl_ioctl_setxflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
+{
+ uint64_t zfs_flags = ITOZ(ip)->z_pflags;
+ xoptattr_t *xoap;
+
+ if (ioctl_flags & ~(FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND |
+ FS_XFLAG_NODUMP | FS_XFLAG_PROJINHERIT))
+ return (-EOPNOTSUPP);
+
+ if ((fchange(ioctl_flags, zfs_flags, FS_XFLAG_IMMUTABLE,
+ ZFS_IMMUTABLE) ||
+ fchange(ioctl_flags, zfs_flags, FS_XFLAG_APPEND, ZFS_APPENDONLY)) &&
+ !capable(CAP_LINUX_IMMUTABLE))
+ return (-EPERM);
+
+ if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
+ return (-EACCES);
+
+ xva_init(xva);
+ xoap = xva_getxoptattr(xva);
+
+#define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \
+ if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \
+ ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \
+ XVA_SET_REQ(xva, (xflag)); \
+ (xfield) = ((ioctl_flags & (iflag)) != 0); \
+ } \
+} while (0)
+
+ FLAG_CHANGE(FS_XFLAG_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
+ xoap->xoa_immutable);
+ FLAG_CHANGE(FS_XFLAG_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
+ xoap->xoa_appendonly);
+ FLAG_CHANGE(FS_XFLAG_NODUMP, ZFS_NODUMP, XAT_NODUMP,
+ xoap->xoa_nodump);
+ FLAG_CHANGE(FS_XFLAG_PROJINHERIT, ZFS_PROJINHERIT, XAT_PROJINHERIT,
xoap->xoa_projinherit);
#undef FLAG_CHANGE
@@ -855,7 +1002,7 @@ zpl_ioctl_getxattr(struct file *filp, void __user *arg)
struct inode *ip = file_inode(filp);
int err;
- fsx.fsx_xflags = __zpl_ioctl_getflags(ip);
+ fsx.fsx_xflags = __zpl_ioctl_getxflags(ip);
fsx.fsx_projid = ITOZ(ip)->z_projid;
err = copy_to_user(arg, &fsx, sizeof (fsx));
@@ -879,7 +1026,7 @@ zpl_ioctl_setxattr(struct file *filp, void __user *arg)
if (!zpl_is_valid_projid(fsx.fsx_projid))
return (-EINVAL);
- err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva);
+ err = __zpl_ioctl_setxflags(ip, fsx.fsx_xflags, &xva);
if (err)
return (err);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
index f9f6406f8b47..f97662d052c7 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
@@ -247,7 +247,7 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
* and fifos, but we want to know if this behavior ever changes.
*/
if (S_ISSOCK(mode) || S_ISFIFO(mode))
- ASSERT(rdev == 0);
+ ASSERT0(rdev);
crhold(cr);
vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
index a682bfd33c38..347b352506e5 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@@ -22,6 +22,8 @@
/*
* Copyright (c) 2011, Lawrence Livermore National Security, LLC.
* Copyright (c) 2023, Datto Inc. All rights reserved.
+ * Copyright (c) 2025, Klara, Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
*/
@@ -32,7 +34,22 @@
#include <sys/zpl.h>
#include <linux/iversion.h>
#include <linux/version.h>
+#include <linux/vfs_compat.h>
+/*
+ * What to do when the last reference to an inode is released. If 0, the kernel
+ * will cache it on the superblock. If 1, the inode will be freed immediately.
+ * See zpl_drop_inode().
+ */
+int zfs_delete_inode = 0;
+
+/*
+ * What to do when the last reference to a dentry is released. If 0, the kernel
+ * will cache it until the entry (file) is destroyed. If 1, the dentry will be
+ * marked for cleanup, at which time its inode reference will be released. See
+ * zpl_dentry_delete().
+ */
+int zfs_delete_dentry = 0;
static struct inode *
zpl_inode_alloc(struct super_block *sb)
@@ -45,10 +62,19 @@ zpl_inode_alloc(struct super_block *sb)
return (ip);
}
+#ifdef HAVE_SOPS_FREE_INODE
+static void
+zpl_inode_free(struct inode *ip)
+{
+ ASSERT0(atomic_read(&ip->i_count));
+ zfs_inode_free(ip);
+}
+#endif
+
static void
zpl_inode_destroy(struct inode *ip)
{
- ASSERT(atomic_read(&ip->i_count) == 0);
+ ASSERT0(atomic_read(&ip->i_count));
zfs_inode_destroy(ip);
}
@@ -68,11 +94,36 @@ zpl_dirty_inode(struct inode *ip, int flags)
}
/*
- * When ->drop_inode() is called its return value indicates if the
- * inode should be evicted from the inode cache. If the inode is
- * unhashed and has no links the default policy is to evict it
- * immediately.
+ * ->drop_inode() is called when the last reference to an inode is released.
+ * Its return value indicates if the inode should be destroyed immediately, or
+ * cached on the superblock structure.
+ *
+ * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns
+ * "destroy immediately" if the inode is unhashed and has no links (roughly: no
+ * longer exists on disk). On datasets with millions of rarely-accessed files,
+ * this can cause a large amount of memory to be "pinned" by cached inodes,
+ * which in turn pin their associated dnodes and dbufs, until the kernel starts
+ * reporting memory pressure and requests OpenZFS release some memory (see
+ * zfs_prune()).
+ *
+ * When set to 1, we call generic_delete_inode(), which always returns "destroy
+ * immediately", resulting in inodes being destroyed immediately, releasing
+ * their associated dnodes and dbufs to the dbuf cached and the ARC to be
+ * evicted as normal.
*
+ * Note that the "last reference" doesn't always mean the last _userspace_
+ * reference; the dentry cache also holds a reference, so "busy" inodes will
+ * still be kept alive that way (subject to dcache tuning).
+ */
+static int
+zpl_drop_inode(struct inode *ip)
+{
+ if (zfs_delete_inode)
+ return (generic_delete_inode(ip));
+ return (generic_drop_inode(ip));
+}
+
+/*
* The ->evict_inode() callback must minimally truncate the inode pages,
* and call clear_inode(). For 2.6.35 and later kernels this will
* simply update the inode state, with the sync occurring before the
@@ -455,9 +506,13 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg)
const struct super_operations zpl_super_operations = {
.alloc_inode = zpl_inode_alloc,
+#ifdef HAVE_SOPS_FREE_INODE
+ .free_inode = zpl_inode_free,
+#endif
.destroy_inode = zpl_inode_destroy,
.dirty_inode = zpl_dirty_inode,
.write_inode = NULL,
+ .drop_inode = zpl_drop_inode,
.evict_inode = zpl_evict_inode,
.put_super = zpl_put_super,
.sync_fs = zpl_sync_fs,
@@ -468,6 +523,35 @@ const struct super_operations zpl_super_operations = {
.show_stats = NULL,
};
+/*
+ * ->d_delete() is called when the last reference to a dentry is released. Its
+ * return value indicates if the dentry should be destroyed immediately, or
+ * retained in the dentry cache.
+ *
+ * By default (zfs_delete_dentry=0) the kernel will always cache unused
+ * entries. Each dentry holds an inode reference, so cached dentries can hold
+ * the final inode reference indefinitely, leading to the inode and its related
+ * data being pinned (see zpl_drop_inode()).
+ *
+ * When set to 1, we signal that the dentry should be destroyed immediately and
+ * never cached. This reduces memory usage, at the cost of higher overheads to
+ * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be
+ * reloaded and reinflated.
+ *
+ * Note that userspace does not have direct control over dentry references and
+ * reclaim; rather, this is part of the kernel's caching and reclaim subsystems
+ * (eg vm.vfs_cache_pressure).
+ */
+static int
+zpl_dentry_delete(const struct dentry *dentry)
+{
+ return (zfs_delete_dentry ? 1 : 0);
+}
+
+const struct dentry_operations zpl_dentry_operations = {
+ .d_delete = zpl_dentry_delete,
+};
+
struct file_system_type zpl_fs_type = {
.owner = THIS_MODULE,
.name = ZFS_DRIVER,
@@ -479,3 +563,10 @@ struct file_system_type zpl_fs_type = {
.mount = zpl_mount,
.kill_sb = zpl_kill_sb,
};
+
+ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW,
+ "Delete inodes as soon as the last reference is released.");
+
+ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW,
+ "Delete dentries from dentry cache as soon as the last reference is "
+ "released.");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
index a098197e7448..d93282db815a 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
@@ -1494,7 +1494,7 @@ zpl_posix_acl_free(void *arg)
acl_rel_head = NULL;
if (cmpxchg(&acl_rel_tail, &a->next,
&acl_rel_head) == &a->next) {
- ASSERT3P(a->next, ==, NULL);
+ ASSERT0P(a->next);
a->next = freelist;
freelist = a;
break;
@@ -1544,7 +1544,7 @@ zpl_posix_acl_release_impl(struct posix_acl *acl)
a->time = ddi_get_lbolt();
/* atomically points tail to us and get the previous tail */
prev = xchg(&acl_rel_tail, &a->next);
- ASSERT3P(*prev, ==, NULL);
+ ASSERT0P(*prev);
*prev = a;
/* if it was empty before, schedule the free task */
if (prev == &acl_rel_head)
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index 57a9711e9027..89f9bc555fcf 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -21,8 +21,8 @@
*/
/*
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
- * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Rob Norris <robn@despairlabs.com>
+ * Copyright (c) 2024, 2025, Klara, Inc.
*/
#include <sys/dataset_kstats.h>
@@ -84,8 +84,9 @@ static unsigned int zvol_blk_mq_blocks_per_thread = 8;
static inline void
zvol_end_io(struct bio *bio, struct request *rq, int error)
{
+ ASSERT3U(error, >=, 0);
if (bio) {
- bio->bi_status = errno_to_bi_status(-error);
+ bio->bi_status = errno_to_bi_status(error);
bio_endio(bio);
} else {
blk_mq_end_request(rq, errno_to_bi_status(error));
@@ -208,8 +209,14 @@ zvol_write(zv_request_t *zvr)
disk = zv->zv_zso->zvo_disk;
/* bio marked as FLUSH need to flush before write */
- if (io_is_flush(bio, rq))
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if (io_is_flush(bio, rq)) {
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if (error != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ zvol_end_io(bio, rq, -error);
+ return;
+ }
+ }
/* Some requests are just for flush and nothing else. */
if (io_size(bio, rq) == 0) {
@@ -273,8 +280,8 @@ zvol_write(zv_request_t *zvr)
dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
task_io_account_write(nwritten);
- if (sync)
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if (error == 0 && sync)
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
rw_exit(&zv->zv_suspend_lock);
@@ -282,7 +289,7 @@ zvol_write(zv_request_t *zvr)
blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
}
- zvol_end_io(bio, rq, -error);
+ zvol_end_io(bio, rq, error);
}
static void
@@ -330,16 +337,14 @@ zvol_discard(zv_request_t *zvr)
}
/*
- * Align the request to volume block boundaries when a secure erase is
- * not required. This will prevent dnode_free_range() from zeroing out
- * the unaligned parts which is slow (read-modify-write) and useless
- * since we are not freeing any space by doing so.
+ * Align the request to volume block boundaries. This will prevent
+ * dnode_free_range() from zeroing out the unaligned parts which is
+ * slow (read-modify-write) and useless since we are not freeing any
+ * space by doing so.
*/
- if (!io_is_secure_erase(bio, rq)) {
- start = P2ROUNDUP(start, zv->zv_volblocksize);
- end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
- size = end - start;
- }
+ start = P2ROUNDUP(start, zv->zv_volblocksize);
+ end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
+ size = end - start;
if (start >= end)
goto unlock;
@@ -361,7 +366,7 @@ zvol_discard(zv_request_t *zvr)
zfs_rangelock_exit(lr);
if (error == 0 && sync)
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
unlock:
rw_exit(&zv->zv_suspend_lock);
@@ -371,7 +376,7 @@ unlock:
start_time);
}
- zvol_end_io(bio, rq, -error);
+ zvol_end_io(bio, rq, error);
}
static void
@@ -449,7 +454,7 @@ zvol_read(zv_request_t *zvr)
blk_generic_end_io_acct(q, disk, READ, bio, start_time);
}
- zvol_end_io(bio, rq, -error);
+ zvol_end_io(bio, rq, error);
}
static void
@@ -460,6 +465,24 @@ zvol_read_task(void *arg)
zv_request_task_free(task);
}
+/*
+ * Note:
+ *
+ * The kernel uses different enum names for the IO opcode, depending on the
+ * kernel version ('req_opf', 'req_op'). To sidestep this, use macros rather
+ * than inline functions for these checks.
+ */
+/* Should this IO go down the zvol write path? */
+#define ZVOL_OP_IS_WRITE(op) \
+ (op == REQ_OP_WRITE || \
+ op == REQ_OP_FLUSH || \
+ op == REQ_OP_DISCARD)
+
+/* Is this IO type supported by zvols? */
+#define ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op))
+
+/* Get the IO opcode */
+#define ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq))
/*
* Process a BIO or request
@@ -477,10 +500,36 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
fstrans_cookie_t cookie = spl_fstrans_mark();
uint64_t offset = io_offset(bio, rq);
uint64_t size = io_size(bio, rq);
- int rw = io_data_dir(bio, rq);
+ int rw;
+
+ if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) {
+ zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x",
+ rq != NULL ? "request" : "BIO",
+ ZVOL_OP(bio, rq),
+ rq != NULL ? rq->cmd_flags : bio->bi_opf);
+ ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)));
+ zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP));
+ goto out;
+ }
+
+ if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) {
+ rw = WRITE;
+ } else {
+ rw = READ;
+ }
+
+ /*
+ * Sanity check
+ *
+ * If we're a BIO, check our rw matches the kernel's
+ * bio_data_dir(bio) rw. We need to check because we support fewer
+ * IO operations, and want to verify that what we think are reads and
+ * writes from those operations match what the kernel thinks.
+ */
+ ASSERT(rq != NULL || rw == bio_data_dir(bio));
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
- zvol_end_io(bio, rq, -SET_ERROR(ENXIO));
+ zvol_end_io(bio, rq, SET_ERROR(ENXIO));
goto out;
}
@@ -499,7 +548,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
(long long unsigned)offset,
(long unsigned)size);
- zvol_end_io(bio, rq, -SET_ERROR(EIO));
+ zvol_end_io(bio, rq, SET_ERROR(EIO));
goto out;
}
@@ -512,8 +561,8 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
#ifdef HAVE_BLK_MQ_RQ_HCTX
blk_mq_hw_queue = rq->mq_hctx->queue_num;
#else
- blk_mq_hw_queue =
- rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
+ blk_mq_hw_queue = rq->q->queue_hw_ctx[
+ rq->q->mq_map[raw_smp_processor_id()]]->queue_num;
#endif
taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
blk_mq_hw_queue);
@@ -521,7 +570,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
if (rw == WRITE) {
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
- zvol_end_io(bio, rq, -SET_ERROR(EROFS));
+ zvol_end_io(bio, rq, SET_ERROR(EROFS));
goto out;
}
@@ -582,7 +631,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
* interfaces lack this functionality (they block waiting for
* the i/o to complete).
*/
- if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) {
+ if (io_is_discard(bio, rq)) {
if (force_sync) {
zvol_discard(&zvr);
} else {
@@ -672,28 +721,19 @@ zvol_open(struct block_device *bdev, fmode_t flag)
retry:
#endif
- rw_enter(&zvol_state_lock, RW_READER);
- /*
- * Obtain a copy of private_data under the zvol_state_lock to make
- * sure that either the result of zvol free code path setting
- * disk->private_data to NULL is observed, or zvol_os_free()
- * is not called on this zv because of the positive zv_open_count.
- */
+
#ifdef HAVE_BLK_MODE_T
- zv = disk->private_data;
+ zv = atomic_load_ptr(&disk->private_data);
#else
- zv = bdev->bd_disk->private_data;
+ zv = atomic_load_ptr(&bdev->bd_disk->private_data);
#endif
if (zv == NULL) {
- rw_exit(&zvol_state_lock);
return (-SET_ERROR(ENXIO));
}
mutex_enter(&zv->zv_state_lock);
-
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
mutex_exit(&zv->zv_state_lock);
- rw_exit(&zvol_state_lock);
return (-SET_ERROR(ENXIO));
}
@@ -705,8 +745,28 @@ retry:
if (zv->zv_open_count == 0) {
if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
mutex_exit(&zv->zv_state_lock);
+
+ /*
+ * Removal may happen while the locks are down, so
+ * we can't trust zv any longer; we have to start over.
+ */
+#ifdef HAVE_BLK_MODE_T
+ zv = atomic_load_ptr(&disk->private_data);
+#else
+ zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+#endif
+ if (zv == NULL)
+ return (-SET_ERROR(ENXIO));
+
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_exit(&zv->zv_suspend_lock);
+ return (-SET_ERROR(ENXIO));
+ }
+
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
@@ -717,7 +777,6 @@ retry:
drop_suspend = B_TRUE;
}
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -750,8 +809,8 @@ retry:
* the kernel so the only option is to return the error for
* the caller to handle it.
*/
- if (!mutex_owned(&spa_namespace_lock)) {
- if (!mutex_tryenter(&spa_namespace_lock)) {
+ if (!spa_namespace_held()) {
+ if (!spa_namespace_tryenter(FTAG)) {
mutex_exit(&zv->zv_state_lock);
rw_exit(&zv->zv_suspend_lock);
drop_suspend = B_FALSE;
@@ -775,7 +834,7 @@ retry:
error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag)));
if (drop_namespace)
- mutex_exit(&spa_namespace_lock);
+ spa_namespace_exit(FTAG);
}
if (error == 0) {
@@ -814,11 +873,11 @@ zvol_release(struct gendisk *disk, fmode_t unused)
#if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
(void) unused;
#endif
- zvol_state_t *zv;
boolean_t drop_suspend = B_TRUE;
- rw_enter(&zvol_state_lock, RW_READER);
- zv = disk->private_data;
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
+ if (zv == NULL)
+ return;
mutex_enter(&zv->zv_state_lock);
ASSERT3U(zv->zv_open_count, >, 0);
@@ -832,6 +891,15 @@ zvol_release(struct gendisk *disk, fmode_t unused)
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ /*
+ * Unlike in zvol_open(), we don't check if removal
+ * started here, because we might be one of the openers
+ * that needs to be thrown out! If we're the last, we
+ * need to call zvol_last_close() below to finish
+ * cleanup. So, no special treatment for us.
+ */
+
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 1) {
rw_exit(&zv->zv_suspend_lock);
@@ -841,7 +909,6 @@ zvol_release(struct gendisk *disk, fmode_t unused)
} else {
drop_suspend = B_FALSE;
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -861,9 +928,10 @@ static int
zvol_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
- zvol_state_t *zv = bdev->bd_disk->private_data;
int error = 0;
+ zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+ ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
switch (cmd) {
@@ -886,16 +954,18 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode,
case BLKZNAME:
mutex_enter(&zv->zv_state_lock);
- error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
+ error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
mutex_exit(&zv->zv_state_lock);
+ if (error)
+ error = SET_ERROR(error);
break;
default:
- error = -ENOTTY;
+ error = SET_ERROR(ENOTTY);
break;
}
- return (SET_ERROR(error));
+ return (-error);
}
#ifdef CONFIG_COMPAT
@@ -914,9 +984,8 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
{
unsigned int mask = 0;
- rw_enter(&zvol_state_lock, RW_READER);
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
- zvol_state_t *zv = disk->private_data;
if (zv != NULL) {
mutex_enter(&zv->zv_state_lock);
mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
@@ -924,17 +993,14 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
mutex_exit(&zv->zv_state_lock);
}
- rw_exit(&zvol_state_lock);
-
return (mask);
}
static int
zvol_revalidate_disk(struct gendisk *disk)
{
- rw_enter(&zvol_state_lock, RW_READER);
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
- zvol_state_t *zv = disk->private_data;
if (zv != NULL) {
mutex_enter(&zv->zv_state_lock);
set_capacity(zv->zv_zso->zvo_disk,
@@ -942,8 +1008,6 @@ zvol_revalidate_disk(struct gendisk *disk)
mutex_exit(&zv->zv_state_lock);
}
- rw_exit(&zvol_state_lock);
-
return (0);
}
@@ -962,28 +1026,19 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
return (0);
}
-void
-zvol_os_clear_private(zvol_state_t *zv)
-{
- /*
- * Cleared while holding zvol_state_lock as a writer
- * which will prevent zvol_open() from opening it.
- */
- zv->zv_zso->zvo_disk->private_data = NULL;
-}
-
/*
* Provide a simple virtual geometry for legacy compatibility. For devices
* smaller than 1 MiB a small head and sector count is used to allow very
* tiny devices. For devices over 1 Mib a standard head and sector count
* is used to keep the cylinders count reasonable.
*/
-static int
-zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static inline int
+zvol_getgeo_impl(struct gendisk *disk, struct hd_geometry *geo)
{
- zvol_state_t *zv = bdev->bd_disk->private_data;
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
sector_t sectors;
+ ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
sectors = get_capacity(zv->zv_zso->zvo_disk);
@@ -1002,6 +1057,20 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return (0);
}
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK
+static int
+zvol_getgeo(struct gendisk *disk, struct hd_geometry *geo)
+{
+ return (zvol_getgeo_impl(disk, geo));
+}
+#else
+static int
+zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ return (zvol_getgeo_impl(bdev->bd_disk, geo));
+}
+#endif
+
/*
* Why have two separate block_device_operations structs?
*
@@ -1302,27 +1371,30 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
* Allocate memory for a new zvol_state_t and setup the required
* request queue and generic disk structures for the block device.
*/
-static zvol_state_t *
-zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
+static int
+zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize,
+ zvol_state_t **zvp)
{
zvol_state_t *zv;
struct zvol_state_os *zso;
uint64_t volmode;
int ret;
- if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
- return (NULL);
+ ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL);
+ if (ret)
+ return (ret);
if (volmode == ZFS_VOLMODE_DEFAULT)
volmode = zvol_volmode;
if (volmode == ZFS_VOLMODE_NONE)
- return (NULL);
+ return (0);
zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
zv->zv_zso = zso;
zv->zv_volmode = volmode;
+ zv->zv_volsize = volsize;
zv->zv_volblocksize = volblocksize;
list_link_init(&zv->zv_next);
@@ -1396,61 +1468,79 @@ zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
ZVOL_DEV_NAME, (dev & MINORMASK));
- return (zv);
+ *zvp = zv;
+ return (ret);
out_kmem:
kmem_free(zso, sizeof (struct zvol_state_os));
kmem_free(zv, sizeof (zvol_state_t));
- return (NULL);
+ return (ret);
}
-/*
- * Cleanup then free a zvol_state_t which was created by zvol_alloc().
- * At this time, the structure is not opened by anyone, is taken off
- * the zvol_state_list, and has its private data set to NULL.
- * The zvol_state_lock is dropped.
- *
- * This function may take many milliseconds to complete (e.g. we've seen
- * it take over 256ms), due to the calls to "blk_cleanup_queue" and
- * "del_gendisk". Thus, consumers need to be careful to account for this
- * latency when calling this function.
- */
void
-zvol_os_free(zvol_state_t *zv)
+zvol_os_remove_minor(zvol_state_t *zv)
{
-
- ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
- ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
ASSERT0(zv->zv_open_count);
- ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
+ ASSERT0(atomic_read(&zv->zv_suspend_ref));
+ ASSERT(zv->zv_flags & ZVOL_REMOVING);
- rw_destroy(&zv->zv_suspend_lock);
- zfs_rangelock_fini(&zv->zv_rangelock);
+ struct zvol_state_os *zso = zv->zv_zso;
+ zv->zv_zso = NULL;
- del_gendisk(zv->zv_zso->zvo_disk);
+ /* Clearing private_data will make new callers return immediately. */
+ atomic_store_ptr(&zso->zvo_disk->private_data, NULL);
+
+ /*
+ * Drop the state lock before calling del_gendisk(). There may be
+ * callers waiting to acquire it, but del_gendisk() will block until
+ * they exit, which would deadlock.
+ */
+ mutex_exit(&zv->zv_state_lock);
+
+ del_gendisk(zso->zvo_disk);
#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
#if defined(HAVE_BLK_CLEANUP_DISK)
- blk_cleanup_disk(zv->zv_zso->zvo_disk);
+ blk_cleanup_disk(zso->zvo_disk);
#else
- put_disk(zv->zv_zso->zvo_disk);
+ put_disk(zso->zvo_disk);
#endif
#else
- blk_cleanup_queue(zv->zv_zso->zvo_queue);
- put_disk(zv->zv_zso->zvo_disk);
+ blk_cleanup_queue(zso->zvo_queue);
+ put_disk(zso->zvo_disk);
#endif
- if (zv->zv_zso->use_blk_mq)
- blk_mq_free_tag_set(&zv->zv_zso->tag_set);
+ if (zso->use_blk_mq)
+ blk_mq_free_tag_set(&zso->tag_set);
- ida_simple_remove(&zvol_ida,
- MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
+ ida_free(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
+
+ kmem_free(zso, sizeof (struct zvol_state_os));
+
+ mutex_enter(&zv->zv_state_lock);
+}
+
+void
+zvol_os_free(zvol_state_t *zv)
+{
+
+ ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+ ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT0(zv->zv_open_count);
+ ASSERT0P(zv->zv_zso);
+
+ ASSERT0P(zv->zv_objset);
+ ASSERT0P(zv->zv_zilog);
+ ASSERT0P(zv->zv_dn);
+
+ rw_destroy(&zv->zv_suspend_lock);
+ zfs_rangelock_fini(&zv->zv_rangelock);
cv_destroy(&zv->zv_removing_cv);
mutex_destroy(&zv->zv_state_lock);
dataset_kstats_destroy(&zv->zv_kstat);
- kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
kmem_free(zv, sizeof (zvol_state_t));
}
@@ -1470,7 +1560,9 @@ __zvol_os_add_disk(struct gendisk *disk)
{
int error = 0;
#ifdef HAVE_ADD_DISK_RET
- error = add_disk(disk);
+ error = -add_disk(disk);
+ if (error)
+ error = SET_ERROR(error);
#else
add_disk(disk);
#endif
@@ -1562,7 +1654,7 @@ zvol_os_add_disk(struct gendisk *disk)
int
zvol_os_create_minor(const char *name)
{
- zvol_state_t *zv;
+ zvol_state_t *zv = NULL;
objset_t *os;
dmu_object_info_t *doi;
uint64_t volsize;
@@ -1577,7 +1669,7 @@ zvol_os_create_minor(const char *name)
if (zvol_inhibit_dev)
return (0);
- idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
+ idx = ida_alloc(&zvol_ida, kmem_flags_convert(KM_SLEEP));
if (idx < 0)
return (SET_ERROR(-idx));
minor = idx << ZVOL_MINOR_BITS;
@@ -1585,7 +1677,7 @@ zvol_os_create_minor(const char *name)
/* too many partitions can cause an overflow */
zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
name, minor, MINOR(minor));
- ida_simple_remove(&zvol_ida, idx);
+ ida_free(&zvol_ida, idx);
return (SET_ERROR(EINVAL));
}
@@ -1593,7 +1685,7 @@ zvol_os_create_minor(const char *name)
if (zv) {
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
mutex_exit(&zv->zv_state_lock);
- ida_simple_remove(&zvol_ida, idx);
+ ida_free(&zvol_ida, idx);
return (SET_ERROR(EEXIST));
}
@@ -1611,18 +1703,16 @@ zvol_os_create_minor(const char *name)
if (error)
goto out_dmu_objset_disown;
- zv = zvol_alloc(MKDEV(zvol_major, minor), name,
- doi->doi_data_block_size);
- if (zv == NULL) {
- error = SET_ERROR(EAGAIN);
+ error = zvol_alloc(MKDEV(zvol_major, minor), name,
+ volsize, doi->doi_data_block_size, &zv);
+ if (error || zv == NULL)
goto out_dmu_objset_disown;
- }
+
zv->zv_hash = hash;
if (dmu_objset_is_snapshot(os))
zv->zv_flags |= ZVOL_RDONLY;
- zv->zv_volsize = volsize;
zv->zv_objset = os;
/* Default */
@@ -1647,11 +1737,11 @@ zvol_os_create_minor(const char *name)
blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
#endif
- ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+ ASSERT0P(zv->zv_kstat.dk_kstats);
error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
if (error)
goto out_dmu_objset_disown;
- ASSERT3P(zv->zv_zilog, ==, NULL);
+ ASSERT0P(zv->zv_zilog);
zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
if (spa_writeable(dmu_objset_spa(os))) {
if (zil_replay_disable)
@@ -1689,19 +1779,19 @@ out_doi:
* zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
* directly as well.
*/
- if (error == 0) {
+ if (error == 0 && zv) {
rw_enter(&zvol_state_lock, RW_WRITER);
zvol_insert(zv);
rw_exit(&zvol_state_lock);
error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
} else {
- ida_simple_remove(&zvol_ida, idx);
+ ida_free(&zvol_ida, idx);
}
return (error);
}
-void
+int
zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
{
int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
@@ -1728,6 +1818,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
set_disk_ro(zv->zv_zso->zvo_disk, readonly);
dataset_kstats_rename(&zv->zv_kstat, newname);
+
+ return (0);
}
void
@@ -1755,10 +1847,10 @@ zvol_init(void)
return (error);
}
- error = register_blkdev(zvol_major, ZVOL_DRIVER);
+ error = -register_blkdev(zvol_major, ZVOL_DRIVER);
if (error) {
printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
- return (error);
+ return (SET_ERROR(error));
}
if (zvol_blk_mq_queue_depth == 0) {