8 files changed, 255 insertions, 30 deletions
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
index 8a8316f63c48..18f2426fbbfc 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 by Delphix. All rights reserved.
  * Copyright (c) 2023, 2024, Klara Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
  */
 
 /*
@@ -1109,6 +1110,14 @@ abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
 #define	ABD_ITER_PAGE_SIZE(page)	(PAGESIZE)
 #endif
 
+#ifndef nth_page
+/*
+ * Since 6.18 nth_page() no longer exists, and is no longer required to iterate
+ * within a single SG entry, so we replace it with a simple addition.
+ */
+#define	nth_page(p, n)	((p)+(n))
+#endif
+
 void
 abd_iter_page(struct abd_iter *aiter)
 {
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
index 830fad7fe793..1bd3500e9f66 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@@ -471,13 +471,17 @@ vdev_disk_close(vdev_t *v)
 	if (v->vdev_reopening || vd == NULL)
 		return;
 
+	rw_enter(&vd->vd_lock, RW_WRITER);
+
 	if (vd->vd_bdh != NULL)
 		vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
 		    zfs_vdev_holder);
 
+	v->vdev_tsd = NULL;
+
+	rw_exit(&vd->vd_lock);
 	rw_destroy(&vd->vd_lock);
 	kmem_free(vd, sizeof (vdev_disk_t));
-	v->vdev_tsd = NULL;
 }
 
 /*
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
index daa4b5776837..934d74a112fd 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
@@ -2524,7 +2524,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
 	 * Also note: DOS R/O is ignored for directories.
 	 */
 	if ((v4_mode & WRITE_MASK_DATA) &&
-	    S_ISDIR(ZTOI(zp)->i_mode) &&
+	    !S_ISDIR(ZTOI(zp)->i_mode) &&
 	    (zp->z_pflags & ZFS_READONLY)) {
 		return (SET_ERROR(EPERM));
 	}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
index cd606e667bff..8a7d14ab6119 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -1556,6 +1556,12 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
 	sb->s_xattr = zpl_xattr_handlers;
 	sb->s_export_op = &zpl_export_operations;
 
+#ifdef HAVE_SET_DEFAULT_D_OP
+	set_default_d_op(sb, &zpl_dentry_operations);
+#else
+	sb->s_d_op = &zpl_dentry_operations;
+#endif
+
 	/* Set features for file system. */
 	zfs_set_fuid_feature(zfsvfs);
 
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
index 6106726651a3..e845ad69ad78 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
@@ -2033,10 +2033,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
 		goto out3;
 	}
 
-	if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
-		err = SET_ERROR(EPERM);
-		goto out3;
-	}
+	/* ZFS_READONLY will be handled in zfs_zaccess() */
 
 	/*
 	 * Verify timestamps doesn't overflow 32 bits.
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
index d07317b0d910..02965ac8cbee 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2025, Klara, Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
  */
 
 
@@ -478,6 +479,7 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 	return (ret);
 }
 
+#ifdef HAVE_WRITE_CACHE_PAGES
 #ifdef HAVE_WRITEPAGE_T_FOLIO
 static int
 zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
@@ -499,6 +501,78 @@ zpl_write_cache_pages(struct address_space *mapping,
 #endif
 	return (result);
 }
+#else
+static inline int
+zpl_write_cache_pages(struct address_space *mapping,
+    struct writeback_control *wbc, void *data)
+{
+	pgoff_t start = wbc->range_start >> PAGE_SHIFT;
+	pgoff_t end = wbc->range_end >> PAGE_SHIFT;
+
+	struct folio_batch fbatch;
+	folio_batch_init(&fbatch);
+
+	/*
+	 * This atomically (-ish) tags all DIRTY pages in the range with
+	 * TOWRITE, allowing users to continue dirtying or undirtying pages
+	 * while we get on with writeback, without us treading on each other.
+	 */
+	tag_pages_for_writeback(mapping, start, end);
+
+	int err = 0;
+	unsigned int npages;
+
+	/*
+	 * Grab references to the TOWRITE pages just flagged. This may not get
+	 * all of them, so we do it in a loop until there are none left.
+	 */
+	while ((npages = filemap_get_folios_tag(mapping, &start, end,
+	    PAGECACHE_TAG_TOWRITE, &fbatch)) != 0) {
+
+		/* Loop over each page and write it out. */
+		struct folio *folio;
+		while ((folio = folio_batch_next(&fbatch)) != NULL) {
+			folio_lock(folio);
+
+			/*
+			 * If the folio has been remapped, or is no longer
+			 * dirty, then there's nothing to do.
+			 */
+			if (folio->mapping != mapping ||
+			    !folio_test_dirty(folio)) {
+				folio_unlock(folio);
+				continue;
+			}
+
+			/*
+			 * If writeback is already in progress, wait for it to
+			 * finish. We continue after this even if the page
+			 * ends up clean; zfs_putpage() will skip it if no
+			 * further work is required.
+			 */
+			while (folio_test_writeback(folio))
+				folio_wait_bit(folio, PG_writeback);
+
+			/*
+			 * Write it out and collect any error. zfs_putpage()
+			 * will clear the TOWRITE and DIRTY flags, and return
+			 * with the page unlocked.
+			 */
+			int ferr = zpl_putpage(&folio->page, wbc, data);
+			if (err == 0 && ferr != 0)
+				err = ferr;
+
+			/* Housekeeping for the caller. */
+			wbc->nr_to_write -= folio_nr_pages(folio);
+		}
+
+		/* Release any remaining references on the batch. */
+		folio_batch_release(&fbatch);
+	}
+
+	return (err);
+}
+#endif
 
 static int
 zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
index 53819628627d..347b352506e5 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@@ -22,6 +22,8 @@
 /*
  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
  * Copyright (c) 2023, Datto Inc. All rights reserved.
+ * Copyright (c) 2025, Klara, Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
  */
 
 
@@ -32,7 +34,22 @@
 #include <sys/zpl.h>
 #include <linux/iversion.h>
 #include <linux/version.h>
+#include <linux/vfs_compat.h>
 
+/*
+ * What to do when the last reference to an inode is released. If 0, the kernel
+ * will cache it on the superblock. If 1, the inode will be freed immediately.
+ * See zpl_drop_inode().
+ */
+int zfs_delete_inode = 0;
+
+/*
+ * What to do when the last reference to a dentry is released. If 0, the kernel
+ * will cache it until the entry (file) is destroyed. If 1, the dentry will be
+ * marked for cleanup, at which time its inode reference will be released. See
+ * zpl_dentry_delete().
+ */
+int zfs_delete_dentry = 0;
 
 static struct inode *
 zpl_inode_alloc(struct super_block *sb)
@@ -77,11 +94,36 @@ zpl_dirty_inode(struct inode *ip, int flags)
 }
 
 /*
- * When ->drop_inode() is called its return value indicates if the
- * inode should be evicted from the inode cache.  If the inode is
- * unhashed and has no links the default policy is to evict it
- * immediately.
+ * ->drop_inode() is called when the last reference to an inode is released.
+ * Its return value indicates if the inode should be destroyed immediately, or
+ * cached on the superblock structure.
+ *
+ * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns
+ * "destroy immediately" if the inode is unhashed and has no links (roughly: no
+ * longer exists on disk). On datasets with millions of rarely-accessed files,
+ * this can cause a large amount of memory to be "pinned" by cached inodes,
+ * which in turn pin their associated dnodes and dbufs, until the kernel starts
+ * reporting memory pressure and requests OpenZFS release some memory (see
+ * zfs_prune()).
+ *
+ * When set to 1, we call generic_delete_inode(), which always returns "destroy
+ * immediately", resulting in inodes being destroyed immediately, releasing
+ * their associated dnodes and dbufs to the dbuf cached and the ARC to be
+ * evicted as normal.
  *
+ * Note that the "last reference" doesn't always mean the last _userspace_
+ * reference; the dentry cache also holds a reference, so "busy" inodes will
+ * still be kept alive that way (subject to dcache tuning).
+ */
+static int
+zpl_drop_inode(struct inode *ip)
+{
+	if (zfs_delete_inode)
+		return (generic_delete_inode(ip));
+	return (generic_drop_inode(ip));
+}
+
+/*
  * The ->evict_inode() callback must minimally truncate the inode pages,
  * and call clear_inode().  For 2.6.35 and later kernels this will
  * simply update the inode state, with the sync occurring before the
@@ -470,6 +512,7 @@ const struct super_operations zpl_super_operations = {
 	.destroy_inode		= zpl_inode_destroy,
 	.dirty_inode		= zpl_dirty_inode,
 	.write_inode		= NULL,
+	.drop_inode		= zpl_drop_inode,
 	.evict_inode		= zpl_evict_inode,
 	.put_super		= zpl_put_super,
 	.sync_fs		= zpl_sync_fs,
@@ -480,6 +523,35 @@ const struct super_operations zpl_super_operations = {
 	.show_stats		= NULL,
 };
 
+/*
+ * ->d_delete() is called when the last reference to a dentry is released. Its
+ *  return value indicates if the dentry should be destroyed immediately, or
+ *  retained in the dentry cache.
+ *
+ * By default (zfs_delete_dentry=0) the kernel will always cache unused
+ * entries.  Each dentry holds an inode reference, so cached dentries can hold
+ * the final inode reference indefinitely, leading to the inode and its related
+ * data being pinned (see zpl_drop_inode()).
+ *
+ * When set to 1, we signal that the dentry should be destroyed immediately and
+ * never cached. This reduces memory usage, at the cost of higher overheads to
+ * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be
+ * reloaded and reinflated.
+ *
+ * Note that userspace does not have direct control over dentry references and
+ * reclaim; rather, this is part of the kernel's caching and reclaim subsystems
+ * (eg vm.vfs_cache_pressure).
+ */
+static int
+zpl_dentry_delete(const struct dentry *dentry)
+{
+	return (zfs_delete_dentry ? 1 : 0);
+}
+
+const struct dentry_operations zpl_dentry_operations = {
+	.d_delete = zpl_dentry_delete,
+};
+
 struct file_system_type zpl_fs_type = {
 	.owner			= THIS_MODULE,
 	.name			= ZFS_DRIVER,
@@ -491,3 +563,10 @@ struct file_system_type zpl_fs_type = {
 	.mount			= zpl_mount,
 	.kill_sb		= zpl_kill_sb,
 };
+
+ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW,
+	"Delete inodes as soon as the last reference is released.");
+
+ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW,
+	"Delete dentries from dentry cache as soon as the last reference is "
+	"released.");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index bac166fcd89e..fe939150b641 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -21,7 +21,7 @@
  */
 /*
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
- * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
+ * Copyright (c) 2024, 2025, Rob Norris <robn@despairlabs.com>
  * Copyright (c) 2024, 2025, Klara, Inc.
  */
 
@@ -337,16 +337,14 @@ zvol_discard(zv_request_t *zvr)
 	}
 
 	/*
-	 * Align the request to volume block boundaries when a secure erase is
-	 * not required.  This will prevent dnode_free_range() from zeroing out
-	 * the unaligned parts which is slow (read-modify-write) and useless
-	 * since we are not freeing any space by doing so.
+	 * Align the request to volume block boundaries. This will prevent
+	 * dnode_free_range() from zeroing out the unaligned parts which is
+	 * slow (read-modify-write) and useless since we are not freeing any
+	 * space by doing so.
 	 */
-	if (!io_is_secure_erase(bio, rq)) {
-		start = P2ROUNDUP(start, zv->zv_volblocksize);
-		end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
-		size = end - start;
-	}
+	start = P2ROUNDUP(start, zv->zv_volblocksize);
+	end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
+	size = end - start;
 
 	if (start >= end)
 		goto unlock;
@@ -467,6 +465,24 @@ zvol_read_task(void *arg)
 	zv_request_task_free(task);
 }
 
+/*
+ * Note:
+ *
+ * The kernel uses different enum names for the IO opcode, depending on the
+ * kernel version ('req_opf', 'req_op').  To sidestep this, use macros rather
+ * than inline functions for these checks.
+ */
+/* Should this IO go down the zvol write path? */
+#define	ZVOL_OP_IS_WRITE(op) \
+	(op == REQ_OP_WRITE || \
+	op == REQ_OP_FLUSH || \
+	op == REQ_OP_DISCARD)
+
+/* Is this IO type supported by zvols? */
+#define	ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op))
+
+/* Get the IO opcode */
+#define	ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq))
 
 /*
  * Process a BIO or request
@@ -484,7 +500,33 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	uint64_t offset = io_offset(bio, rq);
 	uint64_t size = io_size(bio, rq);
-	int rw = io_data_dir(bio, rq);
+	int rw;
+
+	if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) {
+		zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x",
+		    rq != NULL ? "request" : "BIO",
+		    ZVOL_OP(bio, rq),
+		    rq != NULL ? rq->cmd_flags : bio->bi_opf);
+		ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)));
+		zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP));
+		goto out;
+	}
+
+	if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) {
+		rw = WRITE;
+	} else {
+		rw = READ;
+	}
+
+	/*
+	 * Sanity check
+	 *
+	 * If we're a BIO, check our rw matches the kernel's
+	 * bio_data_dir(bio) rw.  We need to check because we support fewer
+	 * IO operations, and want to verify that what we think are reads and
+	 * writes from those operations match what the kernel thinks.
+	 */
+	ASSERT(rq != NULL || rw == bio_data_dir(bio));
 
 	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
 		zvol_end_io(bio, rq, SET_ERROR(ENXIO));
@@ -589,7 +631,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 		 * interfaces lack this functionality (they block waiting for
 		 * the i/o to complete).
 		 */
-		if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) {
+		if (io_is_discard(bio, rq)) {
 			if (force_sync) {
 				zvol_discard(&zvr);
 			} else {
@@ -990,12 +1032,12 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
  * tiny devices.  For devices over 1 Mib a standard head and sector count
  * is used to keep the cylinders count reasonable.
  */
-static int
-zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static inline int
+zvol_getgeo_impl(struct gendisk *disk, struct hd_geometry *geo)
 {
+	zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
 	sector_t sectors;
 
-	zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
 	ASSERT3P(zv, !=, NULL);
 	ASSERT3U(zv->zv_open_count, >, 0);
 
@@ -1015,6 +1057,20 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return (0);
 }
 
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK
+static int
+zvol_getgeo(struct gendisk *disk, struct hd_geometry *geo)
+{
+	return (zvol_getgeo_impl(disk, geo));
+}
+#else
+static int
+zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	return (zvol_getgeo_impl(bdev->bd_disk, geo));
+}
+#endif
+
 /*
  * Why have two separate block_device_operations structs?
  *
@@ -1458,7 +1514,7 @@ zvol_os_remove_minor(zvol_state_t *zv)
 	if (zso->use_blk_mq)
 		blk_mq_free_tag_set(&zso->tag_set);
 
-	ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
+	ida_free(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
 
 	kmem_free(zso, sizeof (struct zvol_state_os));
 
@@ -1613,7 +1669,7 @@ zvol_os_create_minor(const char *name)
 	if (zvol_inhibit_dev)
 		return (0);
 
-	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
+	idx = ida_alloc(&zvol_ida, kmem_flags_convert(KM_SLEEP));
 	if (idx < 0)
 		return (SET_ERROR(-idx));
 	minor = idx << ZVOL_MINOR_BITS;
@@ -1621,7 +1677,7 @@ zvol_os_create_minor(const char *name)
 		/* too many partitions can cause an overflow */
 		zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
 		    name, minor, MINOR(minor));
-		ida_simple_remove(&zvol_ida, idx);
+		ida_free(&zvol_ida, idx);
 		return (SET_ERROR(EINVAL));
 	}
 
@@ -1629,7 +1685,7 @@ zvol_os_create_minor(const char *name)
 	if (zv) {
 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 		mutex_exit(&zv->zv_state_lock);
-		ida_simple_remove(&zvol_ida, idx);
+		ida_free(&zvol_ida, idx);
 		return (SET_ERROR(EEXIST));
 	}
 
@@ -1729,7 +1785,7 @@ out_doi:
 		rw_exit(&zvol_state_lock);
 		error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
 	} else {
-		ida_simple_remove(&zvol_ida, idx);
+		ida_free(&zvol_ida, idx);
 	}
 
 	return (error);