43 files changed, 1677 insertions, 708 deletions
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c
index 6d198fad5203..ae6e36d988c2 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c
@@ -160,7 +160,7 @@ kmem_cache_create(const char *name, size_t bufsize, size_t align,
 {
 	kmem_cache_t *cache;
 
-	ASSERT3P(vmp, ==, NULL);
+	ASSERT0P(vmp);
 
 	cache = kmem_alloc(sizeof (*cache), KM_SLEEP);
 	strlcpy(cache->kc_name, name, sizeof (cache->kc_name));
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
index f9125a067cd1..3f360d167b17 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
@@ -101,6 +101,15 @@ spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
 	va_end(ap);
 }
 
+/*
+ * Check if the current thread is a memory reclaim thread.
+ * Returns true if curproc is pageproc (FreeBSD's page daemon).
+ */
+int
+current_is_reclaim_thread(void)
+{
+	return (curproc == pageproc);
+}
 
 SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY,
     opensolaris_utsname_init, NULL);
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c
index 9da633c2b1be..3c2d39b20c09 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c
@@ -256,7 +256,7 @@ sysevent_worker(void *arg __unused)
 	 * free `ze`, so just inline the free() here -- events have already
 	 * been drained.
 	 */
-	VERIFY3P(ze->ze_zevent, ==, NULL);
+	VERIFY0P(ze->ze_zevent);
 	kmem_free(ze, sizeof (zfs_zevent_t));
 
 	kthread_exit();
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
index 733c2bd07ebb..9d5f025423a1 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
@@ -43,6 +43,7 @@
 const int zfs_vm_pagerret_bad = VM_PAGER_BAD;
 const int zfs_vm_pagerret_error = VM_PAGER_ERROR;
 const int zfs_vm_pagerret_ok = VM_PAGER_OK;
+const int zfs_vm_pagerret_pend = VM_PAGER_PEND;
 const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC;
 const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL;
 
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
index fbf67f6a14a8..4bf487cdc469 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
@@ -507,7 +507,7 @@ abd_iter_at_end(struct abd_iter *aiter)
 void
 abd_iter_advance(struct abd_iter *aiter, size_t amount)
 {
-	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0P(aiter->iter_mapaddr);
 	ASSERT0(aiter->iter_mapsize);
 
 	/* There's nothing left to advance to, so do nothing */
@@ -526,7 +526,7 @@ abd_iter_map(struct abd_iter *aiter)
 {
 	void *paddr;
 
-	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0P(aiter->iter_mapaddr);
 	ASSERT0(aiter->iter_mapsize);
 
 	/* There's nothing left to iterate over, so do nothing */
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
index 364bbfc60abd..26cc7981bfcd 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
@@ -156,7 +156,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
 	if (dbp[0]->db_offset != 0 || numbufs > 1) {
 		for (i = 0; i < numbufs; i++) {
 			ASSERT(ISP2(dbp[i]->db_size));
-			ASSERT3U((dbp[i]->db_offset % dbp[i]->db_size), ==, 0);
+			ASSERT0((dbp[i]->db_offset % dbp[i]->db_size));
 			ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
 		}
 	}
@@ -175,7 +175,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
 			vm_page_sunbusy(m);
 			break;
 		}
-		ASSERT3U(m->dirty, ==, 0);
+		ASSERT0(m->dirty);
 		ASSERT(!pmap_page_is_write_mapped(m));
 
 		ASSERT3U(db->db_size, >, PAGE_SIZE);
@@ -201,7 +201,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
 			if (m != bogus_page) {
 				vm_page_assert_xbusied(m);
 				ASSERT(vm_page_none_valid(m));
-				ASSERT3U(m->dirty, ==, 0);
+				ASSERT0(m->dirty);
 				ASSERT(!pmap_page_is_write_mapped(m));
 				va = zfs_map_page(m, &sf);
 			}
@@ -295,7 +295,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
 			vm_page_sunbusy(m);
 			break;
 		}
-		ASSERT3U(m->dirty, ==, 0);
+		ASSERT0(m->dirty);
 		ASSERT(!pmap_page_is_write_mapped(m));
 
 		ASSERT3U(db->db_size, >, PAGE_SIZE);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
index c114db14a916..b218c0da8125 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
@@ -112,7 +112,6 @@ static int zfs__fini(void);
 static void zfs_shutdown(void *, int);
 
 static eventhandler_tag zfs_shutdown_event_tag;
-static eventhandler_tag zfs_mountroot_event_tag;
 
 #define	ZFS_MIN_KSTACK_PAGES 4
 
@@ -311,9 +310,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused)
 			zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
 			    shutdown_post_sync, zfs_shutdown, NULL,
 			    SHUTDOWN_PRI_FIRST);
-			zfs_mountroot_event_tag = EVENTHANDLER_REGISTER(
-			    mountroot, spa_boot_init, NULL,
-			    SI_ORDER_ANY);
 		}
 		return (err);
 	case MOD_UNLOAD:
@@ -322,9 +318,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused)
 			if (zfs_shutdown_event_tag != NULL)
 				EVENTHANDLER_DEREGISTER(shutdown_post_sync,
 				    zfs_shutdown_event_tag);
-			if (zfs_mountroot_event_tag != NULL)
-				EVENTHANDLER_DEREGISTER(mountroot,
-				    zfs_mountroot_event_tag);
 		}
 		return (err);
 	case MOD_SHUTDOWN:
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
index ace2360c032d..ebc2c0eeb6d2 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@@ -163,6 +163,13 @@ param_set_arc_int(SYSCTL_HANDLER_ARGS)
 	return (0);
 }
 
+static void
+warn_deprecated_sysctl(const char *old, const char *new)
+{
+	printf("WARNING: sysctl vfs.zfs.%s is deprecated. Use vfs.zfs.%s instead.\n",
+	    old, new);
+}
+
 int
 param_set_arc_max(SYSCTL_HANDLER_ARGS)
 {
@@ -185,12 +192,15 @@ param_set_arc_max(SYSCTL_HANDLER_ARGS)
 	if (val != 0)
 		zfs_arc_max = arc_c_max;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("arc_max", "arc.max");
+
 	return (0);
 }
 
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max,
 	CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
-	NULL, 0, param_set_arc_max, "LU",
+	NULL, 1, param_set_arc_max, "LU",
 	"Maximum ARC size in bytes (LEGACY)");
 
 int
@@ -214,12 +224,15 @@ param_set_arc_min(SYSCTL_HANDLER_ARGS)
 	if (val != 0)
 		zfs_arc_min = arc_c_min;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("arc_min", "arc.min");
+
 	return (0);
 }
 
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min,
 	CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
-	NULL, 0, param_set_arc_min, "LU",
+	NULL, 1, param_set_arc_min, "LU",
 	"Minimum ARC size in bytes (LEGACY)");
 
 extern uint_t zfs_arc_free_target;
@@ -242,6 +255,9 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS)
 
 	zfs_arc_free_target = val;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("arc_free_target", "arc.free_target");
+
 	return (0);
 }
 
@@ -251,7 +267,7 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS)
  */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
 	CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
-	NULL, 0, param_set_arc_free_target, "IU",
+	NULL, 1, param_set_arc_free_target, "IU",
 	"Desired number of free pages below which ARC triggers reclaim"
 	" (LEGACY)");
 
@@ -270,12 +286,15 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
 
 	arc_no_grow_shift = val;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("arc_no_grow_shift", "arc.no_grow_shift");
+
 	return (0);
 }
 
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift,
 	CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
-	NULL, 0, param_set_arc_no_grow_shift, "I",
+	NULL, 1, param_set_arc_no_grow_shift, "I",
 	"log2(fraction of ARC which must be free to allow growing) (LEGACY)");
 
 extern uint64_t l2arc_write_max;
@@ -746,12 +765,15 @@ param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS)
 
 	zfs_vdev_min_auto_ashift = val;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("min_auto_ashift",
+		    "vdev.min_auto_ashift");
+
 	return (0);
 }
 
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
-	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
-	&zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift),
+	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1,
 	param_set_min_auto_ashift, "IU",
 	"Min ashift used when creating new top-level vdev. (LEGACY)");
 
@@ -771,12 +793,15 @@ param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS)
 
 	zfs_vdev_max_auto_ashift = val;
 
+	if (arg2 != 0)
+		warn_deprecated_sysctl("max_auto_ashift",
+		    "vdev.max_auto_ashift");
+
 	return (0);
 }
 
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
-	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
-	&zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift),
+	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1,
 	param_set_max_auto_ashift, "IU",
 	"Max ashift used when optimizing for logical -> physical sector size on"
 	" new top-level vdevs. (LEGACY)");
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
index c8ab7cc7cf8e..bbd1dafc69be 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
@@ -1236,7 +1236,7 @@ vdev_geom_io_done(zio_t *zio)
 	struct bio *bp = zio->io_bio;
 
 	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
-		ASSERT3P(bp, ==, NULL);
+		ASSERT0P(bp);
 		return;
 	}
 
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
index 334264f6da2f..cb5787269db2 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
@@ -1175,7 +1175,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
 	int			count = 0;
 	zfs_acl_phys_t		acl_phys;
 
-	if (zp->z_zfsvfs->z_replay == B_FALSE) {
+	if (ZTOV(zp) != NULL && zp->z_zfsvfs->z_replay == B_FALSE) {
 		ASSERT_VOP_IN_SEQC(ZTOV(zp));
 	}
 
@@ -1632,7 +1632,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
 		if (zfsvfs->z_replay == B_FALSE)
 			ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
 	} else
-		ASSERT3P(dzp->z_vnode, ==, NULL);
+		ASSERT0P(dzp->z_vnode);
 	memset(acl_ids, 0, sizeof (zfs_acl_ids_t));
 	acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
 
@@ -2014,7 +2014,7 @@ top:
 
 	error = zfs_aclset_common(zp, aclp, cr, tx);
 	ASSERT0(error);
-	ASSERT3P(zp->z_acl_cached, ==, NULL);
+	ASSERT0P(zp->z_acl_cached);
 	zp->z_acl_cached = aclp;
 
 	if (fuid_dirtied)
@@ -2357,10 +2357,42 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr,
 	 * In FreeBSD, we don't care about permissions of individual ADS.
 	 * Note that not checking them is not just an optimization - without
 	 * this shortcut, EA operations may bogusly fail with EACCES.
+	 *
+	 * If this is a named attribute lookup, do the checks.
 	 */
+#if __FreeBSD_version >= 1500040
+	if ((zp->z_pflags & ZFS_XATTR) && (flags & V_NAMEDATTR) == 0)
+#else
 	if (zp->z_pflags & ZFS_XATTR)
+#endif
 		return (0);
 
+	/*
+	 * If a named attribute directory then validate against base file
+	 */
+	if (is_attr) {
+		if ((error = zfs_zget(ZTOZSB(zp),
+		    zp->z_xattr_parent, &xzp)) != 0) {
+			return (error);
+		}
+
+		check_zp = xzp;
+
+		/*
+		 * fixup mode to map to xattr perms
+		 */
+
+		if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+			mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+			mode |= ACE_WRITE_NAMED_ATTRS;
+		}
+
+		if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+			mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+			mode |= ACE_READ_NAMED_ATTRS;
+		}
+	}
+
 	owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
 
 	/*
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
index 8d0ff9b25e30..d0a9c662e6f0 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
@@ -357,7 +357,7 @@ zfsctl_create(zfsvfs_t *zfsvfs)
 	vnode_t *rvp;
 	uint64_t crtime[2];
 
-	ASSERT3P(zfsvfs->z_ctldir, ==, NULL);
+	ASSERT0P(zfsvfs->z_ctldir);
 
 	snapdir = sfs_alloc_node(sizeof (*snapdir), "snapshot", ZFSCTL_INO_ROOT,
 	    ZFSCTL_INO_SNAPDIR);
@@ -494,7 +494,7 @@ zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 
 	vap->va_uid = 0;
 	vap->va_gid = 0;
-	vap->va_rdev = 0;
+	vap->va_rdev = NODEV;
 	/*
 	 * We are a purely virtual object, so we have no
 	 * blocksize or allocated blocks.
@@ -688,6 +688,8 @@ zfsctl_root_readdir(struct vop_readdir_args *ap)
 	 * count to return is 0.
 	 */
 	if (zfs_uio_offset(&uio) == 3 * sizeof (entry)) {
+		if (eofp != NULL)
+			*eofp = 1;
 		return (0);
 	}
 
@@ -760,8 +762,7 @@ zfsctl_common_pathconf(struct vop_pathconf_args *ap)
 		return (0);
 
 	case _PC_MIN_HOLE_SIZE:
-		*ap->a_retval = (int)SPA_MINBLOCKSIZE;
-		return (0);
+		return (EINVAL);
 
 	case _PC_ACL_EXTENDED:
 		*ap->a_retval = 0;
@@ -1367,7 +1368,7 @@ zfsctl_snapshot_unmount(const char *snapname, int flags __unused)
 
 	int err = getzfsvfs(snapname, &zfsvfs);
 	if (err != 0) {
-		ASSERT3P(zfsvfs, ==, NULL);
+		ASSERT0P(zfsvfs);
 		return (0);
 	}
 	vfsp = zfsvfs->z_vfs;
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c
index 191df832d726..75ba2ea0cb9e 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c
@@ -273,7 +273,7 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ASSERT(zp->z_unlinked);
-	ASSERT3U(zp->z_links, ==, 0);
+	ASSERT0(zp->z_links);
 
 	VERIFY0(zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
 
@@ -437,7 +437,7 @@ zfs_rmnode(znode_t *zp)
 	uint64_t	count;
 	int		error;
 
-	ASSERT3U(zp->z_links, ==, 0);
+	ASSERT0(zp->z_links);
 	if (zfsvfs->z_replay == B_FALSE)
 		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
index 21e5f7938f9f..ca13569a1235 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
@@ -164,8 +164,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
 
 int
 zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
-    ssize_t *resid)
+    uint8_t ashift, ssize_t *resid)
 {
+	(void) ashift;
 	return (zfs_file_write_impl(fp, buf, count, &off, resid));
 }
 
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
index 493ac9f69ad4..79b784288911 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
@@ -455,8 +455,13 @@ zfs_sync(vfs_t *vfsp, int waitfor)
 			return (0);
 		}
 
-		if (zfsvfs->z_log != NULL)
-			zil_commit(zfsvfs->z_log, 0);
+		if (zfsvfs->z_log != NULL) {
+			error = zil_commit(zfsvfs->z_log, 0);
+			if (error != 0) {
+				zfs_exit(zfsvfs, FTAG);
+				return (error);
+			}
+		}
 
 		zfs_exit(zfsvfs, FTAG);
 	} else {
@@ -1091,7 +1096,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 	if (mounting) {
 		boolean_t readonly;
 
-		ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+		ASSERT0P(zfsvfs->z_kstat.dk_kstats);
 		error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
 		if (error)
 			return (error);
@@ -1209,6 +1214,8 @@ zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 }
 
+extern int zfs_xattr_compat;
+
 static int
 zfs_domount(vfs_t *vfsp, char *osname)
 {
@@ -1289,6 +1296,16 @@ zfs_domount(vfs_t *vfsp, char *osname)
 			goto out;
 	}
 
+#if __FreeBSD_version >= 1500040
+	/*
+	 * Named attributes can only work if the xattr property is set to
+	 * on/dir and not sa.  Also, zfs_xattr_compat must be set.
+	 */
+	if ((zfsvfs->z_flags & ZSB_XATTR) != 0 && !zfsvfs->z_xattr_sa &&
+	    zfs_xattr_compat)
+		vfsp->mnt_flag |= MNT_NAMEDATTR;
+#endif
+
 	vfs_mountedfrom(vfsp, osname);
 
 	if (!zfsvfs->z_issnap)
@@ -1812,6 +1829,14 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
 		err = vn_lock(*vpp, flags);
 		if (err != 0)
 			vrele(*vpp);
+#if __FreeBSD_version >= 1500040
+		else if ((zp->z_pflags & ZFS_XATTR) != 0) {
+			if ((*vpp)->v_type == VDIR)
+				vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
+			else
+				vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR);
+		}
+#endif
 	}
 	if (err != 0)
 		*vpp = NULL;
@@ -1964,9 +1989,17 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
 	*vpp = ZTOV(zp);
 	zfs_exit(zfsvfs, FTAG);
 	err = vn_lock(*vpp, flags);
-	if (err == 0)
+	if (err == 0) {
 		vnode_create_vobject(*vpp, zp->z_size, curthread);
-	else
+#if __FreeBSD_version >= 1500040
+		if ((zp->z_pflags & ZFS_XATTR) != 0) {
+			if ((*vpp)->v_type == VDIR)
+				vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
+			else
+				vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR);
+		}
+#endif
+	} else
 		*vpp = NULL;
 	return (err);
 }
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
index 8a5006c488f3..f34a2fd37a77 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -25,6 +25,7 @@
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2025, Klara, Inc.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
@@ -60,6 +61,7 @@
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
@@ -114,6 +116,8 @@ typedef uint64_t cookie_t;
 typedef ulong_t cookie_t;
 #endif
 
+static int zfs_check_attrname(const char *name);
+
 /*
  * Programming rules.
  *
@@ -385,7 +389,9 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error)
 			return (error);
+		vn_seqc_write_begin(vp);
 		error = zfs_ioctl_setxattr(vp, fsx, cred);
+		vn_seqc_write_end(vp);
 		VOP_UNLOCK(vp);
 		return (error);
 	}
@@ -813,7 +819,12 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp,
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
-		error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr, NULL);
+		if (flags & LOOKUP_NAMED_ATTR)
+			error = zfs_zaccess(zp, ACE_EXECUTE, V_NAMEDATTR,
+			    B_FALSE, cr, NULL);
+		else
+			error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr,
+			    NULL);
 		if (error) {
 			vrele(ZTOV(zp));
 		}
@@ -1093,7 +1104,7 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
-	ASSERT3P(zp, ==, NULL);
+	ASSERT0P(zp);
 
 	/*
 	 * Create a new file object and update the directory
@@ -1185,8 +1196,8 @@ out:
 		*zpp = zp;
 	}
 
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
+	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		error = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
@@ -1315,9 +1326,8 @@ out:
 	if (xzp)
 		vrele(ZTOV(xzp));
 
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
-
+	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		error = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
@@ -1474,7 +1484,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
-	ASSERT3P(zp, ==, NULL);
+	ASSERT0P(zp);
 
 	if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
 	    mnt_ns))) {
@@ -1548,8 +1558,8 @@ out:
 
 	getnewvnode_drop_reserve();
 
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
+	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		error = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
@@ -1629,8 +1639,8 @@ zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
 	if (zfsvfs->z_use_namecache)
 		cache_vop_rmdir(dvp, vp);
 out:
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
+	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		error = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
@@ -1728,7 +1738,7 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
 	/*
 	 * Quit if directory has been removed (posix)
 	 */
-	if ((*eofp = zp->z_unlinked) != 0) {
+	if ((*eofp = (zp->z_unlinked != 0)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
@@ -2006,7 +2016,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		vap->va_rdev = zfs_cmpldev(rdev);
 	else
-		vap->va_rdev = 0;
+		vap->va_rdev = NODEV;
 	vap->va_gen = zp->z_gen;
 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
 	vap->va_filerev = zp->z_seq;
@@ -2196,6 +2206,7 @@ zfs_setattr_dir(znode_t *dzp)
 		if (err)
 			break;
 
+		vn_seqc_write_begin(ZTOV(zp));
 		mutex_enter(&dzp->z_lock);
 
 		if (zp->z_uid != dzp->z_uid) {
@@ -2245,6 +2256,7 @@ sa_add_projid_err:
 			dmu_tx_abort(tx);
 		}
 		tx = NULL;
+		vn_seqc_write_end(ZTOV(zp));
 		if (err != 0 && err != ENOENT)
 			break;
 
@@ -3001,8 +3013,8 @@ out:
 	}
 
 out2:
-	if (os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
+	if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS)
+		err = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (err);
@@ -3531,7 +3543,7 @@ out_seq:
 
 out:
 	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
+		error = zil_commit(zilog, 0);
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
@@ -3723,7 +3735,7 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
 		*zpp = zp;
 
 		if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-			zil_commit(zilog, 0);
+			error = zil_commit(zilog, 0);
 	}
 
 	zfs_exit(zfsvfs, FTAG);
@@ -3913,8 +3925,8 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr,
 		vnevent_link(ZTOV(szp), ct);
 	}
 
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
+	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		error = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
@@ -4104,6 +4116,7 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
 {
 	znode_t *zp;
 	zfsvfs_t *zfsvfs;
+	uint_t blksize, iosize;
 	int error;
 
 	switch (cmd) {
@@ -4115,8 +4128,20 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
 		*valp = 64;
 		return (0);
 	case _PC_MIN_HOLE_SIZE:
-		*valp = (int)SPA_MINBLOCKSIZE;
-		return (0);
+		iosize = vp->v_mount->mnt_stat.f_iosize;
+		if (vp->v_type == VREG) {
+			zp = VTOZ(vp);
+			blksize = zp->z_blksz;
+			if (zp->z_size <= blksize)
+				blksize = MAX(blksize, iosize);
+			*valp = (int)blksize;
+			return (0);
+		}
+		if (vp->v_type == VDIR) {
+			*valp = (int)iosize;
+			return (0);
+		}
+		return (EINVAL);
 	case _PC_ACL_EXTENDED:
 #if 0		/* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */
 		zp = VTOZ(vp);
@@ -4198,8 +4223,20 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
 
 			zfs_vmobject_wlock(object);
 			(void) vm_page_grab_pages(object, OFF_TO_IDX(start),
-			    VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_ZERO,
+			    VM_ALLOC_NORMAL | VM_ALLOC_WAITOK,
 			    ma, count);
+			if (!vm_page_all_valid(ma[count - 1])) {
+				/*
+				 * Later in this function, we copy DMU data to
+				 * invalid pages only. The last page may not be
+				 * entirely filled though, if the file does not
+				 * end on a page boundary. Therefore, we zero
+				 * that last page here to make sure it does not
+				 * contain garbage after the end of file.
+				 */
+				ASSERT(vm_page_none_valid(ma[count - 1]));
+				vm_page_zero_invalid(ma[count - 1], FALSE);
+			}
 			zfs_vmobject_wunlock(object);
 		}
 		if (blksz == zp->z_blksz)
@@ -4299,6 +4336,43 @@ zfs_freebsd_getpages(struct vop_getpages_args *ap)
 	    ap->a_rahead));
 }
 
+typedef struct {
+	uint_t		pca_npages;
+	vm_page_t	pca_pages[];
+} putpage_commit_arg_t;
+
+static void
+zfs_putpage_commit_cb(void *arg, int err)
+{
+	putpage_commit_arg_t *pca = arg;
+	vm_object_t object = pca->pca_pages[0]->object;
+
+	zfs_vmobject_wlock(object);
+
+	for (uint_t i = 0; i < pca->pca_npages; i++) {
+		vm_page_t pp = pca->pca_pages[i];
+
+		if (err == 0) {
+			/*
+			 * Writeback succeeded, so undirty the page. If it
+			 * fails, we leave it in the same state it was. That's
+			 * most likely dirty, so it will get tried again some
+			 * other time.
+			 */
+			vm_page_undirty(pp);
+		}
+
+		vm_page_sunbusy(pp);
+	}
+
+	vm_object_pip_wakeupn(object, pca->pca_npages);
+
+	zfs_vmobject_wunlock(object);
+
+	kmem_free(pca,
+	    offsetof(putpage_commit_arg_t, pca_pages[pca->pca_npages]));
+}
+
 static int
 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
     int *rtvals)
@@ -4400,10 +4474,12 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
 	}
 
 	if (zp->z_blksz < PAGE_SIZE) {
-		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
-			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
+		vm_ooffset_t woff = off;
+		size_t wlen = len;
+		for (i = 0; wlen > 0; woff += tocopy, wlen -= tocopy, i++) {
+			tocopy = MIN(PAGE_SIZE, wlen);
 			va = zfs_map_page(ma[i], &sf);
-			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
+			dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx);
 			zfs_unmap_page(sf);
 		}
 	} else {
@@ -4424,19 +4500,48 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		ASSERT0(err);
-		/*
-		 * XXX we should be passing a callback to undirty
-		 * but that would make the locking messier
-		 */
-		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
-		    len, commit, B_FALSE, NULL, NULL);
 
-		zfs_vmobject_wlock(object);
-		for (i = 0; i < ncount; i++) {
-			rtvals[i] = zfs_vm_pagerret_ok;
-			vm_page_undirty(ma[i]);
+		if (commit) {
+			/*
+			 * Caller requested that we commit immediately. We set
+			 * a callback on the log entry, to be called once its
+			 * on disk after the call to zil_commit() below. The
+			 * pages will be undirtied and unbusied there.
+			 */
+			putpage_commit_arg_t *pca = kmem_alloc(
+			    offsetof(putpage_commit_arg_t, pca_pages[ncount]),
+			    KM_SLEEP);
+			pca->pca_npages = ncount;
+			memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount);
+
+			zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
+			    B_TRUE, B_FALSE, zfs_putpage_commit_cb, pca);
+
+			for (i = 0; i < ncount; i++)
+				rtvals[i] = zfs_vm_pagerret_pend;
+		} else {
+			/*
+			 * Caller just wants the page written back somewhere,
+			 * but doesn't need it committed yet. We've already
+			 * written it back to the DMU, so we just need to put
+			 * it on the async log, then undirty the page and
+			 * return.
+			 *
+			 * We cannot use a callback here, because it would keep
+			 * the page busy (locked) until it is eventually
+			 * written down at txg sync.
+			 */
+			zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
+			    B_FALSE, B_FALSE, NULL, NULL);
+
+			zfs_vmobject_wlock(object);
+			for (i = 0; i < ncount; i++) {
+				rtvals[i] = zfs_vm_pagerret_ok;
+				vm_page_undirty(ma[i]);
+			}
+			zfs_vmobject_wunlock(object);
 		}
-		zfs_vmobject_wunlock(object);
+
 		VM_CNT_INC(v_vnodeout);
 		VM_CNT_ADD(v_vnodepgsout, ncount);
 	}
@@ -4444,8 +4549,13 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
 
 out:
 	zfs_rangelock_exit(lr);
-	if (commit)
-		zil_commit(zfsvfs->z_log, zp->z_id);
+	if (commit) {
+		err = zil_commit(zfsvfs->z_log, zp->z_id);
+		if (err != 0) {
+			zfs_exit(zfsvfs, FTAG);
+			return (err);
+		}
+	}
 
 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, len);
 
@@ -4707,8 +4817,16 @@ zfs_freebsd_access(struct vop_access_args *ap)
 	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
 	 */
 	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
-	if (accmode != 0)
-		error = zfs_access(zp, accmode, 0, ap->a_cred);
+	if (accmode != 0) {
+#if __FreeBSD_version >= 1500040
+		/* For named attributes, do the checks. */
+		if ((vn_irflag_read(vp) & VIRF_NAMEDATTR) != 0)
+			error = zfs_access(zp, accmode, V_NAMEDATTR,
+			    ap->a_cred);
+		else
+#endif
+			error = zfs_access(zp, accmode, 0, ap->a_cred);
+	}
 
 	/*
 	 * VADMIN has to be handled by vaccess().
@@ -4741,6 +4859,190 @@ struct vop_lookup_args {
 };
 #endif
 
+#if __FreeBSD_version >= 1500040
+static int
+zfs_lookup_nameddir(struct vnode *dvp, struct componentname *cnp,
+    struct vnode **vpp)
+{
+	struct vnode *xvp;
+	int error, flags;
+
+	*vpp = NULL;
+	flags = LOOKUP_XATTR | LOOKUP_NAMED_ATTR;
+	if ((cnp->cn_flags & CREATENAMED) != 0)
+		flags |= CREATE_XATTR_DIR;
+	error = zfs_lookup(dvp, NULL, &xvp, NULL, 0, cnp->cn_cred, flags,
+	    B_FALSE);
+	if (error == 0) {
+		if ((cnp->cn_flags & LOCKLEAF) != 0)
+			error = vn_lock(xvp, cnp->cn_lkflags);
+		if (error == 0) {
+			vn_irflag_set_cond(xvp, VIRF_NAMEDDIR);
+			*vpp = xvp;
+		} else {
+			vrele(xvp);
+		}
+	}
+	return (error);
+}
+
+static ssize_t
+zfs_readdir_named(struct vnode *vp, char *buf, ssize_t blen, off_t *offp,
+    int *eofflagp, struct ucred *cred, struct thread *td)
+{
+	struct uio io;
+	struct iovec iv;
+	zfs_uio_t uio;
+	int error;
+
+	io.uio_offset = *offp;
+	io.uio_segflg = UIO_SYSSPACE;
+	io.uio_rw = UIO_READ;
+	io.uio_td = td;
+	iv.iov_base = buf;
+	iv.iov_len = blen;
+	io.uio_iov = &iv;
+	io.uio_iovcnt = 1;
+	io.uio_resid = blen;
+	zfs_uio_init(&uio, &io);
+	error = zfs_readdir(vp, &uio, cred, eofflagp, NULL, NULL);
+	if (error != 0)
+		return (-1);
+	*offp = io.uio_offset;
+	return (blen - io.uio_resid);
+}
+
+static bool
+zfs_has_namedattr(struct vnode *vp, struct ucred *cred)
+{
+	struct componentname cn;
+	struct vnode *xvp;
+	struct dirent *dp;
+	off_t offs;
+	ssize_t rsize;
+	char *buf, *cp, *endcp;
+	int eofflag, error;
+	bool ret;
+
+	MNT_ILOCK(vp->v_mount);
+	if ((vp->v_mount->mnt_flag & MNT_NAMEDATTR) == 0) {
+		MNT_IUNLOCK(vp->v_mount);
+		return (false);
+	}
+	MNT_IUNLOCK(vp->v_mount);
+
+	/* Now see if a named attribute directory exists. */
+	cn.cn_flags = LOCKLEAF;
+	cn.cn_lkflags = LK_SHARED;
+	cn.cn_cred = cred;
+	error = zfs_lookup_nameddir(vp, &cn, &xvp);
+	if (error != 0)
+		return (false);
+
+	/* It exists, so see if there is any entry other than "." and "..". */
+	buf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK);
+	ret = false;
+	offs = 0;
+	do {
+		rsize = zfs_readdir_named(xvp, buf, DEV_BSIZE, &offs, &eofflag,
+		    cred, curthread);
+		if (rsize <= 0)
+			break;
+		cp = buf;
+		endcp = &buf[rsize];
+		while (cp < endcp) {
+			dp = (struct dirent *)cp;
+			if (dp->d_fileno != 0 && (dp->d_type == DT_REG ||
+			    dp->d_type == DT_UNKNOWN) &&
+			    !ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name) &&
+			    ((dp->d_namlen == 1 && dp->d_name[0] != '.') ||
+			    (dp->d_namlen == 2 && (dp->d_name[0] != '.' ||
+			    dp->d_name[1] != '.')) || dp->d_namlen > 2)) {
+				ret = true;
+				break;
+			}
+			cp += dp->d_reclen;
+		}
+	} while (!ret && rsize > 0 && eofflag == 0);
+	vput(xvp);
+	free(buf, M_TEMP);
+	return (ret);
+}
+
+static int
+zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
+{
+	struct componentname *cnp = ap->a_cnp;
+	char nm[NAME_MAX + 1];
+	int error;
+	struct vnode **vpp = ap->a_vpp, *dvp = ap->a_dvp, *xvp;
+	bool is_nameddir, needs_nameddir, opennamed = false;
+
+	/*
+	 * These variables are used to handle the named attribute cases:
+	 * opennamed - Is true when this is a call from open with O_NAMEDATTR
+	 *    specified and it is the last component.
+	 * is_nameddir - Is true when the directory is a named attribute dir.
+	 * needs_nameddir - Is set when the lookup needs to look for/create
+	 *    a named attribute directory.  It is only set when is_nameddir
+	 *    is_nameddir is false and opennamed is true.
+	 * xvp - Is the directory that the lookup needs to be done in.
+	 *    Usually dvp, unless needs_nameddir is true where it is the
+	 *    result of the first non-named directory lookup.
+	 * Note that name caching must be disabled for named attribute
+	 * handling.
+	 */
+	needs_nameddir = false;
+	xvp = dvp;
+	opennamed = (cnp->cn_flags & (OPENNAMED | ISLASTCN)) ==
+	    (OPENNAMED | ISLASTCN);
+	is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0;
+	if (is_nameddir && (cnp->cn_flags & ISLASTCN) == 0)
+		return (ENOATTR);
+	if (opennamed && !is_nameddir && (cnp->cn_flags & ISDOTDOT) != 0)
+		return (ENOATTR);
+	if (opennamed || is_nameddir)
+		cnp->cn_flags &= ~MAKEENTRY;
+	if (opennamed && !is_nameddir)
+		needs_nameddir = true;
+	ASSERT3U(cnp->cn_namelen, <, sizeof (nm));
+	error = 0;
+	*vpp = NULL;
+	if (needs_nameddir) {
+		if (VOP_ISLOCKED(dvp) != LK_EXCLUSIVE)
+			vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+		error = zfs_lookup_nameddir(dvp, cnp, &xvp);
+		if (error == 0)
+			is_nameddir = true;
+	}
+	if (error == 0) {
+		if (!needs_nameddir || cnp->cn_namelen != 1 ||
+		    *cnp->cn_nameptr != '.') {
+			strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1,
+			    sizeof (nm)));
+			error = zfs_lookup(xvp, nm, vpp, cnp, cnp->cn_nameiop,
+			    cnp->cn_cred, 0, cached);
+			if (is_nameddir && error == 0 &&
+			    (cnp->cn_namelen != 1 || *cnp->cn_nameptr != '.') &&
+			    (cnp->cn_flags & ISDOTDOT) == 0) {
+				if ((*vpp)->v_type == VDIR)
+					vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
+				else
+					vn_irflag_set_cond(*vpp,
+					    VIRF_NAMEDATTR);
+			}
+			if (needs_nameddir && xvp != *vpp)
+				vput(xvp);
+		} else {
+			/*
+			 * Lookup of "." when a named attribute dir is needed.
+			 */
+			*vpp = xvp;
+		}
+	}
+	return (error);
+}
+#else
 static int
 zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
 {
@@ -4753,6 +5055,7 @@ zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
 	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
 	    cnp->cn_cred, 0, cached));
 }
+#endif
 
 static int
 zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
@@ -4775,7 +5078,11 @@ zfs_cache_lookup(struct vop_lookup_args *ap)
 	zfsvfs_t *zfsvfs;
 
 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
+#if __FreeBSD_version >= 1500040
+	if (zfsvfs->z_use_namecache && (ap->a_cnp->cn_flags & OPENNAMED) == 0)
+#else
 	if (zfsvfs->z_use_namecache)
+#endif
 		return (vfs_cache_lookup(ap));
 	else
 		return (zfs_freebsd_lookup(ap, B_FALSE));
@@ -4798,6 +5105,11 @@ zfs_freebsd_create(struct vop_create_args *ap)
 	vattr_t *vap = ap->a_vap;
 	znode_t *zp = NULL;
 	int rc, mode;
+	struct vnode *dvp = ap->a_dvp;
+#if __FreeBSD_version >= 1500040
+	struct vnode *xvp;
+	bool is_nameddir;
+#endif
 
 #if __FreeBSD_version < 1400068
 	ASSERT(cnp->cn_flags & SAVENAME);
@@ -4808,10 +5120,36 @@ zfs_freebsd_create(struct vop_create_args *ap)
 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
 	*ap->a_vpp = NULL;
 
-	rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, 0, mode,
-	    &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL);
+	rc = 0;
+#if __FreeBSD_version >= 1500040
+	xvp = NULL;
+	is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0;
+	if (!is_nameddir && (cnp->cn_flags & OPENNAMED) != 0) {
+		/* Needs a named attribute directory. */
+		rc = zfs_lookup_nameddir(dvp, cnp, &xvp);
+		if (rc == 0) {
+			dvp = xvp;
+			is_nameddir = true;
+		}
+	}
+	if (is_nameddir && rc == 0)
+		rc = zfs_check_attrname(cnp->cn_nameptr);
+#endif
+
 	if (rc == 0)
+		rc = zfs_create(VTOZ(dvp), cnp->cn_nameptr, vap, 0, mode,
+		    &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL);
+#if __FreeBSD_version >= 1500040
+	if (xvp != NULL)
+		vput(xvp);
+#endif
+	if (rc == 0) {
 		*ap->a_vpp = ZTOV(zp);
+#if __FreeBSD_version >= 1500040
+		if (is_nameddir)
+			vn_irflag_set_cond(*ap->a_vpp, VIRF_NAMEDATTR);
+#endif
+	}
 	if (zfsvfs->z_use_namecache &&
 	    rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
@@ -4830,13 +5168,21 @@ struct vop_remove_args {
 static int
 zfs_freebsd_remove(struct vop_remove_args *ap)
 {
+	int error = 0;
 
 #if __FreeBSD_version < 1400068
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 #endif
 
-	return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
-	    ap->a_cnp->cn_cred));
+#if __FreeBSD_version >= 1500040
+	if ((vn_irflag_read(ap->a_dvp) & VIRF_NAMEDDIR) != 0)
+		error = zfs_check_attrname(ap->a_cnp->cn_nameptr);
+#endif
+
+	if (error == 0)
+		error = zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
+		    ap->a_cnp->cn_cred);
+	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
@@ -4921,8 +5267,32 @@ struct vop_fsync_args {
 static int
 zfs_freebsd_fsync(struct vop_fsync_args *ap)
 {
+	vnode_t *vp = ap->a_vp;
+	int err = 0;
 
-	return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred));
+	/*
+	 * Push any dirty mmap()'d data out to the DMU and ZIL, ready for
+	 * zil_commit() to be called in zfs_fsync().
+	 */
+	if (vm_object_mightbedirty(vp->v_object)) {
+		zfs_vmobject_wlock(vp->v_object);
+		if (!vm_object_page_clean(vp->v_object, 0, 0, 0))
+			err = SET_ERROR(EIO);
+		zfs_vmobject_wunlock(vp->v_object);
+		if (err) {
+			/*
+			 * Unclear what state things are in. zfs_putpages()
+			 * will ensure the pages remain dirty if they haven't
+			 * been written down to the DMU, but because there may
+			 * be nothing logged, we can't assume that zfs_sync()
+			 * -> zil_commit() will give us a useful error. It's
+			 *  safest if we just error out here.
+			 */
+			return (err);
+		}
+	}
+
+	return (zfs_fsync(VTOZ(vp), 0, ap->a_td->td_ucred));
 }
 
 #ifndef _SYS_SYSPROTO_H_
@@ -4994,6 +5364,11 @@ zfs_freebsd_getattr(struct vop_getattr_args *ap)
 #undef	FLAG_CHECK
 	*vap = xvap.xva_vattr;
 	vap->va_flags = fflags;
+
+#if __FreeBSD_version >= 1500040
+	if ((vn_irflag_read(ap->a_vp) & (VIRF_NAMEDDIR | VIRF_NAMEDATTR)) != 0)
+		vap->va_bsdflags |= SFBSD_NAMEDATTR;
+#endif
 	return (0);
 }
 
@@ -5136,15 +5511,24 @@ zfs_freebsd_rename(struct vop_rename_args *ap)
 	vnode_t *fvp = ap->a_fvp;
 	vnode_t *tdvp = ap->a_tdvp;
 	vnode_t *tvp = ap->a_tvp;
-	int error;
+	int error = 0;
 
 #if __FreeBSD_version < 1400068
 	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
 	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
 #endif
 
-	error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
-	    ap->a_tcnp, ap->a_fcnp->cn_cred);
+#if __FreeBSD_version >= 1500040
+	if ((vn_irflag_read(fdvp) & VIRF_NAMEDDIR) != 0) {
+		error = zfs_check_attrname(ap->a_fcnp->cn_nameptr);
+		if (error == 0)
+			error = zfs_check_attrname(ap->a_tcnp->cn_nameptr);
+	}
+#endif
+
+	if (error == 0)
+		error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
+		    ap->a_tcnp, ap->a_fcnp->cn_cred);
 
 	vrele(fdvp);
 	vrele(fvp);
@@ -5373,6 +5757,9 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
 {
 	ulong_t val;
 	int error;
+#ifdef _PC_CLONE_BLKSIZE
+	zfsvfs_t *zfsvfs;
+#endif
 
 	error = zfs_pathconf(ap->a_vp, ap->a_name, &val,
 	    curthread->td_ucred, NULL);
@@ -5398,12 +5785,48 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
 			return (0);
 		}
 		return (EINVAL);
+#if __FreeBSD_version >= 1500040
+	case _PC_NAMEDATTR_ENABLED:
+		MNT_ILOCK(ap->a_vp->v_mount);
+		if ((ap->a_vp->v_mount->mnt_flag & MNT_NAMEDATTR) != 0)
+			*ap->a_retval = 1;
+		else
+			*ap->a_retval = 0;
+		MNT_IUNLOCK(ap->a_vp->v_mount);
+		return (0);
+	case _PC_HAS_NAMEDATTR:
+		if (zfs_has_namedattr(ap->a_vp, curthread->td_ucred))
+			*ap->a_retval = 1;
+		else
+			*ap->a_retval = 0;
+		return (0);
+#endif
+#ifdef _PC_HAS_HIDDENSYSTEM
+	case _PC_HAS_HIDDENSYSTEM:
+		*ap->a_retval = 1;
+		return (0);
+#endif
+#ifdef _PC_CLONE_BLKSIZE
+	case _PC_CLONE_BLKSIZE:
+		zfsvfs = (zfsvfs_t *)ap->a_vp->v_mount->mnt_data;
+		if (zfs_bclone_enabled &&
+		    spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
+		    SPA_FEATURE_BLOCK_CLONING))
+			*ap->a_retval = dsl_dataset_feature_is_active(
+			    zfsvfs->z_os->os_dsl_dataset,
+			    SPA_FEATURE_LARGE_BLOCKS) ?
+			    SPA_MAXBLOCKSIZE :
+			    SPA_OLD_MAXBLOCKSIZE;
+		else
+			*ap->a_retval = 0;
+		return (0);
+#endif
 	default:
 		return (vop_stdpathconf(ap));
 	}
 }
 
-static int zfs_xattr_compat = 1;
+int zfs_xattr_compat = 1;
 
 static int
 zfs_check_attrname(const char *name)
@@ -6436,9 +6859,11 @@ zfs_deallocate(struct vop_deallocate_args *ap)
 	if (error == 0) {
 		if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS ||
 		    (ap->a_ioflag & IO_SYNC) != 0)
-			zil_commit(zilog, zp->z_id);
-		*ap->a_offset = off + len;
-		*ap->a_len = 0;
+			error = zil_commit(zilog, zp->z_id);
+		if (error == 0) {
+			*ap->a_offset = off + len;
+			*ap->a_len = 0;
+		}
 	}
 
 	zfs_exit(zfsvfs, FTAG);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
index 9bad1e13d7cc..649022ab5bcb 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
@@ -150,8 +150,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 	zp->z_xattr_cached = NULL;
 	zp->z_xattr_parent = 0;
 	zp->z_vnode = NULL;
-	zp->z_sync_writes_cnt = 0;
-	zp->z_async_writes_cnt = 0;
 
 	return (0);
 }
@@ -163,18 +161,15 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 	znode_t *zp = buf;
 
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
-	ASSERT3P(zp->z_vnode, ==, NULL);
+	ASSERT0P(zp->z_vnode);
 	ASSERT(!list_link_active(&zp->z_link_node));
 	mutex_destroy(&zp->z_lock);
 	mutex_destroy(&zp->z_acl_lock);
 	rw_destroy(&zp->z_xattr_lock);
 	zfs_rangelock_fini(&zp->z_rangelock);
 
-	ASSERT3P(zp->z_acl_cached, ==, NULL);
-	ASSERT3P(zp->z_xattr_cached, ==, NULL);
-
-	ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
-	ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
+	ASSERT0P(zp->z_acl_cached);
+	ASSERT0P(zp->z_xattr_cached);
 }
 
 
@@ -200,7 +195,7 @@ zfs_znode_init(void)
 	/*
 	 * Initialize zcache
 	 */
-	ASSERT3P(znode_uma_zone, ==, NULL);
+	ASSERT0P(znode_uma_zone);
 	znode_uma_zone = uma_zcreate("zfs_znode_cache",
 	    sizeof (znode_t), zfs_znode_cache_constructor_smr,
 	    zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0);
@@ -229,7 +224,7 @@ zfs_znode_init(void)
 	/*
 	 * Initialize zcache
 	 */
-	ASSERT3P(znode_cache, ==, NULL);
+	ASSERT0P(znode_cache);
 	znode_cache = kmem_cache_create("zfs_znode_cache",
 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
 	    zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_RECLAIMABLE);
@@ -358,8 +353,8 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
 
-	ASSERT3P(zp->z_sa_hdl, ==, NULL);
-	ASSERT3P(zp->z_acl_cached, ==, NULL);
+	ASSERT0P(zp->z_sa_hdl);
+	ASSERT0P(zp->z_acl_cached);
 	if (sa_hdl == NULL) {
 		VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp,
 		    SA_HDL_SHARED, &zp->z_sa_hdl));
@@ -456,8 +451,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
-	zp->z_sync_writes_cnt = 0;
-	zp->z_async_writes_cnt = 0;
 	atomic_store_ptr(&zp->z_cached_symlink, NULL);
 
 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
@@ -824,6 +817,10 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 	(*zpp)->z_dnodesize = dnodesize;
 	(*zpp)->z_projid = projid;
 
+	vnode_t *vp = ZTOV(*zpp);
+	if (!(flag & IS_ROOT_NODE))
+		vn_seqc_write_begin(vp);
+
 	if (vap->va_mask & AT_XVATTR)
 		zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
 
@@ -832,7 +829,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
 	}
 	if (!(flag & IS_ROOT_NODE)) {
-		vnode_t *vp = ZTOV(*zpp);
+		vn_seqc_write_end(vp);
 		vp->v_vflag |= VV_FORCEINSMQ;
 		int err = insmntque(vp, zfsvfs->z_vfs);
 		vp->v_vflag &= ~VV_FORCEINSMQ;
@@ -1134,7 +1131,7 @@ zfs_rezget(znode_t *zp)
 	}
 	rw_exit(&zp->z_xattr_lock);
 
-	ASSERT3P(zp->z_sa_hdl, ==, NULL);
+	ASSERT0P(zp->z_sa_hdl);
 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
 	if (err) {
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
@@ -1305,7 +1302,7 @@ zfs_znode_free(znode_t *zp)
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	char *symlink;
 
-	ASSERT3P(zp->z_sa_hdl, ==, NULL);
+	ASSERT0P(zp->z_sa_hdl);
 	zp->z_vnode = NULL;
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	POINTER_INVALIDATE(&zp->z_zfsvfs);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
index 91cf38016e00..8562c42b3220 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
@@ -437,6 +437,7 @@ zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
+	memset(&cuio_s, 0, sizeof (cuio_s));
 	zfs_uio_init(&cuio, &cuio_s);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
@@ -519,6 +520,7 @@ zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
 
+	memset(&cuio_s, 0, sizeof (cuio_s));
 	zfs_uio_init(&cuio, &cuio_s);
 
 	/*
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
index 212ef560db07..3ddbfcb97184 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
@@ -31,7 +31,7 @@
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Klara, Inc.
  */
 
 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
@@ -183,6 +183,7 @@ static struct filterops zvol_filterops_vnode = {
 	.f_isfd = 1,
 	.f_detach = zvol_filter_detach,
 	.f_event = zvol_filter_vnode,
+	.f_copy = knote_triv_copy,
 };
 
 extern uint_t zfs_geom_probe_vdev_key;
@@ -196,7 +197,6 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
 
 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
-static void zvol_geom_destroy(zvol_state_t *zv);
 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
 static void zvol_geom_bio_start(struct bio *bp);
 static int zvol_geom_bio_getattr(struct bio *bp);
@@ -226,25 +226,14 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
 	}
 
 retry:
-	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
-	/*
-	 * Obtain a copy of private under zvol_state_lock to make sure either
-	 * the result of zvol free code setting private to NULL is observed,
-	 * or the zv is protected from being freed because of the positive
-	 * zv_open_count.
-	 */
-	zv = pp->private;
-	if (zv == NULL) {
-		rw_exit(&zvol_state_lock);
-		err = SET_ERROR(ENXIO);
-		goto out_locked;
-	}
+	zv = atomic_load_ptr(&pp->private);
+	if (zv == NULL)
+		return (SET_ERROR(ENXIO));
 
 	mutex_enter(&zv->zv_state_lock);
 	if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
-		rw_exit(&zvol_state_lock);
 		err = SET_ERROR(ENXIO);
-		goto out_zv_locked;
+		goto out_locked;
 	}
 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
 
@@ -257,8 +246,24 @@ retry:
 		drop_suspend = B_TRUE;
 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
 			mutex_exit(&zv->zv_state_lock);
+
+			/*
+			 * Removal may happen while the locks are down, so
+			 * we can't trust zv any longer; we have to start over.
+			 */
+			zv = atomic_load_ptr(&pp->private);
+			if (zv == NULL)
+				return (SET_ERROR(ENXIO));
+
 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 			mutex_enter(&zv->zv_state_lock);
+
+			if (zv->zv_zso->zso_dying ||
+			    zv->zv_flags & ZVOL_REMOVING) {
+				err = SET_ERROR(ENXIO);
+				goto out_locked;
+			}
+
 			/* Check to see if zv_suspend_lock is needed. */
 			if (zv->zv_open_count != 0) {
 				rw_exit(&zv->zv_suspend_lock);
@@ -266,7 +271,6 @@ retry:
 			}
 		}
 	}
-	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -294,7 +298,7 @@ retry:
 		if (drop_namespace)
 			mutex_exit(&spa_namespace_lock);
 		if (err)
-			goto out_zv_locked;
+			goto out_locked;
 		pp->mediasize = zv->zv_volsize;
 		pp->stripeoffset = 0;
 		pp->stripesize = zv->zv_volblocksize;
@@ -329,9 +333,8 @@ out_opened:
 		zvol_last_close(zv);
 		wakeup(zv);
 	}
-out_zv_locked:
-	mutex_exit(&zv->zv_state_lock);
 out_locked:
+	mutex_exit(&zv->zv_state_lock);
 	if (drop_suspend)
 		rw_exit(&zv->zv_suspend_lock);
 	return (err);
@@ -345,12 +348,9 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
 	boolean_t drop_suspend = B_TRUE;
 	int new_open_count;
 
-	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
-	zv = pp->private;
-	if (zv == NULL) {
-		rw_exit(&zvol_state_lock);
+	zv = atomic_load_ptr(&pp->private);
+	if (zv == NULL)
 		return (SET_ERROR(ENXIO));
-	}
 
 	mutex_enter(&zv->zv_state_lock);
 	if (zv->zv_flags & ZVOL_EXCL) {
@@ -377,6 +377,15 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 			mutex_enter(&zv->zv_state_lock);
+
+			/*
+			 * Unlike in zvol_geom_open(), we don't check if
+			 * removal started here, because we might be one of the
+			 * openers that needs to be thrown out! If we're the
+			 * last, we need to call zvol_last_close() below to
+			 * finish cleanup. So, no special treatment for us.
+			 */
+
 			/* Check to see if zv_suspend_lock is needed. */
 			new_open_count = zv->zv_open_count - count;
 			if (new_open_count != 0) {
@@ -387,7 +396,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
 	} else {
 		drop_suspend = B_FALSE;
 	}
-	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -408,20 +416,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
 	return (0);
 }
 
-static void
-zvol_geom_destroy(zvol_state_t *zv)
-{
-	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
-	struct g_provider *pp = zsg->zsg_provider;
-
-	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
-
-	g_topology_assert();
-
-	zsg->zsg_provider = NULL;
-	g_wither_geom(pp->geom, ENXIO);
-}
-
 void
 zvol_wait_close(zvol_state_t *zv)
 {
@@ -454,7 +448,7 @@ zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
 	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
 	    pp->name, acr, acw, ace));
 
-	if (pp->private == NULL) {
+	if (atomic_load_ptr(&pp->private) == NULL) {
 		if (acr <= 0 && acw <= 0 && ace <= 0)
 			return (0);
 		return (pp->error);
@@ -727,9 +721,9 @@ unlock:
 		break;
 	}
 
-	if (commit) {
+	if (error == 0 && commit) {
 commit:
-		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
 	}
 resume:
 	rw_exit(&zv->zv_suspend_lock);
@@ -906,8 +900,8 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
 	zfs_rangelock_exit(lr);
 	int64_t nwritten = start_resid - zfs_uio_resid(&uio);
 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
-	if (commit)
-		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+	if (error == 0 && commit)
+		error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
 	rw_exit(&zv->zv_suspend_lock);
 
 	return (error);
@@ -921,25 +915,14 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
 	boolean_t drop_suspend = B_FALSE;
 
 retry:
-	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
-	/*
-	 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
-	 * the result of zvol free code setting si_drv2 to NULL is observed,
-	 * or the zv is protected from being freed because of the positive
-	 * zv_open_count.
-	 */
-	zv = dev->si_drv2;
-	if (zv == NULL) {
-		rw_exit(&zvol_state_lock);
-		err = SET_ERROR(ENXIO);
-		goto out_locked;
-	}
+	zv = atomic_load_ptr(&dev->si_drv2);
+	if (zv == NULL)
+		return (SET_ERROR(ENXIO));
 
 	mutex_enter(&zv->zv_state_lock);
-	if (zv->zv_zso->zso_dying) {
-		rw_exit(&zvol_state_lock);
+	if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
 		err = SET_ERROR(ENXIO);
-		goto out_zv_locked;
+		goto out_locked;
 	}
 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
 
@@ -954,6 +937,13 @@ retry:
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 			mutex_enter(&zv->zv_state_lock);
+
+			if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
+				/* Removal started while locks were down. */
+				err = SET_ERROR(ENXIO);
+				goto out_locked;
+			}
+
 			/* Check to see if zv_suspend_lock is needed. */
 			if (zv->zv_open_count != 0) {
 				rw_exit(&zv->zv_suspend_lock);
@@ -961,7 +951,6 @@ retry:
 			}
 		}
 	}
-	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -989,7 +978,7 @@ retry:
 		if (drop_namespace)
 			mutex_exit(&spa_namespace_lock);
 		if (err)
-			goto out_zv_locked;
+			goto out_locked;
 	}
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -1016,9 +1005,8 @@ out_opened:
 		zvol_last_close(zv);
 		wakeup(zv);
 	}
-out_zv_locked:
-	mutex_exit(&zv->zv_state_lock);
 out_locked:
+	mutex_exit(&zv->zv_state_lock);
 	if (drop_suspend)
 		rw_exit(&zv->zv_suspend_lock);
 	return (err);
@@ -1030,12 +1018,9 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 	zvol_state_t *zv;
 	boolean_t drop_suspend = B_TRUE;
 
-	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
-	zv = dev->si_drv2;
-	if (zv == NULL) {
-		rw_exit(&zvol_state_lock);
+	zv = atomic_load_ptr(&dev->si_drv2);
+	if (zv == NULL)
 		return (SET_ERROR(ENXIO));
-	}
 
 	mutex_enter(&zv->zv_state_lock);
 	if (zv->zv_flags & ZVOL_EXCL) {
@@ -1060,6 +1045,15 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 			mutex_enter(&zv->zv_state_lock);
+
+			/*
+			 * Unlike in zvol_cdev_open(), we don't check if
+			 * removal started here, because we might be one of the
+			 * openers that needs to be thrown out! If we're the
+			 * last, we need to call zvol_last_close() below to
+			 * finish cleanup. So, no special treatment for us.
+			 */
+
 			/* Check to see if zv_suspend_lock is needed. */
 			if (zv->zv_open_count != 1) {
 				rw_exit(&zv->zv_suspend_lock);
@@ -1069,7 +1063,6 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 	} else {
 		drop_suspend = B_FALSE;
 	}
-	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -1101,7 +1094,8 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
 	int error;
 	boolean_t sync;
 
-	zv = dev->si_drv2;
+	zv = atomic_load_ptr(&dev->si_drv2);
+	ASSERT3P(zv, !=, NULL);
 
 	error = 0;
 	KASSERT(zv->zv_open_count > 0,
@@ -1117,7 +1111,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
 	case DIOCGFLUSH:
 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 		if (zv->zv_zilog != NULL)
-			zil_commit(zv->zv_zilog, ZVOL_OBJ);
+			error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
 		rw_exit(&zv->zv_suspend_lock);
 		break;
 	case DIOCGDELETE:
@@ -1152,7 +1146,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
 		}
 		zfs_rangelock_exit(lr);
 		if (sync)
-			zil_commit(zv->zv_zilog, ZVOL_OBJ);
+			error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
 		rw_exit(&zv->zv_suspend_lock);
 		break;
 	case DIOCGSTRIPESIZE:
@@ -1162,6 +1156,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
 		*(off_t *)data = 0;
 		break;
 	case DIOCGATTR: {
+		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
 		uint64_t refd, avail, usedobjs, availobjs;
@@ -1186,6 +1181,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
 			arg->value.off = refd / DEV_BSIZE;
 		} else
 			error = SET_ERROR(ENOIOCTL);
+		rw_exit(&zv->zv_suspend_lock);
 		break;
 	}
 	case FIOSEEKHOLE:
@@ -1196,10 +1192,12 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
 
 		hole = (cmd == FIOSEEKHOLE);
 		noff = *off;
+		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 		lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
 		    RL_READER);
 		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
 		zfs_rangelock_exit(lr);
+		rw_exit(&zv->zv_suspend_lock);
 		*off = noff;
 		break;
 	}
@@ -1248,9 +1246,11 @@ zvol_os_is_zvol(const char *device)
 	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
 }
 
-void
+int
 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 {
+	int error = 0;
+
 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -1304,56 +1304,159 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 		args.mda_gid = GID_OPERATOR;
 		args.mda_mode = 0640;
 		args.mda_si_drv2 = zv;
-		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
-		    == 0) {
+		error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname);
+		if (error == 0) {
 			dev->si_iosize_max = maxphys;
 			zsd->zsd_cdev = dev;
 		}
 	}
 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
 	dataset_kstats_rename(&zv->zv_kstat, newname);
+
+	return (error);
 }
 
 /*
- * Remove minor node for the specified volume.
+ * Allocate memory for a new zvol_state_t and setup the required
+ * request queue and generic disk structures for the block device.
  */
-void
-zvol_os_free(zvol_state_t *zv)
+static int
+zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
+    zvol_state_t **zvp)
 {
-	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
-	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
-	ASSERT0(zv->zv_open_count);
+	zvol_state_t *zv;
+	uint64_t volmode;
+	int error;
 
-	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+	error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE),
+	    &volmode, NULL);
+	if (error)
+		return (error);
 
-	rw_destroy(&zv->zv_suspend_lock);
-	zfs_rangelock_fini(&zv->zv_rangelock);
+	if (volmode == ZFS_VOLMODE_DEFAULT)
+		volmode = zvol_volmode;
 
+	if (volmode == ZFS_VOLMODE_NONE)
+		return (0);
+
+	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
+	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
+	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
+	zv->zv_volmode = volmode;
+	zv->zv_volsize = volsize;
+	zv->zv_volblocksize = volblocksize;
 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
-		struct g_provider *pp __maybe_unused = zsg->zsg_provider;
+		struct g_provider *pp;
+		struct g_geom *gp;
+
+		g_topology_lock();
+		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+		gp->start = zvol_geom_bio_start;
+		gp->access = zvol_geom_access;
+		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
+		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+		pp->sectorsize = DEV_BSIZE;
+		pp->mediasize = 0;
+		pp->private = zv;
+
+		zsg->zsg_provider = pp;
+	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+		struct cdev *dev;
+		struct make_dev_args args;
+
+		make_dev_args_init(&args);
+		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+		args.mda_devsw = &zvol_cdevsw;
+		args.mda_cr = NULL;
+		args.mda_uid = UID_ROOT;
+		args.mda_gid = GID_OPERATOR;
+		args.mda_mode = 0640;
+		args.mda_si_drv2 = zv;
+		error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
+		if (error) {
+			kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+			kmem_free(zv, sizeof (zvol_state_t));
+			return (error);
+		}
+
+		dev->si_iosize_max = maxphys;
+		zsd->zsd_cdev = dev;
+		knlist_init_sx(&zsd->zsd_selinfo.si_note, &zv->zv_state_lock);
+	}
+	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
+	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
+	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
+
+	*zvp = zv;
+	return (error);
+}
 
-		ASSERT3P(pp->private, ==, NULL);
+/*
+ * Remove minor node for the specified volume.
+ */
+void
+zvol_os_remove_minor(zvol_state_t *zv)
+{
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT0(zv->zv_open_count);
+	ASSERT0(atomic_read(&zv->zv_suspend_ref));
+	ASSERT(zv->zv_flags & ZVOL_REMOVING);
+
+	struct zvol_state_os *zso = zv->zv_zso;
+	zv->zv_zso = NULL;
+
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		struct zvol_state_geom *zsg = &zso->zso_geom;
+		struct g_provider *pp = zsg->zsg_provider;
+		atomic_store_ptr(&pp->private, NULL);
+		mutex_exit(&zv->zv_state_lock);
 
 		g_topology_lock();
-		zvol_geom_destroy(zv);
+		g_wither_geom(pp->geom, ENXIO);
 		g_topology_unlock();
 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
-		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+		struct zvol_state_dev *zsd = &zso->zso_dev;
 		struct cdev *dev = zsd->zsd_cdev;
 
+		if (dev != NULL)
+			atomic_store_ptr(&dev->si_drv2, NULL);
+		mutex_exit(&zv->zv_state_lock);
+
 		if (dev != NULL) {
-			ASSERT3P(dev->si_drv2, ==, NULL);
 			destroy_dev(dev);
 			knlist_clear(&zsd->zsd_selinfo.si_note, 0);
 			knlist_destroy(&zsd->zsd_selinfo.si_note);
 		}
 	}
 
+	kmem_free(zso, sizeof (struct zvol_state_os));
+
+	mutex_enter(&zv->zv_state_lock);
+}
+
+void
+zvol_os_free(zvol_state_t *zv)
+{
+	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT0(zv->zv_open_count);
+	ASSERT0P(zv->zv_zso);
+
+	ASSERT0P(zv->zv_objset);
+	ASSERT0P(zv->zv_zilog);
+	ASSERT0P(zv->zv_dn);
+
+	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+
+	rw_destroy(&zv->zv_suspend_lock);
+	zfs_rangelock_fini(&zv->zv_rangelock);
+
 	mutex_destroy(&zv->zv_state_lock);
 	cv_destroy(&zv->zv_removing_cv);
 	dataset_kstats_destroy(&zv->zv_kstat);
-	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
 	kmem_free(zv, sizeof (zvol_state_t));
 	zvol_minors--;
 }
@@ -1364,11 +1467,11 @@ zvol_os_free(zvol_state_t *zv)
 int
 zvol_os_create_minor(const char *name)
 {
-	zvol_state_t *zv;
+	zvol_state_t *zv = NULL;
 	objset_t *os;
 	dmu_object_info_t *doi;
 	uint64_t volsize;
-	uint64_t volmode, hash, len;
+	uint64_t hash, len;
 	int error;
 	bool replayed_zil = B_FALSE;
 
@@ -1400,74 +1503,22 @@ zvol_os_create_minor(const char *name)
 	if (error)
 		goto out_dmu_objset_disown;
 
-	error = dsl_prop_get_integer(name,
-	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
-	if (error || volmode == ZFS_VOLMODE_DEFAULT)
-		volmode = zvol_volmode;
-	error = 0;
+	error = zvol_alloc(name, volsize, doi->doi_data_block_size, &zv);
+	if (error || zv == NULL)
+		goto out_dmu_objset_disown;
 
-	/*
-	 * zvol_alloc equivalent ...
-	 */
-	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
 	zv->zv_hash = hash;
-	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
-	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
-	zv->zv_volmode = volmode;
-	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
-		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
-		struct g_provider *pp;
-		struct g_geom *gp;
-
-		g_topology_lock();
-		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
-		gp->start = zvol_geom_bio_start;
-		gp->access = zvol_geom_access;
-		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
-		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
-		pp->sectorsize = DEV_BSIZE;
-		pp->mediasize = 0;
-		pp->private = zv;
-
-		zsg->zsg_provider = pp;
-	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
-		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
-		struct cdev *dev;
-		struct make_dev_args args;
-
-		make_dev_args_init(&args);
-		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
-		args.mda_devsw = &zvol_cdevsw;
-		args.mda_cr = NULL;
-		args.mda_uid = UID_ROOT;
-		args.mda_gid = GID_OPERATOR;
-		args.mda_mode = 0640;
-		args.mda_si_drv2 = zv;
-		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
-		    == 0) {
-			dev->si_iosize_max = maxphys;
-			zsd->zsd_cdev = dev;
-			knlist_init_sx(&zsd->zsd_selinfo.si_note,
-			    &zv->zv_state_lock);
-		}
-	}
-	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
-	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
-	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
 
 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
 		zv->zv_flags |= ZVOL_RDONLY;
 
-	zv->zv_volblocksize = doi->doi_data_block_size;
-	zv->zv_volsize = volsize;
 	zv->zv_objset = os;
 
-	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+	ASSERT0P(zv->zv_kstat.dk_kstats);
 	error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
 	if (error)
 		goto out_dmu_objset_disown;
-	ASSERT3P(zv->zv_zilog, ==, NULL);
+	ASSERT0P(zv->zv_zilog);
 	zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
 	if (spa_writeable(dmu_objset_spa(os))) {
 		if (zil_replay_disable)
@@ -1490,13 +1541,14 @@ zvol_os_create_minor(const char *name)
 out_dmu_objset_disown:
 	dmu_objset_disown(os, B_TRUE, FTAG);
 
-	if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
+	if (error == 0 && zv && zv->zv_volmode == ZFS_VOLMODE_GEOM) {
 		g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0);
+		/* geom was locked inside zvol_alloc() function */
 		g_topology_unlock();
 	}
 out_doi:
 	kmem_free(doi, sizeof (dmu_object_info_t));
-	if (error == 0) {
+	if (error == 0 && zv) {
 		rw_enter(&zvol_state_lock, RW_WRITER);
 		zvol_insert(zv);
 		zvol_minors++;
@@ -1507,28 +1559,6 @@ out_doi:
 	return (error);
 }
 
-void
-zvol_os_clear_private(zvol_state_t *zv)
-{
-	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
-	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
-		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
-		struct g_provider *pp = zsg->zsg_provider;
-
-		if (pp->private == NULL) /* already cleared */
-			return;
-
-		pp->private = NULL;
-		ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
-	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
-		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
-		struct cdev *dev = zsd->zsd_cdev;
-
-		if (dev != NULL)
-			dev->si_drv2 = NULL;
-	}
-}
-
 int
 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
 {
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c
index ce9c9e39e60c..aac5f2ebbfd2 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c
@@ -66,9 +66,9 @@ void
 __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg)
 {
 	ASSERT(cvp);
-	ASSERT(name == NULL);
+	ASSERT0P(name);
 	ASSERT(type == CV_DEFAULT);
-	ASSERT(arg == NULL);
+	ASSERT0P(arg);
 
 	cvp->cv_magic = CV_MAGIC;
 	init_waitqueue_head(&cvp->cv_event);
@@ -83,7 +83,7 @@ static int
 cv_destroy_wakeup(kcondvar_t *cvp)
 {
 	if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) {
-		ASSERT(cvp->cv_mutex == NULL);
+		ASSERT0P(cvp->cv_mutex);
 		ASSERT(!waitqueue_active(&cvp->cv_event));
 		return (1);
 	}
@@ -104,7 +104,7 @@ __cv_destroy(kcondvar_t *cvp)
 	while (cv_destroy_wakeup(cvp) == 0)
 		wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1);
 
-	ASSERT3P(cvp->cv_mutex, ==, NULL);
+	ASSERT0P(cvp->cv_mutex);
 	ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0);
 	ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0);
 	ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
index f37699b4347e..89ca4a648b2f 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
@@ -709,7 +709,7 @@ zone_get_hostid(void *zone)
 {
 	uint32_t hostid;
 
-	ASSERT3P(zone, ==, NULL);
+	ASSERT0P(zone);
 
 	if (spl_hostid != 0)
 		return ((uint32_t)(spl_hostid & HW_HOSTID_MASK));
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
index fab80289b278..22e4ed169d03 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
@@ -296,7 +296,7 @@ spl_slab_free(spl_kmem_slab_t *sks,
 	spl_kmem_cache_t *skc;
 
 	ASSERT(sks->sks_magic == SKS_MAGIC);
-	ASSERT(sks->sks_ref == 0);
+	ASSERT0(sks->sks_ref);
 
 	skc = sks->sks_cache;
 	ASSERT(skc->skc_magic == SKC_MAGIC);
@@ -598,7 +598,7 @@ static void
 spl_magazine_free(spl_kmem_magazine_t *skm)
 {
 	ASSERT(skm->skm_magic == SKM_MAGIC);
-	ASSERT(skm->skm_avail == 0);
+	ASSERT0(skm->skm_avail);
 	kfree(skm);
 }
 
@@ -610,7 +610,7 @@ spl_magazine_create(spl_kmem_cache_t *skc)
 {
 	int i = 0;
 
-	ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+	ASSERT0((skc->skc_flags & KMC_SLAB));
 
 	skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *
 	    num_possible_cpus(), kmem_flags_convert(KM_SLEEP));
@@ -640,7 +640,7 @@ spl_magazine_destroy(spl_kmem_cache_t *skc)
 	spl_kmem_magazine_t *skm;
 	int i = 0;
 
-	ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+	ASSERT0((skc->skc_flags & KMC_SLAB));
 
 	for_each_possible_cpu(i) {
 		skm = skc->skc_mag[i];
@@ -679,8 +679,8 @@ spl_kmem_cache_create(const char *name, size_t size, size_t align,
 	/*
 	 * Unsupported flags
 	 */
-	ASSERT(vmp == NULL);
-	ASSERT(reclaim == NULL);
+	ASSERT0P(vmp);
+	ASSERT0P(reclaim);
 
 	might_sleep();
 
@@ -863,11 +863,11 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 	 * Validate there are no objects in use and free all the
 	 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
 	 */
-	ASSERT3U(skc->skc_slab_alloc, ==, 0);
-	ASSERT3U(skc->skc_obj_alloc, ==, 0);
-	ASSERT3U(skc->skc_slab_total, ==, 0);
-	ASSERT3U(skc->skc_obj_total, ==, 0);
-	ASSERT3U(skc->skc_obj_emergency, ==, 0);
+	ASSERT0(skc->skc_slab_alloc);
+	ASSERT0(skc->skc_obj_alloc);
+	ASSERT0(skc->skc_slab_total);
+	ASSERT0(skc->skc_obj_total);
+	ASSERT0(skc->skc_obj_emergency);
 	ASSERT(list_empty(&skc->skc_complete_list));
 
 	ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0);
@@ -986,7 +986,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
 
 	ASSERT0(flags & ~KM_PUBLIC_MASK);
 	ASSERT(skc->skc_magic == SKC_MAGIC);
-	ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+	ASSERT0((skc->skc_flags & KMC_SLAB));
 
 	*obj = NULL;
 
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
index 337a4bcf76a0..9fe008cef868 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
@@ -302,13 +302,8 @@ spl_kmem_free_impl(const void *buf, size_t size)
 #ifdef DEBUG_KMEM
 
 /* Shim layer memory accounting */
-#ifdef HAVE_ATOMIC64_T
 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
-unsigned long long kmem_alloc_max = 0;
-#else  /* HAVE_ATOMIC64_T */
-atomic_t kmem_alloc_used = ATOMIC_INIT(0);
-unsigned long long kmem_alloc_max = 0;
-#endif /* HAVE_ATOMIC64_T */
+uint64_t kmem_alloc_max = 0;
 
 EXPORT_SYMBOL(kmem_alloc_used);
 EXPORT_SYMBOL(kmem_alloc_max);
@@ -320,9 +315,9 @@ spl_kmem_alloc_debug(size_t size, int flags, int node)
 
 	ptr = spl_kmem_alloc_impl(size, flags, node);
 	if (ptr) {
-		kmem_alloc_used_add(size);
-		if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
-			kmem_alloc_max = kmem_alloc_used_read();
+		atomic64_add(size, &kmem_alloc_used);
+		if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max))
+			kmem_alloc_max = atomic64_read(&kmem_alloc_used);
 	}
 
 	return (ptr);
@@ -331,7 +326,7 @@ spl_kmem_alloc_debug(size_t size, int flags, int node)
 inline void
 spl_kmem_free_debug(const void *ptr, size_t size)
 {
-	kmem_alloc_used_sub(size);
+	atomic64_sub(size, &kmem_alloc_used);
 	spl_kmem_free_impl(ptr, size);
 }
 
@@ -595,7 +590,7 @@ spl_kmem_init(void)
 {
 
 #ifdef DEBUG_KMEM
-	kmem_alloc_used_set(0);
+	atomic64_set(&kmem_alloc_used, 0);
 
 
 
@@ -617,9 +612,10 @@ spl_kmem_fini(void)
 	 * at that address to aid in debugging.  Performance is not
 	 * a serious concern here since it is module unload time.
 	 */
-	if (kmem_alloc_used_read() != 0)
+	if (atomic64_read(&kmem_alloc_used) != 0)
 		printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
-		    (unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
+		    (unsigned long)atomic64_read(&kmem_alloc_used),
+		    kmem_alloc_max);
 
 #ifdef DEBUG_KMEM_TRACKING
 	spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
index 48f70b00c96b..02c5b42bc4a0 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
@@ -541,7 +541,7 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
 	kstat_t *ksp;
 
 	ASSERT(ks_module);
-	ASSERT(ks_instance == 0);
+	ASSERT0(ks_instance);
 	ASSERT(ks_name);
 
 	if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO))
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
index 4ed0deedd5b9..8cdd5fc5cfe5 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
@@ -82,11 +82,7 @@ proc_domemused(CONST_CTL_TABLE *table, int write,
 	if (write) {
 		*ppos += *lenp;
 	} else {
-#ifdef HAVE_ATOMIC64_T
 		val = atomic64_read((atomic64_t *)table->data);
-#else
-		val = atomic_read((atomic_t *)table->data);
-#endif /* HAVE_ATOMIC64_T */
 		rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
 	}
 
@@ -315,18 +311,14 @@ static struct ctl_table spl_kmem_table[] = {
 	{
 		.procname	= "kmem_used",
 		.data		= &kmem_alloc_used,
-#ifdef HAVE_ATOMIC64_T
 		.maxlen		= sizeof (atomic64_t),
-#else
-		.maxlen		= sizeof (atomic_t),
-#endif /* HAVE_ATOMIC64_T */
 		.mode		= 0444,
 		.proc_handler	= &proc_domemused,
 	},
 	{
 		.procname	= "kmem_max",
 		.data		= &kmem_alloc_max,
-		.maxlen		= sizeof (unsigned long),
+		.maxlen		= sizeof (uint64_t),
 		.extra1		= &table_min,
 		.extra2		= &table_max,
 		.mode		= 0444,
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
index 1398483a3ac8..8f5c73b13df5 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
@@ -28,6 +28,7 @@
 #include <sys/kmem.h>
 #include <sys/tsd.h>
 #include <sys/string.h>
+#include <sys/misc.h>
 
 /*
  * Thread interfaces
@@ -79,7 +80,7 @@ __thread_create(caddr_t stk, size_t  stksize, thread_func_t func,
 
 	/* Option pp is simply ignored */
 	/* Variable stack size unsupported */
-	ASSERT(stk == NULL);
+	ASSERT0P(stk);
 
 	tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE);
 	if (tp == NULL)
@@ -197,3 +198,14 @@ issig(void)
 }
 
 EXPORT_SYMBOL(issig);
+
+/*
+ * Check if the current thread is a memory reclaim thread.
+ * Returns true if current thread is kswapd.
+ */
+int
+current_is_reclaim_thread(void)
+{
+	return (current_is_kswapd());
+}
+EXPORT_SYMBOL(current_is_reclaim_thread);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c
index 34a61bef7d4f..2e8cedf0dc87 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c
@@ -161,7 +161,7 @@ tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value)
 	ulong_t hash;
 	int rc = 0;
 
-	ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL);
+	ASSERT0P(tsd_hash_search(table, key, pid));
 
 	/* New entry allocate structure, set value, and add to hash */
 	entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
index 45c2999a4bb1..b2eae5d00b10 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
@@ -25,6 +25,10 @@
  * SUCH DAMAGE.
  */
 
+/*
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
+ */
+
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>
@@ -56,6 +60,19 @@ typedef struct zone_dataset {
 } zone_dataset_t;
 
 #ifdef CONFIG_USER_NS
+
+/*
+ * Linux 6.18 moved the generic namespace type away from ns->ops->type onto
+ * ns_common itself.
+ */
+#ifdef HAVE_NS_COMMON_TYPE
+#define	ns_is_newuser(ns)	\
+	((ns)->ns_type == CLONE_NEWUSER)
+#else
+#define	ns_is_newuser(ns)	\
+	((ns)->ops != NULL && (ns)->ops->type == CLONE_NEWUSER)
+#endif
+
 /*
  * Returns:
  * - 0 on success
@@ -84,7 +101,7 @@ user_ns_get(int fd, struct user_namespace **userns)
 		goto done;
 	}
 	ns = get_proc_ns(file_inode(nsfile));
-	if (ns->ops->type != CLONE_NEWUSER) {
+	if (!ns_is_newuser(ns)) {
 		error = ENOTTY;
 		goto done;
 	}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
index e1140b31a97a..18f2426fbbfc 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 by Delphix. All rights reserved.
  * Copyright (c) 2023, 2024, Klara Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
  */
 
 /*
@@ -256,10 +257,6 @@ abd_unmark_zfs_page(struct page *page)
 
 #ifndef CONFIG_HIGHMEM
 
-#ifndef __GFP_RECLAIM
-#define	__GFP_RECLAIM		__GFP_WAIT
-#endif
-
 /*
  * The goal is to minimize fragmentation by preferentially populating ABDs
  * with higher order compound pages from a single zone.  Allocation size is
@@ -867,9 +864,9 @@ abd_iter_advance(struct abd_iter *aiter, size_t amount)
 	 * Ensure that last chunk is not in use. abd_iterate_*() must clear
 	 * this state (directly or abd_iter_unmap()) before advancing.
 	 */
-	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0P(aiter->iter_mapaddr);
 	ASSERT0(aiter->iter_mapsize);
-	ASSERT3P(aiter->iter_page, ==, NULL);
+	ASSERT0P(aiter->iter_page);
 	ASSERT0(aiter->iter_page_doff);
 	ASSERT0(aiter->iter_page_dsize);
 
@@ -901,7 +898,7 @@ abd_iter_map(struct abd_iter *aiter)
 	void *paddr;
 	size_t offset = 0;
 
-	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0P(aiter->iter_mapaddr);
 	ASSERT0(aiter->iter_mapsize);
 
 	/* There's nothing left to iterate over, so do nothing */
@@ -1113,6 +1110,14 @@ abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
 #define	ABD_ITER_PAGE_SIZE(page)	(PAGESIZE)
 #endif
 
+#ifndef nth_page
+/*
+ * Since 6.18 nth_page() no longer exists, and is no longer required to iterate
+ * within a single SG entry, so we replace it with a simple addition.
+ */
+#define	nth_page(p, n)	((p)+(n))
+#endif
+
 void
 abd_iter_page(struct abd_iter *aiter)
 {
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
index 154ca22d9513..1bd3500e9f66 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@@ -471,13 +471,17 @@ vdev_disk_close(vdev_t *v)
 	if (v->vdev_reopening || vd == NULL)
 		return;
 
+	rw_enter(&vd->vd_lock, RW_WRITER);
+
 	if (vd->vd_bdh != NULL)
 		vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
 		    zfs_vdev_holder);
 
+	v->vdev_tsd = NULL;
+
+	rw_exit(&vd->vd_lock);
 	rw_destroy(&vd->vd_lock);
 	kmem_free(vd, sizeof (vdev_disk_t));
-	v->vdev_tsd = NULL;
 }
 
 /*
@@ -552,7 +556,7 @@ vdev_bio_associate_blkg(struct bio *bio)
 #endif
 
 	ASSERT3P(q, !=, NULL);
-	ASSERT3P(bio->bi_blkg, ==, NULL);
+	ASSERT0P(bio->bi_blkg);
 
 	if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
 		bio->bi_blkg = q->root_blkg;
@@ -574,7 +578,7 @@ vdev_bio_set_dev(struct bio *bio, struct block_device *bdev)
 	bio->bi_bdev = bdev;
 
 	ASSERT3P(q, !=, NULL);
-	ASSERT3P(bio->bi_blkg, ==, NULL);
+	ASSERT0P(bio->bi_blkg);
 
 	if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
 		bio->bi_blkg = q->root_blkg;
@@ -806,7 +810,7 @@ vbio_completion(struct bio *bio)
 	 * here; instead we stash vbio on the zio and take care of it in the
 	 * done callback.
 	 */
-	ASSERT3P(zio->io_bio, ==, NULL);
+	ASSERT0P(zio->io_bio);
 	zio->io_bio = vbio;
 
 	zio_delay_interrupt(zio);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
index 1b169122f25b..934d74a112fd 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
@@ -1900,7 +1900,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
 		if (!(flag & IS_ROOT_NODE) &&
 		    (dzp->z_pflags & ZFS_INHERIT_ACE) &&
 		    !(dzp->z_pflags & ZFS_XATTR)) {
-			VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
+			VERIFY0(zfs_acl_node_read(dzp, B_TRUE,
 			    &paclp, B_FALSE));
 			acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
 			    vap->va_mode, paclp, acl_ids->z_mode, &need_chmod);
@@ -2204,8 +2204,8 @@ top:
 	}
 
 	error = zfs_aclset_common(zp, aclp, cr, tx);
-	ASSERT(error == 0);
-	ASSERT(zp->z_acl_cached == NULL);
+	ASSERT0(error);
+	ASSERT0P(zp->z_acl_cached);
 	zp->z_acl_cached = aclp;
 
 	if (fuid_dirtied)
@@ -2524,7 +2524,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
 	 * Also note: DOS R/O is ignored for directories.
 	 */
 	if ((v4_mode & WRITE_MASK_DATA) &&
-	    S_ISDIR(ZTOI(zp)->i_mode) &&
+	    !S_ISDIR(ZTOI(zp)->i_mode) &&
 	    (zp->z_pflags & ZFS_READONLY)) {
 		return (SET_ERROR(EPERM));
 	}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
index 84b25cb2c5ac..fb4de50480a3 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
@@ -494,9 +494,9 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
 	if (!creation)
 		now = current_time(ip);
 	zp = ITOZ(ip);
-	ASSERT3P(zp->z_dirlocks, ==, NULL);
-	ASSERT3P(zp->z_acl_cached, ==, NULL);
-	ASSERT3P(zp->z_xattr_cached, ==, NULL);
+	ASSERT0P(zp->z_dirlocks);
+	ASSERT0P(zp->z_acl_cached);
+	ASSERT0P(zp->z_xattr_cached);
 	zp->z_id = id;
 	zp->z_unlinked = B_FALSE;
 	zp->z_atime_dirty = B_FALSE;
@@ -511,8 +511,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
 	zp->z_pflags = 0;
 	zp->z_mode = 0;
 	zp->z_sync_cnt = 0;
-	zp->z_sync_writes_cnt = 0;
-	zp->z_async_writes_cnt = 0;
 	ip->i_generation = 0;
 	ip->i_ino = id;
 	ip->i_mode = (S_IFDIR | S_IRWXUGO);
@@ -592,7 +590,7 @@ zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
 int
 zfsctl_create(zfsvfs_t *zfsvfs)
 {
-	ASSERT(zfsvfs->z_ctldir == NULL);
+	ASSERT0P(zfsvfs->z_ctldir);
 
 	zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
 	    &zpl_fops_root, &zpl_ops_root, 0);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
index 2f935bb3fc8c..e8de536606e2 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
@@ -463,7 +463,7 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 
 	ASSERT(zp->z_unlinked);
-	ASSERT(ZTOI(zp)->i_nlink == 0);
+	ASSERT0(ZTOI(zp)->i_nlink);
 
 	VERIFY3U(0, ==,
 	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
@@ -662,8 +662,8 @@ zfs_rmnode(znode_t *zp)
 	uint64_t	links;
 	int		error;
 
-	ASSERT(ZTOI(zp)->i_nlink == 0);
-	ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0);
+	ASSERT0(ZTOI(zp)->i_nlink);
+	ASSERT0(atomic_read(&ZTOI(zp)->i_count));
 
 	/*
 	 * If this is an attribute directory, purge its contents.
@@ -710,7 +710,7 @@ zfs_rmnode(znode_t *zp)
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
-		ASSERT(error == 0);
+		ASSERT0(error);
 	}
 
 	acl_obj = zfs_external_acl(zp);
@@ -744,12 +744,12 @@ zfs_rmnode(znode_t *zp)
 	}
 
 	if (xzp) {
-		ASSERT(error == 0);
+		ASSERT0(error);
 		mutex_enter(&xzp->z_lock);
 		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
 		clear_nlink(ZTOI(xzp));		/* no more links to it */
 		links = 0;
-		VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+		VERIFY0(sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
 		    &links, sizeof (links), tx));
 		mutex_exit(&xzp->z_lock);
 		zfs_unlinked_add(xzp, tx);
@@ -872,7 +872,7 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
 		    ctime);
 	}
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-	ASSERT(error == 0);
+	ASSERT0(error);
 
 	mutex_exit(&zp->z_lock);
 
@@ -894,7 +894,7 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
 	    &dzp->z_pflags, sizeof (dzp->z_pflags));
 	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
 	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
-	ASSERT(error == 0);
+	ASSERT0(error);
 	mutex_exit(&dzp->z_lock);
 
 	return (0);
@@ -986,7 +986,7 @@ zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp)
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
 	    NULL, &links, sizeof (links));
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-	ASSERT3U(error, ==, 0);
+	ASSERT0(error);
 
 	if (unlinkedp != NULL)
 		*unlinkedp = unlinked;
@@ -1058,7 +1058,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 
 		/* The only error is !zfs_dirempty() and we checked earlier. */
 		error = zfs_drop_nlink_locked(zp, tx, &unlinked);
-		ASSERT3U(error, ==, 0);
+		ASSERT0(error);
 		mutex_exit(&zp->z_lock);
 	} else {
 		error = zfs_dropname(dl, zp, dzp, tx, flag);
@@ -1083,7 +1083,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
 	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
 	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
-	ASSERT(error == 0);
+	ASSERT0(error);
 	mutex_exit(&dzp->z_lock);
 
 	if (unlinkedp != NULL)
@@ -1167,7 +1167,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr)
 	ASSERT(error == 0 && parent == zp->z_id);
 #endif
 
-	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+	VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
 	    sizeof (xzp->z_id), tx));
 
 	if (!zp->z_unlinked)
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
index d193eb80dca2..3fdcdbac6f68 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
@@ -115,8 +115,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
  */
 int
 zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
-    ssize_t *resid)
+    uint8_t ashift, ssize_t *resid)
 {
+	(void) ashift;
 	ssize_t rc;
 
 	rc  = kernel_write(fp, buf, count, &off);
@@ -260,24 +261,12 @@ zfs_file_fsync(zfs_file_t *filp, int flags)
 {
 	int datasync = 0;
 	int error;
-	int fstrans;
 
 	if (flags & O_DSYNC)
 		datasync = 1;
 
-	/*
-	 * May enter XFS which generates a warning when PF_FSTRANS is set.
-	 * To avoid this the flag is cleared over vfs_sync() and then reset.
-	 */
-	fstrans = __spl_pf_fstrans_check();
-	if (fstrans)
-		current->flags &= ~(__SPL_PF_FSTRANS);
-
 	error = -vfs_fsync(filp, datasync);
 
-	if (fstrans)
-		current->flags |= __SPL_PF_FSTRANS;
-
 	return (error);
 }
 
@@ -292,14 +281,6 @@ int
 zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len)
 {
 	/*
-	 * May enter XFS which generates a warning when PF_FSTRANS is set.
-	 * To avoid this the flag is cleared over vfs_sync() and then reset.
-	 */
-	int fstrans = __spl_pf_fstrans_check();
-	if (fstrans)
-		current->flags &= ~(__SPL_PF_FSTRANS);
-
-	/*
 	 * When supported by the underlying file system preferentially
 	 * use the fallocate() callback to preallocate the space.
 	 */
@@ -308,9 +289,6 @@ zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len)
 		error = -fp->f_op->fallocate(fp,
 		    FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len);
 
-	if (fstrans)
-		current->flags |= __SPL_PF_FSTRANS;
-
 	if (error)
 		return (SET_ERROR(error));
 
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
index 1c187d7b9cab..895d80b2d79e 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
@@ -223,7 +223,7 @@ zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name)
 {
 	/* zko_default_group.attrs must be NULL terminated */
 	ASSERT(zkobj->zko_default_group.attrs != NULL);
-	ASSERT(zkobj->zko_default_group.attrs[zkobj->zko_attr_count] == NULL);
+	ASSERT0P(zkobj->zko_default_group.attrs[zkobj->zko_attr_count]);
 
 	kobject_init(&zkobj->zko_kobj, &zkobj->zko_kobj_type);
 	return (kobject_add(&zkobj->zko_kobj, parent, name));
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
index a3837f784668..8a7d14ab6119 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -279,19 +279,14 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr)
 		return (err);
 
 	/*
-	 * If the pool is suspended, just return an error. This is to help
-	 * with shutting down with pools suspended, as we don't want to block
-	 * in that case.
+	 * Sync any pending writes, but do not block if the pool is suspended.
+	 * This is to help with shutting down with pools suspended, as we don't
+	 * want to block in that case.
 	 */
-	if (spa_suspended(zfsvfs->z_os->os_spa)) {
-		zfs_exit(zfsvfs, FTAG);
-		return (SET_ERROR(EIO));
-	}
-
-	zil_commit(zfsvfs->z_log, 0);
+	err = zil_commit_flags(zfsvfs->z_log, 0, ZIL_COMMIT_NOW);
 	zfs_exit(zfsvfs, FTAG);
 
-	return (0);
+	return (err);
 }
 
 static void
@@ -883,7 +878,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 	 * operations out since we closed the ZIL.
 	 */
 	if (mounting) {
-		ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+		ASSERT0P(zfsvfs->z_kstat.dk_kstats);
 		error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
 		if (error)
 			return (error);
@@ -1217,6 +1212,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
 }
 
 /*
+ * Dentry and inode caches referenced by a task in non-root memcg are
+ * not going to be scanned by the kernel-provided shrinker. So, if
+ * kernel prunes nothing, fall back to this manual walk to free dnodes.
+ * To avoid scanning the same znodes multiple times they are always rotated
+ * to the end of the z_all_znodes list. New znodes are inserted at the
+ * end of the list so we're always scanning the oldest znodes first.
+ */
+static int
+zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
+{
+	znode_t **zp_array, *zp;
+	int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
+	int objects = 0;
+	int i = 0, j = 0;
+
+	zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
+
+		if ((i++ > nr_to_scan) || (j >= max_array))
+			break;
+
+		ASSERT(list_link_active(&zp->z_link_node));
+		list_remove(&zfsvfs->z_all_znodes, zp);
+		list_insert_tail(&zfsvfs->z_all_znodes, zp);
+
+		/* Skip active znodes and .zfs entries */
+		if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
+			continue;
+
+		if (igrab(ZTOI(zp)) == NULL)
+			continue;
+
+		zp_array[j] = zp;
+		j++;
+	}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	for (i = 0; i < j; i++) {
+		zp = zp_array[i];
+
+		ASSERT3P(zp, !=, NULL);
+		d_prune_aliases(ZTOI(zp));
+
+		if (atomic_read(&ZTOI(zp)->i_count) == 1)
+			objects++;
+
+		zrele(zp);
+	}
+
+	vmem_free(zp_array, max_array * sizeof (znode_t *));
+
+	return (objects);
+}
+
+/*
  * The ARC has requested that the filesystem drop entries from the dentry
  * and inode caches.  This can occur when the ARC needs to free meta data
  * blocks but can't because they are all pinned by entries in these caches.
@@ -1267,6 +1319,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
 	*objects = (*shrinker->scan_objects)(shrinker, &sc);
 #endif
 
+	/*
+	 * Fall back to zfs_prune_aliases if kernel's shrinker did nothing
+	 * due to dentry and inode caches being referenced by a task running
+	 * in non-root memcg.
+	 */
+	if (*objects == 0)
+		*objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
+
 	zfs_exit(zfsvfs, FTAG);
 
 	dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
@@ -1496,6 +1556,12 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
 	sb->s_xattr = zpl_xattr_handlers;
 	sb->s_export_op = &zpl_export_operations;
 
+#ifdef HAVE_SET_DEFAULT_D_OP
+	set_default_d_op(sb, &zpl_dentry_operations);
+#else
+	sb->s_d_op = &zpl_dentry_operations;
+#endif
+
 	/* Set features for file system. */
 	zfs_set_fuid_feature(zfsvfs);
 
@@ -1611,7 +1677,7 @@ zfs_umount(struct super_block *sb)
 
 	if (zfsvfs->z_arc_prune != NULL)
 		arc_remove_prune_callback(zfsvfs->z_arc_prune);
-	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
+	VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
 	os = zfsvfs->z_os;
 
 	/*
@@ -1737,8 +1803,8 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
 		ASSERT(*ipp != NULL);
 
 		if (object == ZFSCTL_INO_SNAPDIR) {
-			VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp,
-			    0, kcred, NULL, NULL) == 0);
+			VERIFY0(zfsctl_root_lookup(*ipp, "snapshot", ipp,
+			    0, kcred, NULL, NULL));
 		} else {
 			/*
 			 * Must have an existing ref, so igrab()
@@ -1840,7 +1906,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
 		goto bail;
 
 	ds->ds_dir->dd_activity_cancelled = B_FALSE;
-	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+	VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
 
 	zfs_set_fuid_feature(zfsvfs);
 	zfsvfs->z_rollback_time = jiffies;
@@ -2013,7 +2079,7 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
 		ASSERT0(error);
 
-		VERIFY(0 == sa_set_sa_object(os, sa_obj));
+		VERIFY0(sa_set_sa_object(os, sa_obj));
 		sa_register_update_callback(os, zfs_sa_upgrade);
 	}
 
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
index ed9721dade76..e845ad69ad78 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
@@ -25,6 +25,7 @@
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2025, Klara, Inc.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
@@ -840,8 +841,8 @@ out:
 		*zpp = zp;
 	}
 
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
+	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		error = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
@@ -1202,8 +1203,8 @@ out:
 		zfs_zrele_async(xzp);
 	}
 
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
+	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		error = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
@@ -1391,14 +1392,15 @@ out:
 
 	zfs_dirent_unlock(dl);
 
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
-
 	if (error != 0) {
 		zrele(zp);
 	} else {
 		zfs_znode_update_vfs(dzp);
 		zfs_znode_update_vfs(zp);
+
+		if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+			error = zil_commit(zilog, 0);
+
 	}
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
@@ -1527,8 +1529,8 @@ out:
 	zfs_znode_update_vfs(zp);
 	zrele(zp);
 
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
+	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		error = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
@@ -2031,10 +2033,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
 		goto out3;
 	}
 
-	if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
-		err = SET_ERROR(EPERM);
-		goto out3;
-	}
+	/* ZFS_READONLY will be handled in zfs_zaccess() */
 
 	/*
 	 * Verify timestamps doesn't overflow 32 bits.
@@ -2482,10 +2481,10 @@ top:
 			new_mode = zp->z_mode;
 		}
 		err = zfs_acl_chown_setattr(zp);
-		ASSERT(err == 0);
+		ASSERT0(err);
 		if (attrzp) {
 			err = zfs_acl_chown_setattr(attrzp);
-			ASSERT(err == 0);
+			ASSERT0(err);
 		}
 	}
 
@@ -2599,7 +2598,7 @@ out:
 	if (err == 0 && xattr_count > 0) {
 		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
 		    xattr_count, tx);
-		ASSERT(err2 == 0);
+		ASSERT0(err2);
 	}
 
 	if (aclp)
@@ -2629,8 +2628,8 @@ out:
 	}
 
 out2:
-	if (os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
+	if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS)
+		err = zil_commit(zilog, 0);
 
 out3:
 	kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
@@ -3156,7 +3155,7 @@ top:
 		 * zfs_link_create() to add back the same entry, but with a new
 		 * dnode (szp), should not fail.
 		 */
-		ASSERT3P(tzp, ==, NULL);
+		ASSERT0P(tzp);
 		goto commit_link_tzp;
 	}
 
@@ -3234,8 +3233,8 @@ out:
 	zfs_dirent_unlock(sdl);
 	zfs_dirent_unlock(tdl);
 
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
+	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		error = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
@@ -3435,7 +3434,7 @@ top:
 		*zpp = zp;
 
 		if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-			zil_commit(zilog, 0);
+			error = zil_commit(zilog, 0);
 	} else {
 		zrele(zp);
 	}
@@ -3653,8 +3652,8 @@ top:
 		 * operation are sync safe.
 		 */
 		if (is_tmpfile) {
-			VERIFY(zap_remove_int(zfsvfs->z_os,
-			    zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
+			VERIFY0(zap_remove_int(zfsvfs->z_os,
+			    zfsvfs->z_unlinkedobj, szp->z_id, tx));
 		} else {
 			if (flags & FIGNORECASE)
 				txtype |= TX_CI;
@@ -3669,18 +3668,20 @@ top:
 
 	zfs_dirent_unlock(dl);
 
-	if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
-
-	if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
-		txg_wait_flag_t wait_flags =
-		    spa_get_failmode(dmu_objset_spa(zfsvfs->z_os)) ==
-		    ZIO_FAILURE_MODE_CONTINUE ? TXG_WAIT_SUSPEND : 0;
-		error = txg_wait_synced_flags(dmu_objset_pool(zfsvfs->z_os),
-		    txg, wait_flags);
-		if (error != 0) {
-			ASSERT3U(error, ==, ESHUTDOWN);
-			error = SET_ERROR(EIO);
+	if (error == 0) {
+		if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+			error = zil_commit(zilog, 0);
+
+		if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+			txg_wait_flag_t wait_flags =
+			    spa_get_failmode(dmu_objset_spa(zfsvfs->z_os)) ==
+			    ZIO_FAILURE_MODE_CONTINUE ? TXG_WAIT_SUSPEND : 0;
+			error = txg_wait_synced_flags(
+			    dmu_objset_pool(zfsvfs->z_os), txg, wait_flags);
+			if (error != 0) {
+				ASSERT3U(error, ==, ESHUTDOWN);
+				error = SET_ERROR(EIO);
+			}
 		}
 	}
 
@@ -3690,24 +3691,39 @@ top:
 	return (error);
 }
 
-static void
-zfs_putpage_sync_commit_cb(void *arg)
+/* Finish page writeback. */
+static inline void
+zfs_page_writeback_done(struct page *pp, int err)
 {
-	struct page *pp = arg;
+	if (err != 0) {
+		/*
+		 * Writeback failed. Re-dirty the page. It was undirtied before
+		 * the IO was issued (in zfs_putpage() or write_cache_pages()).
+		 * The kernel only considers writeback for dirty pages; if we
+		 * don't do this, it is eligible for eviction without being
+		 * written out, which we definitely don't want.
+		 */
+#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
+		filemap_dirty_folio(page_mapping(pp), page_folio(pp));
+#else
+		__set_page_dirty_nobuffers(pp);
+#endif
+	}
 
 	ClearPageError(pp);
 	end_page_writeback(pp);
 }
 
+/*
+ * ZIL callback for page writeback. Passes to zfs_log_write() in zfs_putpage()
+ * for syncing writes. Called when the ZIL itx has been written to the log or
+ * the whole txg syncs, or if the ZIL crashes or the pool suspends. Any failure
+ * is passed as `err`.
+ */
 static void
-zfs_putpage_async_commit_cb(void *arg)
+zfs_putpage_commit_cb(void *arg, int err)
 {
-	struct page *pp = arg;
-	znode_t *zp = ITOZ(pp->mapping->host);
-
-	ClearPageError(pp);
-	end_page_writeback(pp);
-	atomic_dec_32(&zp->z_async_writes_cnt);
+	zfs_page_writeback_done(arg, err);
 }
 
 /*
@@ -3827,15 +3843,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 		zfs_rangelock_exit(lr);
 
 		if (wbc->sync_mode != WB_SYNC_NONE) {
-			/*
-			 * Speed up any non-sync page writebacks since
-			 * they may take several seconds to complete.
-			 * Refer to the comment in zpl_fsync() for details.
-			 */
-			if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
-				zil_commit(zfsvfs->z_log, zp->z_id);
-			}
-
 			if (PageWriteback(pp))
 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
 				folio_wait_bit(page_folio(pp), PG_writeback);
@@ -3861,8 +3868,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	 * was in fact not skipped and should not be counted as if it were.
 	 */
 	wbc->pages_skipped--;
-	if (!for_sync)
-		atomic_inc_32(&zp->z_async_writes_cnt);
 	set_page_writeback(pp);
 	unlock_page(pp);
 
@@ -3874,18 +3879,15 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	err = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
-#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
-		filemap_dirty_folio(page_mapping(pp), page_folio(pp));
-#else
-		__set_page_dirty_nobuffers(pp);
-#endif
-		ClearPageError(pp);
-		end_page_writeback(pp);
-		if (!for_sync)
-			atomic_dec_32(&zp->z_async_writes_cnt);
+		zfs_page_writeback_done(pp, err);
 		zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
-		return (err);
+
+		/*
+		 * Don't return error for an async writeback; we've re-dirtied
+		 * the page so it will be tried again some other time.
+		 */
+		return (for_sync ? err : 0);
 	}
 
 	va = kmap(pp);
@@ -3908,36 +3910,70 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 
 	err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
 
-	boolean_t commit = B_FALSE;
-	if (wbc->sync_mode != WB_SYNC_NONE) {
-		/*
-		 * Note that this is rarely called under writepages(), because
-		 * writepages() normally handles the entire commit for
-		 * performance reasons.
-		 */
-		commit = B_TRUE;
-	} else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
+	/*
+	 * A note about for_sync vs wbc->sync_mode.
+	 *
+	 * for_sync indicates that this is a syncing writeback, that is, kernel
+	 * caller expects the data to be durably stored before being notified.
+	 * Often, but not always, the call was triggered by a userspace syncing
+	 * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE
+	 * means that that page should remain "locked" (in the writeback state)
+	 * until it is definitely on disk (ie zil_commit() or spa_sync()).
+	 * Otherwise, we can unlock and return as soon as it is on the
+	 * in-memory ZIL.
+	 *
+	 * wbc->sync_mode has similar meaning. wbc is passed from the kernel to
+	 * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE
+	 * indicates this a regular async writeback (eg a cache eviction) and
+	 * so does not need a durability guarantee, while WB_SYNC_ALL indicates
+	 * a syncing op that must be waited on (by convention, we test for
+	 * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over
+	 * performance should there ever be a new mode that we have not yet
+	 * added support for).
+	 *
+	 * So, why a separate for_sync field? This is because zpl_writepages()
+	 * calls zfs_putpage() multiple times for a single "logical" operation.
+	 * It wants all the individual pages to be for_sync==TRUE ie only
+	 * unlocked once durably stored, but it only wants one call to
+	 * zil_commit() at the very end, once all the pages are synced. So,
+	 * it repurposes sync_mode slightly to indicate who issue and wait for
+	 * the IO: for NONE, the caller to zfs_putpage() will do it, while for
+	 * ALL, zfs_putpage should do it.
+	 *
+	 * Summary:
+	 *   for_sync:  0=unlock immediately; 1=unlock once on disk
+	 *   sync_mode: NONE=caller will commit; ALL=we will commit
+	 */
+	boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE);
+
+	/*
+	 * We use for_sync as the "commit" arg to zfs_log_write() (arg 7)
+	 * because it is a policy flag that indicates "someone will call
+	 * zil_commit() soon". for_sync=TRUE means exactly that; the only
+	 * question is whether it will be us, or zpl_writepages().
+	 */
+	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync,
+	    B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);
+
+	if (!for_sync) {
 		/*
-		 * If the caller does not intend to wait synchronously
-		 * for this page writeback to complete and there are active
-		 * synchronous calls on this file, do a commit so that
-		 * the latter don't accidentally end up waiting for
-		 * our writeback to complete. Refer to the comment in
-		 * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
+		 * Async writeback is logged and written to the DMU, so page
+		 * can now be unlocked.
 		 */
-		commit = B_TRUE;
+		zfs_page_writeback_done(pp, 0);
 	}
 
-	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
-	    B_FALSE, for_sync ? zfs_putpage_sync_commit_cb :
-	    zfs_putpage_async_commit_cb, pp);
-
 	dmu_tx_commit(tx);
 
 	zfs_rangelock_exit(lr);
 
-	if (commit)
-		zil_commit(zfsvfs->z_log, zp->z_id);
+	if (need_commit) {
+		err = zil_commit_flags(zfsvfs->z_log, zp->z_id, ZIL_COMMIT_NOW);
+		if (err != 0) {
+			zfs_exit(zfsvfs, FTAG);
+			return (err);
+		}
+	}
 
 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
 
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c
index 54e60b4820f6..bcaabeb32b8a 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c
@@ -126,8 +126,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 	zp->z_acl_cached = NULL;
 	zp->z_xattr_cached = NULL;
 	zp->z_xattr_parent = 0;
-	zp->z_sync_writes_cnt = 0;
-	zp->z_async_writes_cnt = 0;
 
 	return (0);
 }
@@ -146,12 +144,9 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 	rw_destroy(&zp->z_xattr_lock);
 	zfs_rangelock_fini(&zp->z_rangelock);
 
-	ASSERT3P(zp->z_dirlocks, ==, NULL);
-	ASSERT3P(zp->z_acl_cached, ==, NULL);
-	ASSERT3P(zp->z_xattr_cached, ==, NULL);
-
-	ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
-	ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
+	ASSERT0P(zp->z_dirlocks);
+	ASSERT0P(zp->z_acl_cached);
+	ASSERT0P(zp->z_xattr_cached);
 }
 
 static int
@@ -183,13 +178,13 @@ zfs_znode_init(void)
 	 * backed by kmalloc() when on the Linux slab in order that any
 	 * wait_on_bit() operations on the related inode operate properly.
 	 */
-	ASSERT(znode_cache == NULL);
+	ASSERT0P(znode_cache);
 	znode_cache = kmem_cache_create("zfs_znode_cache",
 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
 	    zfs_znode_cache_destructor, NULL, NULL, NULL,
 	    KMC_SLAB | KMC_RECLAIMABLE);
 
-	ASSERT(znode_hold_cache == NULL);
+	ASSERT0P(znode_hold_cache);
 	znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
 	    sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
 	    zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
@@ -332,10 +327,10 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
 
 	mutex_enter(&zp->z_lock);
 
-	ASSERT(zp->z_sa_hdl == NULL);
-	ASSERT(zp->z_acl_cached == NULL);
+	ASSERT0P(zp->z_sa_hdl);
+	ASSERT0P(zp->z_acl_cached);
 	if (sa_hdl == NULL) {
-		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+		VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp,
 		    SA_HDL_SHARED, &zp->z_sa_hdl));
 	} else {
 		zp->z_sa_hdl = sa_hdl;
@@ -371,6 +366,12 @@ zfs_inode_alloc(struct super_block *sb, struct inode **ip)
 	return (0);
 }
 
+void
+zfs_inode_free(struct inode *ip)
+{
+	kmem_cache_free(znode_cache, ITOZ(ip));
+}
+
 /*
  * Called in multiple places when an inode should be destroyed.
  */
@@ -395,8 +396,15 @@ zfs_inode_destroy(struct inode *ip)
 		nvlist_free(zp->z_xattr_cached);
 		zp->z_xattr_cached = NULL;
 	}
-
-	kmem_cache_free(znode_cache, zp);
+#ifndef HAVE_SOPS_FREE_INODE
+	/*
+	 * inode needs to be freed in RCU callback.  If we have
+	 * super_operations->free_inode, Linux kernel will do call_rcu
+	 * for us.  But if we don't have it, since call_rcu is GPL-only
+	 * symbol, we can only free synchronously and accept the risk.
+	 */
+	zfs_inode_free(ip);
+#endif
 }
 
 static void
@@ -522,9 +530,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 		return (NULL);
 
 	zp = ITOZ(ip);
-	ASSERT(zp->z_dirlocks == NULL);
-	ASSERT3P(zp->z_acl_cached, ==, NULL);
-	ASSERT3P(zp->z_xattr_cached, ==, NULL);
+	ASSERT0P(zp->z_dirlocks);
+	ASSERT0P(zp->z_acl_cached);
+	ASSERT0P(zp->z_xattr_cached);
 	zp->z_unlinked = B_FALSE;
 	zp->z_atime_dirty = B_FALSE;
 	zp->z_is_ctldir = B_FALSE;
@@ -535,8 +543,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
-	zp->z_sync_writes_cnt = 0;
-	zp->z_async_writes_cnt = 0;
 
 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
 
@@ -605,7 +611,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	 * processing so do not hash unlinked znodes.
 	 */
 	if (links > 0)
-		VERIFY3S(insert_inode_locked(ip), ==, 0);
+		VERIFY0(insert_inode_locked(ip));
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
@@ -805,7 +811,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 	}
 
 	/* Now add in all of the "SA" attributes */
-	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+	VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
 	    &sa_hdl));
 
 	/*
@@ -895,7 +901,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 		    acl_ids->z_fuid, acl_ids->z_fgid);
 	}
 
-	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+	VERIFY0(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx));
 
 	if (!(flag & IS_ROOT_NODE)) {
 		/*
@@ -1194,7 +1200,7 @@ zfs_rezget(znode_t *zp)
 	}
 	rw_exit(&zp->z_xattr_lock);
 
-	ASSERT(zp->z_sa_hdl == NULL);
+	ASSERT0P(zp->z_sa_hdl);
 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
 	if (err) {
 		zfs_znode_hold_exit(zfsvfs, zh);
@@ -1308,9 +1314,9 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
 	zh = zfs_znode_hold_enter(zfsvfs, obj);
 	if (acl_obj) {
 		VERIFY(!zp->z_is_sa);
-		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+		VERIFY0(dmu_object_free(os, acl_obj, tx));
 	}
-	VERIFY(0 == dmu_object_free(os, obj, tx));
+	VERIFY0(dmu_object_free(os, obj, tx));
 	zfs_znode_dmu_fini(zp);
 	zfs_znode_hold_exit(zfsvfs, zh);
 }
@@ -1530,7 +1536,7 @@ zfs_extend(znode_t *zp, uint64_t end)
 
 	zp->z_size = end;
 
-	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
+	VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
 	    &zp->z_size, sizeof (zp->z_size), tx));
 
 	zfs_rangelock_exit(lr);
@@ -1720,7 +1726,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
 		    NULL, &zp->z_pflags, 8);
 	}
-	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
+	VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
 
 	dmu_tx_commit(tx);
 	zfs_rangelock_exit(lr);
@@ -1787,7 +1793,7 @@ log:
 	    NULL, &zp->z_pflags, 8);
 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-	ASSERT(error == 0);
+	ASSERT0(error);
 
 	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
 
@@ -1834,7 +1840,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 	moid = MASTER_NODE_OBJ;
 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
 	    DMU_OT_NONE, 0, tx);
-	ASSERT(error == 0);
+	ASSERT0(error);
 
 	/*
 	 * Set starting attributes.
@@ -1847,7 +1853,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 		const char *name;
 
 		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
-		VERIFY(nvpair_value_uint64(elem, &val) == 0);
+		VERIFY0(nvpair_value_uint64(elem, &val));
 		name = nvpair_name(elem);
 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
 			if (val < version)
@@ -1855,7 +1861,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 		} else {
 			error = zap_update(os, moid, name, 8, 1, &val, tx);
 		}
-		ASSERT(error == 0);
+		ASSERT0(error);
 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
 			norm = val;
 		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
@@ -1863,7 +1869,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 	}
 	ASSERT(version != 0);
 	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
-	ASSERT(error == 0);
+	ASSERT0(error);
 
 	/*
 	 * Create zap object used for SA attribute registration
@@ -1873,7 +1879,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
 		    DMU_OT_NONE, 0, tx);
 		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
-		ASSERT(error == 0);
+		ASSERT0(error);
 	} else {
 		sa_obj = 0;
 	}
@@ -1883,7 +1889,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
 
 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
-	ASSERT(error == 0);
+	ASSERT0(error);
 
 	/*
 	 * Create root znode.  Create minimal znode/inode/zfsvfs/sb
@@ -1916,7 +1922,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 	    &zfsvfs->z_attr_table);
 
-	ASSERT(error == 0);
+	ASSERT0(error);
 
 	/*
 	 * Fold case on file systems that are always or sometimes case
@@ -1940,12 +1946,12 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 		mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
 	}
 
-	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
+	VERIFY0(zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
 	    cr, NULL, &acl_ids, zfs_init_idmap));
 	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
 	ASSERT3P(zp, ==, rootzp);
 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
-	ASSERT(error == 0);
+	ASSERT0(error);
 	zfs_acl_ids_free(&acl_ids);
 
 	atomic_set(&ZTOI(rootzp)->i_count, 0);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
index 48dae79a2373..81ac26cb0c93 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
@@ -202,7 +202,7 @@ zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
 	return (!!dentry->d_inode);
 }
 
-static dentry_operations_t zpl_dops_snapdirs = {
+static const struct dentry_operations zpl_dops_snapdirs = {
 /*
  * Auto mounting of snapshots is only supported for 2.6.37 and
  * newer kernels.  Prior to this kernel the ops->follow_link()
@@ -215,6 +215,51 @@ static dentry_operations_t zpl_dops_snapdirs = {
 	.d_revalidate	= zpl_snapdir_revalidate,
 };
 
+/*
+ * For the .zfs control directory to work properly we must be able to override
+ * the default operations table and register custom .d_automount and
+ * .d_revalidate callbacks.
+ */
+static void
+set_snapdir_dentry_ops(struct dentry *dentry, unsigned int extraflags) {
+	static const unsigned int op_flags =
+	    DCACHE_OP_HASH | DCACHE_OP_COMPARE |
+	    DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE |
+	    DCACHE_OP_PRUNE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_REAL;
+
+#ifdef HAVE_D_SET_D_OP
+	/*
+	 * d_set_d_op() will set the DCACHE_OP_ flags according to what it
+	 * finds in the passed dentry_operations, so we don't have to.
+	 *
+	 * We clear the flags and the old op table before calling d_set_d_op()
+	 * because issues a warning when the dentry operations table is already
+	 * set.
+	 */
+	dentry->d_op = NULL;
+	dentry->d_flags &= ~op_flags;
+	d_set_d_op(dentry, &zpl_dops_snapdirs);
+	dentry->d_flags |= extraflags;
+#else
+	/*
+	 * Since 6.17 there's no exported way to modify dentry ops, so we have
+	 * to reach in and do it ourselves. This should be safe for our very
+	 * narrow use case, which is to create or splice in an entry to give
+	 * access to a snapshot.
+	 *
+	 * We need to set the op flags directly. We hardcode
+	 * DCACHE_OP_REVALIDATE because that's the only operation we have; if
+	 * we ever extend zpl_dops_snapdirs we will need to update the op flags
+	 * to match.
+	 */
+	spin_lock(&dentry->d_lock);
+	dentry->d_op = &zpl_dops_snapdirs;
+	dentry->d_flags &= ~op_flags;
+	dentry->d_flags |= DCACHE_OP_REVALIDATE | extraflags;
+	spin_unlock(&dentry->d_lock);
+#endif
+}
+
 static struct dentry *
 zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
     unsigned int flags)
@@ -236,10 +281,7 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
 		return (ERR_PTR(error));
 
 	ASSERT(error == 0 || ip == NULL);
-	d_clear_d_op(dentry);
-	d_set_d_op(dentry, &zpl_dops_snapdirs);
-	dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
-
+	set_snapdir_dentry_ops(dentry, DCACHE_NEED_AUTOMOUNT);
 	return (d_splice_alias(ip, dentry));
 }
 
@@ -373,8 +415,7 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
 
 	error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
 	if (error == 0) {
-		d_clear_d_op(dentry);
-		d_set_d_op(dentry, &zpl_dops_snapdirs);
+		set_snapdir_dentry_ops(dentry, 0);
 		d_instantiate(dentry, ip);
 	}
 
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
index 1a82c13e1523..02965ac8cbee 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
@@ -22,6 +22,8 @@
 /*
  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2025, Klara, Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
  */
 
 
@@ -106,60 +108,52 @@ zpl_iterate(struct file *filp, struct dir_context *ctx)
 	return (error);
 }
 
+static inline int
+zpl_write_cache_pages(struct address_space *mapping,
+    struct writeback_control *wbc, void *data);
+
 static int
 zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
 	znode_t *zp = ITOZ(inode);
-	zfsvfs_t *zfsvfs = ITOZSB(inode);
 	cred_t *cr = CRED();
 	int error;
 	fstrans_cookie_t cookie;
 
 	/*
-	 * The variables z_sync_writes_cnt and z_async_writes_cnt work in
-	 * tandem so that sync writes can detect if there are any non-sync
-	 * writes going on and vice-versa. The "vice-versa" part to this logic
-	 * is located in zfs_putpage() where non-sync writes check if there are
-	 * any ongoing sync writes. If any sync and non-sync writes overlap,
-	 * we do a commit to complete the non-sync writes since the latter can
-	 * potentially take several seconds to complete and thus block sync
-	 * writes in the upcoming call to filemap_write_and_wait_range().
-	 */
-	atomic_inc_32(&zp->z_sync_writes_cnt);
-	/*
-	 * If the following check does not detect an overlapping non-sync write
-	 * (say because it's just about to start), then it is guaranteed that
-	 * the non-sync write will detect this sync write. This is because we
-	 * always increment z_sync_writes_cnt / z_async_writes_cnt before doing
-	 * the check on z_async_writes_cnt / z_sync_writes_cnt here and in
-	 * zfs_putpage() respectively.
+	 * Force dirty pages in the range out to the DMU and the log, ready
+	 * for zil_commit() to write down.
+	 *
+	 * We call write_cache_pages() directly to ensure that zpl_putpage() is
+	 * called with the flags we need. We need WB_SYNC_NONE to avoid a call
+	 * to zil_commit() (since we're doing this as a kind of pre-sync); but
+	 * we do need for_sync so that the pages remain in writeback until
+	 * they're on disk, and so that we get an error if the DMU write fails.
 	 */
-	if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
-		if ((error = zpl_enter(zfsvfs, FTAG)) != 0) {
-			atomic_dec_32(&zp->z_sync_writes_cnt);
+	if (filemap_range_has_page(inode->i_mapping, start, end)) {
+		int for_sync = 1;
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_NONE,
+			.nr_to_write = LONG_MAX,
+			.range_start = start,
+			.range_end = end,
+		};
+		error =
+		    zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync);
+		if (error != 0) {
+			/*
+			 * Unclear what state things are in. zfs_putpage() will
+			 * ensure the pages remain dirty if they haven't been
+			 * written down to the DMU, but because there may be
+			 * nothing logged, we can't assume that zfs_sync() ->
+			 * zil_commit() will give us a useful error. It's
+			 * safest if we just error out here.
+			 */
 			return (error);
 		}
-		zil_commit(zfsvfs->z_log, zp->z_id);
-		zpl_exit(zfsvfs, FTAG);
 	}
 
-	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
-
-	/*
-	 * The sync write is not complete yet but we decrement
-	 * z_sync_writes_cnt since zfs_fsync() increments and decrements
-	 * it internally. If a non-sync write starts just after the decrement
-	 * operation but before we call zfs_fsync(), it may not detect this
-	 * overlapping sync write but it does not matter since we have already
-	 * gone past filemap_write_and_wait_range() and we won't block due to
-	 * the non-sync write.
-	 */
-	atomic_dec_32(&zp->z_sync_writes_cnt);
-
-	if (error)
-		return (error);
-
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 	error = -zfs_fsync(zp, datasync, cr);
@@ -485,6 +479,7 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 	return (ret);
 }
 
+#ifdef HAVE_WRITE_CACHE_PAGES
 #ifdef HAVE_WRITEPAGE_T_FOLIO
 static int
 zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
@@ -506,6 +501,78 @@ zpl_write_cache_pages(struct address_space *mapping,
 #endif
 	return (result);
 }
+#else
+static inline int
+zpl_write_cache_pages(struct address_space *mapping,
+    struct writeback_control *wbc, void *data)
+{
+	pgoff_t start = wbc->range_start >> PAGE_SHIFT;
+	pgoff_t end = wbc->range_end >> PAGE_SHIFT;
+
+	struct folio_batch fbatch;
+	folio_batch_init(&fbatch);
+
+	/*
+	 * This atomically (-ish) tags all DIRTY pages in the range with
+	 * TOWRITE, allowing users to continue dirtying or undirtying pages
+	 * while we get on with writeback, without us treading on each other.
+	 */
+	tag_pages_for_writeback(mapping, start, end);
+
+	int err = 0;
+	unsigned int npages;
+
+	/*
+	 * Grab references to the TOWRITE pages just flagged. This may not get
+	 * all of them, so we do it in a loop until there are none left.
+	 */
+	while ((npages = filemap_get_folios_tag(mapping, &start, end,
+	    PAGECACHE_TAG_TOWRITE, &fbatch)) != 0) {
+
+		/* Loop over each page and write it out. */
+		struct folio *folio;
+		while ((folio = folio_batch_next(&fbatch)) != NULL) {
+			folio_lock(folio);
+
+			/*
+			 * If the folio has been remapped, or is no longer
+			 * dirty, then there's nothing to do.
+			 */
+			if (folio->mapping != mapping ||
+			    !folio_test_dirty(folio)) {
+				folio_unlock(folio);
+				continue;
+			}
+
+			/*
+			 * If writeback is already in progress, wait for it to
+			 * finish. We continue after this even if the page
+			 * ends up clean; zfs_putpage() will skip it if no
+			 * further work is required.
+			 */
+			while (folio_test_writeback(folio))
+				folio_wait_bit(folio, PG_writeback);
+
+			/*
+			 * Write it out and collect any error. zfs_putpage()
+			 * will clear the TOWRITE and DIRTY flags, and return
+			 * with the page unlocked.
+			 */
+			int ferr = zpl_putpage(&folio->page, wbc, data);
+			if (err == 0 && ferr != 0)
+				err = ferr;
+
+			/* Housekeeping for the caller. */
+			wbc->nr_to_write -= folio_nr_pages(folio);
+		}
+
+		/* Release any remaining references on the batch. */
+		folio_batch_release(&fbatch);
+	}
+
+	return (err);
+}
+#endif
 
 static int
 zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
@@ -535,11 +602,30 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	if (sync_mode != wbc->sync_mode) {
 		if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 			return (result);
-		if (zfsvfs->z_log != NULL)
-			zil_commit(zfsvfs->z_log, zp->z_id);
+
+		if (zfsvfs->z_log != NULL) {
+			/*
+			 * We don't want to block here if the pool suspends,
+			 * because this is not a syncing op by itself, but
+			 * might be part of one that the caller will
+			 * coordinate.
+			 */
+			result = -zil_commit_flags(zfsvfs->z_log, zp->z_id,
+			    ZIL_COMMIT_NOW);
+		}
+
 		zpl_exit(zfsvfs, FTAG);
 
 		/*
+		 * If zil_commit_flags() failed, it's unclear what state things
+		 * are currently in. putpage() has written back out what it can
+		 * to the DMU, but it may not be on disk. We have little choice
+		 * but to escape.
+		 */
+		if (result != 0)
+			return (result);
+
+		/*
 		 * We need to call write_cache_pages() again (we can't just
 		 * return after the commit) because the previous call in
 		 * non-SYNC mode does not guarantee that we got all the dirty
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
index f9f6406f8b47..f97662d052c7 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
@@ -247,7 +247,7 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	 * and fifos, but we want to know if this behavior ever changes.
 	 */
 	if (S_ISSOCK(mode) || S_ISFIFO(mode))
-		ASSERT(rdev == 0);
+		ASSERT0(rdev);
 
 	crhold(cr);
 	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
index a682bfd33c38..347b352506e5 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@@ -22,6 +22,8 @@
 /*
  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
  * Copyright (c) 2023, Datto Inc. All rights reserved.
+ * Copyright (c) 2025, Klara, Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
  */
 
 
@@ -32,7 +34,22 @@
 #include <sys/zpl.h>
 #include <linux/iversion.h>
 #include <linux/version.h>
+#include <linux/vfs_compat.h>
 
+/*
+ * What to do when the last reference to an inode is released. If 0, the kernel
+ * will cache it on the superblock. If 1, the inode will be freed immediately.
+ * See zpl_drop_inode().
+ */
+int zfs_delete_inode = 0;
+
+/*
+ * What to do when the last reference to a dentry is released. If 0, the kernel
+ * will cache it until the entry (file) is destroyed. If 1, the dentry will be
+ * marked for cleanup, at which time its inode reference will be released. See
+ * zpl_dentry_delete().
+ */
+int zfs_delete_dentry = 0;
 
 static struct inode *
 zpl_inode_alloc(struct super_block *sb)
@@ -45,10 +62,19 @@ zpl_inode_alloc(struct super_block *sb)
 	return (ip);
 }
 
+#ifdef HAVE_SOPS_FREE_INODE
+static void
+zpl_inode_free(struct inode *ip)
+{
+	ASSERT0(atomic_read(&ip->i_count));
+	zfs_inode_free(ip);
+}
+#endif
+
 static void
 zpl_inode_destroy(struct inode *ip)
 {
-	ASSERT(atomic_read(&ip->i_count) == 0);
+	ASSERT0(atomic_read(&ip->i_count));
 	zfs_inode_destroy(ip);
 }
 
@@ -68,11 +94,36 @@ zpl_dirty_inode(struct inode *ip, int flags)
 }
 
 /*
- * When ->drop_inode() is called its return value indicates if the
- * inode should be evicted from the inode cache.  If the inode is
- * unhashed and has no links the default policy is to evict it
- * immediately.
+ * ->drop_inode() is called when the last reference to an inode is released.
+ * Its return value indicates if the inode should be destroyed immediately, or
+ * cached on the superblock structure.
+ *
+ * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns
+ * "destroy immediately" if the inode is unhashed and has no links (roughly: no
+ * longer exists on disk). On datasets with millions of rarely-accessed files,
+ * this can cause a large amount of memory to be "pinned" by cached inodes,
+ * which in turn pin their associated dnodes and dbufs, until the kernel starts
+ * reporting memory pressure and requests OpenZFS release some memory (see
+ * zfs_prune()).
+ *
+ * When set to 1, we call generic_delete_inode(), which always returns "destroy
+ * immediately", resulting in inodes being destroyed immediately, releasing
+ * their associated dnodes and dbufs to the dbuf cached and the ARC to be
+ * evicted as normal.
  *
+ * Note that the "last reference" doesn't always mean the last _userspace_
+ * reference; the dentry cache also holds a reference, so "busy" inodes will
+ * still be kept alive that way (subject to dcache tuning).
+ */
+static int
+zpl_drop_inode(struct inode *ip)
+{
+	if (zfs_delete_inode)
+		return (generic_delete_inode(ip));
+	return (generic_drop_inode(ip));
+}
+
+/*
  * The ->evict_inode() callback must minimally truncate the inode pages,
  * and call clear_inode().  For 2.6.35 and later kernels this will
  * simply update the inode state, with the sync occurring before the
@@ -455,9 +506,13 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg)
 
 const struct super_operations zpl_super_operations = {
 	.alloc_inode		= zpl_inode_alloc,
+#ifdef HAVE_SOPS_FREE_INODE
+	.free_inode		= zpl_inode_free,
+#endif
 	.destroy_inode		= zpl_inode_destroy,
 	.dirty_inode		= zpl_dirty_inode,
 	.write_inode		= NULL,
+	.drop_inode		= zpl_drop_inode,
 	.evict_inode		= zpl_evict_inode,
 	.put_super		= zpl_put_super,
 	.sync_fs		= zpl_sync_fs,
@@ -468,6 +523,35 @@ const struct super_operations zpl_super_operations = {
 	.show_stats		= NULL,
 };
 
+/*
+ * ->d_delete() is called when the last reference to a dentry is released. Its
+ *  return value indicates if the dentry should be destroyed immediately, or
+ *  retained in the dentry cache.
+ *
+ * By default (zfs_delete_dentry=0) the kernel will always cache unused
+ * entries.  Each dentry holds an inode reference, so cached dentries can hold
+ * the final inode reference indefinitely, leading to the inode and its related
+ * data being pinned (see zpl_drop_inode()).
+ *
+ * When set to 1, we signal that the dentry should be destroyed immediately and
+ * never cached. This reduces memory usage, at the cost of higher overheads to
+ * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be
+ * reloaded and reinflated.
+ *
+ * Note that userspace does not have direct control over dentry references and
+ * reclaim; rather, this is part of the kernel's caching and reclaim subsystems
+ * (eg vm.vfs_cache_pressure).
+ */
+static int
+zpl_dentry_delete(const struct dentry *dentry)
+{
+	return (zfs_delete_dentry ? 1 : 0);
+}
+
+const struct dentry_operations zpl_dentry_operations = {
+	.d_delete = zpl_dentry_delete,
+};
+
 struct file_system_type zpl_fs_type = {
 	.owner			= THIS_MODULE,
 	.name			= ZFS_DRIVER,
@@ -479,3 +563,10 @@ struct file_system_type zpl_fs_type = {
 	.mount			= zpl_mount,
 	.kill_sb		= zpl_kill_sb,
 };
+
+ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW,
+	"Delete inodes as soon as the last reference is released.");
+
+ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW,
+	"Delete dentries from dentry cache as soon as the last reference is "
+	"released.");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
index a098197e7448..d93282db815a 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
@@ -1494,7 +1494,7 @@ zpl_posix_acl_free(void *arg)
 				acl_rel_head = NULL;
 				if (cmpxchg(&acl_rel_tail, &a->next,
 				    &acl_rel_head) == &a->next) {
-					ASSERT3P(a->next, ==, NULL);
+					ASSERT0P(a->next);
 					a->next = freelist;
 					freelist = a;
 					break;
@@ -1544,7 +1544,7 @@ zpl_posix_acl_release_impl(struct posix_acl *acl)
 	a->time = ddi_get_lbolt();
 	/* atomically points tail to us and get the previous tail */
 	prev = xchg(&acl_rel_tail, &a->next);
-	ASSERT3P(*prev, ==, NULL);
+	ASSERT0P(*prev);
 	*prev = a;
 	/* if it was empty before, schedule the free task */
 	if (prev == &acl_rel_head)
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index 57a9711e9027..fe939150b641 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -21,8 +21,8 @@
  */
 /*
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
- * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Rob Norris <robn@despairlabs.com>
+ * Copyright (c) 2024, 2025, Klara, Inc.
  */
 
 #include <sys/dataset_kstats.h>
@@ -84,8 +84,9 @@ static unsigned int zvol_blk_mq_blocks_per_thread = 8;
 static inline void
 zvol_end_io(struct bio *bio, struct request *rq, int error)
 {
+	ASSERT3U(error, >=, 0);
 	if (bio) {
-		bio->bi_status = errno_to_bi_status(-error);
+		bio->bi_status = errno_to_bi_status(error);
 		bio_endio(bio);
 	} else {
 		blk_mq_end_request(rq, errno_to_bi_status(error));
@@ -208,8 +209,14 @@ zvol_write(zv_request_t *zvr)
 	disk = zv->zv_zso->zvo_disk;
 
 	/* bio marked as FLUSH need to flush before write */
-	if (io_is_flush(bio, rq))
-		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+	if (io_is_flush(bio, rq)) {
+		error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		if (error != 0) {
+			rw_exit(&zv->zv_suspend_lock);
+			zvol_end_io(bio, rq, -error);
+			return;
+		}
+	}
 
 	/* Some requests are just for flush and nothing else. */
 	if (io_size(bio, rq) == 0) {
@@ -273,8 +280,8 @@ zvol_write(zv_request_t *zvr)
 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
 	task_io_account_write(nwritten);
 
-	if (sync)
-		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+	if (error == 0 && sync)
+		error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 	rw_exit(&zv->zv_suspend_lock);
 
@@ -282,7 +289,7 @@ zvol_write(zv_request_t *zvr)
 		blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
 	}
 
-	zvol_end_io(bio, rq, -error);
+	zvol_end_io(bio, rq, error);
 }
 
 static void
@@ -330,16 +337,14 @@ zvol_discard(zv_request_t *zvr)
 	}
 
 	/*
-	 * Align the request to volume block boundaries when a secure erase is
-	 * not required.  This will prevent dnode_free_range() from zeroing out
-	 * the unaligned parts which is slow (read-modify-write) and useless
-	 * since we are not freeing any space by doing so.
+	 * Align the request to volume block boundaries. This will prevent
+	 * dnode_free_range() from zeroing out the unaligned parts which is
+	 * slow (read-modify-write) and useless since we are not freeing any
+	 * space by doing so.
 	 */
-	if (!io_is_secure_erase(bio, rq)) {
-		start = P2ROUNDUP(start, zv->zv_volblocksize);
-		end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
-		size = end - start;
-	}
+	start = P2ROUNDUP(start, zv->zv_volblocksize);
+	end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
+	size = end - start;
 
 	if (start >= end)
 		goto unlock;
@@ -361,7 +366,7 @@ zvol_discard(zv_request_t *zvr)
 	zfs_rangelock_exit(lr);
 
 	if (error == 0 && sync)
-		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 unlock:
 	rw_exit(&zv->zv_suspend_lock);
@@ -371,7 +376,7 @@ unlock:
 		    start_time);
 	}
 
-	zvol_end_io(bio, rq, -error);
+	zvol_end_io(bio, rq, error);
 }
 
 static void
@@ -449,7 +454,7 @@ zvol_read(zv_request_t *zvr)
 		blk_generic_end_io_acct(q, disk, READ, bio, start_time);
 	}
 
-	zvol_end_io(bio, rq, -error);
+	zvol_end_io(bio, rq, error);
 }
 
 static void
@@ -460,6 +465,24 @@ zvol_read_task(void *arg)
 	zv_request_task_free(task);
 }
 
+/*
+ * Note:
+ *
+ * The kernel uses different enum names for the IO opcode, depending on the
+ * kernel version ('req_opf', 'req_op').  To sidestep this, use macros rather
+ * than inline functions for these checks.
+ */
+/* Should this IO go down the zvol write path? */
+#define	ZVOL_OP_IS_WRITE(op) \
+	(op == REQ_OP_WRITE || \
+	op == REQ_OP_FLUSH || \
+	op == REQ_OP_DISCARD)
+
+/* Is this IO type supported by zvols? */
+#define	ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op))
+
+/* Get the IO opcode */
+#define	ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq))
 
 /*
  * Process a BIO or request
@@ -477,10 +500,36 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	uint64_t offset = io_offset(bio, rq);
 	uint64_t size = io_size(bio, rq);
-	int rw = io_data_dir(bio, rq);
+	int rw;
+
+	if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) {
+		zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x",
+		    rq != NULL ? "request" : "BIO",
+		    ZVOL_OP(bio, rq),
+		    rq != NULL ? rq->cmd_flags : bio->bi_opf);
+		ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)));
+		zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP));
+		goto out;
+	}
+
+	if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) {
+		rw = WRITE;
+	} else {
+		rw = READ;
+	}
+
+	/*
+	 * Sanity check
+	 *
+	 * If we're a BIO, check our rw matches the kernel's
+	 * bio_data_dir(bio) rw.  We need to check because we support fewer
+	 * IO operations, and want to verify that what we think are reads and
+	 * writes from those operations match what the kernel thinks.
+	 */
+	ASSERT(rq != NULL || rw == bio_data_dir(bio));
 
 	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
-		zvol_end_io(bio, rq, -SET_ERROR(ENXIO));
+		zvol_end_io(bio, rq, SET_ERROR(ENXIO));
 		goto out;
 	}
 
@@ -499,7 +548,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 		    (long long unsigned)offset,
 		    (long unsigned)size);
 
-		zvol_end_io(bio, rq, -SET_ERROR(EIO));
+		zvol_end_io(bio, rq, SET_ERROR(EIO));
 		goto out;
 	}
 
@@ -512,8 +561,8 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 #ifdef HAVE_BLK_MQ_RQ_HCTX
 		blk_mq_hw_queue = rq->mq_hctx->queue_num;
 #else
-		blk_mq_hw_queue =
-		    rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
+		blk_mq_hw_queue = rq->q->queue_hw_ctx[
+		    rq->q->mq_map[raw_smp_processor_id()]]->queue_num;
 #endif
 	taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
 	    blk_mq_hw_queue);
@@ -521,7 +570,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 
 	if (rw == WRITE) {
 		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
-			zvol_end_io(bio, rq, -SET_ERROR(EROFS));
+			zvol_end_io(bio, rq, SET_ERROR(EROFS));
 			goto out;
 		}
 
@@ -582,7 +631,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 		 * interfaces lack this functionality (they block waiting for
 		 * the i/o to complete).
 		 */
-		if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) {
+		if (io_is_discard(bio, rq)) {
 			if (force_sync) {
 				zvol_discard(&zvr);
 			} else {
@@ -672,28 +721,19 @@ zvol_open(struct block_device *bdev, fmode_t flag)
 
 retry:
 #endif
-	rw_enter(&zvol_state_lock, RW_READER);
-	/*
-	 * Obtain a copy of private_data under the zvol_state_lock to make
-	 * sure that either the result of zvol free code path setting
-	 * disk->private_data to NULL is observed, or zvol_os_free()
-	 * is not called on this zv because of the positive zv_open_count.
-	 */
+
 #ifdef HAVE_BLK_MODE_T
-	zv = disk->private_data;
+	zv = atomic_load_ptr(&disk->private_data);
 #else
-	zv = bdev->bd_disk->private_data;
+	zv = atomic_load_ptr(&bdev->bd_disk->private_data);
 #endif
 	if (zv == NULL) {
-		rw_exit(&zvol_state_lock);
 		return (-SET_ERROR(ENXIO));
 	}
 
 	mutex_enter(&zv->zv_state_lock);
-
 	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
 		mutex_exit(&zv->zv_state_lock);
-		rw_exit(&zvol_state_lock);
 		return (-SET_ERROR(ENXIO));
 	}
 
@@ -705,8 +745,28 @@ retry:
 	if (zv->zv_open_count == 0) {
 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 			mutex_exit(&zv->zv_state_lock);
+
+			/*
+			 * Removal may happen while the locks are down, so
+			 * we can't trust zv any longer; we have to start over.
+			 */
+#ifdef HAVE_BLK_MODE_T
+			zv = atomic_load_ptr(&disk->private_data);
+#else
+			zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+#endif
+			if (zv == NULL)
+				return (-SET_ERROR(ENXIO));
+
 			rw_enter(&zv->zv_suspend_lock, RW_READER);
 			mutex_enter(&zv->zv_state_lock);
+
+			if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
+				mutex_exit(&zv->zv_state_lock);
+				rw_exit(&zv->zv_suspend_lock);
+				return (-SET_ERROR(ENXIO));
+			}
+
 			/* check to see if zv_suspend_lock is needed */
 			if (zv->zv_open_count != 0) {
 				rw_exit(&zv->zv_suspend_lock);
@@ -717,7 +777,6 @@ retry:
 			drop_suspend = B_TRUE;
 		}
 	}
-	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -814,11 +873,11 @@ zvol_release(struct gendisk *disk, fmode_t unused)
 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
 	(void) unused;
 #endif
-	zvol_state_t *zv;
 	boolean_t drop_suspend = B_TRUE;
 
-	rw_enter(&zvol_state_lock, RW_READER);
-	zv = disk->private_data;
+	zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
+	if (zv == NULL)
+		return;
 
 	mutex_enter(&zv->zv_state_lock);
 	ASSERT3U(zv->zv_open_count, >, 0);
@@ -832,6 +891,15 @@ zvol_release(struct gendisk *disk, fmode_t unused)
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, RW_READER);
 			mutex_enter(&zv->zv_state_lock);
+
+			/*
+			 * Unlike in zvol_open(), we don't check if removal
+			 * started here, because we might be one of the openers
+			 * that needs to be thrown out! If we're the last, we
+			 * need to call zvol_last_close() below to finish
+			 * cleanup. So, no special treatment for us.
+			 */
+
 			/* check to see if zv_suspend_lock is needed */
 			if (zv->zv_open_count != 1) {
 				rw_exit(&zv->zv_suspend_lock);
@@ -841,7 +909,6 @@ zvol_release(struct gendisk *disk, fmode_t unused)
 	} else {
 		drop_suspend = B_FALSE;
 	}
-	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -861,9 +928,10 @@ static int
 zvol_ioctl(struct block_device *bdev, fmode_t mode,
     unsigned int cmd, unsigned long arg)
 {
-	zvol_state_t *zv = bdev->bd_disk->private_data;
 	int error = 0;
 
+	zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+	ASSERT3P(zv, !=, NULL);
 	ASSERT3U(zv->zv_open_count, >, 0);
 
 	switch (cmd) {
@@ -886,16 +954,18 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode,
 
 	case BLKZNAME:
 		mutex_enter(&zv->zv_state_lock);
-		error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
+		error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
 		mutex_exit(&zv->zv_state_lock);
+		if (error)
+			error = SET_ERROR(error);
 		break;
 
 	default:
-		error = -ENOTTY;
+		error = SET_ERROR(ENOTTY);
 		break;
 	}
 
-	return (SET_ERROR(error));
+	return (-error);
 }
 
 #ifdef CONFIG_COMPAT
@@ -914,9 +984,8 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
 {
 	unsigned int mask = 0;
 
-	rw_enter(&zvol_state_lock, RW_READER);
+	zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
 
-	zvol_state_t *zv = disk->private_data;
 	if (zv != NULL) {
 		mutex_enter(&zv->zv_state_lock);
 		mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
@@ -924,17 +993,14 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
 		mutex_exit(&zv->zv_state_lock);
 	}
 
-	rw_exit(&zvol_state_lock);
-
 	return (mask);
 }
 
 static int
 zvol_revalidate_disk(struct gendisk *disk)
 {
-	rw_enter(&zvol_state_lock, RW_READER);
+	zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
 
-	zvol_state_t *zv = disk->private_data;
 	if (zv != NULL) {
 		mutex_enter(&zv->zv_state_lock);
 		set_capacity(zv->zv_zso->zvo_disk,
@@ -942,8 +1008,6 @@ zvol_revalidate_disk(struct gendisk *disk)
 		mutex_exit(&zv->zv_state_lock);
 	}
 
-	rw_exit(&zvol_state_lock);
-
 	return (0);
 }
 
@@ -962,28 +1026,19 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
 	return (0);
 }
 
-void
-zvol_os_clear_private(zvol_state_t *zv)
-{
-	/*
-	 * Cleared while holding zvol_state_lock as a writer
-	 * which will prevent zvol_open() from opening it.
-	 */
-	zv->zv_zso->zvo_disk->private_data = NULL;
-}
-
 /*
  * Provide a simple virtual geometry for legacy compatibility.  For devices
  * smaller than 1 MiB a small head and sector count is used to allow very
  * tiny devices.  For devices over 1 Mib a standard head and sector count
  * is used to keep the cylinders count reasonable.
  */
-static int
-zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static inline int
+zvol_getgeo_impl(struct gendisk *disk, struct hd_geometry *geo)
 {
-	zvol_state_t *zv = bdev->bd_disk->private_data;
+	zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
 	sector_t sectors;
 
+	ASSERT3P(zv, !=, NULL);
 	ASSERT3U(zv->zv_open_count, >, 0);
 
 	sectors = get_capacity(zv->zv_zso->zvo_disk);
@@ -1002,6 +1057,20 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return (0);
 }
 
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK
+static int
+zvol_getgeo(struct gendisk *disk, struct hd_geometry *geo)
+{
+	return (zvol_getgeo_impl(disk, geo));
+}
+#else
+static int
+zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	return (zvol_getgeo_impl(bdev->bd_disk, geo));
+}
+#endif
+
 /*
  * Why have two separate block_device_operations structs?
  *
@@ -1302,27 +1371,30 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
  * Allocate memory for a new zvol_state_t and setup the required
  * request queue and generic disk structures for the block device.
  */
-static zvol_state_t *
-zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
+static int
+zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize,
+    zvol_state_t **zvp)
 {
 	zvol_state_t *zv;
 	struct zvol_state_os *zso;
 	uint64_t volmode;
 	int ret;
 
-	if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
-		return (NULL);
+	ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL);
+	if (ret)
+		return (ret);
 
 	if (volmode == ZFS_VOLMODE_DEFAULT)
 		volmode = zvol_volmode;
 
 	if (volmode == ZFS_VOLMODE_NONE)
-		return (NULL);
+		return (0);
 
 	zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
 	zv->zv_zso = zso;
 	zv->zv_volmode = volmode;
+	zv->zv_volsize = volsize;
 	zv->zv_volblocksize = volblocksize;
 
 	list_link_init(&zv->zv_next);
@@ -1396,61 +1468,79 @@ zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
 	snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
 	    ZVOL_DEV_NAME, (dev & MINORMASK));
 
-	return (zv);
+	*zvp = zv;
+	return (ret);
 
 out_kmem:
 	kmem_free(zso, sizeof (struct zvol_state_os));
 	kmem_free(zv, sizeof (zvol_state_t));
-	return (NULL);
+	return (ret);
 }
 
-/*
- * Cleanup then free a zvol_state_t which was created by zvol_alloc().
- * At this time, the structure is not opened by anyone, is taken off
- * the zvol_state_list, and has its private data set to NULL.
- * The zvol_state_lock is dropped.
- *
- * This function may take many milliseconds to complete (e.g. we've seen
- * it take over 256ms), due to the calls to "blk_cleanup_queue" and
- * "del_gendisk". Thus, consumers need to be careful to account for this
- * latency when calling this function.
- */
 void
-zvol_os_free(zvol_state_t *zv)
+zvol_os_remove_minor(zvol_state_t *zv)
 {
-
-	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
-	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT0(zv->zv_open_count);
-	ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
+	ASSERT0(atomic_read(&zv->zv_suspend_ref));
+	ASSERT(zv->zv_flags & ZVOL_REMOVING);
 
-	rw_destroy(&zv->zv_suspend_lock);
-	zfs_rangelock_fini(&zv->zv_rangelock);
+	struct zvol_state_os *zso = zv->zv_zso;
+	zv->zv_zso = NULL;
 
-	del_gendisk(zv->zv_zso->zvo_disk);
+	/* Clearing private_data will make new callers return immediately. */
+	atomic_store_ptr(&zso->zvo_disk->private_data, NULL);
+
+	/*
+	 * Drop the state lock before calling del_gendisk(). There may be
+	 * callers waiting to acquire it, but del_gendisk() will block until
+	 * they exit, which would deadlock.
+	 */
+	mutex_exit(&zv->zv_state_lock);
+
+	del_gendisk(zso->zvo_disk);
 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
 	(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
 #if defined(HAVE_BLK_CLEANUP_DISK)
-	blk_cleanup_disk(zv->zv_zso->zvo_disk);
+	blk_cleanup_disk(zso->zvo_disk);
 #else
-	put_disk(zv->zv_zso->zvo_disk);
+	put_disk(zso->zvo_disk);
 #endif
 #else
-	blk_cleanup_queue(zv->zv_zso->zvo_queue);
-	put_disk(zv->zv_zso->zvo_disk);
+	blk_cleanup_queue(zso->zvo_queue);
+	put_disk(zso->zvo_disk);
 #endif
 
-	if (zv->zv_zso->use_blk_mq)
-		blk_mq_free_tag_set(&zv->zv_zso->tag_set);
+	if (zso->use_blk_mq)
+		blk_mq_free_tag_set(&zso->tag_set);
 
-	ida_simple_remove(&zvol_ida,
-	    MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
+	ida_free(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
+
+	kmem_free(zso, sizeof (struct zvol_state_os));
+
+	mutex_enter(&zv->zv_state_lock);
+}
+
+void
+zvol_os_free(zvol_state_t *zv)
+{
+
+	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT0(zv->zv_open_count);
+	ASSERT0P(zv->zv_zso);
+
+	ASSERT0P(zv->zv_objset);
+	ASSERT0P(zv->zv_zilog);
+	ASSERT0P(zv->zv_dn);
+
+	rw_destroy(&zv->zv_suspend_lock);
+	zfs_rangelock_fini(&zv->zv_rangelock);
 
 	cv_destroy(&zv->zv_removing_cv);
 	mutex_destroy(&zv->zv_state_lock);
 	dataset_kstats_destroy(&zv->zv_kstat);
 
-	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
 	kmem_free(zv, sizeof (zvol_state_t));
 }
 
@@ -1470,7 +1560,9 @@ __zvol_os_add_disk(struct gendisk *disk)
 {
 	int error = 0;
 #ifdef HAVE_ADD_DISK_RET
-	error = add_disk(disk);
+	error = -add_disk(disk);
+	if (error)
+		error = SET_ERROR(error);
 #else
 	add_disk(disk);
 #endif
@@ -1562,7 +1654,7 @@ zvol_os_add_disk(struct gendisk *disk)
 int
 zvol_os_create_minor(const char *name)
 {
-	zvol_state_t *zv;
+	zvol_state_t *zv = NULL;
 	objset_t *os;
 	dmu_object_info_t *doi;
 	uint64_t volsize;
@@ -1577,7 +1669,7 @@ zvol_os_create_minor(const char *name)
 	if (zvol_inhibit_dev)
 		return (0);
 
-	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
+	idx = ida_alloc(&zvol_ida, kmem_flags_convert(KM_SLEEP));
 	if (idx < 0)
 		return (SET_ERROR(-idx));
 	minor = idx << ZVOL_MINOR_BITS;
@@ -1585,7 +1677,7 @@ zvol_os_create_minor(const char *name)
 		/* too many partitions can cause an overflow */
 		zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
 		    name, minor, MINOR(minor));
-		ida_simple_remove(&zvol_ida, idx);
+		ida_free(&zvol_ida, idx);
 		return (SET_ERROR(EINVAL));
 	}
 
@@ -1593,7 +1685,7 @@ zvol_os_create_minor(const char *name)
 	if (zv) {
 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 		mutex_exit(&zv->zv_state_lock);
-		ida_simple_remove(&zvol_ida, idx);
+		ida_free(&zvol_ida, idx);
 		return (SET_ERROR(EEXIST));
 	}
 
@@ -1611,18 +1703,16 @@ zvol_os_create_minor(const char *name)
 	if (error)
 		goto out_dmu_objset_disown;
 
-	zv = zvol_alloc(MKDEV(zvol_major, minor), name,
-	    doi->doi_data_block_size);
-	if (zv == NULL) {
-		error = SET_ERROR(EAGAIN);
+	error = zvol_alloc(MKDEV(zvol_major, minor), name,
+	    volsize, doi->doi_data_block_size, &zv);
+	if (error || zv == NULL)
 		goto out_dmu_objset_disown;
-	}
+
 	zv->zv_hash = hash;
 
 	if (dmu_objset_is_snapshot(os))
 		zv->zv_flags |= ZVOL_RDONLY;
 
-	zv->zv_volsize = volsize;
 	zv->zv_objset = os;
 
 	/* Default */
@@ -1647,11 +1737,11 @@ zvol_os_create_minor(const char *name)
 	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
 #endif
 
-	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+	ASSERT0P(zv->zv_kstat.dk_kstats);
 	error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
 	if (error)
 		goto out_dmu_objset_disown;
-	ASSERT3P(zv->zv_zilog, ==, NULL);
+	ASSERT0P(zv->zv_zilog);
 	zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
 	if (spa_writeable(dmu_objset_spa(os))) {
 		if (zil_replay_disable)
@@ -1689,19 +1779,19 @@ out_doi:
 	 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
 	 * directly as well.
 	 */
-	if (error == 0) {
+	if (error == 0 && zv) {
 		rw_enter(&zvol_state_lock, RW_WRITER);
 		zvol_insert(zv);
 		rw_exit(&zvol_state_lock);
 		error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
 	} else {
-		ida_simple_remove(&zvol_ida, idx);
+		ida_free(&zvol_ida, idx);
 	}
 
 	return (error);
 }
 
-void
+int
 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 {
 	int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
@@ -1728,6 +1818,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 	set_disk_ro(zv->zv_zso->zvo_disk, readonly);
 
 	dataset_kstats_rename(&zv->zv_kstat, newname);
+
+	return (0);
 }
 
 void
@@ -1755,10 +1847,10 @@ zvol_init(void)
 		return (error);
 	}
 
-	error = register_blkdev(zvol_major, ZVOL_DRIVER);
+	error = -register_blkdev(zvol_major, ZVOL_DRIVER);
 	if (error) {
 		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
-		return (error);
+		return (SET_ERROR(error));
 	}
 
 	if (zvol_blk_mq_queue_depth == 0) {