aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/os
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/os')
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c2
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c9
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c7
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c2
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c2
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c1
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c4
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c1
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c23
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c7
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c41
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c16
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c38
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c8
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c6
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c3
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_racct.c4
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c165
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c806
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c77
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c6
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c584
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c8
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-generic.c25
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c24
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c22
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c7
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-proc.c10
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c28
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-thread.c14
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/abd_os.c12
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/policy.c44
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c321
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c6
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c10
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c24
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c26
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_racct.c8
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c11
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c258
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c206
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c84
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c67
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c139
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c82
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c23
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c156
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c4
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c432
51 files changed, 2364 insertions, 1503 deletions
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c
index 6d198fad5203..ae6e36d988c2 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c
@@ -160,7 +160,7 @@ kmem_cache_create(const char *name, size_t bufsize, size_t align,
{
kmem_cache_t *cache;
- ASSERT3P(vmp, ==, NULL);
+ ASSERT0P(vmp);
cache = kmem_alloc(sizeof (*cache), KM_SLEEP);
strlcpy(cache->kc_name, name, sizeof (cache->kc_name));
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
index f9125a067cd1..3f360d167b17 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
@@ -101,6 +101,15 @@ spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
va_end(ap);
}
+/*
+ * Check if the current thread is a memory reclaim thread.
+ * Returns true if curproc is pageproc (FreeBSD's page daemon).
+ */
+int
+current_is_reclaim_thread(void)
+{
+ return (curproc == pageproc);
+}
SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY,
opensolaris_utsname_init, NULL);
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c
index aad3ef2fad5d..7fc93648c71e 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c
@@ -53,13 +53,6 @@ secpolicy_zfs(cred_t *cr)
}
int
-secpolicy_zfs_proc(cred_t *cr, proc_t *proc)
-{
-
- return (priv_check_cred(cr, PRIV_VFS_MOUNT));
-}
-
-int
secpolicy_sys_config(cred_t *cr, int checkonly __unused)
{
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c
index aee506f28615..a92068aa1cdc 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c
@@ -30,7 +30,7 @@
#include <sys/param.h>
#include <sys/string.h>
#include <sys/kmem.h>
-#include <machine/stdarg.h>
+#include <sys/stdarg.h>
#define IS_DIGIT(c) ((c) >= '0' && (c) <= '9')
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c
index 9da633c2b1be..3c2d39b20c09 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c
@@ -256,7 +256,7 @@ sysevent_worker(void *arg __unused)
* free `ze`, so just inline the free() here -- events have already
* been drained.
*/
- VERIFY3P(ze->ze_zevent, ==, NULL);
+ VERIFY0P(ze->ze_zevent);
kmem_free(ze, sizeof (zfs_zevent_t));
kthread_exit();
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
index 733c2bd07ebb..9d5f025423a1 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
@@ -43,6 +43,7 @@
const int zfs_vm_pagerret_bad = VM_PAGER_BAD;
const int zfs_vm_pagerret_error = VM_PAGER_ERROR;
const int zfs_vm_pagerret_ok = VM_PAGER_OK;
+const int zfs_vm_pagerret_pend = VM_PAGER_PEND;
const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC;
const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL;
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
index fbf67f6a14a8..4bf487cdc469 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
@@ -507,7 +507,7 @@ abd_iter_at_end(struct abd_iter *aiter)
void
abd_iter_advance(struct abd_iter *aiter, size_t amount)
{
- ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0P(aiter->iter_mapaddr);
ASSERT0(aiter->iter_mapsize);
/* There's nothing left to advance to, so do nothing */
@@ -526,7 +526,7 @@ abd_iter_map(struct abd_iter *aiter)
{
void *paddr;
- ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0P(aiter->iter_mapaddr);
ASSERT0(aiter->iter_mapsize);
/* There's nothing left to iterate over, so do nothing */
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c
index a4bf3fb6490f..0fc2697717af 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c
@@ -196,7 +196,6 @@ zfs_crypto_dispatch(freebsd_crypt_session_t *session, struct cryptop *crp)
break;
}
crp->crp_etype = 0;
- crp->crp_flags &= ~CRYPTO_F_DONE;
session->fs_done = false;
}
return (error);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
index d7c9be70ad4a..26cc7981bfcd 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
@@ -41,7 +41,6 @@
#include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h>
#include <sys/dsl_prop.h>
-#include <sys/dmu_zfetch.h>
#include <sys/zfs_ioctl.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
@@ -71,6 +70,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
struct sf_buf *sf;
int numbufs, i;
int err;
+ dmu_flags_t flags = 0;
if (size == 0)
return (0);
@@ -94,10 +94,17 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
- if (tocpy == db->db_size)
+ if (tocpy == db->db_size) {
dmu_buf_will_fill(db, tx, B_FALSE);
- else
- dmu_buf_will_dirty(db, tx);
+ } else {
+ if (i == numbufs - 1 && bufoff + tocpy < db->db_size) {
+ if (bufoff == 0)
+ flags |= DMU_PARTIAL_FIRST;
+ else
+ flags |= DMU_PARTIAL_MORE;
+ }
+ dmu_buf_will_dirty_flags(db, tx, flags);
+ }
for (copied = 0; copied < tocpy; copied += PAGESIZE) {
ASSERT3U(ptoa((*ma)->pindex), ==,
@@ -149,7 +156,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
if (dbp[0]->db_offset != 0 || numbufs > 1) {
for (i = 0; i < numbufs; i++) {
ASSERT(ISP2(dbp[i]->db_size));
- ASSERT3U((dbp[i]->db_offset % dbp[i]->db_size), ==, 0);
+ ASSERT0((dbp[i]->db_offset % dbp[i]->db_size));
ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
}
}
@@ -168,7 +175,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
vm_page_sunbusy(m);
break;
}
- ASSERT3U(m->dirty, ==, 0);
+ ASSERT0(m->dirty);
ASSERT(!pmap_page_is_write_mapped(m));
ASSERT3U(db->db_size, >, PAGE_SIZE);
@@ -194,7 +201,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
if (m != bogus_page) {
vm_page_assert_xbusied(m);
ASSERT(vm_page_none_valid(m));
- ASSERT3U(m->dirty, ==, 0);
+ ASSERT0(m->dirty);
ASSERT(!pmap_page_is_write_mapped(m));
va = zfs_map_page(m, &sf);
}
@@ -288,7 +295,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
vm_page_sunbusy(m);
break;
}
- ASSERT3U(m->dirty, ==, 0);
+ ASSERT0(m->dirty);
ASSERT(!pmap_page_is_write_mapped(m));
ASSERT3U(db->db_size, >, PAGE_SIZE);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
index c114db14a916..b218c0da8125 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
@@ -112,7 +112,6 @@ static int zfs__fini(void);
static void zfs_shutdown(void *, int);
static eventhandler_tag zfs_shutdown_event_tag;
-static eventhandler_tag zfs_mountroot_event_tag;
#define ZFS_MIN_KSTACK_PAGES 4
@@ -311,9 +310,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused)
zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
shutdown_post_sync, zfs_shutdown, NULL,
SHUTDOWN_PRI_FIRST);
- zfs_mountroot_event_tag = EVENTHANDLER_REGISTER(
- mountroot, spa_boot_init, NULL,
- SI_ORDER_ANY);
}
return (err);
case MOD_UNLOAD:
@@ -322,9 +318,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused)
if (zfs_shutdown_event_tag != NULL)
EVENTHANDLER_DEREGISTER(shutdown_post_sync,
zfs_shutdown_event_tag);
- if (zfs_mountroot_event_tag != NULL)
- EVENTHANDLER_DEREGISTER(mountroot,
- zfs_mountroot_event_tag);
}
return (err);
case MOD_SHUTDOWN:
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
index ace2360c032d..ebc2c0eeb6d2 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@@ -163,6 +163,13 @@ param_set_arc_int(SYSCTL_HANDLER_ARGS)
return (0);
}
+static void
+warn_deprecated_sysctl(const char *old, const char *new)
+{
+ printf("WARNING: sysctl vfs.zfs.%s is deprecated. Use vfs.zfs.%s instead.\n",
+ old, new);
+}
+
int
param_set_arc_max(SYSCTL_HANDLER_ARGS)
{
@@ -185,12 +192,15 @@ param_set_arc_max(SYSCTL_HANDLER_ARGS)
if (val != 0)
zfs_arc_max = arc_c_max;
+ if (arg2 != 0)
+ warn_deprecated_sysctl("arc_max", "arc.max");
+
return (0);
}
SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max,
CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
- NULL, 0, param_set_arc_max, "LU",
+ NULL, 1, param_set_arc_max, "LU",
"Maximum ARC size in bytes (LEGACY)");
int
@@ -214,12 +224,15 @@ param_set_arc_min(SYSCTL_HANDLER_ARGS)
if (val != 0)
zfs_arc_min = arc_c_min;
+ if (arg2 != 0)
+ warn_deprecated_sysctl("arc_min", "arc.min");
+
return (0);
}
SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min,
CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
- NULL, 0, param_set_arc_min, "LU",
+ NULL, 1, param_set_arc_min, "LU",
"Minimum ARC size in bytes (LEGACY)");
extern uint_t zfs_arc_free_target;
@@ -242,6 +255,9 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS)
zfs_arc_free_target = val;
+ if (arg2 != 0)
+ warn_deprecated_sysctl("arc_free_target", "arc.free_target");
+
return (0);
}
@@ -251,7 +267,7 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS)
*/
SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
- NULL, 0, param_set_arc_free_target, "IU",
+ NULL, 1, param_set_arc_free_target, "IU",
"Desired number of free pages below which ARC triggers reclaim"
" (LEGACY)");
@@ -270,12 +286,15 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
arc_no_grow_shift = val;
+ if (arg2 != 0)
+ warn_deprecated_sysctl("arc_no_grow_shift", "arc.no_grow_shift");
+
return (0);
}
SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift,
CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
- NULL, 0, param_set_arc_no_grow_shift, "I",
+ NULL, 1, param_set_arc_no_grow_shift, "I",
"log2(fraction of ARC which must be free to allow growing) (LEGACY)");
extern uint64_t l2arc_write_max;
@@ -746,12 +765,15 @@ param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS)
zfs_vdev_min_auto_ashift = val;
+ if (arg2 != 0)
+ warn_deprecated_sysctl("min_auto_ashift",
+ "vdev.min_auto_ashift");
+
return (0);
}
SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
- CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
- &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift),
+ CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1,
param_set_min_auto_ashift, "IU",
"Min ashift used when creating new top-level vdev. (LEGACY)");
@@ -771,12 +793,15 @@ param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS)
zfs_vdev_max_auto_ashift = val;
+ if (arg2 != 0)
+ warn_deprecated_sysctl("max_auto_ashift",
+ "vdev.max_auto_ashift");
+
return (0);
}
SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
- CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
- &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift),
+ CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1,
param_set_max_auto_ashift, "IU",
"Max ashift used when optimizing for logical -> physical sector size on"
" new top-level vdevs. (LEGACY)");
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
index 7acf37ba9cd7..bbd1dafc69be 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
@@ -1236,12 +1236,21 @@ vdev_geom_io_done(zio_t *zio)
struct bio *bp = zio->io_bio;
if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
- ASSERT3P(bp, ==, NULL);
+ ASSERT0P(bp);
return;
}
if (bp == NULL) {
- ASSERT3S(zio->io_error, ==, ENXIO);
+ if (zio_injection_enabled && zio->io_error == EIO)
+ /*
+ * Convert an injected EIO to ENXIO. This is needed to
+ * work around zio_handle_device_injection_impl() not
+ * currently being able to inject ENXIO directly, while
+ * the assertion below only allows ENXIO here.
+ */
+ zio->io_error = SET_ERROR(ENXIO);
+ else
+ ASSERT3S(zio->io_error, ==, ENXIO);
return;
}
@@ -1276,7 +1285,8 @@ vdev_ops_t vdev_disk_ops = {
.vdev_op_fini = NULL,
.vdev_op_open = vdev_geom_open,
.vdev_op_close = vdev_geom_close,
- .vdev_op_asize = vdev_default_asize,
+ .vdev_op_psize_to_asize = vdev_default_asize,
+ .vdev_op_asize_to_psize = vdev_default_psize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_geom_io_start,
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
index 334264f6da2f..cb5787269db2 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
@@ -1175,7 +1175,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
int count = 0;
zfs_acl_phys_t acl_phys;
- if (zp->z_zfsvfs->z_replay == B_FALSE) {
+ if (ZTOV(zp) != NULL && zp->z_zfsvfs->z_replay == B_FALSE) {
ASSERT_VOP_IN_SEQC(ZTOV(zp));
}
@@ -1632,7 +1632,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
if (zfsvfs->z_replay == B_FALSE)
ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
} else
- ASSERT3P(dzp->z_vnode, ==, NULL);
+ ASSERT0P(dzp->z_vnode);
memset(acl_ids, 0, sizeof (zfs_acl_ids_t));
acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
@@ -2014,7 +2014,7 @@ top:
error = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT0(error);
- ASSERT3P(zp->z_acl_cached, ==, NULL);
+ ASSERT0P(zp->z_acl_cached);
zp->z_acl_cached = aclp;
if (fuid_dirtied)
@@ -2357,10 +2357,42 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr,
* In FreeBSD, we don't care about permissions of individual ADS.
* Note that not checking them is not just an optimization - without
* this shortcut, EA operations may bogusly fail with EACCES.
+ *
+ * If this is a named attribute lookup, do the checks.
*/
+#if __FreeBSD_version >= 1500040
+ if ((zp->z_pflags & ZFS_XATTR) && (flags & V_NAMEDATTR) == 0)
+#else
if (zp->z_pflags & ZFS_XATTR)
+#endif
return (0);
+ /*
+ * If a named attribute directory then validate against base file
+ */
+ if (is_attr) {
+ if ((error = zfs_zget(ZTOZSB(zp),
+ zp->z_xattr_parent, &xzp)) != 0) {
+ return (error);
+ }
+
+ check_zp = xzp;
+
+ /*
+ * fixup mode to map to xattr perms
+ */
+
+ if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+ mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ mode |= ACE_WRITE_NAMED_ATTRS;
+ }
+
+ if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+ mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+ mode |= ACE_READ_NAMED_ATTRS;
+ }
+ }
+
owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
/*
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
index 8d0ff9b25e30..4de48e013ec4 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
@@ -357,7 +357,7 @@ zfsctl_create(zfsvfs_t *zfsvfs)
vnode_t *rvp;
uint64_t crtime[2];
- ASSERT3P(zfsvfs->z_ctldir, ==, NULL);
+ ASSERT0P(zfsvfs->z_ctldir);
snapdir = sfs_alloc_node(sizeof (*snapdir), "snapshot", ZFSCTL_INO_ROOT,
ZFSCTL_INO_SNAPDIR);
@@ -494,7 +494,7 @@ zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
vap->va_uid = 0;
vap->va_gid = 0;
- vap->va_rdev = 0;
+ vap->va_rdev = NODEV;
/*
* We are a purely virtual object, so we have no
* blocksize or allocated blocks.
@@ -688,6 +688,8 @@ zfsctl_root_readdir(struct vop_readdir_args *ap)
* count to return is 0.
*/
if (zfs_uio_offset(&uio) == 3 * sizeof (entry)) {
+ if (eofp != NULL)
+ *eofp = 1;
return (0);
}
@@ -1367,7 +1369,7 @@ zfsctl_snapshot_unmount(const char *snapname, int flags __unused)
int err = getzfsvfs(snapname, &zfsvfs);
if (err != 0) {
- ASSERT3P(zfsvfs, ==, NULL);
+ ASSERT0P(zfsvfs);
return (0);
}
vfsp = zfsvfs->z_vfs;
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c
index a0c9dd178980..75ba2ea0cb9e 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c
@@ -273,7 +273,7 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
ASSERT(zp->z_unlinked);
- ASSERT3U(zp->z_links, ==, 0);
+ ASSERT0(zp->z_links);
VERIFY0(zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
@@ -437,7 +437,7 @@ zfs_rmnode(znode_t *zp)
uint64_t count;
int error;
- ASSERT3U(zp->z_links, ==, 0);
+ ASSERT0(zp->z_links);
if (zfsvfs->z_replay == B_FALSE)
ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
@@ -833,7 +833,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xvpp, cred_t *cr)
if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
&acl_ids, NULL)) != 0)
return (error);
- if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0)) {
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) {
zfs_acl_ids_free(&acl_ids);
return (SET_ERROR(EDQUOT));
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
index 21e5f7938f9f..ca13569a1235 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
@@ -164,8 +164,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
int
zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
- ssize_t *resid)
+ uint8_t ashift, ssize_t *resid)
{
+ (void) ashift;
return (zfs_file_write_impl(fp, buf, count, &off, resid));
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_racct.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_racct.c
index bdbbdacd984e..50d1cbf53afc 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_racct.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_racct.c
@@ -28,7 +28,7 @@
#include <sys/racct.h>
void
-zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
curthread->td_ru.ru_inblock += iops;
#ifdef RACCT
@@ -46,7 +46,7 @@ zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
}
void
-zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
curthread->td_ru.ru_oublock += iops;
#ifdef RACCT
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
index 547e109db404..79b784288911 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
@@ -241,35 +241,40 @@ zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
{
int error = 0;
char buf[32];
- uint64_t usedobj, quotaobj;
+ uint64_t usedobj, quotaobj, defaultquota;
uint64_t quota, used = 0;
timespec_t now;
usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+ defaultquota = isgroup ? zfsvfs->z_defaultgroupquota :
+ zfsvfs->z_defaultuserquota;
+
+ if (zfsvfs->z_replay)
+ return (ENOENT);
- if (quotaobj == 0 || zfsvfs->z_replay) {
- error = ENOENT;
- goto done;
- }
(void) sprintf(buf, "%llx", (longlong_t)id);
- if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
- buf, sizeof (quota), 1, &quota)) != 0) {
- dprintf("%s(%d): quotaobj lookup failed\n",
- __FUNCTION__, __LINE__);
- goto done;
+ if (quotaobj == 0) {
+ if (defaultquota == 0)
+ return (ENOENT);
+ quota = defaultquota;
+ } else {
+ error = zap_lookup(zfsvfs->z_os, quotaobj, buf, sizeof (quota),
+ 1, &quota);
+ if (error && (quota = defaultquota) == 0)
+ return (error);
}
+
/*
* quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
* So we set them to be the same.
*/
dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
- if (error && error != ENOENT) {
- dprintf("%s(%d): usedobj failed; %d\n",
- __FUNCTION__, __LINE__, error);
- goto done;
- }
+ if (error == ENOENT)
+ error = 0;
+ if (error)
+ return (error);
dqp->dqb_curblocks = btodb(used);
dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
vfs_timestamp(&now);
@@ -279,7 +284,6 @@ zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
* particularly useful.
*/
dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
-done:
return (error);
}
@@ -451,8 +455,13 @@ zfs_sync(vfs_t *vfsp, int waitfor)
return (0);
}
- if (zfsvfs->z_log != NULL)
- zil_commit(zfsvfs->z_log, 0);
+ if (zfsvfs->z_log != NULL) {
+ error = zil_commit(zfsvfs->z_log, 0);
+ if (error != 0) {
+ zfs_exit(zfsvfs, FTAG);
+ return (error);
+ }
+ }
zfs_exit(zfsvfs, FTAG);
} else {
@@ -861,6 +870,36 @@ zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
zfsvfs->z_xattr_sa = B_TRUE;
}
+ error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSERQUOTA,
+ &zfsvfs->z_defaultuserquota);
+ if (error != 0)
+ return (error);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPQUOTA,
+ &zfsvfs->z_defaultgroupquota);
+ if (error != 0)
+ return (error);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTQUOTA,
+ &zfsvfs->z_defaultprojectquota);
+ if (error != 0)
+ return (error);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSEROBJQUOTA,
+ &zfsvfs->z_defaultuserobjquota);
+ if (error != 0)
+ return (error);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPOBJQUOTA,
+ &zfsvfs->z_defaultgroupobjquota);
+ if (error != 0)
+ return (error);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTOBJQUOTA,
+ &zfsvfs->z_defaultprojectobjquota);
+ if (error != 0)
+ return (error);
+
error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
&zfsvfs->z_attr_table);
if (error != 0)
@@ -1057,7 +1096,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
if (mounting) {
boolean_t readonly;
- ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+ ASSERT0P(zfsvfs->z_kstat.dk_kstats);
error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
if (error)
return (error);
@@ -1175,6 +1214,8 @@ zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
}
+extern int zfs_xattr_compat;
+
static int
zfs_domount(vfs_t *vfsp, char *osname)
{
@@ -1255,6 +1296,16 @@ zfs_domount(vfs_t *vfsp, char *osname)
goto out;
}
+#if __FreeBSD_version >= 1500040
+ /*
+ * Named attributes can only work if the xattr property is set to
+ * on/dir and not sa. Also, zfs_xattr_compat must be set.
+ */
+ if ((zfsvfs->z_flags & ZSB_XATTR) != 0 && !zfsvfs->z_xattr_sa &&
+ zfs_xattr_compat)
+ vfsp->mnt_flag |= MNT_NAMEDATTR;
+#endif
+
vfs_mountedfrom(vfsp, osname);
if (!zfsvfs->z_issnap)
@@ -1778,6 +1829,14 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
err = vn_lock(*vpp, flags);
if (err != 0)
vrele(*vpp);
+#if __FreeBSD_version >= 1500040
+ else if ((zp->z_pflags & ZFS_XATTR) != 0) {
+ if ((*vpp)->v_type == VDIR)
+ vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
+ else
+ vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR);
+ }
+#endif
}
if (err != 0)
*vpp = NULL;
@@ -1930,9 +1989,17 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
*vpp = ZTOV(zp);
zfs_exit(zfsvfs, FTAG);
err = vn_lock(*vpp, flags);
- if (err == 0)
+ if (err == 0) {
vnode_create_vobject(*vpp, zp->z_size, curthread);
- else
+#if __FreeBSD_version >= 1500040
+ if ((zp->z_pflags & ZFS_XATTR) != 0) {
+ if ((*vpp)->v_type == VDIR)
+ vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
+ else
+ vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR);
+ }
+#endif
+ } else
*vpp = NULL;
return (err);
}
@@ -2241,6 +2308,62 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
return (0);
}
+int
+zfs_set_default_quota(zfsvfs_t *zfsvfs, zfs_prop_t prop, uint64_t quota)
+{
+ int error;
+ objset_t *os = zfsvfs->z_os;
+ const char *propstr = zfs_prop_to_name(prop);
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, propstr);
+ error = dmu_tx_assign(tx, DMU_TX_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ if (quota == 0) {
+ error = zap_remove(os, MASTER_NODE_OBJ, propstr, tx);
+ if (error == ENOENT)
+ error = 0;
+ } else {
+ error = zap_update(os, MASTER_NODE_OBJ, propstr, 8, 1,
+ &quota, tx);
+ }
+
+ if (error)
+ goto out;
+
+ switch (prop) {
+ case ZFS_PROP_DEFAULTUSERQUOTA:
+ zfsvfs->z_defaultuserquota = quota;
+ break;
+ case ZFS_PROP_DEFAULTGROUPQUOTA:
+ zfsvfs->z_defaultgroupquota = quota;
+ break;
+ case ZFS_PROP_DEFAULTPROJECTQUOTA:
+ zfsvfs->z_defaultprojectquota = quota;
+ break;
+ case ZFS_PROP_DEFAULTUSEROBJQUOTA:
+ zfsvfs->z_defaultuserobjquota = quota;
+ break;
+ case ZFS_PROP_DEFAULTGROUPOBJQUOTA:
+ zfsvfs->z_defaultgroupobjquota = quota;
+ break;
+ case ZFS_PROP_DEFAULTPROJECTOBJQUOTA:
+ zfsvfs->z_defaultprojectobjquota = quota;
+ break;
+ default:
+ break;
+ }
+
+out:
+ dmu_tx_commit(tx);
+ return (error);
+}
+
/*
* Return true if the corresponding vfs's unmounted flag is set.
* Otherwise return false.
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
index e468377eb44f..411225786089 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -25,6 +25,7 @@
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2025, Klara, Inc.
*/
/* Portions Copyright 2007 Jeremy Teo */
@@ -60,6 +61,7 @@
#include <sys/fs/zfs.h>
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/dbuf.h>
@@ -74,6 +76,7 @@
#include <sys/zfs_quota.h>
#include <sys/zfs_sa.h>
#include <sys/zfs_rlock.h>
+#include <sys/zfs_project.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/sched.h>
@@ -113,6 +116,8 @@ typedef uint64_t cookie_t;
typedef ulong_t cookie_t;
#endif
+static int zfs_check_attrname(const char *name);
+
/*
* Programming rules.
*
@@ -267,6 +272,71 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
}
static int
+zfs_ioctl_getxattr(vnode_t *vp, zfsxattr_t *fsx)
+{
+ znode_t *zp = VTOZ(vp);
+
+ memset(fsx, 0, sizeof (*fsx));
+ fsx->fsx_xflags = (zp->z_pflags & ZFS_PROJINHERIT) ?
+ ZFS_PROJINHERIT_FL : 0;
+ fsx->fsx_projid = zp->z_projid;
+
+ return (0);
+}
+
+static int
+zfs_ioctl_setflags(vnode_t *vp, uint32_t ioctl_flags, xvattr_t *xva)
+{
+ uint64_t zfs_flags = VTOZ(vp)->z_pflags;
+ xoptattr_t *xoap;
+
+ if (ioctl_flags & ~(ZFS_PROJINHERIT_FL))
+ return (SET_ERROR(EOPNOTSUPP));
+
+ xva_init(xva);
+ xoap = xva_getxoptattr(xva);
+
+#define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \
+ if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \
+ ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \
+ XVA_SET_REQ(xva, (xflag)); \
+ (xfield) = ((ioctl_flags & (iflag)) != 0); \
+ } \
+} while (0)
+
+ FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
+ xoap->xoa_projinherit);
+
+#undef FLAG_CHANGE
+
+ return (0);
+}
+
+static int
+zfs_ioctl_setxattr(vnode_t *vp, zfsxattr_t *fsx, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ xvattr_t xva;
+ xoptattr_t *xoap;
+ int err;
+
+ if (!zpl_is_valid_projid(fsx->fsx_projid))
+ return (SET_ERROR(EINVAL));
+
+ err = zfs_ioctl_setflags(vp, fsx->fsx_xflags, &xva);
+ if (err)
+ return (err);
+
+ xoap = xva_getxoptattr(&xva);
+ XVA_SET_REQ(&xva, XAT_PROJID);
+ xoap->xoa_projid = fsx->fsx_projid;
+
+ err = zfs_setattr(zp, (vattr_t *)&xva, 0, cr, NULL);
+
+ return (err);
+}
+
+static int
zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
int *rvalp)
{
@@ -305,6 +375,38 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
*(offset_t *)data = off;
return (0);
}
+ case ZFS_IOC_FSGETXATTR: {
+ zfsxattr_t *fsx = (zfsxattr_t *)data;
+ error = vn_lock(vp, LK_SHARED);
+ if (error)
+ return (error);
+ error = zfs_ioctl_getxattr(vp, fsx);
+ VOP_UNLOCK(vp);
+ return (error);
+ }
+ case ZFS_IOC_FSSETXATTR: {
+ zfsxattr_t *fsx = (zfsxattr_t *)data;
+ error = vn_lock(vp, LK_EXCLUSIVE);
+ if (error)
+ return (error);
+ vn_seqc_write_begin(vp);
+ error = zfs_ioctl_setxattr(vp, fsx, cred);
+ vn_seqc_write_end(vp);
+ VOP_UNLOCK(vp);
+ return (error);
+ }
+ case ZFS_IOC_REWRITE: {
+ zfs_rewrite_args_t *args = (zfs_rewrite_args_t *)data;
+ if ((flag & FWRITE) == 0)
+ return (SET_ERROR(EBADF));
+ error = vn_lock(vp, LK_SHARED);
+ if (error)
+ return (error);
+ error = zfs_rewrite(VTOZ(vp), args->off, args->len,
+ args->flags, args->arg);
+ VOP_UNLOCK(vp);
+ return (error);
+ }
}
return (SET_ERROR(ENOTTY));
}
@@ -518,7 +620,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
page_unhold(pp);
} else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
- uio, bytes);
+ uio, bytes, DMU_READ_PREFETCH);
}
len -= bytes;
off = 0;
@@ -717,7 +819,12 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp,
/*
* Do we have permission to get into attribute directory?
*/
- error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr, NULL);
+ if (flags & LOOKUP_NAMED_ATTR)
+ error = zfs_zaccess(zp, ACE_EXECUTE, V_NAMEDATTR,
+ B_FALSE, cr, NULL);
+ else
+ error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr,
+ NULL);
if (error) {
vrele(ZTOV(zp));
}
@@ -997,7 +1104,7 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
zfs_exit(zfsvfs, FTAG);
return (error);
}
- ASSERT3P(zp, ==, NULL);
+ ASSERT0P(zp);
/*
* Create a new file object and update the directory
@@ -1089,8 +1196,8 @@ out:
*zpp = zp;
}
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -1219,9 +1326,8 @@ out:
if (xzp)
vrele(ZTOV(xzp));
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -1378,7 +1484,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
zfs_exit(zfsvfs, FTAG);
return (error);
}
- ASSERT3P(zp, ==, NULL);
+ ASSERT0P(zp);
if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
mnt_ns))) {
@@ -1452,8 +1558,8 @@ out:
getnewvnode_drop_reserve();
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -1533,8 +1639,8 @@ zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
if (zfsvfs->z_use_namecache)
cache_vop_rmdir(dvp, vp);
out:
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -1632,7 +1738,7 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
/*
* Quit if directory has been removed (posix)
*/
- if ((*eofp = zp->z_unlinked) != 0) {
+ if ((*eofp = (zp->z_unlinked != 0)) != 0) {
zfs_exit(zfsvfs, FTAG);
return (0);
}
@@ -1910,7 +2016,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
if (vp->v_type == VBLK || vp->v_type == VCHR)
vap->va_rdev = zfs_cmpldev(rdev);
else
- vap->va_rdev = 0;
+ vap->va_rdev = NODEV;
vap->va_gen = zp->z_gen;
vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
vap->va_filerev = zp->z_seq;
@@ -2046,6 +2152,134 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
}
/*
+ * For the operation of changing file's user/group/project, we need to
+ * handle not only the main object that is assigned to the file directly,
+ * but also the ones that are used by the file via hidden xattr directory.
+ *
+ * Because the xattr directory may contains many EA entries, as to it may
+ * be impossible to change all of them via the transaction of changing the
+ * main object's user/group/project attributes. Then we have to change them
+ * via other multiple independent transactions one by one. It may be not good
+ * solution, but we have no better idea yet.
+ */
+static int
+zfs_setattr_dir(znode_t *dzp)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ objset_t *os = zfsvfs->z_os;
+ zap_cursor_t zc;
+ zap_attribute_t *zap;
+ znode_t *zp = NULL;
+ dmu_tx_t *tx = NULL;
+ uint64_t uid, gid;
+ sa_bulk_attr_t bulk[4];
+ int count;
+ int err;
+
+ zap = zap_attribute_alloc();
+ zap_cursor_init(&zc, os, dzp->z_id);
+ while ((err = zap_cursor_retrieve(&zc, zap)) == 0) {
+ count = 0;
+ if (zap->za_integer_length != 8 || zap->za_num_integers != 1) {
+ err = ENXIO;
+ break;
+ }
+
+ err = zfs_dirent_lookup(dzp, zap->za_name, &zp, ZEXISTS);
+ if (err == ENOENT)
+ goto next;
+ if (err)
+ break;
+
+ if (zp->z_uid == dzp->z_uid &&
+ zp->z_gid == dzp->z_gid &&
+ zp->z_projid == dzp->z_projid)
+ goto next;
+
+ tx = dmu_tx_create(os);
+ if (!(zp->z_pflags & ZFS_PROJID))
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ else
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+
+ err = dmu_tx_assign(tx, DMU_TX_WAIT);
+ if (err)
+ break;
+
+ vn_seqc_write_begin(ZTOV(zp));
+ mutex_enter(&dzp->z_lock);
+
+ if (zp->z_uid != dzp->z_uid) {
+ uid = dzp->z_uid;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &uid, sizeof (uid));
+ zp->z_uid = uid;
+ }
+
+ if (zp->z_gid != dzp->z_gid) {
+ gid = dzp->z_gid;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &gid, sizeof (gid));
+ zp->z_gid = gid;
+ }
+
+ uint64_t projid = dzp->z_projid;
+ if (zp->z_projid != projid) {
+ if (!(zp->z_pflags & ZFS_PROJID)) {
+ err = sa_add_projid(zp->z_sa_hdl, tx, projid);
+ if (unlikely(err == EEXIST)) {
+ err = 0;
+ } else if (err != 0) {
+ goto sa_add_projid_err;
+ } else {
+ projid = ZFS_INVALID_PROJID;
+ }
+ }
+
+ if (projid != ZFS_INVALID_PROJID) {
+ zp->z_projid = projid;
+ SA_ADD_BULK_ATTR(bulk, count,
+ SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
+ sizeof (zp->z_projid));
+ }
+ }
+
+sa_add_projid_err:
+ mutex_exit(&dzp->z_lock);
+
+ if (likely(count > 0)) {
+ err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ dmu_tx_commit(tx);
+ } else if (projid == ZFS_INVALID_PROJID) {
+ dmu_tx_commit(tx);
+ } else {
+ dmu_tx_abort(tx);
+ }
+ tx = NULL;
+ vn_seqc_write_end(ZTOV(zp));
+ if (err != 0 && err != ENOENT)
+ break;
+
+next:
+ if (zp) {
+ zrele(zp);
+ zp = NULL;
+ }
+ zap_cursor_advance(&zc);
+ }
+
+ if (tx)
+ dmu_tx_abort(tx);
+ if (zp) {
+ zrele(zp);
+ }
+ zap_cursor_fini(&zc);
+ zap_attribute_free(zap);
+
+ return (err == ENOENT ? 0 : err);
+}
+
+/*
* Set the file attributes to the values contained in the
* vattr structure.
*
@@ -2090,6 +2324,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
zfs_acl_t *aclp;
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
boolean_t fuid_dirtied = B_FALSE;
+ boolean_t handle_eadir = B_FALSE;
sa_bulk_attr_t bulk[7], xattr_bulk[7];
int count = 0, xattr_count = 0;
@@ -2441,6 +2676,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
mask = vap->va_mask;
if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) {
+ handle_eadir = B_TRUE;
err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
&xattr_obj, sizeof (xattr_obj));
@@ -2770,11 +3006,15 @@ out:
} else {
err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
dmu_tx_commit(tx);
+ if (attrzp) {
+ if (err2 == 0 && handle_eadir)
+ err = zfs_setattr_dir(attrzp);
+ }
}
out2:
- if (os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS)
+ err = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (err);
@@ -3303,7 +3543,7 @@ out_seq:
out:
if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -3426,8 +3666,7 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
return (error);
}
- if (zfs_acl_ids_overquota(zfsvfs, &acl_ids,
- 0 /* projid */)) {
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
zfs_acl_ids_free(&acl_ids);
zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EDQUOT));
@@ -3496,7 +3735,7 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
*zpp = zp;
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ error = zil_commit(zilog, 0);
}
zfs_exit(zfsvfs, FTAG);
@@ -3686,8 +3925,8 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr,
vnevent_link(ZTOV(szp), ct);
}
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -4072,6 +4311,43 @@ zfs_freebsd_getpages(struct vop_getpages_args *ap)
ap->a_rahead));
}
+typedef struct {
+ uint_t pca_npages;
+ vm_page_t pca_pages[];
+} putpage_commit_arg_t;
+
+static void
+zfs_putpage_commit_cb(void *arg, int err)
+{
+ putpage_commit_arg_t *pca = arg;
+ vm_object_t object = pca->pca_pages[0]->object;
+
+ zfs_vmobject_wlock(object);
+
+ for (uint_t i = 0; i < pca->pca_npages; i++) {
+ vm_page_t pp = pca->pca_pages[i];
+
+ if (err == 0) {
+ /*
+ * Writeback succeeded, so undirty the page. If it
+ * fails, we leave it in the same state it was. That's
+ * most likely dirty, so it will get tried again some
+ * other time.
+ */
+ vm_page_undirty(pp);
+ }
+
+ vm_page_sunbusy(pp);
+ }
+
+ vm_object_pip_wakeupn(object, pca->pca_npages);
+
+ zfs_vmobject_wunlock(object);
+
+ kmem_free(pca,
+ offsetof(putpage_commit_arg_t, pca_pages[pca->pca_npages]));
+}
+
static int
zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
int *rtvals)
@@ -4173,10 +4449,12 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
}
if (zp->z_blksz < PAGE_SIZE) {
- for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
- tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
+ vm_ooffset_t woff = off;
+ size_t wlen = len;
+ for (i = 0; wlen > 0; woff += tocopy, wlen -= tocopy, i++) {
+ tocopy = MIN(PAGE_SIZE, wlen);
va = zfs_map_page(ma[i], &sf);
- dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
+ dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx);
zfs_unmap_page(sf);
}
} else {
@@ -4197,19 +4475,48 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
ASSERT0(err);
- /*
- * XXX we should be passing a callback to undirty
- * but that would make the locking messier
- */
- zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
- len, commit, B_FALSE, NULL, NULL);
- zfs_vmobject_wlock(object);
- for (i = 0; i < ncount; i++) {
- rtvals[i] = zfs_vm_pagerret_ok;
- vm_page_undirty(ma[i]);
+ if (commit) {
+ /*
+ * Caller requested that we commit immediately. We set
+ * a callback on the log entry, to be called once its
+ * on disk after the call to zil_commit() below. The
+ * pages will be undirtied and unbusied there.
+ */
+ putpage_commit_arg_t *pca = kmem_alloc(
+ offsetof(putpage_commit_arg_t, pca_pages[ncount]),
+ KM_SLEEP);
+ pca->pca_npages = ncount;
+ memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount);
+
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
+ B_TRUE, B_FALSE, zfs_putpage_commit_cb, pca);
+
+ for (i = 0; i < ncount; i++)
+ rtvals[i] = zfs_vm_pagerret_pend;
+ } else {
+ /*
+ * Caller just wants the page written back somewhere,
+ * but doesn't need it committed yet. We've already
+ * written it back to the DMU, so we just need to put
+ * it on the async log, then undirty the page and
+ * return.
+ *
+ * We cannot use a callback here, because it would keep
+ * the page busy (locked) until it is eventually
+ * written down at txg sync.
+ */
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
+ B_FALSE, B_FALSE, NULL, NULL);
+
+ zfs_vmobject_wlock(object);
+ for (i = 0; i < ncount; i++) {
+ rtvals[i] = zfs_vm_pagerret_ok;
+ vm_page_undirty(ma[i]);
+ }
+ zfs_vmobject_wunlock(object);
}
- zfs_vmobject_wunlock(object);
+
VM_CNT_INC(v_vnodeout);
VM_CNT_ADD(v_vnodepgsout, ncount);
}
@@ -4217,8 +4524,13 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
out:
zfs_rangelock_exit(lr);
- if (commit)
- zil_commit(zfsvfs->z_log, zp->z_id);
+ if (commit) {
+ err = zil_commit(zfsvfs->z_log, zp->z_id);
+ if (err != 0) {
+ zfs_exit(zfsvfs, FTAG);
+ return (err);
+ }
+ }
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, len);
@@ -4480,8 +4792,16 @@ zfs_freebsd_access(struct vop_access_args *ap)
* ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
*/
accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
- if (accmode != 0)
- error = zfs_access(zp, accmode, 0, ap->a_cred);
+ if (accmode != 0) {
+#if __FreeBSD_version >= 1500040
+ /* For named attributes, do the checks. */
+ if ((vn_irflag_read(vp) & VIRF_NAMEDATTR) != 0)
+ error = zfs_access(zp, accmode, V_NAMEDATTR,
+ ap->a_cred);
+ else
+#endif
+ error = zfs_access(zp, accmode, 0, ap->a_cred);
+ }
/*
* VADMIN has to be handled by vaccess().
@@ -4514,6 +4834,190 @@ struct vop_lookup_args {
};
#endif
+#if __FreeBSD_version >= 1500040
+static int
+zfs_lookup_nameddir(struct vnode *dvp, struct componentname *cnp,
+ struct vnode **vpp)
+{
+ struct vnode *xvp;
+ int error, flags;
+
+ *vpp = NULL;
+ flags = LOOKUP_XATTR | LOOKUP_NAMED_ATTR;
+ if ((cnp->cn_flags & CREATENAMED) != 0)
+ flags |= CREATE_XATTR_DIR;
+ error = zfs_lookup(dvp, NULL, &xvp, NULL, 0, cnp->cn_cred, flags,
+ B_FALSE);
+ if (error == 0) {
+ if ((cnp->cn_flags & LOCKLEAF) != 0)
+ error = vn_lock(xvp, cnp->cn_lkflags);
+ if (error == 0) {
+ vn_irflag_set_cond(xvp, VIRF_NAMEDDIR);
+ *vpp = xvp;
+ } else {
+ vrele(xvp);
+ }
+ }
+ return (error);
+}
+
+static ssize_t
+zfs_readdir_named(struct vnode *vp, char *buf, ssize_t blen, off_t *offp,
+ int *eofflagp, struct ucred *cred, struct thread *td)
+{
+ struct uio io;
+ struct iovec iv;
+ zfs_uio_t uio;
+ int error;
+
+ io.uio_offset = *offp;
+ io.uio_segflg = UIO_SYSSPACE;
+ io.uio_rw = UIO_READ;
+ io.uio_td = td;
+ iv.iov_base = buf;
+ iv.iov_len = blen;
+ io.uio_iov = &iv;
+ io.uio_iovcnt = 1;
+ io.uio_resid = blen;
+ zfs_uio_init(&uio, &io);
+ error = zfs_readdir(vp, &uio, cred, eofflagp, NULL, NULL);
+ if (error != 0)
+ return (-1);
+ *offp = io.uio_offset;
+ return (blen - io.uio_resid);
+}
+
+static bool
+zfs_has_namedattr(struct vnode *vp, struct ucred *cred)
+{
+ struct componentname cn;
+ struct vnode *xvp;
+ struct dirent *dp;
+ off_t offs;
+ ssize_t rsize;
+ char *buf, *cp, *endcp;
+ int eofflag, error;
+ bool ret;
+
+ MNT_ILOCK(vp->v_mount);
+ if ((vp->v_mount->mnt_flag & MNT_NAMEDATTR) == 0) {
+ MNT_IUNLOCK(vp->v_mount);
+ return (false);
+ }
+ MNT_IUNLOCK(vp->v_mount);
+
+ /* Now see if a named attribute directory exists. */
+ cn.cn_flags = LOCKLEAF;
+ cn.cn_lkflags = LK_SHARED;
+ cn.cn_cred = cred;
+ error = zfs_lookup_nameddir(vp, &cn, &xvp);
+ if (error != 0)
+ return (false);
+
+ /* It exists, so see if there is any entry other than "." and "..". */
+ buf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK);
+ ret = false;
+ offs = 0;
+ do {
+ rsize = zfs_readdir_named(xvp, buf, DEV_BSIZE, &offs, &eofflag,
+ cred, curthread);
+ if (rsize <= 0)
+ break;
+ cp = buf;
+ endcp = &buf[rsize];
+ while (cp < endcp) {
+ dp = (struct dirent *)cp;
+ if (dp->d_fileno != 0 && (dp->d_type == DT_REG ||
+ dp->d_type == DT_UNKNOWN) &&
+ !ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name) &&
+ ((dp->d_namlen == 1 && dp->d_name[0] != '.') ||
+ (dp->d_namlen == 2 && (dp->d_name[0] != '.' ||
+ dp->d_name[1] != '.')) || dp->d_namlen > 2)) {
+ ret = true;
+ break;
+ }
+ cp += dp->d_reclen;
+ }
+ } while (!ret && rsize > 0 && eofflag == 0);
+ vput(xvp);
+ free(buf, M_TEMP);
+ return (ret);
+}
+
+static int
+zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
+{
+ struct componentname *cnp = ap->a_cnp;
+ char nm[NAME_MAX + 1];
+ int error;
+ struct vnode **vpp = ap->a_vpp, *dvp = ap->a_dvp, *xvp;
+ bool is_nameddir, needs_nameddir, opennamed = false;
+
+ /*
+ * These variables are used to handle the named attribute cases:
+ * opennamed - Is true when this is a call from open with O_NAMEDATTR
+ * specified and it is the last component.
+ * is_nameddir - Is true when the directory is a named attribute dir.
+ * needs_nameddir - Is set when the lookup needs to look for/create
+ * a named attribute directory. It is only set when is_nameddir
+ * is_nameddir is false and opennamed is true.
+ * xvp - Is the directory that the lookup needs to be done in.
+ * Usually dvp, unless needs_nameddir is true where it is the
+ * result of the first non-named directory lookup.
+ * Note that name caching must be disabled for named attribute
+ * handling.
+ */
+ needs_nameddir = false;
+ xvp = dvp;
+ opennamed = (cnp->cn_flags & (OPENNAMED | ISLASTCN)) ==
+ (OPENNAMED | ISLASTCN);
+ is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0;
+ if (is_nameddir && (cnp->cn_flags & ISLASTCN) == 0)
+ return (ENOATTR);
+ if (opennamed && !is_nameddir && (cnp->cn_flags & ISDOTDOT) != 0)
+ return (ENOATTR);
+ if (opennamed || is_nameddir)
+ cnp->cn_flags &= ~MAKEENTRY;
+ if (opennamed && !is_nameddir)
+ needs_nameddir = true;
+ ASSERT3U(cnp->cn_namelen, <, sizeof (nm));
+ error = 0;
+ *vpp = NULL;
+ if (needs_nameddir) {
+ if (VOP_ISLOCKED(dvp) != LK_EXCLUSIVE)
+ vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+ error = zfs_lookup_nameddir(dvp, cnp, &xvp);
+ if (error == 0)
+ is_nameddir = true;
+ }
+ if (error == 0) {
+ if (!needs_nameddir || cnp->cn_namelen != 1 ||
+ *cnp->cn_nameptr != '.') {
+ strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1,
+ sizeof (nm)));
+ error = zfs_lookup(xvp, nm, vpp, cnp, cnp->cn_nameiop,
+ cnp->cn_cred, 0, cached);
+ if (is_nameddir && error == 0 &&
+ (cnp->cn_namelen != 1 || *cnp->cn_nameptr != '.') &&
+ (cnp->cn_flags & ISDOTDOT) == 0) {
+ if ((*vpp)->v_type == VDIR)
+ vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
+ else
+ vn_irflag_set_cond(*vpp,
+ VIRF_NAMEDATTR);
+ }
+ if (needs_nameddir && xvp != *vpp)
+ vput(xvp);
+ } else {
+ /*
+ * Lookup of "." when a named attribute dir is needed.
+ */
+ *vpp = xvp;
+ }
+ }
+ return (error);
+}
+#else
static int
zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
{
@@ -4526,6 +5030,7 @@ zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
cnp->cn_cred, 0, cached));
}
+#endif
static int
zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
@@ -4548,7 +5053,11 @@ zfs_cache_lookup(struct vop_lookup_args *ap)
zfsvfs_t *zfsvfs;
zfsvfs = ap->a_dvp->v_mount->mnt_data;
+#if __FreeBSD_version >= 1500040
+ if (zfsvfs->z_use_namecache && (ap->a_cnp->cn_flags & OPENNAMED) == 0)
+#else
if (zfsvfs->z_use_namecache)
+#endif
return (vfs_cache_lookup(ap));
else
return (zfs_freebsd_lookup(ap, B_FALSE));
@@ -4571,6 +5080,11 @@ zfs_freebsd_create(struct vop_create_args *ap)
vattr_t *vap = ap->a_vap;
znode_t *zp = NULL;
int rc, mode;
+ struct vnode *dvp = ap->a_dvp;
+#if __FreeBSD_version >= 1500040
+ struct vnode *xvp;
+ bool is_nameddir;
+#endif
#if __FreeBSD_version < 1400068
ASSERT(cnp->cn_flags & SAVENAME);
@@ -4581,10 +5095,36 @@ zfs_freebsd_create(struct vop_create_args *ap)
zfsvfs = ap->a_dvp->v_mount->mnt_data;
*ap->a_vpp = NULL;
- rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, 0, mode,
- &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL);
+ rc = 0;
+#if __FreeBSD_version >= 1500040
+ xvp = NULL;
+ is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0;
+ if (!is_nameddir && (cnp->cn_flags & OPENNAMED) != 0) {
+ /* Needs a named attribute directory. */
+ rc = zfs_lookup_nameddir(dvp, cnp, &xvp);
+ if (rc == 0) {
+ dvp = xvp;
+ is_nameddir = true;
+ }
+ }
+ if (is_nameddir && rc == 0)
+ rc = zfs_check_attrname(cnp->cn_nameptr);
+#endif
+
if (rc == 0)
+ rc = zfs_create(VTOZ(dvp), cnp->cn_nameptr, vap, 0, mode,
+ &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL);
+#if __FreeBSD_version >= 1500040
+ if (xvp != NULL)
+ vput(xvp);
+#endif
+ if (rc == 0) {
*ap->a_vpp = ZTOV(zp);
+#if __FreeBSD_version >= 1500040
+ if (is_nameddir)
+ vn_irflag_set_cond(*ap->a_vpp, VIRF_NAMEDATTR);
+#endif
+ }
if (zfsvfs->z_use_namecache &&
rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
@@ -4603,13 +5143,21 @@ struct vop_remove_args {
static int
zfs_freebsd_remove(struct vop_remove_args *ap)
{
+ int error = 0;
#if __FreeBSD_version < 1400068
ASSERT(ap->a_cnp->cn_flags & SAVENAME);
#endif
- return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
- ap->a_cnp->cn_cred));
+#if __FreeBSD_version >= 1500040
+ if ((vn_irflag_read(ap->a_dvp) & VIRF_NAMEDDIR) != 0)
+ error = zfs_check_attrname(ap->a_cnp->cn_nameptr);
+#endif
+
+ if (error == 0)
+ error = zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
+ ap->a_cnp->cn_cred);
+ return (error);
}
#ifndef _SYS_SYSPROTO_H_
@@ -4694,8 +5242,32 @@ struct vop_fsync_args {
static int
zfs_freebsd_fsync(struct vop_fsync_args *ap)
{
+ vnode_t *vp = ap->a_vp;
+ int err = 0;
- return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred));
+ /*
+ * Push any dirty mmap()'d data out to the DMU and ZIL, ready for
+ * zil_commit() to be called in zfs_fsync().
+ */
+ if (vm_object_mightbedirty(vp->v_object)) {
+ zfs_vmobject_wlock(vp->v_object);
+ if (!vm_object_page_clean(vp->v_object, 0, 0, 0))
+ err = SET_ERROR(EIO);
+ zfs_vmobject_wunlock(vp->v_object);
+ if (err) {
+ /*
+ * Unclear what state things are in. zfs_putpages()
+ * will ensure the pages remain dirty if they haven't
+ * been written down to the DMU, but because there may
+ * be nothing logged, we can't assume that zfs_sync()
+ * -> zil_commit() will give us a useful error. It's
+ * safest if we just error out here.
+ */
+ return (err);
+ }
+ }
+
+ return (zfs_fsync(VTOZ(vp), 0, ap->a_td->td_ucred));
}
#ifndef _SYS_SYSPROTO_H_
@@ -4767,6 +5339,11 @@ zfs_freebsd_getattr(struct vop_getattr_args *ap)
#undef FLAG_CHECK
*vap = xvap.xva_vattr;
vap->va_flags = fflags;
+
+#if __FreeBSD_version >= 1500040
+ if ((vn_irflag_read(ap->a_vp) & (VIRF_NAMEDDIR | VIRF_NAMEDATTR)) != 0)
+ vap->va_bsdflags |= SFBSD_NAMEDATTR;
+#endif
return (0);
}
@@ -4909,15 +5486,24 @@ zfs_freebsd_rename(struct vop_rename_args *ap)
vnode_t *fvp = ap->a_fvp;
vnode_t *tdvp = ap->a_tdvp;
vnode_t *tvp = ap->a_tvp;
- int error;
+ int error = 0;
#if __FreeBSD_version < 1400068
ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
#endif
- error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
- ap->a_tcnp, ap->a_fcnp->cn_cred);
+#if __FreeBSD_version >= 1500040
+ if ((vn_irflag_read(fdvp) & VIRF_NAMEDDIR) != 0) {
+ error = zfs_check_attrname(ap->a_fcnp->cn_nameptr);
+ if (error == 0)
+ error = zfs_check_attrname(ap->a_tcnp->cn_nameptr);
+ }
+#endif
+
+ if (error == 0)
+ error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
+ ap->a_tcnp, ap->a_fcnp->cn_cred);
vrele(fdvp);
vrele(fvp);
@@ -5146,6 +5732,9 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
{
ulong_t val;
int error;
+#ifdef _PC_CLONE_BLKSIZE
+ zfsvfs_t *zfsvfs;
+#endif
error = zfs_pathconf(ap->a_vp, ap->a_name, &val,
curthread->td_ucred, NULL);
@@ -5171,12 +5760,48 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
return (0);
}
return (EINVAL);
+#if __FreeBSD_version >= 1500040
+ case _PC_NAMEDATTR_ENABLED:
+ MNT_ILOCK(ap->a_vp->v_mount);
+ if ((ap->a_vp->v_mount->mnt_flag & MNT_NAMEDATTR) != 0)
+ *ap->a_retval = 1;
+ else
+ *ap->a_retval = 0;
+ MNT_IUNLOCK(ap->a_vp->v_mount);
+ return (0);
+ case _PC_HAS_NAMEDATTR:
+ if (zfs_has_namedattr(ap->a_vp, curthread->td_ucred))
+ *ap->a_retval = 1;
+ else
+ *ap->a_retval = 0;
+ return (0);
+#endif
+#ifdef _PC_HAS_HIDDENSYSTEM
+ case _PC_HAS_HIDDENSYSTEM:
+ *ap->a_retval = 1;
+ return (0);
+#endif
+#ifdef _PC_CLONE_BLKSIZE
+ case _PC_CLONE_BLKSIZE:
+ zfsvfs = (zfsvfs_t *)ap->a_vp->v_mount->mnt_data;
+ if (zfs_bclone_enabled &&
+ spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
+ SPA_FEATURE_BLOCK_CLONING))
+ *ap->a_retval = dsl_dataset_feature_is_active(
+ zfsvfs->z_os->os_dsl_dataset,
+ SPA_FEATURE_LARGE_BLOCKS) ?
+ SPA_MAXBLOCKSIZE :
+ SPA_OLD_MAXBLOCKSIZE;
+ else
+ *ap->a_retval = 0;
+ return (0);
+#endif
default:
return (vop_stdpathconf(ap));
}
}
-static int zfs_xattr_compat = 1;
+int zfs_xattr_compat = 1;
static int
zfs_check_attrname(const char *name)
@@ -6043,6 +6668,78 @@ zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap)
return (EOPNOTSUPP);
}
+#ifndef _SYS_SYSPROTO_H_
+struct vop_advise_args {
+ struct vnode *a_vp;
+ off_t a_start;
+ off_t a_end;
+ int a_advice;
+};
+#endif
+
+static int
+zfs_freebsd_advise(struct vop_advise_args *ap)
+{
+ vnode_t *vp = ap->a_vp;
+ off_t start = ap->a_start;
+ off_t end = ap->a_end;
+ int advice = ap->a_advice;
+ off_t len;
+ znode_t *zp;
+ zfsvfs_t *zfsvfs;
+ objset_t *os;
+ int error = 0;
+
+ if (end < start)
+ return (EINVAL);
+
+ error = vn_lock(vp, LK_SHARED);
+ if (error)
+ return (error);
+
+ zp = VTOZ(vp);
+ zfsvfs = zp->z_zfsvfs;
+ os = zp->z_zfsvfs->z_os;
+
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ goto out_unlock;
+
+ /* kern_posix_fadvise points to the last byte, we want one past */
+ if (end != OFF_MAX)
+ end += 1;
+ len = end - start;
+
+ switch (advice) {
+ case POSIX_FADV_WILLNEED:
+ /*
+ * Pass on the caller's size directly, but note that
+ * dmu_prefetch_max will effectively cap it. If there really
+ * is a larger sequential access pattern, perhaps dmu_zfetch
+ * will detect it.
+ */
+ dmu_prefetch(os, zp->z_id, 0, start, len,
+ ZIO_PRIORITY_ASYNC_READ);
+ break;
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_DONTNEED:
+ case POSIX_FADV_NOREUSE:
+ /* ignored for now */
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ zfs_exit(zfsvfs, FTAG);
+
+out_unlock:
+ VOP_UNLOCK(vp);
+
+ return (error);
+}
+
static int
zfs_vptocnp(struct vop_vptocnp_args *ap)
{
@@ -6137,9 +6834,11 @@ zfs_deallocate(struct vop_deallocate_args *ap)
if (error == 0) {
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS ||
(ap->a_ioflag & IO_SYNC) != 0)
- zil_commit(zilog, zp->z_id);
- *ap->a_offset = off + len;
- *ap->a_len = 0;
+ error = zil_commit(zilog, zp->z_id);
+ if (error == 0) {
+ *ap->a_offset = off + len;
+ *ap->a_len = 0;
+ }
}
zfs_exit(zfsvfs, FTAG);
@@ -6279,6 +6978,7 @@ struct vop_vector zfs_vnodeops = {
.vop_link = zfs_freebsd_link,
.vop_symlink = zfs_freebsd_symlink,
.vop_readlink = zfs_freebsd_readlink,
+ .vop_advise = zfs_freebsd_advise,
.vop_read = zfs_freebsd_read,
.vop_write = zfs_freebsd_write,
.vop_remove = zfs_freebsd_remove,
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
index ce7b93d20a47..649022ab5bcb 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
@@ -67,8 +67,12 @@
#include "zfs_comutil.h"
/* Used by fstat(1). */
+#ifdef SYSCTL_SIZEOF
+SYSCTL_SIZEOF(znode, znode_t);
+#else
SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD,
SYSCTL_NULL_INT_PTR, sizeof (znode_t), "sizeof(znode_t)");
+#endif
/*
* Define ZNODE_STATS to turn on statistic gathering. By default, it is only
@@ -146,8 +150,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
zp->z_xattr_cached = NULL;
zp->z_xattr_parent = 0;
zp->z_vnode = NULL;
- zp->z_sync_writes_cnt = 0;
- zp->z_async_writes_cnt = 0;
return (0);
}
@@ -159,18 +161,15 @@ zfs_znode_cache_destructor(void *buf, void *arg)
znode_t *zp = buf;
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
- ASSERT3P(zp->z_vnode, ==, NULL);
+ ASSERT0P(zp->z_vnode);
ASSERT(!list_link_active(&zp->z_link_node));
mutex_destroy(&zp->z_lock);
mutex_destroy(&zp->z_acl_lock);
rw_destroy(&zp->z_xattr_lock);
zfs_rangelock_fini(&zp->z_rangelock);
- ASSERT3P(zp->z_acl_cached, ==, NULL);
- ASSERT3P(zp->z_xattr_cached, ==, NULL);
-
- ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
- ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
+ ASSERT0P(zp->z_acl_cached);
+ ASSERT0P(zp->z_xattr_cached);
}
@@ -196,7 +195,7 @@ zfs_znode_init(void)
/*
* Initialize zcache
*/
- ASSERT3P(znode_uma_zone, ==, NULL);
+ ASSERT0P(znode_uma_zone);
znode_uma_zone = uma_zcreate("zfs_znode_cache",
sizeof (znode_t), zfs_znode_cache_constructor_smr,
zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0);
@@ -225,7 +224,7 @@ zfs_znode_init(void)
/*
* Initialize zcache
*/
- ASSERT3P(znode_cache, ==, NULL);
+ ASSERT0P(znode_cache);
znode_cache = kmem_cache_create("zfs_znode_cache",
sizeof (znode_t), 0, zfs_znode_cache_constructor,
zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_RECLAIMABLE);
@@ -289,6 +288,7 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
sharezp->z_atime_dirty = 0;
sharezp->z_zfsvfs = zfsvfs;
sharezp->z_is_sa = zfsvfs->z_use_sa;
+ sharezp->z_pflags = 0;
VERIFY0(zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
kcred, NULL, &acl_ids, NULL));
@@ -353,8 +353,8 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
- ASSERT3P(zp->z_sa_hdl, ==, NULL);
- ASSERT3P(zp->z_acl_cached, ==, NULL);
+ ASSERT0P(zp->z_sa_hdl);
+ ASSERT0P(zp->z_acl_cached);
if (sa_hdl == NULL) {
VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp,
SA_HDL_SHARED, &zp->z_sa_hdl));
@@ -451,8 +451,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
zp->z_sync_cnt = 0;
- zp->z_sync_writes_cnt = 0;
- zp->z_async_writes_cnt = 0;
atomic_store_ptr(&zp->z_cached_symlink, NULL);
zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
@@ -560,6 +558,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
uint64_t crtime[2], atime[2], mtime[2], ctime[2];
uint64_t mode, size, links, parent, pflags;
uint64_t dzp_pflags = 0;
+ uint64_t projid = ZFS_DEFAULT_PROJID;
uint64_t rdev = 0;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
dmu_buf_t *db;
@@ -667,6 +666,23 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
if (flag & IS_XATTR)
pflags |= ZFS_XATTR;
+ if (vap->va_type == VREG || vap->va_type == VDIR) {
+ /*
+ * With ZFS_PROJID flag, we can easily know whether there is
+ * project ID stored on disk or not. See zpl_get_file_info().
+ */
+ if (obj_type != DMU_OT_ZNODE &&
+ dmu_objset_projectquota_enabled(zfsvfs->z_os))
+ pflags |= ZFS_PROJID;
+
+ /*
+ * Inherit project ID from parent if required.
+ */
+ projid = zfs_inherit_projid(dzp);
+ if (dzp_pflags & ZFS_PROJINHERIT)
+ pflags |= ZFS_PROJINHERIT;
+ }
+
/*
* No execs denied will be determined when zfs_mode_compute() is called.
*/
@@ -748,6 +764,10 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
if (obj_type == DMU_OT_ZNODE) {
SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
&empty_xattr, 8);
+ } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+ pflags & ZFS_PROJID) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
+ NULL, &projid, 8);
}
if (obj_type == DMU_OT_ZNODE ||
(vap->va_type == VBLK || vap->va_type == VCHR)) {
@@ -795,6 +815,11 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
(*zpp)->z_pflags = pflags;
(*zpp)->z_mode = mode;
(*zpp)->z_dnodesize = dnodesize;
+ (*zpp)->z_projid = projid;
+
+ vnode_t *vp = ZTOV(*zpp);
+ if (!(flag & IS_ROOT_NODE))
+ vn_seqc_write_begin(vp);
if (vap->va_mask & AT_XVATTR)
zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
@@ -804,7 +829,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
}
if (!(flag & IS_ROOT_NODE)) {
- vnode_t *vp = ZTOV(*zpp);
+ vn_seqc_write_end(vp);
vp->v_vflag |= VV_FORCEINSMQ;
int err = insmntque(vp, zfsvfs->z_vfs);
vp->v_vflag &= ~VV_FORCEINSMQ;
@@ -912,6 +937,11 @@ zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_SPARSE);
}
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+ ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_PROJINHERIT);
+ }
}
int
@@ -1064,6 +1094,7 @@ zfs_rezget(znode_t *zp)
int err;
int count = 0;
uint64_t gen;
+ uint64_t projid = ZFS_DEFAULT_PROJID;
/*
* Remove cached pages before reloading the znode, so that they are not
@@ -1100,7 +1131,7 @@ zfs_rezget(znode_t *zp)
}
rw_exit(&zp->z_xattr_lock);
- ASSERT3P(zp->z_sa_hdl, ==, NULL);
+ ASSERT0P(zp->z_sa_hdl);
err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
if (err) {
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
@@ -1144,6 +1175,17 @@ zfs_rezget(znode_t *zp)
return (SET_ERROR(EIO));
}
+ if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
+ err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
+ &projid, 8);
+ if (err != 0 && err != ENOENT) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (err);
+ }
+ }
+
+ zp->z_projid = projid;
zp->z_mode = mode;
if (gen != zp->z_gen) {
@@ -1260,7 +1302,7 @@ zfs_znode_free(znode_t *zp)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
char *symlink;
- ASSERT3P(zp->z_sa_hdl, ==, NULL);
+ ASSERT0P(zp->z_sa_hdl);
zp->z_vnode = NULL;
mutex_enter(&zfsvfs->z_znodes_lock);
POINTER_INVALIDATE(&zp->z_zfsvfs);
@@ -1725,6 +1767,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
rootzp->z_unlinked = 0;
rootzp->z_atime_dirty = 0;
rootzp->z_is_sa = USE_SA(version, os);
+ rootzp->z_pflags = 0;
zfsvfs->z_os = os;
zfsvfs->z_parent = zfsvfs;
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
index 5a2c4b8cbf22..91cf38016e00 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
@@ -1822,9 +1822,3 @@ error:
return (SET_ERROR(ret));
}
-
-#if defined(_KERNEL) && defined(HAVE_SPL)
-module_param(zfs_key_max_salt_uses, ulong, 0644);
-MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
- "can be used for generating encryption keys before it is rotated");
-#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
index d18ea9d59fa3..0dd2ecd7fd8d 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
@@ -31,7 +31,7 @@
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Klara, Inc.
*/
/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
@@ -99,6 +99,7 @@
#include <geom/geom.h>
#include <sys/zvol.h>
#include <sys/zvol_impl.h>
+#include <cityhash.h>
#include "zfs_namecheck.h"
@@ -112,12 +113,6 @@
#define ZVOL_RW_READ_HELD RW_READ_HELD
#endif
-enum zvol_geom_state {
- ZVOL_GEOM_UNINIT,
- ZVOL_GEOM_STOPPED,
- ZVOL_GEOM_RUNNING,
-};
-
struct zvol_state_os {
#define zso_dev _zso_state._zso_dev
#define zso_geom _zso_state._zso_geom
@@ -131,9 +126,6 @@ struct zvol_state_os {
/* volmode=geom */
struct zvol_state_geom {
struct g_provider *zsg_provider;
- struct bio_queue_head zsg_queue;
- struct mtx zsg_queue_mtx;
- enum zvol_geom_state zsg_state;
} _zso_geom;
} _zso_state;
int zso_dying;
@@ -143,8 +135,7 @@ static uint32_t zvol_minors;
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
-SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
- "Expose as GEOM providers (1), device files (2) or neither");
+
static boolean_t zpool_on_zvol = B_FALSE;
SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
"Allow zpools to use zvols as vdevs (DANGEROUS)");
@@ -169,7 +160,7 @@ static d_close_t zvol_cdev_close;
static d_ioctl_t zvol_cdev_ioctl;
static d_read_t zvol_cdev_read;
static d_write_t zvol_cdev_write;
-static d_strategy_t zvol_geom_bio_strategy;
+static d_strategy_t zvol_cdev_bio_strategy;
static d_kqfilter_t zvol_cdev_kqfilter;
static struct cdevsw zvol_cdevsw = {
@@ -181,7 +172,7 @@ static struct cdevsw zvol_cdevsw = {
.d_ioctl = zvol_cdev_ioctl,
.d_read = zvol_cdev_read,
.d_write = zvol_cdev_write,
- .d_strategy = zvol_geom_bio_strategy,
+ .d_strategy = zvol_cdev_bio_strategy,
.d_kqfilter = zvol_cdev_kqfilter,
};
@@ -205,13 +196,10 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
static int zvol_geom_open(struct g_provider *pp, int flag, int count);
static int zvol_geom_close(struct g_provider *pp, int flag, int count);
-static void zvol_geom_run(zvol_state_t *zv);
-static void zvol_geom_destroy(zvol_state_t *zv);
static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
-static void zvol_geom_worker(void *arg);
static void zvol_geom_bio_start(struct bio *bp);
static int zvol_geom_bio_getattr(struct bio *bp);
-/* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */
+static void zvol_geom_bio_strategy(struct bio *bp, boolean_t sync);
/*
* GEOM mode implementation
@@ -237,25 +225,14 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
}
retry:
- rw_enter(&zvol_state_lock, ZVOL_RW_READER);
- /*
- * Obtain a copy of private under zvol_state_lock to make sure either
- * the result of zvol free code setting private to NULL is observed,
- * or the zv is protected from being freed because of the positive
- * zv_open_count.
- */
- zv = pp->private;
- if (zv == NULL) {
- rw_exit(&zvol_state_lock);
- err = SET_ERROR(ENXIO);
- goto out_locked;
- }
+ zv = atomic_load_ptr(&pp->private);
+ if (zv == NULL)
+ return (SET_ERROR(ENXIO));
mutex_enter(&zv->zv_state_lock);
if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
- rw_exit(&zvol_state_lock);
err = SET_ERROR(ENXIO);
- goto out_zv_locked;
+ goto out_locked;
}
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
@@ -268,8 +245,24 @@ retry:
drop_suspend = B_TRUE;
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
mutex_exit(&zv->zv_state_lock);
+
+ /*
+ * Removal may happen while the locks are down, so
+ * we can't trust zv any longer; we have to start over.
+ */
+ zv = atomic_load_ptr(&pp->private);
+ if (zv == NULL)
+ return (SET_ERROR(ENXIO));
+
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ if (zv->zv_zso->zso_dying ||
+ zv->zv_flags & ZVOL_REMOVING) {
+ err = SET_ERROR(ENXIO);
+ goto out_locked;
+ }
+
/* Check to see if zv_suspend_lock is needed. */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
@@ -277,7 +270,6 @@ retry:
}
}
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -305,7 +297,7 @@ retry:
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
if (err)
- goto out_zv_locked;
+ goto out_locked;
pp->mediasize = zv->zv_volsize;
pp->stripeoffset = 0;
pp->stripesize = zv->zv_volblocksize;
@@ -340,9 +332,8 @@ out_opened:
zvol_last_close(zv);
wakeup(zv);
}
-out_zv_locked:
- mutex_exit(&zv->zv_state_lock);
out_locked:
+ mutex_exit(&zv->zv_state_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
return (err);
@@ -356,12 +347,9 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
boolean_t drop_suspend = B_TRUE;
int new_open_count;
- rw_enter(&zvol_state_lock, ZVOL_RW_READER);
- zv = pp->private;
- if (zv == NULL) {
- rw_exit(&zvol_state_lock);
+ zv = atomic_load_ptr(&pp->private);
+ if (zv == NULL)
return (SET_ERROR(ENXIO));
- }
mutex_enter(&zv->zv_state_lock);
if (zv->zv_flags & ZVOL_EXCL) {
@@ -388,6 +376,15 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ /*
+ * Unlike in zvol_geom_open(), we don't check if
+ * removal started here, because we might be one of the
+ * openers that needs to be thrown out! If we're the
+ * last, we need to call zvol_last_close() below to
+ * finish cleanup. So, no special treatment for us.
+ */
+
/* Check to see if zv_suspend_lock is needed. */
new_open_count = zv->zv_open_count - count;
if (new_open_count != 0) {
@@ -398,7 +395,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
} else {
drop_suspend = B_FALSE;
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -419,37 +415,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
return (0);
}
-static void
-zvol_geom_run(zvol_state_t *zv)
-{
- struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
- struct g_provider *pp = zsg->zsg_provider;
-
- ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
-
- g_error_provider(pp, 0);
-
- kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
- "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
-}
-
-static void
-zvol_geom_destroy(zvol_state_t *zv)
-{
- struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
- struct g_provider *pp = zsg->zsg_provider;
-
- ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
-
- g_topology_assert();
-
- mutex_enter(&zv->zv_state_lock);
- VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
- mutex_exit(&zv->zv_state_lock);
- zsg->zsg_provider = NULL;
- g_wither_geom(pp->geom, ENXIO);
-}
-
void
zvol_wait_close(zvol_state_t *zv)
{
@@ -482,7 +447,7 @@ zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
pp->name, acr, acw, ace));
- if (pp->private == NULL) {
+ if (atomic_load_ptr(&pp->private) == NULL) {
if (acr <= 0 && acw <= 0 && ace <= 0)
return (0);
return (pp->error);
@@ -517,43 +482,9 @@ zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
}
static void
-zvol_geom_worker(void *arg)
-{
- zvol_state_t *zv = arg;
- struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
- struct bio *bp;
-
- ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
-
- thread_lock(curthread);
- sched_prio(curthread, PRIBIO);
- thread_unlock(curthread);
-
- for (;;) {
- mtx_lock(&zsg->zsg_queue_mtx);
- bp = bioq_takefirst(&zsg->zsg_queue);
- if (bp == NULL) {
- if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
- zsg->zsg_state = ZVOL_GEOM_RUNNING;
- wakeup(&zsg->zsg_state);
- mtx_unlock(&zsg->zsg_queue_mtx);
- kthread_exit();
- }
- msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
- PRIBIO | PDROP, "zvol:io", 0);
- continue;
- }
- mtx_unlock(&zsg->zsg_queue_mtx);
- zvol_geom_bio_strategy(bp);
- }
-}
-
-static void
zvol_geom_bio_start(struct bio *bp)
{
zvol_state_t *zv = bp->bio_to->private;
- struct zvol_state_geom *zsg;
- boolean_t first;
if (zv == NULL) {
g_io_deliver(bp, ENXIO);
@@ -565,18 +496,8 @@ zvol_geom_bio_start(struct bio *bp)
return;
}
- if (!THREAD_CAN_SLEEP()) {
- zsg = &zv->zv_zso->zso_geom;
- mtx_lock(&zsg->zsg_queue_mtx);
- first = (bioq_first(&zsg->zsg_queue) == NULL);
- bioq_insert_tail(&zsg->zsg_queue, bp);
- mtx_unlock(&zsg->zsg_queue_mtx);
- if (first)
- wakeup_one(&zsg->zsg_queue);
- return;
- }
-
- zvol_geom_bio_strategy(bp);
+ zvol_geom_bio_strategy(bp, !g_is_geom_thread(curthread) &&
+ THREAD_CAN_SLEEP());
}
static int
@@ -660,9 +581,10 @@ zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
}
static void
-zvol_geom_bio_strategy(struct bio *bp)
+zvol_strategy_impl(zv_request_t *zvr)
{
zvol_state_t *zv;
+ struct bio *bp;
uint64_t off, volsize;
size_t resid;
char *addr;
@@ -673,11 +595,8 @@ zvol_geom_bio_strategy(struct bio *bp)
boolean_t is_dumpified;
boolean_t commit;
- if (bp->bio_to)
- zv = bp->bio_to->private;
- else
- zv = bp->bio_dev->si_drv2;
-
+ bp = zvr->bio;
+ zv = zvr->zv;
if (zv == NULL) {
error = SET_ERROR(ENXIO);
goto out;
@@ -752,7 +671,7 @@ zvol_geom_bio_strategy(struct bio *bp)
while (resid != 0 && off < volsize) {
size_t size = MIN(resid, zvol_maxphys);
if (doread) {
- error = dmu_read(os, ZVOL_OBJ, off, size, addr,
+ error = dmu_read_by_dnode(zv->zv_dn, off, size, addr,
DMU_READ_PREFETCH);
} else {
dmu_tx_t *tx = dmu_tx_create(os);
@@ -761,7 +680,8 @@ zvol_geom_bio_strategy(struct bio *bp)
if (error) {
dmu_tx_abort(tx);
} else {
- dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
+ dmu_write_by_dnode(zv->zv_dn, off, size, addr,
+ tx, DMU_READ_PREFETCH);
zvol_log_write(zv, tx, off, size, commit);
dmu_tx_commit(tx);
}
@@ -800,9 +720,9 @@ unlock:
break;
}
- if (commit) {
+ if (error == 0 && commit) {
commit:
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
}
resume:
rw_exit(&zv->zv_suspend_lock);
@@ -813,6 +733,63 @@ out:
biofinish(bp, NULL, error);
}
+static void
+zvol_strategy_task(void *arg)
+{
+ zv_request_task_t *task = arg;
+
+ zvol_strategy_impl(&task->zvr);
+ zv_request_task_free(task);
+}
+
+static void
+zvol_geom_bio_strategy(struct bio *bp, boolean_t sync)
+{
+ zv_taskq_t *ztqs = &zvol_taskqs;
+ zv_request_task_t *task;
+ zvol_state_t *zv;
+ uint_t tq_idx;
+ uint_t taskq_hash;
+ int error;
+
+ if (bp->bio_to)
+ zv = bp->bio_to->private;
+ else
+ zv = bp->bio_dev->si_drv2;
+
+ if (zv == NULL) {
+ error = SET_ERROR(ENXIO);
+ if (bp->bio_to)
+ g_io_deliver(bp, error);
+ else
+ biofinish(bp, NULL, error);
+ return;
+ }
+
+ zv_request_t zvr = {
+ .zv = zv,
+ .bio = bp,
+ };
+
+ if (sync || zvol_request_sync) {
+ zvol_strategy_impl(&zvr);
+ return;
+ }
+
+ taskq_hash = cityhash3((uintptr_t)zv, curcpu, bp->bio_offset >>
+ ZVOL_TASKQ_OFFSET_SHIFT);
+ tq_idx = taskq_hash % ztqs->tqs_cnt;
+ task = zv_request_task_create(zvr);
+ taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_strategy_task, task,
+ 0, &task->ent);
+}
+
+static void
+zvol_cdev_bio_strategy(struct bio *bp)
+{
+ zvol_geom_bio_strategy(bp, B_FALSE);
+}
+
/*
* Character device mode implementation
*/
@@ -850,7 +827,8 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
if (bytes > volsize - zfs_uio_offset(&uio))
bytes = volsize - zfs_uio_offset(&uio);
- error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
+ error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
+ DMU_READ_PREFETCH);
if (error) {
/* Convert checksum errors into IO errors. */
if (error == ECKSUM)
@@ -909,7 +887,8 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
dmu_tx_abort(tx);
break;
}
- error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
+ error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
+ DMU_READ_PREFETCH);
if (error == 0)
zvol_log_write(zv, tx, off, bytes, commit);
dmu_tx_commit(tx);
@@ -920,8 +899,8 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
zfs_rangelock_exit(lr);
int64_t nwritten = start_resid - zfs_uio_resid(&uio);
dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
- if (commit)
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if (error == 0 && commit)
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
rw_exit(&zv->zv_suspend_lock);
return (error);
@@ -935,25 +914,14 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
boolean_t drop_suspend = B_FALSE;
retry:
- rw_enter(&zvol_state_lock, ZVOL_RW_READER);
- /*
- * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
- * the result of zvol free code setting si_drv2 to NULL is observed,
- * or the zv is protected from being freed because of the positive
- * zv_open_count.
- */
- zv = dev->si_drv2;
- if (zv == NULL) {
- rw_exit(&zvol_state_lock);
- err = SET_ERROR(ENXIO);
- goto out_locked;
- }
+ zv = atomic_load_ptr(&dev->si_drv2);
+ if (zv == NULL)
+ return (SET_ERROR(ENXIO));
mutex_enter(&zv->zv_state_lock);
- if (zv->zv_zso->zso_dying) {
- rw_exit(&zvol_state_lock);
+ if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
err = SET_ERROR(ENXIO);
- goto out_zv_locked;
+ goto out_locked;
}
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
@@ -968,6 +936,13 @@ retry:
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
+ /* Removal started while locks were down. */
+ err = SET_ERROR(ENXIO);
+ goto out_locked;
+ }
+
/* Check to see if zv_suspend_lock is needed. */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
@@ -975,7 +950,6 @@ retry:
}
}
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -1003,7 +977,7 @@ retry:
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
if (err)
- goto out_zv_locked;
+ goto out_locked;
}
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -1030,9 +1004,8 @@ out_opened:
zvol_last_close(zv);
wakeup(zv);
}
-out_zv_locked:
- mutex_exit(&zv->zv_state_lock);
out_locked:
+ mutex_exit(&zv->zv_state_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
return (err);
@@ -1044,12 +1017,9 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
zvol_state_t *zv;
boolean_t drop_suspend = B_TRUE;
- rw_enter(&zvol_state_lock, ZVOL_RW_READER);
- zv = dev->si_drv2;
- if (zv == NULL) {
- rw_exit(&zvol_state_lock);
+ zv = atomic_load_ptr(&dev->si_drv2);
+ if (zv == NULL)
return (SET_ERROR(ENXIO));
- }
mutex_enter(&zv->zv_state_lock);
if (zv->zv_flags & ZVOL_EXCL) {
@@ -1074,6 +1044,15 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ /*
+ * Unlike in zvol_cdev_open(), we don't check if
+ * removal started here, because we might be one of the
+ * openers that needs to be thrown out! If we're the
+ * last, we need to call zvol_last_close() below to
+ * finish cleanup. So, no special treatment for us.
+ */
+
/* Check to see if zv_suspend_lock is needed. */
if (zv->zv_open_count != 1) {
rw_exit(&zv->zv_suspend_lock);
@@ -1083,7 +1062,6 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
} else {
drop_suspend = B_FALSE;
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -1115,7 +1093,8 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
int error;
boolean_t sync;
- zv = dev->si_drv2;
+ zv = atomic_load_ptr(&dev->si_drv2);
+ ASSERT3P(zv, !=, NULL);
error = 0;
KASSERT(zv->zv_open_count > 0,
@@ -1131,7 +1110,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
case DIOCGFLUSH:
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
if (zv->zv_zilog != NULL)
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
rw_exit(&zv->zv_suspend_lock);
break;
case DIOCGDELETE:
@@ -1166,7 +1145,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
}
zfs_rangelock_exit(lr);
if (sync)
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
rw_exit(&zv->zv_suspend_lock);
break;
case DIOCGSTRIPESIZE:
@@ -1176,6 +1155,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
*(off_t *)data = 0;
break;
case DIOCGATTR: {
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
spa_t *spa = dmu_objset_spa(zv->zv_objset);
struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
uint64_t refd, avail, usedobjs, availobjs;
@@ -1200,6 +1180,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
arg->value.off = refd / DEV_BSIZE;
} else
error = SET_ERROR(ENOIOCTL);
+ rw_exit(&zv->zv_suspend_lock);
break;
}
case FIOSEEKHOLE:
@@ -1210,10 +1191,12 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
hole = (cmd == FIOSEEKHOLE);
noff = *off;
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
RL_READER);
error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
zfs_rangelock_exit(lr);
+ rw_exit(&zv->zv_suspend_lock);
*off = noff;
break;
}
@@ -1262,9 +1245,11 @@ zvol_os_is_zvol(const char *device)
return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
}
-void
+int
zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
{
+ int error = 0;
+
ASSERT(RW_LOCK_HELD(&zvol_state_lock));
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -1318,57 +1303,159 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
args.mda_gid = GID_OPERATOR;
args.mda_mode = 0640;
args.mda_si_drv2 = zv;
- if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
- == 0) {
+ error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname);
+ if (error == 0) {
dev->si_iosize_max = maxphys;
zsd->zsd_cdev = dev;
}
}
strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
dataset_kstats_rename(&zv->zv_kstat, newname);
+
+ return (error);
}
/*
- * Remove minor node for the specified volume.
+ * Allocate memory for a new zvol_state_t and setup the required
+ * request queue and generic disk structures for the block device.
*/
-void
-zvol_os_free(zvol_state_t *zv)
+static int
+zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
+ zvol_state_t **zvp)
{
- ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
- ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
- ASSERT0(zv->zv_open_count);
+ zvol_state_t *zv;
+ uint64_t volmode;
+ int error;
- ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+ error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE),
+ &volmode, NULL);
+ if (error)
+ return (error);
- rw_destroy(&zv->zv_suspend_lock);
- zfs_rangelock_fini(&zv->zv_rangelock);
+ if (volmode == ZFS_VOLMODE_DEFAULT)
+ volmode = zvol_volmode;
+
+ if (volmode == ZFS_VOLMODE_NONE)
+ return (0);
+ zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
+ mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
+ zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
+ zv->zv_volmode = volmode;
+ zv->zv_volsize = volsize;
+ zv->zv_volblocksize = volblocksize;
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
- struct g_provider *pp __maybe_unused = zsg->zsg_provider;
+ struct g_provider *pp;
+ struct g_geom *gp;
+
+ g_topology_lock();
+ gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+ gp->start = zvol_geom_bio_start;
+ gp->access = zvol_geom_access;
+ pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
+ pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+ pp->sectorsize = DEV_BSIZE;
+ pp->mediasize = 0;
+ pp->private = zv;
+
+ zsg->zsg_provider = pp;
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+ struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+ struct cdev *dev;
+ struct make_dev_args args;
+
+ make_dev_args_init(&args);
+ args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+ args.mda_devsw = &zvol_cdevsw;
+ args.mda_cr = NULL;
+ args.mda_uid = UID_ROOT;
+ args.mda_gid = GID_OPERATOR;
+ args.mda_mode = 0640;
+ args.mda_si_drv2 = zv;
+ error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
+ if (error) {
+ kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+ kmem_free(zv, sizeof (zvol_state_t));
+ return (error);
+ }
+
+ dev->si_iosize_max = maxphys;
+ zsd->zsd_cdev = dev;
+ knlist_init_sx(&zsd->zsd_selinfo.si_note, &zv->zv_state_lock);
+ }
+ (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
+ rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
+ zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
- ASSERT3P(pp->private, ==, NULL);
+ *zvp = zv;
+ return (error);
+}
+
+/*
+ * Remove minor node for the specified volume.
+ */
+void
+zvol_os_remove_minor(zvol_state_t *zv)
+{
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT0(zv->zv_open_count);
+ ASSERT0(atomic_read(&zv->zv_suspend_ref));
+ ASSERT(zv->zv_flags & ZVOL_REMOVING);
+
+ struct zvol_state_os *zso = zv->zv_zso;
+ zv->zv_zso = NULL;
+
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ struct zvol_state_geom *zsg = &zso->zso_geom;
+ struct g_provider *pp = zsg->zsg_provider;
+ atomic_store_ptr(&pp->private, NULL);
+ mutex_exit(&zv->zv_state_lock);
g_topology_lock();
- zvol_geom_destroy(zv);
+ g_wither_geom(pp->geom, ENXIO);
g_topology_unlock();
- mtx_destroy(&zsg->zsg_queue_mtx);
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
- struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+ struct zvol_state_dev *zsd = &zso->zso_dev;
struct cdev *dev = zsd->zsd_cdev;
+ if (dev != NULL)
+ atomic_store_ptr(&dev->si_drv2, NULL);
+ mutex_exit(&zv->zv_state_lock);
+
if (dev != NULL) {
- ASSERT3P(dev->si_drv2, ==, NULL);
destroy_dev(dev);
knlist_clear(&zsd->zsd_selinfo.si_note, 0);
knlist_destroy(&zsd->zsd_selinfo.si_note);
}
}
+ kmem_free(zso, sizeof (struct zvol_state_os));
+
+ mutex_enter(&zv->zv_state_lock);
+}
+
+void
+zvol_os_free(zvol_state_t *zv)
+{
+ ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+ ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT0(zv->zv_open_count);
+ ASSERT0P(zv->zv_zso);
+
+ ASSERT0P(zv->zv_objset);
+ ASSERT0P(zv->zv_zilog);
+ ASSERT0P(zv->zv_dn);
+
+ ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+
+ rw_destroy(&zv->zv_suspend_lock);
+ zfs_rangelock_fini(&zv->zv_rangelock);
+
mutex_destroy(&zv->zv_state_lock);
cv_destroy(&zv->zv_removing_cv);
dataset_kstats_destroy(&zv->zv_kstat);
- kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
kmem_free(zv, sizeof (zvol_state_t));
zvol_minors--;
}
@@ -1379,14 +1466,17 @@ zvol_os_free(zvol_state_t *zv)
int
zvol_os_create_minor(const char *name)
{
- zvol_state_t *zv;
+ zvol_state_t *zv = NULL;
objset_t *os;
dmu_object_info_t *doi;
uint64_t volsize;
- uint64_t volmode, hash;
+ uint64_t hash, len;
int error;
bool replayed_zil = B_FALSE;
+ if (zvol_inhibit_dev)
+ return (0);
+
ZFS_LOG(1, "Creating ZVOL %s...", name);
hash = zvol_name_hash(name);
if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
@@ -1412,78 +1502,22 @@ zvol_os_create_minor(const char *name)
if (error)
goto out_dmu_objset_disown;
- error = dsl_prop_get_integer(name,
- zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
- if (error || volmode == ZFS_VOLMODE_DEFAULT)
- volmode = zvol_volmode;
- error = 0;
+ error = zvol_alloc(name, volsize, doi->doi_data_block_size, &zv);
+ if (error || zv == NULL)
+ goto out_dmu_objset_disown;
- /*
- * zvol_alloc equivalent ...
- */
- zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
zv->zv_hash = hash;
- mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
- zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
- zv->zv_volmode = volmode;
- if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
- struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
- struct g_provider *pp;
- struct g_geom *gp;
-
- zsg->zsg_state = ZVOL_GEOM_UNINIT;
- mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
-
- g_topology_lock();
- gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
- gp->start = zvol_geom_bio_start;
- gp->access = zvol_geom_access;
- pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
- pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
- pp->sectorsize = DEV_BSIZE;
- pp->mediasize = 0;
- pp->private = zv;
-
- zsg->zsg_provider = pp;
- bioq_init(&zsg->zsg_queue);
- } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
- struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
- struct cdev *dev;
- struct make_dev_args args;
-
- make_dev_args_init(&args);
- args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
- args.mda_devsw = &zvol_cdevsw;
- args.mda_cr = NULL;
- args.mda_uid = UID_ROOT;
- args.mda_gid = GID_OPERATOR;
- args.mda_mode = 0640;
- args.mda_si_drv2 = zv;
- if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
- == 0) {
- dev->si_iosize_max = maxphys;
- zsd->zsd_cdev = dev;
- knlist_init_sx(&zsd->zsd_selinfo.si_note,
- &zv->zv_state_lock);
- }
- }
- (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
- rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
- zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
zv->zv_flags |= ZVOL_RDONLY;
- zv->zv_volblocksize = doi->doi_data_block_size;
- zv->zv_volsize = volsize;
zv->zv_objset = os;
- ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+ ASSERT0P(zv->zv_kstat.dk_kstats);
error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
if (error)
goto out_dmu_objset_disown;
- ASSERT3P(zv->zv_zilog, ==, NULL);
+ ASSERT0P(zv->zv_zilog);
zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
if (spa_writeable(dmu_objset_spa(os))) {
if (zil_replay_disable)
@@ -1495,19 +1529,25 @@ zvol_os_create_minor(const char *name)
zil_close(zv->zv_zilog);
zv->zv_zilog = NULL;
- /* TODO: prefetch for geom tasting */
+ len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);
+ if (len > 0) {
+ dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_ASYNC_READ);
+ dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
+ ZIO_PRIORITY_ASYNC_READ);
+ }
zv->zv_objset = NULL;
out_dmu_objset_disown:
dmu_objset_disown(os, B_TRUE, FTAG);
- if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
- zvol_geom_run(zv);
+ if (error == 0 && zv && zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0);
+ /* geom was locked inside zvol_alloc() function */
g_topology_unlock();
}
out_doi:
kmem_free(doi, sizeof (dmu_object_info_t));
- if (error == 0) {
+ if (error == 0 && zv) {
rw_enter(&zvol_state_lock, RW_WRITER);
zvol_insert(zv);
zvol_minors++;
@@ -1518,35 +1558,6 @@ out_doi:
return (error);
}
-void
-zvol_os_clear_private(zvol_state_t *zv)
-{
- ASSERT(RW_LOCK_HELD(&zvol_state_lock));
- if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
- struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
- struct g_provider *pp = zsg->zsg_provider;
-
- if (pp->private == NULL) /* already cleared */
- return;
-
- mtx_lock(&zsg->zsg_queue_mtx);
- zsg->zsg_state = ZVOL_GEOM_STOPPED;
- pp->private = NULL;
- wakeup_one(&zsg->zsg_queue);
- while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
- msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
- 0, "zvol:w", 0);
- mtx_unlock(&zsg->zsg_queue_mtx);
- ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
- } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
- struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
- struct cdev *dev = zsd->zsd_cdev;
-
- if (dev != NULL)
- dev->si_drv2 = NULL;
- }
-}
-
int
zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
{
@@ -1584,13 +1595,21 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
void
zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
{
- // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
+ /*
+ * The ro/rw ZVOL mode is switched using zvol_set_ro() function by
+ * enabling/disabling ZVOL_RDONLY flag. No additional FreeBSD-specific
+ * actions are required for readonly zfs property switching.
+ */
}
void
zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
{
- // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
+ /*
+ * The ZVOL size/capacity is changed by zvol_set_volsize() function.
+ * Leave this method empty, because all required job is doing by
+ * zvol_os_update_volsize() platform-specific function.
+ */
}
/*
@@ -1606,8 +1625,7 @@ zvol_busy(void)
int
zvol_init(void)
{
- zvol_init_impl();
- return (0);
+ return (zvol_init_impl());
}
void
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c
index ce9c9e39e60c..aac5f2ebbfd2 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c
@@ -66,9 +66,9 @@ void
__cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg)
{
ASSERT(cvp);
- ASSERT(name == NULL);
+ ASSERT0P(name);
ASSERT(type == CV_DEFAULT);
- ASSERT(arg == NULL);
+ ASSERT0P(arg);
cvp->cv_magic = CV_MAGIC;
init_waitqueue_head(&cvp->cv_event);
@@ -83,7 +83,7 @@ static int
cv_destroy_wakeup(kcondvar_t *cvp)
{
if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) {
- ASSERT(cvp->cv_mutex == NULL);
+ ASSERT0P(cvp->cv_mutex);
ASSERT(!waitqueue_active(&cvp->cv_event));
return (1);
}
@@ -104,7 +104,7 @@ __cv_destroy(kcondvar_t *cvp)
while (cv_destroy_wakeup(cvp) == 0)
wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1);
- ASSERT3P(cvp->cv_mutex, ==, NULL);
+ ASSERT0P(cvp->cv_mutex);
ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0);
ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0);
ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
index 115c9460f3e6..89ca4a648b2f 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
@@ -555,29 +555,6 @@ ddi_copyin(const void *from, void *to, size_t len, int flags)
}
EXPORT_SYMBOL(ddi_copyin);
-#define define_spl_param(type, fmt) \
-int \
-spl_param_get_##type(char *buf, zfs_kernel_param_t *kp) \
-{ \
- return (scnprintf(buf, PAGE_SIZE, fmt "\n", \
- *(type *)kp->arg)); \
-} \
-int \
-spl_param_set_##type(const char *buf, zfs_kernel_param_t *kp) \
-{ \
- return (kstrto##type(buf, 0, (type *)kp->arg)); \
-} \
-const struct kernel_param_ops spl_param_ops_##type = { \
- .set = spl_param_set_##type, \
- .get = spl_param_get_##type, \
-}; \
-EXPORT_SYMBOL(spl_param_get_##type); \
-EXPORT_SYMBOL(spl_param_set_##type); \
-EXPORT_SYMBOL(spl_param_ops_##type);
-
-define_spl_param(s64, "%lld")
-define_spl_param(u64, "%llu")
-
/*
* Post a uevent to userspace whenever a new vdev adds to the pool. It is
* necessary to sync blkid information with udev, which zed daemon uses
@@ -732,7 +709,7 @@ zone_get_hostid(void *zone)
{
uint32_t hostid;
- ASSERT3P(zone, ==, NULL);
+ ASSERT0P(zone);
if (spl_hostid != 0)
return ((uint32_t)(spl_hostid & HW_HOSTID_MASK));
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
index fab80289b278..22e4ed169d03 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
@@ -296,7 +296,7 @@ spl_slab_free(spl_kmem_slab_t *sks,
spl_kmem_cache_t *skc;
ASSERT(sks->sks_magic == SKS_MAGIC);
- ASSERT(sks->sks_ref == 0);
+ ASSERT0(sks->sks_ref);
skc = sks->sks_cache;
ASSERT(skc->skc_magic == SKC_MAGIC);
@@ -598,7 +598,7 @@ static void
spl_magazine_free(spl_kmem_magazine_t *skm)
{
ASSERT(skm->skm_magic == SKM_MAGIC);
- ASSERT(skm->skm_avail == 0);
+ ASSERT0(skm->skm_avail);
kfree(skm);
}
@@ -610,7 +610,7 @@ spl_magazine_create(spl_kmem_cache_t *skc)
{
int i = 0;
- ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+ ASSERT0((skc->skc_flags & KMC_SLAB));
skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *
num_possible_cpus(), kmem_flags_convert(KM_SLEEP));
@@ -640,7 +640,7 @@ spl_magazine_destroy(spl_kmem_cache_t *skc)
spl_kmem_magazine_t *skm;
int i = 0;
- ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+ ASSERT0((skc->skc_flags & KMC_SLAB));
for_each_possible_cpu(i) {
skm = skc->skc_mag[i];
@@ -679,8 +679,8 @@ spl_kmem_cache_create(const char *name, size_t size, size_t align,
/*
* Unsupported flags
*/
- ASSERT(vmp == NULL);
- ASSERT(reclaim == NULL);
+ ASSERT0P(vmp);
+ ASSERT0P(reclaim);
might_sleep();
@@ -863,11 +863,11 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
* Validate there are no objects in use and free all the
* spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
*/
- ASSERT3U(skc->skc_slab_alloc, ==, 0);
- ASSERT3U(skc->skc_obj_alloc, ==, 0);
- ASSERT3U(skc->skc_slab_total, ==, 0);
- ASSERT3U(skc->skc_obj_total, ==, 0);
- ASSERT3U(skc->skc_obj_emergency, ==, 0);
+ ASSERT0(skc->skc_slab_alloc);
+ ASSERT0(skc->skc_obj_alloc);
+ ASSERT0(skc->skc_slab_total);
+ ASSERT0(skc->skc_obj_total);
+ ASSERT0(skc->skc_obj_emergency);
ASSERT(list_empty(&skc->skc_complete_list));
ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0);
@@ -986,7 +986,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
ASSERT0(flags & ~KM_PUBLIC_MASK);
ASSERT(skc->skc_magic == SKC_MAGIC);
- ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+ ASSERT0((skc->skc_flags & KMC_SLAB));
*obj = NULL;
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
index 337a4bcf76a0..9fe008cef868 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
@@ -302,13 +302,8 @@ spl_kmem_free_impl(const void *buf, size_t size)
#ifdef DEBUG_KMEM
/* Shim layer memory accounting */
-#ifdef HAVE_ATOMIC64_T
atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
-unsigned long long kmem_alloc_max = 0;
-#else /* HAVE_ATOMIC64_T */
-atomic_t kmem_alloc_used = ATOMIC_INIT(0);
-unsigned long long kmem_alloc_max = 0;
-#endif /* HAVE_ATOMIC64_T */
+uint64_t kmem_alloc_max = 0;
EXPORT_SYMBOL(kmem_alloc_used);
EXPORT_SYMBOL(kmem_alloc_max);
@@ -320,9 +315,9 @@ spl_kmem_alloc_debug(size_t size, int flags, int node)
ptr = spl_kmem_alloc_impl(size, flags, node);
if (ptr) {
- kmem_alloc_used_add(size);
- if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
- kmem_alloc_max = kmem_alloc_used_read();
+ atomic64_add(size, &kmem_alloc_used);
+ if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max))
+ kmem_alloc_max = atomic64_read(&kmem_alloc_used);
}
return (ptr);
@@ -331,7 +326,7 @@ spl_kmem_alloc_debug(size_t size, int flags, int node)
inline void
spl_kmem_free_debug(const void *ptr, size_t size)
{
- kmem_alloc_used_sub(size);
+ atomic64_sub(size, &kmem_alloc_used);
spl_kmem_free_impl(ptr, size);
}
@@ -595,7 +590,7 @@ spl_kmem_init(void)
{
#ifdef DEBUG_KMEM
- kmem_alloc_used_set(0);
+ atomic64_set(&kmem_alloc_used, 0);
@@ -617,9 +612,10 @@ spl_kmem_fini(void)
* at that address to aid in debugging. Performance is not
* a serious concern here since it is module unload time.
*/
- if (kmem_alloc_used_read() != 0)
+ if (atomic64_read(&kmem_alloc_used) != 0)
printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
- (unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
+ (unsigned long)atomic64_read(&kmem_alloc_used),
+ kmem_alloc_max);
#ifdef DEBUG_KMEM_TRACKING
spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
index 0a6125755118..02c5b42bc4a0 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
@@ -395,7 +395,7 @@ kstat_delete_module(kstat_module_t *module)
kstat_module_t *parent = module->ksm_parent;
- char *p = module->ksm_name, *frag;
+ char *p = module->ksm_name, *frag = NULL;
while (p != NULL && (frag = strsep(&p, "/"))) {}
remove_proc_entry(frag, parent ? parent->ksm_proc : proc_spl_kstat);
@@ -420,7 +420,7 @@ kstat_create_module(char *name)
(void) strlcpy(buf, name, KSTAT_STRLEN);
- parent = NULL;
+ module = parent = NULL;
char *p = buf, *frag;
while ((frag = strsep(&p, "/")) != NULL) {
module = kstat_find_module(buf);
@@ -454,7 +454,6 @@ kstat_create_module(char *name)
}
return (module);
-
}
static int
@@ -542,7 +541,7 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
kstat_t *ksp;
ASSERT(ks_module);
- ASSERT(ks_instance == 0);
+ ASSERT0(ks_instance);
ASSERT(ks_name);
if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO))
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
index 4ed0deedd5b9..8cdd5fc5cfe5 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
@@ -82,11 +82,7 @@ proc_domemused(CONST_CTL_TABLE *table, int write,
if (write) {
*ppos += *lenp;
} else {
-#ifdef HAVE_ATOMIC64_T
val = atomic64_read((atomic64_t *)table->data);
-#else
- val = atomic_read((atomic_t *)table->data);
-#endif /* HAVE_ATOMIC64_T */
rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
}
@@ -315,18 +311,14 @@ static struct ctl_table spl_kmem_table[] = {
{
.procname = "kmem_used",
.data = &kmem_alloc_used,
-#ifdef HAVE_ATOMIC64_T
.maxlen = sizeof (atomic64_t),
-#else
- .maxlen = sizeof (atomic_t),
-#endif /* HAVE_ATOMIC64_T */
.mode = 0444,
.proc_handler = &proc_domemused,
},
{
.procname = "kmem_max",
.data = &kmem_alloc_max,
- .maxlen = sizeof (unsigned long),
+ .maxlen = sizeof (uint64_t),
.extra1 = &table_min,
.extra2 = &table_max,
.mode = 0444,
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
index d5b42fdfaf20..092f090d934b 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
@@ -37,6 +37,12 @@
#include <sys/atomic.h>
#include <sys/kstat.h>
#include <linux/cpuhotplug.h>
+#include <linux/mod_compat.h>
+
+/* Linux 6.2 renamed timer_delete_sync(); point it at its old name for those. */
+#ifndef HAVE_TIMER_DELETE_SYNC
+#define timer_delete_sync(t) del_timer_sync(t)
+#endif
typedef struct taskq_kstats {
/* static values, for completeness */
@@ -633,7 +639,7 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id)
*/
if (timer_pending(&t->tqent_timer)) {
spin_unlock_irqrestore(&tq->tq_lock, flags);
- del_timer_sync(&t->tqent_timer);
+ timer_delete_sync(&t->tqent_timer);
spin_lock_irqsave_nested(&tq->tq_lock, flags,
tq->tq_lock_class);
}
@@ -1646,18 +1652,8 @@ spl_taskq_kstat_fini(void)
static unsigned int spl_taskq_kick = 0;
-/*
- * 2.6.36 API Change
- * module_param_cb is introduced to take kernel_param_ops and
- * module_param_call is marked as obsolete. Also set and get operations
- * were changed to take a 'const struct kernel_param *'.
- */
static int
-#ifdef module_param_cb
-param_set_taskq_kick(const char *val, const struct kernel_param *kp)
-#else
-param_set_taskq_kick(const char *val, struct kernel_param *kp)
-#endif
+param_set_taskq_kick(const char *val, zfs_kernel_param_t *kp)
{
int ret;
taskq_t *tq = NULL;
@@ -1687,16 +1683,8 @@ param_set_taskq_kick(const char *val, struct kernel_param *kp)
return (ret);
}
-#ifdef module_param_cb
-static const struct kernel_param_ops param_ops_taskq_kick = {
- .set = param_set_taskq_kick,
- .get = param_get_uint,
-};
-module_param_cb(spl_taskq_kick, &param_ops_taskq_kick, &spl_taskq_kick, 0644);
-#else
module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint,
&spl_taskq_kick, 0644);
-#endif
MODULE_PARM_DESC(spl_taskq_kick,
"Write nonzero to kick stuck taskqs to spawn more threads");
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
index 1398483a3ac8..8f5c73b13df5 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
@@ -28,6 +28,7 @@
#include <sys/kmem.h>
#include <sys/tsd.h>
#include <sys/string.h>
+#include <sys/misc.h>
/*
* Thread interfaces
@@ -79,7 +80,7 @@ __thread_create(caddr_t stk, size_t stksize, thread_func_t func,
/* Option pp is simply ignored */
/* Variable stack size unsupported */
- ASSERT(stk == NULL);
+ ASSERT0P(stk);
tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE);
if (tp == NULL)
@@ -197,3 +198,14 @@ issig(void)
}
EXPORT_SYMBOL(issig);
+
+/*
+ * Check if the current thread is a memory reclaim thread.
+ * Returns true if current thread is kswapd.
+ */
+int
+current_is_reclaim_thread(void)
+{
+ return (current_is_kswapd());
+}
+EXPORT_SYMBOL(current_is_reclaim_thread);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c
index 34a61bef7d4f..2e8cedf0dc87 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c
@@ -161,7 +161,7 @@ tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value)
ulong_t hash;
int rc = 0;
- ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL);
+ ASSERT0P(tsd_hash_search(table, key, pid));
/* New entry allocate structure, set value, and add to hash */
entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
index 8e4d271976ea..8a8316f63c48 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
@@ -256,10 +256,6 @@ abd_unmark_zfs_page(struct page *page)
#ifndef CONFIG_HIGHMEM
-#ifndef __GFP_RECLAIM
-#define __GFP_RECLAIM __GFP_WAIT
-#endif
-
/*
* The goal is to minimize fragmentation by preferentially populating ABDs
* with higher order compound pages from a single zone. Allocation size is
@@ -867,9 +863,9 @@ abd_iter_advance(struct abd_iter *aiter, size_t amount)
* Ensure that last chunk is not in use. abd_iterate_*() must clear
* this state (directly or abd_iter_unmap()) before advancing.
*/
- ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0P(aiter->iter_mapaddr);
ASSERT0(aiter->iter_mapsize);
- ASSERT3P(aiter->iter_page, ==, NULL);
+ ASSERT0P(aiter->iter_page);
ASSERT0(aiter->iter_page_doff);
ASSERT0(aiter->iter_page_dsize);
@@ -901,7 +897,7 @@ abd_iter_map(struct abd_iter *aiter)
void *paddr;
size_t offset = 0;
- ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0P(aiter->iter_mapaddr);
ASSERT0(aiter->iter_mapsize);
/* There's nothing left to iterate over, so do nothing */
@@ -1340,6 +1336,8 @@ abd_bio_map_off(struct bio *bio, abd_t *abd,
return (io_size);
}
+EXPORT_SYMBOL(abd_alloc_from_pages);
+
/* Tunable Parameters */
module_param(zfs_abd_scatter_enabled, int, 0644);
MODULE_PARM_DESC(zfs_abd_scatter_enabled,
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/policy.c b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
index c50ffcfe6992..4396a5d9e076 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/policy.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
@@ -24,6 +24,7 @@
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2013, Joyent, Inc. All rights reserved.
* Copyright (C) 2016 Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
*
* For Linux the vast majority of this enforcement is already handled via
* the standard Linux VFS permission checks. However certain administrative
@@ -35,28 +36,32 @@
#include <linux/security.h>
#include <linux/vfs_compat.h>
-/*
- * The passed credentials cannot be directly verified because Linux only
- * provides and interface to check the *current* process credentials. In
- * order to handle this the capable() test is only run when the passed
- * credentials match the current process credentials or the kcred. In
- * all other cases this function must fail and return the passed err.
- */
static int
priv_policy_ns(const cred_t *cr, int capability, int err,
struct user_namespace *ns)
{
- if (cr != CRED() && (cr != kcred))
- return (err);
+ /*
+ * The passed credentials cannot be directly verified because Linux
+ * only provides an interface to check the *current* process
+ * credentials. In order to handle this we check if the passed in
+ * creds match the current process credentials or the kcred. If not,
+ * we swap the passed credentials into the current task, perform the
+ * check, and then revert it before returning.
+ */
+ const cred_t *old =
+ (cr != CRED() && cr != kcred) ? override_creds(cr) : NULL;
#if defined(CONFIG_USER_NS)
- if (!(ns ? ns_capable(ns, capability) : capable(capability)))
+ if (ns ? ns_capable(ns, capability) : capable(capability))
#else
- if (!capable(capability))
+ if (capable(capability))
#endif
- return (err);
+ err = 0;
- return (0);
+ if (old)
+ revert_creds(old);
+
+ return (err);
}
static int
@@ -249,19 +254,6 @@ secpolicy_zfs(const cred_t *cr)
return (priv_policy(cr, CAP_SYS_ADMIN, EACCES));
}
-/*
- * Equivalent to secpolicy_zfs(), but works even if the cred_t is not that of
- * the current process. Takes both cred_t and proc_t so that this can work
- * easily on all platforms.
- */
-int
-secpolicy_zfs_proc(const cred_t *cr, proc_t *proc)
-{
- if (!has_capability(proc, CAP_SYS_ADMIN))
- return (EACCES);
- return (0);
-}
-
void
secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
{
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
index 29e54b39aa1a..1bd3500e9f66 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@@ -25,7 +25,7 @@
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
* LLNL-CODE-403049.
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
- * Copyright (c) 2023, 2024, Klara Inc.
+ * Copyright (c) 2023, 2024, 2025, Klara, Inc.
*/
#include <sys/zfs_context.h>
@@ -471,13 +471,17 @@ vdev_disk_close(vdev_t *v)
if (v->vdev_reopening || vd == NULL)
return;
+ rw_enter(&vd->vd_lock, RW_WRITER);
+
if (vd->vd_bdh != NULL)
vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
zfs_vdev_holder);
+ v->vdev_tsd = NULL;
+
+ rw_exit(&vd->vd_lock);
rw_destroy(&vd->vd_lock);
kmem_free(vd, sizeof (vdev_disk_t));
- v->vdev_tsd = NULL;
}
/*
@@ -552,7 +556,7 @@ vdev_bio_associate_blkg(struct bio *bio)
#endif
ASSERT3P(q, !=, NULL);
- ASSERT3P(bio->bi_blkg, ==, NULL);
+ ASSERT0P(bio->bi_blkg);
if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
bio->bi_blkg = q->root_blkg;
@@ -574,7 +578,7 @@ vdev_bio_set_dev(struct bio *bio, struct block_device *bdev)
bio->bi_bdev = bdev;
ASSERT3P(q, !=, NULL);
- ASSERT3P(bio->bi_blkg, ==, NULL);
+ ASSERT0P(bio->bi_blkg);
if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
bio->bi_blkg = q->root_blkg;
@@ -614,7 +618,7 @@ static inline uint_t
vdev_bio_max_segs(struct block_device *bdev)
{
/*
- * Smallest of the device max segs and the tuneable max segs. Minimum
+ * Smallest of the device max segs and the tunable max segs. Minimum
* 4, so there's room to finish split pages if they come up.
*/
const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
@@ -806,7 +810,7 @@ vbio_completion(struct bio *bio)
* here; instead we stash vbio on the zio and take care of it in the
* done callback.
*/
- ASSERT3P(zio->io_bio, ==, NULL);
+ ASSERT0P(zio->io_bio);
zio->io_bio = vbio;
zio_delay_interrupt(zio);
@@ -966,234 +970,6 @@ vdev_disk_io_rw(zio_t *zio)
return (0);
}
-/* ========== */
-
-/*
- * This is the classic, battle-tested BIO submission code. Until we're totally
- * sure that the new code is safe and correct in all cases, this will remain
- * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
- * load time.
- *
- * These functions have been renamed to vdev_classic_* to make it clear what
- * they belong to, but their implementations are unchanged.
- */
-
-/*
- * Virtual device vector for disks.
- */
-typedef struct dio_request {
- zio_t *dr_zio; /* Parent ZIO */
- atomic_t dr_ref; /* References */
- int dr_error; /* Bio error */
- int dr_bio_count; /* Count of bio's */
- struct bio *dr_bio[]; /* Attached bio's */
-} dio_request_t;
-
-static dio_request_t *
-vdev_classic_dio_alloc(int bio_count)
-{
- dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
- sizeof (struct bio *) * bio_count, KM_SLEEP);
- atomic_set(&dr->dr_ref, 0);
- dr->dr_bio_count = bio_count;
- dr->dr_error = 0;
-
- for (int i = 0; i < dr->dr_bio_count; i++)
- dr->dr_bio[i] = NULL;
-
- return (dr);
-}
-
-static void
-vdev_classic_dio_free(dio_request_t *dr)
-{
- int i;
-
- for (i = 0; i < dr->dr_bio_count; i++)
- if (dr->dr_bio[i])
- bio_put(dr->dr_bio[i]);
-
- kmem_free(dr, sizeof (dio_request_t) +
- sizeof (struct bio *) * dr->dr_bio_count);
-}
-
-static void
-vdev_classic_dio_get(dio_request_t *dr)
-{
- atomic_inc(&dr->dr_ref);
-}
-
-static void
-vdev_classic_dio_put(dio_request_t *dr)
-{
- int rc = atomic_dec_return(&dr->dr_ref);
-
- /*
- * Free the dio_request when the last reference is dropped and
- * ensure zio_interpret is called only once with the correct zio
- */
- if (rc == 0) {
- zio_t *zio = dr->dr_zio;
- int error = dr->dr_error;
-
- vdev_classic_dio_free(dr);
-
- if (zio) {
- zio->io_error = error;
- ASSERT3S(zio->io_error, >=, 0);
- if (zio->io_error)
- vdev_disk_error(zio);
-
- zio_delay_interrupt(zio);
- }
- }
-}
-
-static void
-vdev_classic_physio_completion(struct bio *bio)
-{
- dio_request_t *dr = bio->bi_private;
-
- if (dr->dr_error == 0) {
- dr->dr_error = bi_status_to_errno(bio->bi_status);
- }
-
- /* Drop reference acquired by vdev_classic_physio */
- vdev_classic_dio_put(dr);
-}
-
-static inline unsigned int
-vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
-{
- unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
- bio_size, abd_offset);
-
-#ifdef HAVE_BIO_MAX_SEGS
- return (bio_max_segs(nr_segs));
-#else
- return (MIN(nr_segs, BIO_MAX_PAGES));
-#endif
-}
-
-static int
-vdev_classic_physio(zio_t *zio)
-{
- vdev_t *v = zio->io_vd;
- vdev_disk_t *vd = v->vdev_tsd;
- struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
- size_t io_size = zio->io_size;
- uint64_t io_offset = zio->io_offset;
- int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
- int flags = 0;
-
- dio_request_t *dr;
- uint64_t abd_offset;
- uint64_t bio_offset;
- int bio_size;
- int bio_count = 16;
- int error = 0;
- struct blk_plug plug;
- unsigned short nr_vecs;
-
- /*
- * Accessing outside the block device is never allowed.
- */
- if (io_offset + io_size > bdev_capacity(bdev)) {
- vdev_dbgmsg(zio->io_vd,
- "Illegal access %llu size %llu, device size %llu",
- (u_longlong_t)io_offset,
- (u_longlong_t)io_size,
- (u_longlong_t)bdev_capacity(bdev));
- return (SET_ERROR(EIO));
- }
-
-retry:
- dr = vdev_classic_dio_alloc(bio_count);
-
- if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
- zio->io_vd->vdev_failfast == B_TRUE) {
- bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
- zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
- }
-
- dr->dr_zio = zio;
-
- /*
- * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which
- * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio
- * can cover at least 128KB and at most 1MB. When the required number
- * of iovec's exceeds this, we are forced to break the IO in multiple
- * bio's and wait for them all to complete. This is likely if the
- * recordsize property is increased beyond 1MB. The default
- * bio_count=16 should typically accommodate the maximum-size zio of
- * 16MB.
- */
-
- abd_offset = 0;
- bio_offset = io_offset;
- bio_size = io_size;
- for (int i = 0; i <= dr->dr_bio_count; i++) {
-
- /* Finished constructing bio's for given buffer */
- if (bio_size <= 0)
- break;
-
- /*
- * If additional bio's are required, we have to retry, but
- * this should be rare - see the comment above.
- */
- if (dr->dr_bio_count == i) {
- vdev_classic_dio_free(dr);
- bio_count *= 2;
- goto retry;
- }
-
- nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
- dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
- if (unlikely(dr->dr_bio[i] == NULL)) {
- vdev_classic_dio_free(dr);
- return (SET_ERROR(ENOMEM));
- }
-
- /* Matching put called by vdev_classic_physio_completion */
- vdev_classic_dio_get(dr);
-
- BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
- dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
- dr->dr_bio[i]->bi_private = dr;
- bio_set_op_attrs(dr->dr_bio[i], rw, flags);
-
- /* Remaining size is returned to become the new size */
- bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd,
- bio_size, abd_offset);
-
- /* Advance in buffer and construct another bio if needed */
- abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
- bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
- }
-
- /* Extra reference to protect dio_request during vdev_submit_bio */
- vdev_classic_dio_get(dr);
-
- if (dr->dr_bio_count > 1)
- blk_start_plug(&plug);
-
- /* Submit all bio's associated with this dio */
- for (int i = 0; i < dr->dr_bio_count; i++) {
- if (dr->dr_bio[i])
- vdev_submit_bio(dr->dr_bio[i]);
- }
-
- if (dr->dr_bio_count > 1)
- blk_finish_plug(&plug);
-
- vdev_classic_dio_put(dr);
-
- return (error);
-}
-
-/* ========== */
-
static void
vdev_disk_io_flush_completion(struct bio *bio)
{
@@ -1339,8 +1115,6 @@ vdev_disk_io_trim(zio_t *zio)
return (0);
}
-int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
-
static void
vdev_disk_io_start(zio_t *zio)
{
@@ -1413,7 +1187,7 @@ vdev_disk_io_start(zio_t *zio)
case ZIO_TYPE_READ:
case ZIO_TYPE_WRITE:
zio->io_target_timestamp = zio_handle_io_delay(zio);
- error = vdev_disk_io_rw_fn(zio);
+ error = vdev_disk_io_rw(zio);
rw_exit(&vd->vd_lock);
if (error) {
zio->io_error = error;
@@ -1508,53 +1282,13 @@ vdev_disk_rele(vdev_t *vd)
/* XXX: Implement me as a vnode rele for the device */
}
-/*
- * BIO submission method. See comment above about vdev_classic.
- * Set zfs_vdev_disk_classic=0 for new, =1 for classic
- */
-static uint_t zfs_vdev_disk_classic = 0; /* default new */
-
-/* Set submission function from module parameter */
-static int
-vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
-{
- int err = param_set_uint(buf, kp);
- if (err < 0)
- return (SET_ERROR(err));
-
- vdev_disk_io_rw_fn =
- zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
-
- printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
- zfs_vdev_disk_classic ? "classic" : "new");
-
- return (0);
-}
-
-/*
- * At first use vdev use, set the submission function from the default value if
- * it hasn't been set already.
- */
-static int
-vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
-{
- (void) spa;
- (void) nv;
- (void) tsd;
-
- if (vdev_disk_io_rw_fn == NULL)
- vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
- vdev_classic_physio : vdev_disk_io_rw;
-
- return (0);
-}
-
vdev_ops_t vdev_disk_ops = {
- .vdev_op_init = vdev_disk_init,
+ .vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_disk_open,
.vdev_op_close = vdev_disk_close,
- .vdev_op_asize = vdev_default_asize,
+ .vdev_op_asize_to_psize = vdev_default_psize,
+ .vdev_op_psize_to_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_disk_io_start,
@@ -1575,29 +1309,6 @@ vdev_ops_t vdev_disk_ops = {
.vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post
};
-/*
- * The zfs_vdev_scheduler module option has been deprecated. Setting this
- * value no longer has any effect. It has not yet been entirely removed
- * to allow the module to be loaded if this option is specified in the
- * /etc/modprobe.d/zfs.conf file. The following warning will be logged.
- */
-static int
-param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
-{
- int error = param_set_charp(val, kp);
- if (error == 0) {
- printk(KERN_INFO "The 'zfs_vdev_scheduler' module option "
- "is not supported.\n");
- }
-
- return (error);
-}
-
-static const char *zfs_vdev_scheduler = "unused";
-module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
- param_get_charp, &zfs_vdev_scheduler, 0644);
-MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");
-
int
param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
{
@@ -1646,7 +1357,3 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
"Maximum number of data segments to add to an IO request (min 4)");
-
-ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
- vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
- "Use classic BIO submission method");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
index 1b169122f25b..daa4b5776837 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
@@ -1900,7 +1900,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
if (!(flag & IS_ROOT_NODE) &&
(dzp->z_pflags & ZFS_INHERIT_ACE) &&
!(dzp->z_pflags & ZFS_XATTR)) {
- VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
+ VERIFY0(zfs_acl_node_read(dzp, B_TRUE,
&paclp, B_FALSE));
acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
vap->va_mode, paclp, acl_ids->z_mode, &need_chmod);
@@ -2204,8 +2204,8 @@ top:
}
error = zfs_aclset_common(zp, aclp, cr, tx);
- ASSERT(error == 0);
- ASSERT(zp->z_acl_cached == NULL);
+ ASSERT0(error);
+ ASSERT0P(zp->z_acl_cached);
zp->z_acl_cached = aclp;
if (fuid_dirtied)
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
index 84b25cb2c5ac..fb4de50480a3 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
@@ -494,9 +494,9 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
if (!creation)
now = current_time(ip);
zp = ITOZ(ip);
- ASSERT3P(zp->z_dirlocks, ==, NULL);
- ASSERT3P(zp->z_acl_cached, ==, NULL);
- ASSERT3P(zp->z_xattr_cached, ==, NULL);
+ ASSERT0P(zp->z_dirlocks);
+ ASSERT0P(zp->z_acl_cached);
+ ASSERT0P(zp->z_xattr_cached);
zp->z_id = id;
zp->z_unlinked = B_FALSE;
zp->z_atime_dirty = B_FALSE;
@@ -511,8 +511,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
zp->z_pflags = 0;
zp->z_mode = 0;
zp->z_sync_cnt = 0;
- zp->z_sync_writes_cnt = 0;
- zp->z_async_writes_cnt = 0;
ip->i_generation = 0;
ip->i_ino = id;
ip->i_mode = (S_IFDIR | S_IRWXUGO);
@@ -592,7 +590,7 @@ zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
int
zfsctl_create(zfsvfs_t *zfsvfs)
{
- ASSERT(zfsvfs->z_ctldir == NULL);
+ ASSERT0P(zfsvfs->z_ctldir);
zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
&zpl_fops_root, &zpl_ops_root, 0);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
index 2f935bb3fc8c..e8de536606e2 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
@@ -463,7 +463,7 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
zfsvfs_t *zfsvfs = ZTOZSB(zp);
ASSERT(zp->z_unlinked);
- ASSERT(ZTOI(zp)->i_nlink == 0);
+ ASSERT0(ZTOI(zp)->i_nlink);
VERIFY3U(0, ==,
zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
@@ -662,8 +662,8 @@ zfs_rmnode(znode_t *zp)
uint64_t links;
int error;
- ASSERT(ZTOI(zp)->i_nlink == 0);
- ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0);
+ ASSERT0(ZTOI(zp)->i_nlink);
+ ASSERT0(atomic_read(&ZTOI(zp)->i_count));
/*
* If this is an attribute directory, purge its contents.
@@ -710,7 +710,7 @@ zfs_rmnode(znode_t *zp)
&xattr_obj, sizeof (xattr_obj));
if (error == 0 && xattr_obj) {
error = zfs_zget(zfsvfs, xattr_obj, &xzp);
- ASSERT(error == 0);
+ ASSERT0(error);
}
acl_obj = zfs_external_acl(zp);
@@ -744,12 +744,12 @@ zfs_rmnode(znode_t *zp)
}
if (xzp) {
- ASSERT(error == 0);
+ ASSERT0(error);
mutex_enter(&xzp->z_lock);
xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
clear_nlink(ZTOI(xzp)); /* no more links to it */
links = 0;
- VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ VERIFY0(sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
&links, sizeof (links), tx));
mutex_exit(&xzp->z_lock);
zfs_unlinked_add(xzp, tx);
@@ -872,7 +872,7 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
ctime);
}
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
mutex_exit(&zp->z_lock);
@@ -894,7 +894,7 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
&dzp->z_pflags, sizeof (dzp->z_pflags));
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
mutex_exit(&dzp->z_lock);
return (0);
@@ -986,7 +986,7 @@ zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp)
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
NULL, &links, sizeof (links));
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- ASSERT3U(error, ==, 0);
+ ASSERT0(error);
if (unlinkedp != NULL)
*unlinkedp = unlinked;
@@ -1058,7 +1058,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
/* The only error is !zfs_dirempty() and we checked earlier. */
error = zfs_drop_nlink_locked(zp, tx, &unlinked);
- ASSERT3U(error, ==, 0);
+ ASSERT0(error);
mutex_exit(&zp->z_lock);
} else {
error = zfs_dropname(dl, zp, dzp, tx, flag);
@@ -1083,7 +1083,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
mutex_exit(&dzp->z_lock);
if (unlinkedp != NULL)
@@ -1167,7 +1167,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr)
ASSERT(error == 0 && parent == zp->z_id);
#endif
- VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+ VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
sizeof (xzp->z_id), tx));
if (!zp->z_unlinked)
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
index d193eb80dca2..3fdcdbac6f68 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
@@ -115,8 +115,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
*/
int
zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
- ssize_t *resid)
+ uint8_t ashift, ssize_t *resid)
{
+ (void) ashift;
ssize_t rc;
rc = kernel_write(fp, buf, count, &off);
@@ -260,24 +261,12 @@ zfs_file_fsync(zfs_file_t *filp, int flags)
{
int datasync = 0;
int error;
- int fstrans;
if (flags & O_DSYNC)
datasync = 1;
- /*
- * May enter XFS which generates a warning when PF_FSTRANS is set.
- * To avoid this the flag is cleared over vfs_sync() and then reset.
- */
- fstrans = __spl_pf_fstrans_check();
- if (fstrans)
- current->flags &= ~(__SPL_PF_FSTRANS);
-
error = -vfs_fsync(filp, datasync);
- if (fstrans)
- current->flags |= __SPL_PF_FSTRANS;
-
return (error);
}
@@ -292,14 +281,6 @@ int
zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len)
{
/*
- * May enter XFS which generates a warning when PF_FSTRANS is set.
- * To avoid this the flag is cleared over vfs_sync() and then reset.
- */
- int fstrans = __spl_pf_fstrans_check();
- if (fstrans)
- current->flags &= ~(__SPL_PF_FSTRANS);
-
- /*
* When supported by the underlying file system preferentially
* use the fallocate() callback to preallocate the space.
*/
@@ -308,9 +289,6 @@ zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len)
error = -fp->f_op->fallocate(fp,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len);
- if (fstrans)
- current->flags |= __SPL_PF_FSTRANS;
-
if (error)
return (SET_ERROR(error));
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_racct.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_racct.c
index 4dbd6a28b594..18c5d67f9e32 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_racct.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_racct.c
@@ -30,14 +30,14 @@
#include <linux/task_io_accounting_ops.h>
void
-zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
task_io_account_read(size);
spa_iostats_read_add(spa, size, iops, flags);
}
void
-zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
task_io_account_write(size);
spa_iostats_write_add(spa, size, iops, flags);
@@ -46,13 +46,13 @@ zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
#else
void
-zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
(void) spa, (void) size, (void) iops, (void) flags;
}
void
-zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
(void) spa, (void) size, (void) iops, (void) flags;
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
index 1c187d7b9cab..895d80b2d79e 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
@@ -223,7 +223,7 @@ zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name)
{
/* zko_default_group.attrs must be NULL terminated */
ASSERT(zkobj->zko_default_group.attrs != NULL);
- ASSERT(zkobj->zko_default_group.attrs[zkobj->zko_attr_count] == NULL);
+ ASSERT0P(zkobj->zko_default_group.attrs[zkobj->zko_attr_count]);
kobject_init(&zkobj->zko_kobj, &zkobj->zko_kobj_type);
return (kobject_add(&zkobj->zko_kobj, parent, name));
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
index 901bd191f2df..d282f6d95ddf 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
@@ -233,9 +233,6 @@ zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,
{
size_t cnt = MIN(n, uio->uio_resid);
- if (uio->uio_skip)
- iov_iter_advance(uio->uio_iter, uio->uio_skip);
-
if (rw == UIO_READ)
cnt = copy_to_iter(p, cnt, uio->uio_iter);
else
@@ -507,12 +504,14 @@ static int
zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
{
long res;
- size_t skip = uio->uio_skip;
+ size_t skip = uio->uio_iter->iov_offset;
size_t len = uio->uio_resid - skip;
unsigned int gup_flags = 0;
unsigned long addr;
unsigned long nr_pages;
+ ASSERT3U(uio->uio_segflg, ==, UIO_ITER);
+
/*
* Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could
* possibly be used here in the future to allow for P2P operations with
@@ -577,7 +576,7 @@ static int
zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
{
size_t start;
- size_t wanted = uio->uio_resid - uio->uio_skip;
+ size_t wanted = uio->uio_resid;
ssize_t rollback = 0;
ssize_t cnt;
unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);
@@ -611,7 +610,7 @@ zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
#endif
}
- ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip);
+ ASSERT3U(rollback, ==, uio->uio_resid);
iov_iter_revert(uio->uio_iter, rollback);
return (0);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
index ca75080d5457..8a7d14ab6119 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -265,6 +265,7 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr)
{
(void) cr;
zfsvfs_t *zfsvfs = sb->s_fs_info;
+ ASSERT3P(zfsvfs, !=, NULL);
/*
* Semantically, the only requirement is that the sync be initiated.
@@ -273,40 +274,19 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr)
if (!wait)
return (0);
- if (zfsvfs != NULL) {
- /*
- * Sync a specific filesystem.
- */
- dsl_pool_t *dp;
- int error;
-
- if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
- return (error);
- dp = dmu_objset_pool(zfsvfs->z_os);
-
- /*
- * If the system is shutting down, then skip any
- * filesystems which may exist on a suspended pool.
- */
- if (spa_suspended(dp->dp_spa)) {
- zfs_exit(zfsvfs, FTAG);
- return (0);
- }
-
- if (zfsvfs->z_log != NULL)
- zil_commit(zfsvfs->z_log, 0);
+ int err = zfs_enter(zfsvfs, FTAG);
+ if (err != 0)
+ return (err);
- zfs_exit(zfsvfs, FTAG);
- } else {
- /*
- * Sync all ZFS filesystems. This is what happens when you
- * run sync(1). Unlike other filesystems, ZFS honors the
- * request by waiting for all pools to commit all dirty data.
- */
- spa_sync_allpools();
- }
+ /*
+ * Sync any pending writes, but do not block if the pool is suspended.
+ * This is to help with shutting down with pools suspended, as we don't
+ * want to block in that case.
+ */
+ err = zil_commit_flags(zfsvfs->z_log, 0, ZIL_COMMIT_NOW);
+ zfs_exit(zfsvfs, FTAG);
- return (0);
+ return (err);
}
static void
@@ -697,6 +677,36 @@ zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
zfsvfs->z_xattr_sa = B_TRUE;
}
+ error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSERQUOTA,
+ &zfsvfs->z_defaultuserquota);
+ if (error != 0)
+ return (error);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPQUOTA,
+ &zfsvfs->z_defaultgroupquota);
+ if (error != 0)
+ return (error);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTQUOTA,
+ &zfsvfs->z_defaultprojectquota);
+ if (error != 0)
+ return (error);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSEROBJQUOTA,
+ &zfsvfs->z_defaultuserobjquota);
+ if (error != 0)
+ return (error);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPOBJQUOTA,
+ &zfsvfs->z_defaultgroupobjquota);
+ if (error != 0)
+ return (error);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTOBJQUOTA,
+ &zfsvfs->z_defaultprojectobjquota);
+ if (error != 0)
+ return (error);
+
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
&zfsvfs->z_root);
if (error != 0)
@@ -868,7 +878,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
* operations out since we closed the ZIL.
*/
if (mounting) {
- ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+ ASSERT0P(zfsvfs->z_kstat.dk_kstats);
error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
if (error)
return (error);
@@ -1038,15 +1048,19 @@ zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp,
if (err)
return (err);
- if (zfsvfs->z_projectquota_obj == 0)
- goto objs;
-
- err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj,
- buf + offset, 8, 1, &quota);
- if (err == ENOENT)
- goto objs;
- else if (err)
- return (err);
+ if (zfsvfs->z_projectquota_obj == 0) {
+ if (zfsvfs->z_defaultprojectquota == 0)
+ goto objs;
+ quota = zfsvfs->z_defaultprojectquota;
+ } else {
+ err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj,
+ buf + offset, 8, 1, &quota);
+ if (err && (quota = zfsvfs->z_defaultprojectquota) == 0) {
+ if (err == ENOENT)
+ goto objs;
+ return (err);
+ }
+ }
err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
buf + offset, 8, 1, &used);
@@ -1072,15 +1086,21 @@ zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp,
statp->f_bavail = statp->f_bfree;
objs:
- if (zfsvfs->z_projectobjquota_obj == 0)
- return (0);
- err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj,
- buf + offset, 8, 1, &quota);
- if (err == ENOENT)
- return (0);
- else if (err)
- return (err);
+ if (zfsvfs->z_projectobjquota_obj == 0) {
+ if (zfsvfs->z_defaultprojectobjquota == 0)
+ return (0);
+ quota = zfsvfs->z_defaultprojectobjquota;
+ } else {
+ err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj,
+ buf + offset, 8, 1, &quota);
+ if (err && (quota = zfsvfs->z_defaultprojectobjquota) == 0) {
+ if (err == ENOENT)
+ return (0);
+ return (err);
+ }
+ }
+
err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
buf, 8, 1, &used);
@@ -1192,6 +1212,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
}
/*
+ * Dentry and inode caches referenced by a task in non-root memcg are
+ * not going to be scanned by the kernel-provided shrinker. So, if
+ * kernel prunes nothing, fall back to this manual walk to free dnodes.
+ * To avoid scanning the same znodes multiple times they are always rotated
+ * to the end of the z_all_znodes list. New znodes are inserted at the
+ * end of the list so we're always scanning the oldest znodes first.
+ */
+static int
+zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
+{
+ znode_t **zp_array, *zp;
+ int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
+ int objects = 0;
+ int i = 0, j = 0;
+
+ zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
+
+ if ((i++ > nr_to_scan) || (j >= max_array))
+ break;
+
+ ASSERT(list_link_active(&zp->z_link_node));
+ list_remove(&zfsvfs->z_all_znodes, zp);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+
+ /* Skip active znodes and .zfs entries */
+ if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
+ continue;
+
+ if (igrab(ZTOI(zp)) == NULL)
+ continue;
+
+ zp_array[j] = zp;
+ j++;
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ for (i = 0; i < j; i++) {
+ zp = zp_array[i];
+
+ ASSERT3P(zp, !=, NULL);
+ d_prune_aliases(ZTOI(zp));
+
+ if (atomic_read(&ZTOI(zp)->i_count) == 1)
+ objects++;
+
+ zrele(zp);
+ }
+
+ vmem_free(zp_array, max_array * sizeof (znode_t *));
+
+ return (objects);
+}
+
+/*
* The ARC has requested that the filesystem drop entries from the dentry
* and inode caches. This can occur when the ARC needs to free meta data
* blocks but can't because they are all pinned by entries in these caches.
@@ -1242,6 +1319,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
*objects = (*shrinker->scan_objects)(shrinker, &sc);
#endif
+ /*
+ * Fall back to zfs_prune_aliases if kernel's shrinker did nothing
+ * due to dentry and inode caches being referenced by a task running
+ * in non-root memcg.
+ */
+ if (*objects == 0)
+ *objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
+
zfs_exit(zfsvfs, FTAG);
dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
@@ -1471,6 +1556,12 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
sb->s_xattr = zpl_xattr_handlers;
sb->s_export_op = &zpl_export_operations;
+#ifdef HAVE_SET_DEFAULT_D_OP
+ set_default_d_op(sb, &zpl_dentry_operations);
+#else
+ sb->s_d_op = &zpl_dentry_operations;
+#endif
+
/* Set features for file system. */
zfs_set_fuid_feature(zfsvfs);
@@ -1586,7 +1677,7 @@ zfs_umount(struct super_block *sb)
if (zfsvfs->z_arc_prune != NULL)
arc_remove_prune_callback(zfsvfs->z_arc_prune);
- VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
+ VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
os = zfsvfs->z_os;
/*
@@ -1712,8 +1803,8 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
ASSERT(*ipp != NULL);
if (object == ZFSCTL_INO_SNAPDIR) {
- VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp,
- 0, kcred, NULL, NULL) == 0);
+ VERIFY0(zfsctl_root_lookup(*ipp, "snapshot", ipp,
+ 0, kcred, NULL, NULL));
} else {
/*
* Must have an existing ref, so igrab()
@@ -1815,7 +1906,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
goto bail;
ds->ds_dir->dd_activity_cancelled = B_FALSE;
- VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+ VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
zfs_set_fuid_feature(zfsvfs);
zfsvfs->z_rollback_time = jiffies;
@@ -1988,7 +2079,7 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
ASSERT0(error);
- VERIFY(0 == sa_set_sa_object(os, sa_obj));
+ VERIFY0(sa_set_sa_object(os, sa_obj));
sa_register_update_callback(os, zfs_sa_upgrade);
}
@@ -2005,6 +2096,62 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
return (0);
}
+int
+zfs_set_default_quota(zfsvfs_t *zfsvfs, zfs_prop_t prop, uint64_t quota)
+{
+ int error;
+ objset_t *os = zfsvfs->z_os;
+ const char *propstr = zfs_prop_to_name(prop);
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, propstr);
+ error = dmu_tx_assign(tx, DMU_TX_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ if (quota == 0) {
+ error = zap_remove(os, MASTER_NODE_OBJ, propstr, tx);
+ if (error == ENOENT)
+ error = 0;
+ } else {
+ error = zap_update(os, MASTER_NODE_OBJ, propstr, 8, 1,
+ &quota, tx);
+ }
+
+ if (error)
+ goto out;
+
+ switch (prop) {
+ case ZFS_PROP_DEFAULTUSERQUOTA:
+ zfsvfs->z_defaultuserquota = quota;
+ break;
+ case ZFS_PROP_DEFAULTGROUPQUOTA:
+ zfsvfs->z_defaultgroupquota = quota;
+ break;
+ case ZFS_PROP_DEFAULTPROJECTQUOTA:
+ zfsvfs->z_defaultprojectquota = quota;
+ break;
+ case ZFS_PROP_DEFAULTUSEROBJQUOTA:
+ zfsvfs->z_defaultuserobjquota = quota;
+ break;
+ case ZFS_PROP_DEFAULTGROUPOBJQUOTA:
+ zfsvfs->z_defaultgroupobjquota = quota;
+ break;
+ case ZFS_PROP_DEFAULTPROJECTOBJQUOTA:
+ zfsvfs->z_defaultprojectobjquota = quota;
+ break;
+ default:
+ break;
+ }
+
+out:
+ dmu_tx_commit(tx);
+ return (error);
+}
+
/*
* Return true if the corresponding vfs's unmounted flag is set.
* Otherwise return false.
@@ -2073,4 +2220,5 @@ EXPORT_SYMBOL(zfs_remount);
EXPORT_SYMBOL(zfs_statvfs);
EXPORT_SYMBOL(zfs_vget);
EXPORT_SYMBOL(zfs_prune);
+EXPORT_SYMBOL(zfs_set_default_quota);
#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
index 9ceb6cb8dbdd..6106726651a3 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
@@ -25,6 +25,7 @@
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2025, Klara, Inc.
*/
/* Portions Copyright 2007 Jeremy Teo */
@@ -329,7 +330,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
put_page(pp);
} else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
- uio, bytes);
+ uio, bytes, DMU_READ_PREFETCH);
}
len -= bytes;
@@ -840,8 +841,8 @@ out:
*zpp = zp;
}
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -1202,8 +1203,8 @@ out:
zfs_zrele_async(xzp);
}
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -1391,14 +1392,15 @@ out:
zfs_dirent_unlock(dl);
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
if (error != 0) {
zrele(zp);
} else {
zfs_znode_update_vfs(dzp);
zfs_znode_update_vfs(zp);
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
+
}
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -1527,8 +1529,8 @@ out:
zfs_znode_update_vfs(zp);
zrele(zp);
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -2482,10 +2484,10 @@ top:
new_mode = zp->z_mode;
}
err = zfs_acl_chown_setattr(zp);
- ASSERT(err == 0);
+ ASSERT0(err);
if (attrzp) {
err = zfs_acl_chown_setattr(attrzp);
- ASSERT(err == 0);
+ ASSERT0(err);
}
}
@@ -2599,7 +2601,7 @@ out:
if (err == 0 && xattr_count > 0) {
err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
xattr_count, tx);
- ASSERT(err2 == 0);
+ ASSERT0(err2);
}
if (aclp)
@@ -2629,8 +2631,8 @@ out:
}
out2:
- if (os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS)
+ err = zil_commit(zilog, 0);
out3:
kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
@@ -3156,7 +3158,7 @@ top:
* zfs_link_create() to add back the same entry, but with a new
* dnode (szp), should not fail.
*/
- ASSERT3P(tzp, ==, NULL);
+ ASSERT0P(tzp);
goto commit_link_tzp;
}
@@ -3234,8 +3236,8 @@ out:
zfs_dirent_unlock(sdl);
zfs_dirent_unlock(tdl);
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -3435,7 +3437,7 @@ top:
*zpp = zp;
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ error = zil_commit(zilog, 0);
} else {
zrele(zp);
}
@@ -3653,8 +3655,8 @@ top:
* operation are sync safe.
*/
if (is_tmpfile) {
- VERIFY(zap_remove_int(zfsvfs->z_os,
- zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
+ VERIFY0(zap_remove_int(zfsvfs->z_os,
+ zfsvfs->z_unlinkedobj, szp->z_id, tx));
} else {
if (flags & FIGNORECASE)
txtype |= TX_CI;
@@ -3669,11 +3671,22 @@ top:
zfs_dirent_unlock(dl);
- if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
- if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
- txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
+ if (error == 0) {
+ if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ error = zil_commit(zilog, 0);
+
+ if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+ txg_wait_flag_t wait_flags =
+ spa_get_failmode(dmu_objset_spa(zfsvfs->z_os)) ==
+ ZIO_FAILURE_MODE_CONTINUE ? TXG_WAIT_SUSPEND : 0;
+ error = txg_wait_synced_flags(
+ dmu_objset_pool(zfsvfs->z_os), txg, wait_flags);
+ if (error != 0) {
+ ASSERT3U(error, ==, ESHUTDOWN);
+ error = SET_ERROR(EIO);
+ }
+ }
+ }
zfs_znode_update_vfs(tdzp);
zfs_znode_update_vfs(szp);
@@ -3681,24 +3694,39 @@ top:
return (error);
}
-static void
-zfs_putpage_sync_commit_cb(void *arg)
+/* Finish page writeback. */
+static inline void
+zfs_page_writeback_done(struct page *pp, int err)
{
- struct page *pp = arg;
+ if (err != 0) {
+ /*
+ * Writeback failed. Re-dirty the page. It was undirtied before
+ * the IO was issued (in zfs_putpage() or write_cache_pages()).
+ * The kernel only considers writeback for dirty pages; if we
+ * don't do this, it is eligible for eviction without being
+ * written out, which we definitely don't want.
+ */
+#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
+ filemap_dirty_folio(page_mapping(pp), page_folio(pp));
+#else
+ __set_page_dirty_nobuffers(pp);
+#endif
+ }
ClearPageError(pp);
end_page_writeback(pp);
}
+/*
+ * ZIL callback for page writeback. Passes to zfs_log_write() in zfs_putpage()
+ * for syncing writes. Called when the ZIL itx has been written to the log or
+ * the whole txg syncs, or if the ZIL crashes or the pool suspends. Any failure
+ * is passed as `err`.
+ */
static void
-zfs_putpage_async_commit_cb(void *arg)
+zfs_putpage_commit_cb(void *arg, int err)
{
- struct page *pp = arg;
- znode_t *zp = ITOZ(pp->mapping->host);
-
- ClearPageError(pp);
- end_page_writeback(pp);
- atomic_dec_32(&zp->z_async_writes_cnt);
+ zfs_page_writeback_done(arg, err);
}
/*
@@ -3818,15 +3846,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
zfs_rangelock_exit(lr);
if (wbc->sync_mode != WB_SYNC_NONE) {
- /*
- * Speed up any non-sync page writebacks since
- * they may take several seconds to complete.
- * Refer to the comment in zpl_fsync() for details.
- */
- if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
- zil_commit(zfsvfs->z_log, zp->z_id);
- }
-
if (PageWriteback(pp))
#ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
folio_wait_bit(page_folio(pp), PG_writeback);
@@ -3852,8 +3871,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
* was in fact not skipped and should not be counted as if it were.
*/
wbc->pages_skipped--;
- if (!for_sync)
- atomic_inc_32(&zp->z_async_writes_cnt);
set_page_writeback(pp);
unlock_page(pp);
@@ -3865,18 +3882,15 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
err = dmu_tx_assign(tx, DMU_TX_WAIT);
if (err != 0) {
dmu_tx_abort(tx);
-#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
- filemap_dirty_folio(page_mapping(pp), page_folio(pp));
-#else
- __set_page_dirty_nobuffers(pp);
-#endif
- ClearPageError(pp);
- end_page_writeback(pp);
- if (!for_sync)
- atomic_dec_32(&zp->z_async_writes_cnt);
+ zfs_page_writeback_done(pp, err);
zfs_rangelock_exit(lr);
zfs_exit(zfsvfs, FTAG);
- return (err);
+
+ /*
+ * Don't return error for an async writeback; we've re-dirtied
+ * the page so it will be tried again some other time.
+ */
+ return (for_sync ? err : 0);
}
va = kmap(pp);
@@ -3899,36 +3913,70 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
- boolean_t commit = B_FALSE;
- if (wbc->sync_mode != WB_SYNC_NONE) {
- /*
- * Note that this is rarely called under writepages(), because
- * writepages() normally handles the entire commit for
- * performance reasons.
- */
- commit = B_TRUE;
- } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
+ /*
+ * A note about for_sync vs wbc->sync_mode.
+ *
+ * for_sync indicates that this is a syncing writeback, that is, kernel
+ * caller expects the data to be durably stored before being notified.
+ * Often, but not always, the call was triggered by a userspace syncing
+ * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE
+ * means that that page should remain "locked" (in the writeback state)
+ * until it is definitely on disk (ie zil_commit() or spa_sync()).
+ * Otherwise, we can unlock and return as soon as it is on the
+ * in-memory ZIL.
+ *
+ * wbc->sync_mode has similar meaning. wbc is passed from the kernel to
+ * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE
+ * indicates this a regular async writeback (eg a cache eviction) and
+ * so does not need a durability guarantee, while WB_SYNC_ALL indicates
+ * a syncing op that must be waited on (by convention, we test for
+ * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over
+ * performance should there ever be a new mode that we have not yet
+ * added support for).
+ *
+ * So, why a separate for_sync field? This is because zpl_writepages()
+ * calls zfs_putpage() multiple times for a single "logical" operation.
+ * It wants all the individual pages to be for_sync==TRUE ie only
+ * unlocked once durably stored, but it only wants one call to
+ * zil_commit() at the very end, once all the pages are synced. So,
+ * it repurposes sync_mode slightly to indicate who issue and wait for
+ * the IO: for NONE, the caller to zfs_putpage() will do it, while for
+ * ALL, zfs_putpage should do it.
+ *
+ * Summary:
+ * for_sync: 0=unlock immediately; 1=unlock once on disk
+ * sync_mode: NONE=caller will commit; ALL=we will commit
+ */
+ boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE);
+
+ /*
+ * We use for_sync as the "commit" arg to zfs_log_write() (arg 7)
+ * because it is a policy flag that indicates "someone will call
+ * zil_commit() soon". for_sync=TRUE means exactly that; the only
+ * question is whether it will be us, or zpl_writepages().
+ */
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync,
+ B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);
+
+ if (!for_sync) {
/*
- * If the caller does not intend to wait synchronously
- * for this page writeback to complete and there are active
- * synchronous calls on this file, do a commit so that
- * the latter don't accidentally end up waiting for
- * our writeback to complete. Refer to the comment in
- * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
+ * Async writeback is logged and written to the DMU, so page
+ * can now be unlocked.
*/
- commit = B_TRUE;
+ zfs_page_writeback_done(pp, 0);
}
- zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
- B_FALSE, for_sync ? zfs_putpage_sync_commit_cb :
- zfs_putpage_async_commit_cb, pp);
-
dmu_tx_commit(tx);
zfs_rangelock_exit(lr);
- if (commit)
- zil_commit(zfsvfs->z_log, zp->z_id);
+ if (need_commit) {
+ err = zil_commit_flags(zfsvfs->z_log, zp->z_id, ZIL_COMMIT_NOW);
+ if (err != 0) {
+ zfs_exit(zfsvfs, FTAG);
+ return (err);
+ }
+ }
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c
index 607b3995cb60..bcaabeb32b8a 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c
@@ -126,8 +126,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
zp->z_acl_cached = NULL;
zp->z_xattr_cached = NULL;
zp->z_xattr_parent = 0;
- zp->z_sync_writes_cnt = 0;
- zp->z_async_writes_cnt = 0;
return (0);
}
@@ -146,12 +144,9 @@ zfs_znode_cache_destructor(void *buf, void *arg)
rw_destroy(&zp->z_xattr_lock);
zfs_rangelock_fini(&zp->z_rangelock);
- ASSERT3P(zp->z_dirlocks, ==, NULL);
- ASSERT3P(zp->z_acl_cached, ==, NULL);
- ASSERT3P(zp->z_xattr_cached, ==, NULL);
-
- ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
- ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
+ ASSERT0P(zp->z_dirlocks);
+ ASSERT0P(zp->z_acl_cached);
+ ASSERT0P(zp->z_xattr_cached);
}
static int
@@ -183,13 +178,13 @@ zfs_znode_init(void)
* backed by kmalloc() when on the Linux slab in order that any
* wait_on_bit() operations on the related inode operate properly.
*/
- ASSERT(znode_cache == NULL);
+ ASSERT0P(znode_cache);
znode_cache = kmem_cache_create("zfs_znode_cache",
sizeof (znode_t), 0, zfs_znode_cache_constructor,
zfs_znode_cache_destructor, NULL, NULL, NULL,
KMC_SLAB | KMC_RECLAIMABLE);
- ASSERT(znode_hold_cache == NULL);
+ ASSERT0P(znode_hold_cache);
znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
@@ -332,10 +327,10 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
mutex_enter(&zp->z_lock);
- ASSERT(zp->z_sa_hdl == NULL);
- ASSERT(zp->z_acl_cached == NULL);
+ ASSERT0P(zp->z_sa_hdl);
+ ASSERT0P(zp->z_acl_cached);
if (sa_hdl == NULL) {
- VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+ VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp,
SA_HDL_SHARED, &zp->z_sa_hdl));
} else {
zp->z_sa_hdl = sa_hdl;
@@ -371,6 +366,12 @@ zfs_inode_alloc(struct super_block *sb, struct inode **ip)
return (0);
}
+void
+zfs_inode_free(struct inode *ip)
+{
+ kmem_cache_free(znode_cache, ITOZ(ip));
+}
+
/*
* Called in multiple places when an inode should be destroyed.
*/
@@ -395,8 +396,15 @@ zfs_inode_destroy(struct inode *ip)
nvlist_free(zp->z_xattr_cached);
zp->z_xattr_cached = NULL;
}
-
- kmem_cache_free(znode_cache, zp);
+#ifndef HAVE_SOPS_FREE_INODE
+ /*
+ * inode needs to be freed in RCU callback. If we have
+ * super_operations->free_inode, Linux kernel will do call_rcu
+ * for us. But if we don't have it, since call_rcu is GPL-only
+ * symbol, we can only free synchronously and accept the risk.
+ */
+ zfs_inode_free(ip);
+#endif
}
static void
@@ -522,9 +530,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
return (NULL);
zp = ITOZ(ip);
- ASSERT(zp->z_dirlocks == NULL);
- ASSERT3P(zp->z_acl_cached, ==, NULL);
- ASSERT3P(zp->z_xattr_cached, ==, NULL);
+ ASSERT0P(zp->z_dirlocks);
+ ASSERT0P(zp->z_acl_cached);
+ ASSERT0P(zp->z_xattr_cached);
zp->z_unlinked = B_FALSE;
zp->z_atime_dirty = B_FALSE;
zp->z_is_ctldir = B_FALSE;
@@ -535,8 +543,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
zp->z_sync_cnt = 0;
- zp->z_sync_writes_cnt = 0;
- zp->z_async_writes_cnt = 0;
zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
@@ -605,7 +611,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
* processing so do not hash unlinked znodes.
*/
if (links > 0)
- VERIFY3S(insert_inode_locked(ip), ==, 0);
+ VERIFY0(insert_inode_locked(ip));
mutex_enter(&zfsvfs->z_znodes_lock);
list_insert_tail(&zfsvfs->z_all_znodes, zp);
@@ -768,7 +774,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
/*
* With ZFS_PROJID flag, we can easily know whether there is
- * project ID stored on disk or not. See zfs_space_delta_cb().
+ * project ID stored on disk or not. See zpl_get_file_info().
*/
if (obj_type != DMU_OT_ZNODE &&
dmu_objset_projectquota_enabled(zfsvfs->z_os))
@@ -805,7 +811,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
}
/* Now add in all of the "SA" attributes */
- VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+ VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
&sa_hdl));
/*
@@ -895,7 +901,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
acl_ids->z_fuid, acl_ids->z_fgid);
}
- VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+ VERIFY0(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx));
if (!(flag & IS_ROOT_NODE)) {
/*
@@ -1194,7 +1200,7 @@ zfs_rezget(znode_t *zp)
}
rw_exit(&zp->z_xattr_lock);
- ASSERT(zp->z_sa_hdl == NULL);
+ ASSERT0P(zp->z_sa_hdl);
err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
if (err) {
zfs_znode_hold_exit(zfsvfs, zh);
@@ -1308,9 +1314,9 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
zh = zfs_znode_hold_enter(zfsvfs, obj);
if (acl_obj) {
VERIFY(!zp->z_is_sa);
- VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+ VERIFY0(dmu_object_free(os, acl_obj, tx));
}
- VERIFY(0 == dmu_object_free(os, obj, tx));
+ VERIFY0(dmu_object_free(os, obj, tx));
zfs_znode_dmu_fini(zp);
zfs_znode_hold_exit(zfsvfs, zh);
}
@@ -1530,7 +1536,7 @@ zfs_extend(znode_t *zp, uint64_t end)
zp->z_size = end;
- VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
+ VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
&zp->z_size, sizeof (zp->z_size), tx));
zfs_rangelock_exit(lr);
@@ -1720,7 +1726,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
NULL, &zp->z_pflags, 8);
}
- VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
+ VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
dmu_tx_commit(tx);
zfs_rangelock_exit(lr);
@@ -1787,7 +1793,7 @@ log:
NULL, &zp->z_pflags, 8);
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
@@ -1834,7 +1840,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
moid = MASTER_NODE_OBJ;
error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
DMU_OT_NONE, 0, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
/*
* Set starting attributes.
@@ -1847,7 +1853,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
const char *name;
ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
- VERIFY(nvpair_value_uint64(elem, &val) == 0);
+ VERIFY0(nvpair_value_uint64(elem, &val));
name = nvpair_name(elem);
if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
if (val < version)
@@ -1855,7 +1861,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
} else {
error = zap_update(os, moid, name, 8, 1, &val, tx);
}
- ASSERT(error == 0);
+ ASSERT0(error);
if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
norm = val;
else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
@@ -1863,7 +1869,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
}
ASSERT(version != 0);
error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
/*
* Create zap object used for SA attribute registration
@@ -1873,7 +1879,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
DMU_OT_NONE, 0, tx);
error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
} else {
sa_obj = 0;
}
@@ -1883,7 +1889,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
/*
* Create root znode. Create minimal znode/inode/zfsvfs/sb
@@ -1916,7 +1922,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
&zfsvfs->z_attr_table);
- ASSERT(error == 0);
+ ASSERT0(error);
/*
* Fold case on file systems that are always or sometimes case
@@ -1940,12 +1946,12 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
}
- VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
+ VERIFY0(zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
cr, NULL, &acl_ids, zfs_init_idmap));
zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
ASSERT3P(zp, ==, rootzp);
error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
zfs_acl_ids_free(&acl_ids);
atomic_set(&ZTOI(rootzp)->i_count, 0);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
index 0b04ec6866f4..81ac26cb0c93 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
@@ -202,7 +202,7 @@ zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
return (!!dentry->d_inode);
}
-static dentry_operations_t zpl_dops_snapdirs = {
+static const struct dentry_operations zpl_dops_snapdirs = {
/*
* Auto mounting of snapshots is only supported for 2.6.37 and
* newer kernels. Prior to this kernel the ops->follow_link()
@@ -215,6 +215,51 @@ static dentry_operations_t zpl_dops_snapdirs = {
.d_revalidate = zpl_snapdir_revalidate,
};
+/*
+ * For the .zfs control directory to work properly we must be able to override
+ * the default operations table and register custom .d_automount and
+ * .d_revalidate callbacks.
+ */
+static void
+set_snapdir_dentry_ops(struct dentry *dentry, unsigned int extraflags) {
+ static const unsigned int op_flags =
+ DCACHE_OP_HASH | DCACHE_OP_COMPARE |
+ DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE |
+ DCACHE_OP_PRUNE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_REAL;
+
+#ifdef HAVE_D_SET_D_OP
+ /*
+ * d_set_d_op() will set the DCACHE_OP_ flags according to what it
+ * finds in the passed dentry_operations, so we don't have to.
+ *
+ * We clear the flags and the old op table before calling d_set_d_op()
+ * because issues a warning when the dentry operations table is already
+ * set.
+ */
+ dentry->d_op = NULL;
+ dentry->d_flags &= ~op_flags;
+ d_set_d_op(dentry, &zpl_dops_snapdirs);
+ dentry->d_flags |= extraflags;
+#else
+ /*
+ * Since 6.17 there's no exported way to modify dentry ops, so we have
+ * to reach in and do it ourselves. This should be safe for our very
+ * narrow use case, which is to create or splice in an entry to give
+ * access to a snapshot.
+ *
+ * We need to set the op flags directly. We hardcode
+ * DCACHE_OP_REVALIDATE because that's the only operation we have; if
+ * we ever extend zpl_dops_snapdirs we will need to update the op flags
+ * to match.
+ */
+ spin_lock(&dentry->d_lock);
+ dentry->d_op = &zpl_dops_snapdirs;
+ dentry->d_flags &= ~op_flags;
+ dentry->d_flags |= DCACHE_OP_REVALIDATE | extraflags;
+ spin_unlock(&dentry->d_lock);
+#endif
+}
+
static struct dentry *
zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
unsigned int flags)
@@ -236,10 +281,7 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
return (ERR_PTR(error));
ASSERT(error == 0 || ip == NULL);
- d_clear_d_op(dentry);
- d_set_d_op(dentry, &zpl_dops_snapdirs);
- dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
-
+ set_snapdir_dentry_ops(dentry, DCACHE_NEED_AUTOMOUNT);
return (d_splice_alias(ip, dentry));
}
@@ -341,14 +383,20 @@ zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry)
return (error);
}
+#if defined(HAVE_IOPS_MKDIR_USERNS)
static int
-#ifdef HAVE_IOPS_MKDIR_USERNS
zpl_snapdir_mkdir(struct user_namespace *user_ns, struct inode *dip,
struct dentry *dentry, umode_t mode)
#elif defined(HAVE_IOPS_MKDIR_IDMAP)
+static int
+zpl_snapdir_mkdir(struct mnt_idmap *user_ns, struct inode *dip,
+ struct dentry *dentry, umode_t mode)
+#elif defined(HAVE_IOPS_MKDIR_DENTRY)
+static struct dentry *
zpl_snapdir_mkdir(struct mnt_idmap *user_ns, struct inode *dip,
struct dentry *dentry, umode_t mode)
#else
+static int
zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
#endif
{
@@ -367,8 +415,7 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
if (error == 0) {
- d_clear_d_op(dentry);
- d_set_d_op(dentry, &zpl_dops_snapdirs);
+ set_snapdir_dentry_ops(dentry, 0);
d_instantiate(dentry, ip);
}
@@ -376,7 +423,11 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
ASSERT3S(error, <=, 0);
crfree(cr);
+#if defined(HAVE_IOPS_MKDIR_DENTRY)
+ return (ERR_PTR(error));
+#else
return (error);
+#endif
}
/*
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
index 787d3cb31410..d07317b0d910 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2011, Lawrence Livermore National Security, LLC.
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2025, Klara, Inc.
*/
@@ -36,10 +37,7 @@
#include <sys/zfs_vfsops.h>
#include <sys/zfs_vnops.h>
#include <sys/zfs_project.h>
-#if defined(HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS) || \
- defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO)
-#include <linux/pagemap.h>
-#endif
+#include <linux/pagemap_compat.h>
#include <linux/fadvise.h>
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
#include <linux/writeback.h>
@@ -109,60 +107,52 @@ zpl_iterate(struct file *filp, struct dir_context *ctx)
return (error);
}
+static inline int
+zpl_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc, void *data);
+
static int
zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
struct inode *inode = filp->f_mapping->host;
znode_t *zp = ITOZ(inode);
- zfsvfs_t *zfsvfs = ITOZSB(inode);
cred_t *cr = CRED();
int error;
fstrans_cookie_t cookie;
/*
- * The variables z_sync_writes_cnt and z_async_writes_cnt work in
- * tandem so that sync writes can detect if there are any non-sync
- * writes going on and vice-versa. The "vice-versa" part to this logic
- * is located in zfs_putpage() where non-sync writes check if there are
- * any ongoing sync writes. If any sync and non-sync writes overlap,
- * we do a commit to complete the non-sync writes since the latter can
- * potentially take several seconds to complete and thus block sync
- * writes in the upcoming call to filemap_write_and_wait_range().
- */
- atomic_inc_32(&zp->z_sync_writes_cnt);
- /*
- * If the following check does not detect an overlapping non-sync write
- * (say because it's just about to start), then it is guaranteed that
- * the non-sync write will detect this sync write. This is because we
- * always increment z_sync_writes_cnt / z_async_writes_cnt before doing
- * the check on z_async_writes_cnt / z_sync_writes_cnt here and in
- * zfs_putpage() respectively.
+ * Force dirty pages in the range out to the DMU and the log, ready
+ * for zil_commit() to write down.
+ *
+ * We call write_cache_pages() directly to ensure that zpl_putpage() is
+ * called with the flags we need. We need WB_SYNC_NONE to avoid a call
+ * to zil_commit() (since we're doing this as a kind of pre-sync); but
+ * we do need for_sync so that the pages remain in writeback until
+ * they're on disk, and so that we get an error if the DMU write fails.
*/
- if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
- if ((error = zpl_enter(zfsvfs, FTAG)) != 0) {
- atomic_dec_32(&zp->z_sync_writes_cnt);
+ if (filemap_range_has_page(inode->i_mapping, start, end)) {
+ int for_sync = 1;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = LONG_MAX,
+ .range_start = start,
+ .range_end = end,
+ };
+ error =
+ zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync);
+ if (error != 0) {
+ /*
+ * Unclear what state things are in. zfs_putpage() will
+ * ensure the pages remain dirty if they haven't been
+ * written down to the DMU, but because there may be
+ * nothing logged, we can't assume that zfs_sync() ->
+ * zil_commit() will give us a useful error. It's
+ * safest if we just error out here.
+ */
return (error);
}
- zil_commit(zfsvfs->z_log, zp->z_id);
- zpl_exit(zfsvfs, FTAG);
}
- error = filemap_write_and_wait_range(inode->i_mapping, start, end);
-
- /*
- * The sync write is not complete yet but we decrement
- * z_sync_writes_cnt since zfs_fsync() increments and decrements
- * it internally. If a non-sync write starts just after the decrement
- * operation but before we call zfs_fsync(), it may not detect this
- * overlapping sync write but it does not matter since we have already
- * gone past filemap_write_and_wait_range() and we won't block due to
- * the non-sync write.
- */
- atomic_dec_32(&zp->z_sync_writes_cnt);
-
- if (error)
- return (error);
-
crhold(cr);
cookie = spl_fstrans_mark();
error = -zfs_fsync(zp, datasync, cr);
@@ -226,7 +216,7 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
ssize_t count = iov_iter_count(to);
zfs_uio_t uio;
- zfs_uio_iov_iter_init(&uio, to, kiocb->ki_pos, count, 0);
+ zfs_uio_iov_iter_init(&uio, to, kiocb->ki_pos, count);
crhold(cr);
cookie = spl_fstrans_mark();
@@ -276,8 +266,7 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
if (ret)
return (ret);
- zfs_uio_iov_iter_init(&uio, from, kiocb->ki_pos, count,
- from->iov_offset);
+ zfs_uio_iov_iter_init(&uio, from, kiocb->ki_pos, count);
crhold(cr);
cookie = spl_fstrans_mark();
@@ -539,11 +528,30 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
if (sync_mode != wbc->sync_mode) {
if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
return (result);
- if (zfsvfs->z_log != NULL)
- zil_commit(zfsvfs->z_log, zp->z_id);
+
+ if (zfsvfs->z_log != NULL) {
+ /*
+ * We don't want to block here if the pool suspends,
+ * because this is not a syncing op by itself, but
+ * might be part of one that the caller will
+ * coordinate.
+ */
+ result = -zil_commit_flags(zfsvfs->z_log, zp->z_id,
+ ZIL_COMMIT_NOW);
+ }
+
zpl_exit(zfsvfs, FTAG);
/*
+ * If zil_commit_flags() failed, it's unclear what state things
+ * are currently in. putpage() has written back out what it can
+ * to the DMU, but it may not be on disk. We have little choice
+ * but to escape.
+ */
+ if (result != 0)
+ return (result);
+
+ /*
* We need to call write_cache_pages() again (we can't just
* return after the commit) because the previous call in
* non-SYNC mode does not guarantee that we got all the dirty
@@ -556,6 +564,7 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
return (result);
}
+#ifdef HAVE_VFS_WRITEPAGE
/*
* Write out dirty pages to the ARC, this function is only required to
* support mmap(2). Mapped pages may be dirtied by memory operations
@@ -572,6 +581,7 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc)
return (zpl_putpage(pp, wbc, &for_sync));
}
+#endif
/*
* The flag combination which matches the behavior of zfs_space() is
@@ -986,6 +996,27 @@ zpl_ioctl_setdosflags(struct file *filp, void __user *arg)
return (err);
}
+static int
+zpl_ioctl_rewrite(struct file *filp, void __user *arg)
+{
+ struct inode *ip = file_inode(filp);
+ zfs_rewrite_args_t args;
+ fstrans_cookie_t cookie;
+ int err;
+
+ if (copy_from_user(&args, arg, sizeof (args)))
+ return (-EFAULT);
+
+ if (unlikely(!(filp->f_mode & FMODE_WRITE)))
+ return (-EBADF);
+
+ cookie = spl_fstrans_mark();
+ err = -zfs_rewrite(ITOZ(ip), args.off, args.len, args.flags, args.arg);
+ spl_fstrans_unmark(cookie);
+
+ return (err);
+}
+
static long
zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
@@ -1004,12 +1035,8 @@ zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return (zpl_ioctl_getdosflags(filp, (void *)arg));
case ZFS_IOC_SETDOSFLAGS:
return (zpl_ioctl_setdosflags(filp, (void *)arg));
- case ZFS_IOC_COMPAT_FICLONE:
- return (zpl_ioctl_ficlone(filp, (void *)arg));
- case ZFS_IOC_COMPAT_FICLONERANGE:
- return (zpl_ioctl_ficlonerange(filp, (void *)arg));
- case ZFS_IOC_COMPAT_FIDEDUPERANGE:
- return (zpl_ioctl_fideduperange(filp, (void *)arg));
+ case ZFS_IOC_REWRITE:
+ return (zpl_ioctl_rewrite(filp, (void *)arg));
default:
return (-ENOTTY);
}
@@ -1047,7 +1074,9 @@ const struct address_space_operations zpl_address_space_operations = {
#else
.readpage = zpl_readpage,
#endif
+#ifdef HAVE_VFS_WRITEPAGE
.writepage = zpl_writepage,
+#endif
.writepages = zpl_writepages,
.direct_IO = zpl_direct_IO,
#ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS
@@ -1058,7 +1087,7 @@ const struct address_space_operations zpl_address_space_operations = {
#endif
#ifdef HAVE_VFS_MIGRATE_FOLIO
.migrate_folio = migrate_folio,
-#else
+#elif defined(HAVE_VFS_MIGRATEPAGE)
.migratepage = migrate_page,
#endif
};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c
index cb8562d21421..c40dde046142 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c
@@ -212,85 +212,3 @@ zpl_dedupe_file_range(struct file *src_file, loff_t src_off,
return (-EOPNOTSUPP);
}
#endif /* HAVE_VFS_DEDUPE_FILE_RANGE */
-
-/* Entry point for FICLONE, before Linux 4.5. */
-long
-zpl_ioctl_ficlone(struct file *dst_file, void *arg)
-{
- unsigned long sfd = (unsigned long)arg;
-
- struct file *src_file = fget(sfd);
- if (src_file == NULL)
- return (-EBADF);
-
- if (dst_file->f_op != src_file->f_op) {
- fput(src_file);
- return (-EXDEV);
- }
-
- size_t len = i_size_read(file_inode(src_file));
-
- ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len);
-
- fput(src_file);
-
- if (ret < 0) {
- if (ret == -EOPNOTSUPP)
- return (-ENOTTY);
- return (ret);
- }
-
- if (ret != len)
- return (-EINVAL);
-
- return (0);
-}
-
-/* Entry point for FICLONERANGE, before Linux 4.5. */
-long
-zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg)
-{
- zfs_ioc_compat_file_clone_range_t fcr;
-
- if (copy_from_user(&fcr, arg, sizeof (fcr)))
- return (-EFAULT);
-
- struct file *src_file = fget(fcr.fcr_src_fd);
- if (src_file == NULL)
- return (-EBADF);
-
- if (dst_file->f_op != src_file->f_op) {
- fput(src_file);
- return (-EXDEV);
- }
-
- size_t len = fcr.fcr_src_length;
- if (len == 0)
- len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;
-
- ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset,
- dst_file, fcr.fcr_dest_offset, len);
-
- fput(src_file);
-
- if (ret < 0) {
- if (ret == -EOPNOTSUPP)
- return (-ENOTTY);
- return (ret);
- }
-
- if (ret != len)
- return (-EINVAL);
-
- return (0);
-}
-
-/* Entry point for FIDEDUPERANGE, before Linux 4.5. */
-long
-zpl_ioctl_fideduperange(struct file *filp, void *arg)
-{
- (void) arg;
-
- /* No support for dedup yet */
- return (-ENOTTY);
-}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
index 85df9b9acf28..f97662d052c7 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
@@ -247,7 +247,7 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
* and fifos, but we want to know if this behavior ever changes.
*/
if (S_ISSOCK(mode) || S_ISFIFO(mode))
- ASSERT(rdev == 0);
+ ASSERT0(rdev);
crhold(cr);
vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
@@ -374,14 +374,20 @@ zpl_unlink(struct inode *dir, struct dentry *dentry)
return (error);
}
+#if defined(HAVE_IOPS_MKDIR_USERNS)
static int
-#ifdef HAVE_IOPS_MKDIR_USERNS
zpl_mkdir(struct user_namespace *user_ns, struct inode *dir,
struct dentry *dentry, umode_t mode)
#elif defined(HAVE_IOPS_MKDIR_IDMAP)
+static int
+zpl_mkdir(struct mnt_idmap *user_ns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
+#elif defined(HAVE_IOPS_MKDIR_DENTRY)
+static struct dentry *
zpl_mkdir(struct mnt_idmap *user_ns, struct inode *dir,
struct dentry *dentry, umode_t mode)
#else
+static int
zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
#endif
{
@@ -390,12 +396,14 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
znode_t *zp;
int error;
fstrans_cookie_t cookie;
-#if !(defined(HAVE_IOPS_MKDIR_USERNS) || defined(HAVE_IOPS_MKDIR_IDMAP))
+#if !(defined(HAVE_IOPS_MKDIR_USERNS) || \
+ defined(HAVE_IOPS_MKDIR_IDMAP) || defined(HAVE_IOPS_MKDIR_DENTRY))
zidmap_t *user_ns = kcred->user_ns;
#endif
if (is_nametoolong(dentry)) {
- return (-ENAMETOOLONG);
+ error = -ENAMETOOLONG;
+ goto err;
}
crhold(cr);
@@ -422,9 +430,14 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
spl_fstrans_unmark(cookie);
kmem_free(vap, sizeof (vattr_t));
crfree(cr);
- ASSERT3S(error, <=, 0);
+err:
+ ASSERT3S(error, <=, 0);
+#if defined(HAVE_IOPS_MKDIR_DENTRY)
+ return (error != 0 ? ERR_PTR(error) : NULL);
+#else
return (error);
+#endif
}
static int
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
index 40c25e464c5d..444948d03cb3 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2011, Lawrence Livermore National Security, LLC.
* Copyright (c) 2023, Datto Inc. All rights reserved.
+ * Copyright (c) 2025, Klara, Inc.
*/
@@ -31,7 +32,22 @@
#include <sys/zfs_ctldir.h>
#include <sys/zpl.h>
#include <linux/iversion.h>
+#include <linux/version.h>
+/*
+ * What to do when the last reference to an inode is released. If 0, the kernel
+ * will cache it on the superblock. If 1, the inode will be freed immediately.
+ * See zpl_drop_inode().
+ */
+int zfs_delete_inode = 0;
+
+/*
+ * What to do when the last reference to a dentry is released. If 0, the kernel
+ * will cache it until the entry (file) is destroyed. If 1, the dentry will be
+ * marked for cleanup, at which time its inode reference will be released. See
+ * zpl_dentry_delete().
+ */
+int zfs_delete_dentry = 0;
static struct inode *
zpl_inode_alloc(struct super_block *sb)
@@ -44,10 +60,19 @@ zpl_inode_alloc(struct super_block *sb)
return (ip);
}
+#ifdef HAVE_SOPS_FREE_INODE
+static void
+zpl_inode_free(struct inode *ip)
+{
+ ASSERT0(atomic_read(&ip->i_count));
+ zfs_inode_free(ip);
+}
+#endif
+
static void
zpl_inode_destroy(struct inode *ip)
{
- ASSERT(atomic_read(&ip->i_count) == 0);
+ ASSERT0(atomic_read(&ip->i_count));
zfs_inode_destroy(ip);
}
@@ -67,11 +92,36 @@ zpl_dirty_inode(struct inode *ip, int flags)
}
/*
- * When ->drop_inode() is called its return value indicates if the
- * inode should be evicted from the inode cache. If the inode is
- * unhashed and has no links the default policy is to evict it
- * immediately.
+ * ->drop_inode() is called when the last reference to an inode is released.
+ * Its return value indicates if the inode should be destroyed immediately, or
+ * cached on the superblock structure.
+ *
+ * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns
+ * "destroy immediately" if the inode is unhashed and has no links (roughly: no
+ * longer exists on disk). On datasets with millions of rarely-accessed files,
+ * this can cause a large amount of memory to be "pinned" by cached inodes,
+ * which in turn pin their associated dnodes and dbufs, until the kernel starts
+ * reporting memory pressure and requests OpenZFS release some memory (see
+ * zfs_prune()).
+ *
+ * When set to 1, we call generic_delete_node(), which always returns "destroy
+ * immediately", resulting in inodes being destroyed immediately, releasing
+ * their associated dnodes and dbufs to the dbuf cached and the ARC to be
+ * evicted as normal.
*
+ * Note that the "last reference" doesn't always mean the last _userspace_
+ * reference; the dentry cache also holds a reference, so "busy" inodes will
+ * still be kept alive that way (subject to dcache tuning).
+ */
+static int
+zpl_drop_inode(struct inode *ip)
+{
+ if (zfs_delete_inode)
+ return (generic_delete_inode(ip));
+ return (generic_drop_inode(ip));
+}
+
+/*
* The ->evict_inode() callback must minimally truncate the inode pages,
* and call clear_inode(). For 2.6.35 and later kernels this will
* simply update the inode state, with the sync occurring before the
@@ -105,6 +155,42 @@ zpl_put_super(struct super_block *sb)
ASSERT3S(error, <=, 0);
}
+/*
+ * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2)
+ * syscalls, via sb->s_op->sync_fs().
+ *
+ * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() ->
+ * sync_filesystem() would ignore the return from sync_fs(), instead only
+ * considing the error from syncing the underlying block device (sb->s_dev).
+ * Since OpenZFS doesn't _have_ an underlying block device, there's no way for
+ * us to report a sync directly.
+ *
+ * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra
+ * error store `s_wb_err`, to carry errors seen on page writeback since the
+ * last call to syncfs(). If sync_filesystem() does not return an error, any
+ * existing writeback error on the superblock will be used instead (and cleared
+ * either way). We don't use this (page writeback is a different thing for us),
+ * so for 5.8-5.17 we can use that instead to get syncfs() to return the error.
+ *
+ * Before 5.8, we have no other good options - no matter what happens, the
+ * userspace program will be told the call has succeeded, and so we must make
+ * it so, Therefore, when we are asked to wait for sync to complete (wait ==
+ * 1), if zfs_sync() has returned an error we have no choice but to block,
+ * regardless of the reason.
+ *
+ * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely
+ * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the
+ * mainline Linux series at time of writing), and has likely been backported to
+ * vendor kernels before 5.8. We don't really want to use a workaround when we
+ * don't have to, but we can't really detect whether or not sync_filesystem()
+ * will return our errors (without a difficult runtime test anyway). So, we use
+ * a static version check: any kernel reporting its version as 5.17+ will use a
+ * direct error return, otherwise, we'll either use s_wb_err if it was detected
+ * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will
+ * block to ensure the correct semantics.
+ *
+ * See https://github.com/openzfs/zfs/issues/17416 for further discussion.
+ */
static int
zpl_sync_fs(struct super_block *sb, int wait)
{
@@ -115,10 +201,28 @@ zpl_sync_fs(struct super_block *sb, int wait)
crhold(cr);
cookie = spl_fstrans_mark();
error = -zfs_sync(sb, wait, cr);
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)
+#ifdef HAVE_SUPER_BLOCK_S_WB_ERR
+ if (error && wait)
+ errseq_set(&sb->s_wb_err, error);
+#else
+ if (error && wait) {
+ zfsvfs_t *zfsvfs = sb->s_fs_info;
+ ASSERT3P(zfsvfs, !=, NULL);
+ if (zfs_enter(zfsvfs, FTAG) == 0) {
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+ zfs_exit(zfsvfs, FTAG);
+ error = 0;
+ }
+ }
+#endif
+#endif /* < 5.17.0 */
+
spl_fstrans_unmark(cookie);
crfree(cr);
- ASSERT3S(error, <=, 0);
+ ASSERT3S(error, <=, 0);
return (error);
}
@@ -400,9 +504,13 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg)
const struct super_operations zpl_super_operations = {
.alloc_inode = zpl_inode_alloc,
+#ifdef HAVE_SOPS_FREE_INODE
+ .free_inode = zpl_inode_free,
+#endif
.destroy_inode = zpl_inode_destroy,
.dirty_inode = zpl_dirty_inode,
.write_inode = NULL,
+ .drop_inode = zpl_drop_inode,
.evict_inode = zpl_evict_inode,
.put_super = zpl_put_super,
.sync_fs = zpl_sync_fs,
@@ -413,6 +521,35 @@ const struct super_operations zpl_super_operations = {
.show_stats = NULL,
};
+/*
+ * ->d_delete() is called when the last reference to a dentry is released. Its
+ * return value indicates if the dentry should be destroyed immediately, or
+ * retained in the dentry cache.
+ *
+ * By default (zfs_delete_dentry=0) the kernel will always cache unused
+ * entries. Each dentry holds an inode reference, so cached dentries can hold
+ * the final inode reference indefinitely, leading to the inode and its related
+ * data being pinned (see zpl_drop_inode()).
+ *
+ * When set to 1, we signal that the dentry should be destroyed immediately and
+ * never cached. This reduces memory usage, at the cost of higher overheads to
+ * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be
+ * reloaded and reinflated.
+ *
+ * Note that userspace does not have direct control over dentry references and
+ * reclaim; rather, this is part of the kernel's caching and reclaim subsystems
+ * (eg vm.vfs_cache_pressure).
+ */
+static int
+zpl_dentry_delete(const struct dentry *dentry)
+{
+ return (zfs_delete_dentry ? 1 : 0);
+}
+
+const struct dentry_operations zpl_dentry_operations = {
+ .d_delete = zpl_dentry_delete,
+};
+
struct file_system_type zpl_fs_type = {
.owner = THIS_MODULE,
.name = ZFS_DRIVER,
@@ -424,3 +561,10 @@ struct file_system_type zpl_fs_type = {
.mount = zpl_mount,
.kill_sb = zpl_kill_sb,
};
+
+ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW,
+ "Delete inodes as soon as the last reference is released.");
+
+ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW,
+ "Delete dentries from dentry cache as soon as the last reference is "
+ "released.");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
index a098197e7448..d93282db815a 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
@@ -1494,7 +1494,7 @@ zpl_posix_acl_free(void *arg)
acl_rel_head = NULL;
if (cmpxchg(&acl_rel_tail, &a->next,
&acl_rel_head) == &a->next) {
- ASSERT3P(a->next, ==, NULL);
+ ASSERT0P(a->next);
a->next = freelist;
freelist = a;
break;
@@ -1544,7 +1544,7 @@ zpl_posix_acl_release_impl(struct posix_acl *acl)
a->time = ddi_get_lbolt();
/* atomically points tail to us and get the previous tail */
prev = xchg(&acl_rel_tail, &a->next);
- ASSERT3P(*prev, ==, NULL);
+ ASSERT0P(*prev);
*prev = a;
/* if it was empty before, schedule the free task */
if (prev == &acl_rel_head)
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index c8a04539258f..967a018640e1 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
- * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2024, 2025, Klara, Inc.
*/
#include <sys/dataset_kstats.h>
@@ -51,21 +51,12 @@ static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
struct request *rq, boolean_t force_sync);
static unsigned int zvol_major = ZVOL_MAJOR;
-static unsigned int zvol_request_sync = 0;
-static unsigned int zvol_prefetch_bytes = (128 * 1024);
static unsigned long zvol_max_discard_blocks = 16384;
-/*
- * Switch taskq at multiple of 512 MB offset. This can be set to a lower value
- * to utilize more threads for small files but may affect prefetch hits.
- */
-#define ZVOL_TASKQ_OFFSET_SHIFT 29
-
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
static unsigned int zvol_open_timeout_ms = 1000;
#endif
-static unsigned int zvol_threads = 0;
static unsigned int zvol_blk_mq_threads = 0;
static unsigned int zvol_blk_mq_actual_threads;
static boolean_t zvol_use_blk_mq = B_FALSE;
@@ -82,8 +73,6 @@ static boolean_t zvol_use_blk_mq = B_FALSE;
*/
static unsigned int zvol_blk_mq_blocks_per_thread = 8;
-static unsigned int zvol_num_taskqs = 0;
-
#ifndef BLKDEV_DEFAULT_RQ
/* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
#define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
@@ -95,8 +84,9 @@ static unsigned int zvol_num_taskqs = 0;
static inline void
zvol_end_io(struct bio *bio, struct request *rq, int error)
{
+ ASSERT3U(error, >=, 0);
if (bio) {
- bio->bi_status = errno_to_bi_status(-error);
+ bio->bi_status = errno_to_bi_status(error);
bio_endio(bio);
} else {
blk_mq_end_request(rq, errno_to_bi_status(error));
@@ -117,45 +107,8 @@ struct zvol_state_os {
boolean_t use_blk_mq;
};
-typedef struct zv_taskq {
- uint_t tqs_cnt;
- taskq_t **tqs_taskq;
-} zv_taskq_t;
-static zv_taskq_t zvol_taskqs;
static struct ida zvol_ida;
-typedef struct zv_request_stack {
- zvol_state_t *zv;
- struct bio *bio;
- struct request *rq;
-} zv_request_t;
-
-typedef struct zv_work {
- struct request *rq;
- struct work_struct work;
-} zv_work_t;
-
-typedef struct zv_request_task {
- zv_request_t zvr;
- taskq_ent_t ent;
-} zv_request_task_t;
-
-static zv_request_task_t *
-zv_request_task_create(zv_request_t zvr)
-{
- zv_request_task_t *task;
- task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
- taskq_init_ent(&task->ent);
- task->zvr = zvr;
- return (task);
-}
-
-static void
-zv_request_task_free(zv_request_task_t *task)
-{
- kmem_free(task, sizeof (*task));
-}
-
/*
* This is called when a new block multiqueue request comes in. A request
* contains one or more BIOs.
@@ -256,8 +209,14 @@ zvol_write(zv_request_t *zvr)
disk = zv->zv_zso->zvo_disk;
/* bio marked as FLUSH need to flush before write */
- if (io_is_flush(bio, rq))
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if (io_is_flush(bio, rq)) {
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if (error != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ zvol_end_io(bio, rq, -error);
+ return;
+ }
+ }
/* Some requests are just for flush and nothing else. */
if (io_size(bio, rq) == 0) {
@@ -305,7 +264,8 @@ zvol_write(zv_request_t *zvr)
dmu_tx_abort(tx);
break;
}
- error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
+ error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
+ DMU_READ_PREFETCH);
if (error == 0) {
zvol_log_write(zv, tx, off, bytes, sync);
}
@@ -320,8 +280,8 @@ zvol_write(zv_request_t *zvr)
dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
task_io_account_write(nwritten);
- if (sync)
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if (error == 0 && sync)
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
rw_exit(&zv->zv_suspend_lock);
@@ -329,7 +289,7 @@ zvol_write(zv_request_t *zvr)
blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
}
- zvol_end_io(bio, rq, -error);
+ zvol_end_io(bio, rq, error);
}
static void
@@ -408,7 +368,7 @@ zvol_discard(zv_request_t *zvr)
zfs_rangelock_exit(lr);
if (error == 0 && sync)
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
unlock:
rw_exit(&zv->zv_suspend_lock);
@@ -418,7 +378,7 @@ unlock:
start_time);
}
- zvol_end_io(bio, rq, -error);
+ zvol_end_io(bio, rq, error);
}
static void
@@ -475,7 +435,8 @@ zvol_read(zv_request_t *zvr)
if (bytes > volsize - uio.uio_loffset)
bytes = volsize - uio.uio_loffset;
- error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
+ error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
+ DMU_READ_PREFETCH);
if (error) {
/* convert checksum errors into IO errors */
if (error == ECKSUM)
@@ -495,7 +456,7 @@ zvol_read(zv_request_t *zvr)
blk_generic_end_io_acct(q, disk, READ, bio, start_time);
}
- zvol_end_io(bio, rq, -error);
+ zvol_end_io(bio, rq, error);
}
static void
@@ -523,10 +484,31 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
fstrans_cookie_t cookie = spl_fstrans_mark();
uint64_t offset = io_offset(bio, rq);
uint64_t size = io_size(bio, rq);
- int rw = io_data_dir(bio, rq);
+ int rw;
+
+ if (rq != NULL) {
+ /*
+ * Flush & trim requests go down the zvol_write codepath. Or
+ * more specifically:
+ *
+ * If request is a write, or if it's op_is_sync() and not a
+ * read, or if it's a flush, or if it's a discard, then send the
+ * request down the write path.
+ */
+ if (op_is_write(rq->cmd_flags) ||
+ (op_is_sync(rq->cmd_flags) && req_op(rq) != REQ_OP_READ) ||
+ req_op(rq) == REQ_OP_FLUSH ||
+ op_is_discard(rq->cmd_flags)) {
+ rw = WRITE;
+ } else {
+ rw = READ;
+ }
+ } else {
+ rw = bio_data_dir(bio);
+ }
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
- zvol_end_io(bio, rq, -SET_ERROR(ENXIO));
+ zvol_end_io(bio, rq, SET_ERROR(ENXIO));
goto out;
}
@@ -545,7 +527,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
(long long unsigned)offset,
(long unsigned)size);
- zvol_end_io(bio, rq, -SET_ERROR(EIO));
+ zvol_end_io(bio, rq, SET_ERROR(EIO));
goto out;
}
@@ -558,8 +540,8 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
#ifdef HAVE_BLK_MQ_RQ_HCTX
blk_mq_hw_queue = rq->mq_hctx->queue_num;
#else
- blk_mq_hw_queue =
- rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
+ blk_mq_hw_queue = rq->q->queue_hw_ctx[
+ rq->q->mq_map[raw_smp_processor_id()]]->queue_num;
#endif
taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
blk_mq_hw_queue);
@@ -567,7 +549,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
if (rw == WRITE) {
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
- zvol_end_io(bio, rq, -SET_ERROR(EROFS));
+ zvol_end_io(bio, rq, SET_ERROR(EROFS));
goto out;
}
@@ -718,28 +700,19 @@ zvol_open(struct block_device *bdev, fmode_t flag)
retry:
#endif
- rw_enter(&zvol_state_lock, RW_READER);
- /*
- * Obtain a copy of private_data under the zvol_state_lock to make
- * sure that either the result of zvol free code path setting
- * disk->private_data to NULL is observed, or zvol_os_free()
- * is not called on this zv because of the positive zv_open_count.
- */
+
#ifdef HAVE_BLK_MODE_T
- zv = disk->private_data;
+ zv = atomic_load_ptr(&disk->private_data);
#else
- zv = bdev->bd_disk->private_data;
+ zv = atomic_load_ptr(&bdev->bd_disk->private_data);
#endif
if (zv == NULL) {
- rw_exit(&zvol_state_lock);
return (-SET_ERROR(ENXIO));
}
mutex_enter(&zv->zv_state_lock);
-
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
mutex_exit(&zv->zv_state_lock);
- rw_exit(&zvol_state_lock);
return (-SET_ERROR(ENXIO));
}
@@ -751,8 +724,28 @@ retry:
if (zv->zv_open_count == 0) {
if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
mutex_exit(&zv->zv_state_lock);
+
+ /*
+ * Removal may happen while the locks are down, so
+ * we can't trust zv any longer; we have to start over.
+ */
+#ifdef HAVE_BLK_MODE_T
+ zv = atomic_load_ptr(&disk->private_data);
+#else
+ zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+#endif
+ if (zv == NULL)
+ return (-SET_ERROR(ENXIO));
+
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_exit(&zv->zv_suspend_lock);
+ return (-SET_ERROR(ENXIO));
+ }
+
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
@@ -763,7 +756,6 @@ retry:
drop_suspend = B_TRUE;
}
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -860,11 +852,11 @@ zvol_release(struct gendisk *disk, fmode_t unused)
#if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
(void) unused;
#endif
- zvol_state_t *zv;
boolean_t drop_suspend = B_TRUE;
- rw_enter(&zvol_state_lock, RW_READER);
- zv = disk->private_data;
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
+ if (zv == NULL)
+ return;
mutex_enter(&zv->zv_state_lock);
ASSERT3U(zv->zv_open_count, >, 0);
@@ -878,6 +870,15 @@ zvol_release(struct gendisk *disk, fmode_t unused)
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
+
+ /*
+ * Unlike in zvol_open(), we don't check if removal
+ * started here, because we might be one of the openers
+ * that needs to be thrown out! If we're the last, we
+ * need to call zvol_last_close() below to finish
+ * cleanup. So, no special treatment for us.
+ */
+
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 1) {
rw_exit(&zv->zv_suspend_lock);
@@ -887,7 +888,6 @@ zvol_release(struct gendisk *disk, fmode_t unused)
} else {
drop_suspend = B_FALSE;
}
- rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -907,9 +907,10 @@ static int
zvol_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
- zvol_state_t *zv = bdev->bd_disk->private_data;
int error = 0;
+ zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+ ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
switch (cmd) {
@@ -932,16 +933,18 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode,
case BLKZNAME:
mutex_enter(&zv->zv_state_lock);
- error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
+ error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
mutex_exit(&zv->zv_state_lock);
+ if (error)
+ error = SET_ERROR(error);
break;
default:
- error = -ENOTTY;
+ error = SET_ERROR(ENOTTY);
break;
}
- return (SET_ERROR(error));
+ return (-error);
}
#ifdef CONFIG_COMPAT
@@ -960,9 +963,8 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
{
unsigned int mask = 0;
- rw_enter(&zvol_state_lock, RW_READER);
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
- zvol_state_t *zv = disk->private_data;
if (zv != NULL) {
mutex_enter(&zv->zv_state_lock);
mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
@@ -970,17 +972,14 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
mutex_exit(&zv->zv_state_lock);
}
- rw_exit(&zvol_state_lock);
-
return (mask);
}
static int
zvol_revalidate_disk(struct gendisk *disk)
{
- rw_enter(&zvol_state_lock, RW_READER);
+ zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
- zvol_state_t *zv = disk->private_data;
if (zv != NULL) {
mutex_enter(&zv->zv_state_lock);
set_capacity(zv->zv_zso->zvo_disk,
@@ -988,8 +987,6 @@ zvol_revalidate_disk(struct gendisk *disk)
mutex_exit(&zv->zv_state_lock);
}
- rw_exit(&zvol_state_lock);
-
return (0);
}
@@ -1008,16 +1005,6 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
return (0);
}
-void
-zvol_os_clear_private(zvol_state_t *zv)
-{
- /*
- * Cleared while holding zvol_state_lock as a writer
- * which will prevent zvol_open() from opening it.
- */
- zv->zv_zso->zvo_disk->private_data = NULL;
-}
-
/*
* Provide a simple virtual geometry for legacy compatibility. For devices
* smaller than 1 MiB a small head and sector count is used to allow very
@@ -1027,9 +1014,10 @@ zvol_os_clear_private(zvol_state_t *zv)
static int
zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
- zvol_state_t *zv = bdev->bd_disk->private_data;
sector_t sectors;
+ zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
+ ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
sectors = get_capacity(zv->zv_zso->zvo_disk);
@@ -1348,27 +1336,30 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
* Allocate memory for a new zvol_state_t and setup the required
* request queue and generic disk structures for the block device.
*/
-static zvol_state_t *
-zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
+static int
+zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize,
+ zvol_state_t **zvp)
{
zvol_state_t *zv;
struct zvol_state_os *zso;
uint64_t volmode;
int ret;
- if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
- return (NULL);
+ ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL);
+ if (ret)
+ return (ret);
if (volmode == ZFS_VOLMODE_DEFAULT)
volmode = zvol_volmode;
if (volmode == ZFS_VOLMODE_NONE)
- return (NULL);
+ return (0);
zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
zv->zv_zso = zso;
zv->zv_volmode = volmode;
+ zv->zv_volsize = volsize;
zv->zv_volblocksize = volblocksize;
list_link_init(&zv->zv_next);
@@ -1397,13 +1388,15 @@ zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
*/
if (zv->zv_zso->use_blk_mq) {
ret = zvol_alloc_blk_mq(zv, &limits);
+ if (ret != 0)
+ goto out_kmem;
zso->zvo_disk->fops = &zvol_ops_blk_mq;
} else {
ret = zvol_alloc_non_blk_mq(zso, &limits);
+ if (ret != 0)
+ goto out_kmem;
zso->zvo_disk->fops = &zvol_ops;
}
- if (ret != 0)
- goto out_kmem;
/* Limit read-ahead to a single page to prevent over-prefetching. */
blk_queue_set_read_ahead(zso->zvo_queue, 1);
@@ -1440,61 +1433,79 @@ zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
ZVOL_DEV_NAME, (dev & MINORMASK));
- return (zv);
+ *zvp = zv;
+ return (ret);
out_kmem:
kmem_free(zso, sizeof (struct zvol_state_os));
kmem_free(zv, sizeof (zvol_state_t));
- return (NULL);
+ return (ret);
}
-/*
- * Cleanup then free a zvol_state_t which was created by zvol_alloc().
- * At this time, the structure is not opened by anyone, is taken off
- * the zvol_state_list, and has its private data set to NULL.
- * The zvol_state_lock is dropped.
- *
- * This function may take many milliseconds to complete (e.g. we've seen
- * it take over 256ms), due to the calls to "blk_cleanup_queue" and
- * "del_gendisk". Thus, consumers need to be careful to account for this
- * latency when calling this function.
- */
void
-zvol_os_free(zvol_state_t *zv)
+zvol_os_remove_minor(zvol_state_t *zv)
{
-
- ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
- ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
ASSERT0(zv->zv_open_count);
- ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
+ ASSERT0(atomic_read(&zv->zv_suspend_ref));
+ ASSERT(zv->zv_flags & ZVOL_REMOVING);
- rw_destroy(&zv->zv_suspend_lock);
- zfs_rangelock_fini(&zv->zv_rangelock);
+ struct zvol_state_os *zso = zv->zv_zso;
+ zv->zv_zso = NULL;
+
+ /* Clearing private_data will make new callers return immediately. */
+ atomic_store_ptr(&zso->zvo_disk->private_data, NULL);
+
+ /*
+ * Drop the state lock before calling del_gendisk(). There may be
+ * callers waiting to acquire it, but del_gendisk() will block until
+ * they exit, which would deadlock.
+ */
+ mutex_exit(&zv->zv_state_lock);
- del_gendisk(zv->zv_zso->zvo_disk);
+ del_gendisk(zso->zvo_disk);
#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
#if defined(HAVE_BLK_CLEANUP_DISK)
- blk_cleanup_disk(zv->zv_zso->zvo_disk);
+ blk_cleanup_disk(zso->zvo_disk);
#else
- put_disk(zv->zv_zso->zvo_disk);
+ put_disk(zso->zvo_disk);
#endif
#else
- blk_cleanup_queue(zv->zv_zso->zvo_queue);
- put_disk(zv->zv_zso->zvo_disk);
+ blk_cleanup_queue(zso->zvo_queue);
+ put_disk(zso->zvo_disk);
#endif
- if (zv->zv_zso->use_blk_mq)
- blk_mq_free_tag_set(&zv->zv_zso->tag_set);
+ if (zso->use_blk_mq)
+ blk_mq_free_tag_set(&zso->tag_set);
- ida_simple_remove(&zvol_ida,
- MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
+ ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
+
+ kmem_free(zso, sizeof (struct zvol_state_os));
+
+ mutex_enter(&zv->zv_state_lock);
+}
+
+void
+zvol_os_free(zvol_state_t *zv)
+{
+
+ ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+ ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT0(zv->zv_open_count);
+ ASSERT0P(zv->zv_zso);
+
+ ASSERT0P(zv->zv_objset);
+ ASSERT0P(zv->zv_zilog);
+ ASSERT0P(zv->zv_dn);
+
+ rw_destroy(&zv->zv_suspend_lock);
+ zfs_rangelock_fini(&zv->zv_rangelock);
cv_destroy(&zv->zv_removing_cv);
mutex_destroy(&zv->zv_state_lock);
dataset_kstats_destroy(&zv->zv_kstat);
- kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
kmem_free(zv, sizeof (zvol_state_t));
}
@@ -1514,7 +1525,9 @@ __zvol_os_add_disk(struct gendisk *disk)
{
int error = 0;
#ifdef HAVE_ADD_DISK_RET
- error = add_disk(disk);
+ error = -add_disk(disk);
+ if (error)
+ error = SET_ERROR(error);
#else
add_disk(disk);
#endif
@@ -1606,7 +1619,7 @@ zvol_os_add_disk(struct gendisk *disk)
int
zvol_os_create_minor(const char *name)
{
- zvol_state_t *zv;
+ zvol_state_t *zv = NULL;
objset_t *os;
dmu_object_info_t *doi;
uint64_t volsize;
@@ -1655,18 +1668,16 @@ zvol_os_create_minor(const char *name)
if (error)
goto out_dmu_objset_disown;
- zv = zvol_alloc(MKDEV(zvol_major, minor), name,
- doi->doi_data_block_size);
- if (zv == NULL) {
- error = SET_ERROR(EAGAIN);
+ error = zvol_alloc(MKDEV(zvol_major, minor), name,
+ volsize, doi->doi_data_block_size, &zv);
+ if (error || zv == NULL)
goto out_dmu_objset_disown;
- }
+
zv->zv_hash = hash;
if (dmu_objset_is_snapshot(os))
zv->zv_flags |= ZVOL_RDONLY;
- zv->zv_volsize = volsize;
zv->zv_objset = os;
/* Default */
@@ -1691,11 +1702,11 @@ zvol_os_create_minor(const char *name)
blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
#endif
- ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+ ASSERT0P(zv->zv_kstat.dk_kstats);
error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
if (error)
goto out_dmu_objset_disown;
- ASSERT3P(zv->zv_zilog, ==, NULL);
+ ASSERT0P(zv->zv_zilog);
zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
if (spa_writeable(dmu_objset_spa(os))) {
if (zil_replay_disable)
@@ -1733,7 +1744,7 @@ out_doi:
* zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
* directly as well.
*/
- if (error == 0) {
+ if (error == 0 && zv) {
rw_enter(&zvol_state_lock, RW_WRITER);
zvol_insert(zv);
rw_exit(&zvol_state_lock);
@@ -1745,7 +1756,7 @@ out_doi:
return (error);
}
-void
+int
zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
{
int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
@@ -1772,6 +1783,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
set_disk_ro(zv->zv_zso->zvo_disk, readonly);
dataset_kstats_rename(&zv->zv_kstat, newname);
+
+ return (0);
}
void
@@ -1793,61 +1806,16 @@ zvol_init(void)
{
int error;
- /*
- * zvol_threads is the module param the user passes in.
- *
- * zvol_actual_threads is what we use internally, since the user can
- * pass zvol_thread = 0 to mean "use all the CPUs" (the default).
- */
- static unsigned int zvol_actual_threads;
-
- if (zvol_threads == 0) {
- /*
- * See dde9380a1 for why 32 was chosen here. This should
- * probably be refined to be some multiple of the number
- * of CPUs.
- */
- zvol_actual_threads = MAX(num_online_cpus(), 32);
- } else {
- zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
+ error = zvol_init_impl();
+ if (error) {
+ printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error);
+ return (error);
}
- /*
- * Use atleast 32 zvol_threads but for many core system,
- * prefer 6 threads per taskq, but no more taskqs
- * than threads in them on large systems.
- *
- * taskq total
- * cpus taskqs threads threads
- * ------- ------- ------- -------
- * 1 1 32 32
- * 2 1 32 32
- * 4 1 32 32
- * 8 2 16 32
- * 16 3 11 33
- * 32 5 7 35
- * 64 8 8 64
- * 128 11 12 132
- * 256 16 16 256
- */
- zv_taskq_t *ztqs = &zvol_taskqs;
- uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
- if (num_tqs == 0) {
- num_tqs = 1 + num_online_cpus() / 6;
- while (num_tqs * num_tqs > zvol_actual_threads)
- num_tqs--;
- }
- uint_t per_tq_thread = zvol_actual_threads / num_tqs;
- if (per_tq_thread * num_tqs < zvol_actual_threads)
- per_tq_thread++;
- ztqs->tqs_cnt = num_tqs;
- ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
- error = register_blkdev(zvol_major, ZVOL_DRIVER);
+ error = -register_blkdev(zvol_major, ZVOL_DRIVER);
if (error) {
- kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
- ztqs->tqs_taskq = NULL;
printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
- return (error);
+ return (SET_ERROR(error));
}
if (zvol_blk_mq_queue_depth == 0) {
@@ -1864,25 +1832,6 @@ zvol_init(void)
1024);
}
- for (uint_t i = 0; i < num_tqs; i++) {
- char name[32];
- (void) snprintf(name, sizeof (name), "%s_tq-%u",
- ZVOL_DRIVER, i);
- ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
- maxclsyspri, per_tq_thread, INT_MAX,
- TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
- if (ztqs->tqs_taskq[i] == NULL) {
- for (int j = i - 1; j >= 0; j--)
- taskq_destroy(ztqs->tqs_taskq[j]);
- unregister_blkdev(zvol_major, ZVOL_DRIVER);
- kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
- sizeof (taskq_t *));
- ztqs->tqs_taskq = NULL;
- return (-ENOMEM);
- }
- }
-
- zvol_init_impl();
ida_init(&zvol_ida);
return (0);
}
@@ -1890,50 +1839,19 @@ zvol_init(void)
void
zvol_fini(void)
{
- zv_taskq_t *ztqs = &zvol_taskqs;
- zvol_fini_impl();
unregister_blkdev(zvol_major, ZVOL_DRIVER);
- if (ztqs->tqs_taskq == NULL) {
- ASSERT3U(ztqs->tqs_cnt, ==, 0);
- } else {
- for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
- ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
- taskq_destroy(ztqs->tqs_taskq[i]);
- }
- kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
- sizeof (taskq_t *));
- ztqs->tqs_taskq = NULL;
- }
+ zvol_fini_impl();
ida_destroy(&zvol_ida);
}
-module_param(zvol_inhibit_dev, uint, 0644);
-MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
-
module_param(zvol_major, uint, 0444);
MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
-module_param(zvol_threads, uint, 0444);
-MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set"
- "to 0 to use all active CPUs");
-
-module_param(zvol_request_sync, uint, 0644);
-MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
-
module_param(zvol_max_discard_blocks, ulong, 0444);
MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
-module_param(zvol_num_taskqs, uint, 0444);
-MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");
-
-module_param(zvol_prefetch_bytes, uint, 0644);
-MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
-
-module_param(zvol_volmode, uint, 0644);
-MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
-
module_param(zvol_blk_mq_queue_depth, uint, 0644);
MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");