diff options
Diffstat (limited to 'sys/contrib/openzfs/module/os')
51 files changed, 2364 insertions, 1503 deletions
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c index 6d198fad5203..ae6e36d988c2 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c @@ -160,7 +160,7 @@ kmem_cache_create(const char *name, size_t bufsize, size_t align, { kmem_cache_t *cache; - ASSERT3P(vmp, ==, NULL); + ASSERT0P(vmp); cache = kmem_alloc(sizeof (*cache), KM_SLEEP); strlcpy(cache->kc_name, name, sizeof (cache->kc_name)); diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c index f9125a067cd1..3f360d167b17 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c @@ -101,6 +101,15 @@ spl_panic(const char *file, const char *func, int line, const char *fmt, ...) va_end(ap); } +/* + * Check if the current thread is a memory reclaim thread. + * Returns true if curproc is pageproc (FreeBSD's page daemon). + */ +int +current_is_reclaim_thread(void) +{ + return (curproc == pageproc); +} SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY, opensolaris_utsname_init, NULL); diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c index aad3ef2fad5d..7fc93648c71e 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c @@ -53,13 +53,6 @@ secpolicy_zfs(cred_t *cr) } int -secpolicy_zfs_proc(cred_t *cr, proc_t *proc) -{ - - return (priv_check_cred(cr, PRIV_VFS_MOUNT)); -} - -int secpolicy_sys_config(cred_t *cr, int checkonly __unused) { diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c index aee506f28615..a92068aa1cdc 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c @@ -30,7 +30,7 @@ #include <sys/param.h> #include <sys/string.h> #include <sys/kmem.h> -#include <machine/stdarg.h> +#include <sys/stdarg.h> #define IS_DIGIT(c) ((c) >= '0' && (c) <= '9') diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c index 9da633c2b1be..3c2d39b20c09 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c @@ -256,7 +256,7 @@ sysevent_worker(void *arg __unused) * free `ze`, so just inline the free() here -- events have already * been drained. */ - VERIFY3P(ze->ze_zevent, ==, NULL); + VERIFY0P(ze->ze_zevent); kmem_free(ze, sizeof (zfs_zevent_t)); kthread_exit(); diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c index 733c2bd07ebb..9d5f025423a1 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c @@ -43,6 +43,7 @@ const int zfs_vm_pagerret_bad = VM_PAGER_BAD; const int zfs_vm_pagerret_error = VM_PAGER_ERROR; const int zfs_vm_pagerret_ok = VM_PAGER_OK; +const int zfs_vm_pagerret_pend = VM_PAGER_PEND; const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC; const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c index fbf67f6a14a8..4bf487cdc469 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c @@ -507,7 +507,7 @@ abd_iter_at_end(struct abd_iter *aiter) void abd_iter_advance(struct abd_iter *aiter, size_t amount) { - ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0P(aiter->iter_mapaddr); ASSERT0(aiter->iter_mapsize); /* There's nothing left to advance to, so do nothing */ @@ -526,7 +526,7 @@ abd_iter_map(struct abd_iter *aiter) { void *paddr; - ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0P(aiter->iter_mapaddr); ASSERT0(aiter->iter_mapsize); /* There's nothing left to iterate over, so do nothing */ diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c index a4bf3fb6490f..0fc2697717af 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c @@ -196,7 +196,6 @@ zfs_crypto_dispatch(freebsd_crypt_session_t *session, struct cryptop *crp) break; } crp->crp_etype = 0; - crp->crp_flags &= ~CRYPTO_F_DONE; session->fs_done = false; } return (error); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c index d7c9be70ad4a..26cc7981bfcd 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c @@ -41,7 +41,6 @@ #include <sys/dsl_pool.h> #include <sys/dsl_synctask.h> #include <sys/dsl_prop.h> -#include <sys/dmu_zfetch.h> #include <sys/zfs_ioctl.h> #include <sys/zap.h> #include <sys/zio_checksum.h> @@ -71,6 +70,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct sf_buf *sf; int numbufs, i; int err; + dmu_flags_t flags = 0; if (size == 0) return (0); @@ -94,10 +94,17 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - if (tocpy == db->db_size) + if (tocpy == db->db_size) { dmu_buf_will_fill(db, tx, B_FALSE); - else - dmu_buf_will_dirty(db, tx); + } else { + if (i == numbufs - 1 && bufoff + tocpy < db->db_size) { + if (bufoff == 0) + flags |= DMU_PARTIAL_FIRST; + else + flags |= DMU_PARTIAL_MORE; + } + dmu_buf_will_dirty_flags(db, tx, flags); + } for (copied = 0; copied < tocpy; copied += PAGESIZE) { ASSERT3U(ptoa((*ma)->pindex), ==, @@ -149,7 +156,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, if (dbp[0]->db_offset != 0 || numbufs > 1) { for (i = 0; i < numbufs; i++) { ASSERT(ISP2(dbp[i]->db_size)); - ASSERT3U((dbp[i]->db_offset % dbp[i]->db_size), ==, 0); + ASSERT0((dbp[i]->db_offset % dbp[i]->db_size)); ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size); } } @@ -168,7 +175,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, vm_page_sunbusy(m); break; } - ASSERT3U(m->dirty, ==, 0); + ASSERT0(m->dirty); ASSERT(!pmap_page_is_write_mapped(m)); ASSERT3U(db->db_size, >, PAGE_SIZE); @@ -194,7 +201,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, if (m != bogus_page) { vm_page_assert_xbusied(m); ASSERT(vm_page_none_valid(m)); - ASSERT3U(m->dirty, ==, 0); + ASSERT0(m->dirty); ASSERT(!pmap_page_is_write_mapped(m)); va = zfs_map_page(m, &sf); } @@ -288,7 +295,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, vm_page_sunbusy(m); break; } - ASSERT3U(m->dirty, ==, 0); + ASSERT0(m->dirty); ASSERT(!pmap_page_is_write_mapped(m)); ASSERT3U(db->db_size, >, PAGE_SIZE); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c index c114db14a916..b218c0da8125 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c @@ -112,7 +112,6 @@ static int zfs__fini(void); static void zfs_shutdown(void *, int); static eventhandler_tag zfs_shutdown_event_tag; -static eventhandler_tag zfs_mountroot_event_tag; #define ZFS_MIN_KSTACK_PAGES 4 @@ -311,9 +310,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused) zfs_shutdown_event_tag = EVENTHANDLER_REGISTER( shutdown_post_sync, zfs_shutdown, NULL, SHUTDOWN_PRI_FIRST); - zfs_mountroot_event_tag = EVENTHANDLER_REGISTER( - mountroot, spa_boot_init, NULL, - SI_ORDER_ANY); } return (err); case MOD_UNLOAD: @@ -322,9 +318,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused) if (zfs_shutdown_event_tag != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, zfs_shutdown_event_tag); - if (zfs_mountroot_event_tag != NULL) - EVENTHANDLER_DEREGISTER(mountroot, - zfs_mountroot_event_tag); } return (err); case MOD_SHUTDOWN: diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c index ace2360c032d..ebc2c0eeb6d2 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -163,6 +163,13 @@ param_set_arc_int(SYSCTL_HANDLER_ARGS) return (0); } +static void +warn_deprecated_sysctl(const char *old, const char *new) +{ + printf("WARNING: sysctl vfs.zfs.%s is deprecated. Use vfs.zfs.%s instead.\n", + old, new); +} + int param_set_arc_max(SYSCTL_HANDLER_ARGS) { @@ -185,12 +192,15 @@ param_set_arc_max(SYSCTL_HANDLER_ARGS) if (val != 0) zfs_arc_max = arc_c_max; + if (arg2 != 0) + warn_deprecated_sysctl("arc_max", "arc.max"); + return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - NULL, 0, param_set_arc_max, "LU", + NULL, 1, param_set_arc_max, "LU", "Maximum ARC size in bytes (LEGACY)"); int @@ -214,12 +224,15 @@ param_set_arc_min(SYSCTL_HANDLER_ARGS) if (val != 0) zfs_arc_min = arc_c_min; + if (arg2 != 0) + warn_deprecated_sysctl("arc_min", "arc.min"); + return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - NULL, 0, param_set_arc_min, "LU", + NULL, 1, param_set_arc_min, "LU", "Minimum ARC size in bytes (LEGACY)"); extern uint_t zfs_arc_free_target; @@ -242,6 +255,9 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS) zfs_arc_free_target = val; + if (arg2 != 0) + warn_deprecated_sysctl("arc_free_target", "arc.free_target"); + return (0); } @@ -251,7 +267,7 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS) */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, - NULL, 0, param_set_arc_free_target, "IU", + NULL, 1, param_set_arc_free_target, "IU", "Desired number of free pages below which ARC triggers reclaim" " (LEGACY)"); @@ -270,12 +286,15 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) arc_no_grow_shift = val; + if (arg2 != 0) + warn_deprecated_sysctl("arc_no_grow_shift", "arc.no_grow_shift"); + return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - NULL, 0, param_set_arc_no_grow_shift, "I", + NULL, 1, param_set_arc_no_grow_shift, "I", "log2(fraction of ARC which must be free to allow growing) (LEGACY)"); extern uint64_t l2arc_write_max; @@ -746,12 +765,15 @@ param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) zfs_vdev_min_auto_ashift = val; + if (arg2 != 0) + warn_deprecated_sysctl("min_auto_ashift", + "vdev.min_auto_ashift"); + return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, - CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift), + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1, param_set_min_auto_ashift, "IU", "Min ashift used when creating new top-level vdev. (LEGACY)"); @@ -771,12 +793,15 @@ param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) zfs_vdev_max_auto_ashift = val; + if (arg2 != 0) + warn_deprecated_sysctl("max_auto_ashift", + "vdev.max_auto_ashift"); + return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, - CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift), + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1, param_set_max_auto_ashift, "IU", "Max ashift used when optimizing for logical -> physical sector size on" " new top-level vdevs. (LEGACY)"); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c index 7acf37ba9cd7..bbd1dafc69be 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c @@ -1236,12 +1236,21 @@ vdev_geom_io_done(zio_t *zio) struct bio *bp = zio->io_bio; if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { - ASSERT3P(bp, ==, NULL); + ASSERT0P(bp); return; } if (bp == NULL) { - ASSERT3S(zio->io_error, ==, ENXIO); + if (zio_injection_enabled && zio->io_error == EIO) + /* + * Convert an injected EIO to ENXIO. This is needed to + * work around zio_handle_device_injection_impl() not + * currently being able to inject ENXIO directly, while + * the assertion below only allows ENXIO here. + */ + zio->io_error = SET_ERROR(ENXIO); + else + ASSERT3S(zio->io_error, ==, ENXIO); return; } @@ -1276,7 +1285,8 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_fini = NULL, .vdev_op_open = vdev_geom_open, .vdev_op_close = vdev_geom_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_geom_io_start, diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c index 334264f6da2f..cb5787269db2 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c @@ -1175,7 +1175,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) int count = 0; zfs_acl_phys_t acl_phys; - if (zp->z_zfsvfs->z_replay == B_FALSE) { + if (ZTOV(zp) != NULL && zp->z_zfsvfs->z_replay == B_FALSE) { ASSERT_VOP_IN_SEQC(ZTOV(zp)); } @@ -1632,7 +1632,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); } else - ASSERT3P(dzp->z_vnode, ==, NULL); + ASSERT0P(dzp->z_vnode); memset(acl_ids, 0, sizeof (zfs_acl_ids_t)); acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); @@ -2014,7 +2014,7 @@ top: error = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(error); - ASSERT3P(zp->z_acl_cached, ==, NULL); + ASSERT0P(zp->z_acl_cached); zp->z_acl_cached = aclp; if (fuid_dirtied) @@ -2357,10 +2357,42 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr, * In FreeBSD, we don't care about permissions of individual ADS. * Note that not checking them is not just an optimization - without * this shortcut, EA operations may bogusly fail with EACCES. + * + * If this is a named attribute lookup, do the checks. */ +#if __FreeBSD_version >= 1500040 + if ((zp->z_pflags & ZFS_XATTR) && (flags & V_NAMEDATTR) == 0) +#else if (zp->z_pflags & ZFS_XATTR) +#endif return (0); + /* + * If a named attribute directory then validate against base file + */ + if (is_attr) { + if ((error = zfs_zget(ZTOZSB(zp), + zp->z_xattr_parent, &xzp)) != 0) { + return (error); + } + + check_zp = xzp; + + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); /* diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c index 8d0ff9b25e30..4de48e013ec4 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c @@ -357,7 +357,7 @@ zfsctl_create(zfsvfs_t *zfsvfs) vnode_t *rvp; uint64_t crtime[2]; - ASSERT3P(zfsvfs->z_ctldir, ==, NULL); + ASSERT0P(zfsvfs->z_ctldir); snapdir = sfs_alloc_node(sizeof (*snapdir), "snapshot", ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR); @@ -494,7 +494,7 @@ zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) vap->va_uid = 0; vap->va_gid = 0; - vap->va_rdev = 0; + vap->va_rdev = NODEV; /* * We are a purely virtual object, so we have no * blocksize or allocated blocks. @@ -688,6 +688,8 @@ zfsctl_root_readdir(struct vop_readdir_args *ap) * count to return is 0. */ if (zfs_uio_offset(&uio) == 3 * sizeof (entry)) { + if (eofp != NULL) + *eofp = 1; return (0); } @@ -1367,7 +1369,7 @@ zfsctl_snapshot_unmount(const char *snapname, int flags __unused) int err = getzfsvfs(snapname, &zfsvfs); if (err != 0) { - ASSERT3P(zfsvfs, ==, NULL); + ASSERT0P(zfsvfs); return (0); } vfsp = zfsvfs->z_vfs; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c index a0c9dd178980..75ba2ea0cb9e 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c @@ -273,7 +273,7 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp->z_unlinked); - ASSERT3U(zp->z_links, ==, 0); + ASSERT0(zp->z_links); VERIFY0(zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); @@ -437,7 +437,7 @@ zfs_rmnode(znode_t *zp) uint64_t count; int error; - ASSERT3U(zp->z_links, ==, 0); + ASSERT0(zp->z_links); if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); @@ -833,7 +833,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xvpp, cred_t *cr) if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, &acl_ids, NULL)) != 0) return (error); - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0)) { + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) { zfs_acl_ids_free(&acl_ids); return (SET_ERROR(EDQUOT)); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c index 21e5f7938f9f..ca13569a1235 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c @@ -164,8 +164,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, - ssize_t *resid) + uint8_t ashift, ssize_t *resid) { + (void) ashift; return (zfs_file_write_impl(fp, buf, count, &off, resid)); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_racct.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_racct.c index bdbbdacd984e..50d1cbf53afc 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_racct.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_racct.c @@ -28,7 +28,7 @@ #include <sys/racct.h> void -zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags) { curthread->td_ru.ru_inblock += iops; #ifdef RACCT @@ -46,7 +46,7 @@ zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) } void -zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags) { curthread->td_ru.ru_oublock += iops; #ifdef RACCT diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c index 547e109db404..79b784288911 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c @@ -241,35 +241,40 @@ zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) { int error = 0; char buf[32]; - uint64_t usedobj, quotaobj; + uint64_t usedobj, quotaobj, defaultquota; uint64_t quota, used = 0; timespec_t now; usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + defaultquota = isgroup ? zfsvfs->z_defaultgroupquota : + zfsvfs->z_defaultuserquota; + + if (zfsvfs->z_replay) + return (ENOENT); - if (quotaobj == 0 || zfsvfs->z_replay) { - error = ENOENT; - goto done; - } (void) sprintf(buf, "%llx", (longlong_t)id); - if ((error = zap_lookup(zfsvfs->z_os, quotaobj, - buf, sizeof (quota), 1, "a)) != 0) { - dprintf("%s(%d): quotaobj lookup failed\n", - __FUNCTION__, __LINE__); - goto done; + if (quotaobj == 0) { + if (defaultquota == 0) + return (ENOENT); + quota = defaultquota; + } else { + error = zap_lookup(zfsvfs->z_os, quotaobj, buf, sizeof (quota), + 1, "a); + if (error && (quota = defaultquota) == 0) + return (error); } + /* * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". * So we set them to be the same. */ dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); - if (error && error != ENOENT) { - dprintf("%s(%d): usedobj failed; %d\n", - __FUNCTION__, __LINE__, error); - goto done; - } + if (error == ENOENT) + error = 0; + if (error) + return (error); dqp->dqb_curblocks = btodb(used); dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; vfs_timestamp(&now); @@ -279,7 +284,6 @@ zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) * particularly useful. */ dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; -done: return (error); } @@ -451,8 +455,13 @@ zfs_sync(vfs_t *vfsp, int waitfor) return (0); } - if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, 0); + if (zfsvfs->z_log != NULL) { + error = zil_commit(zfsvfs->z_log, 0); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } + } zfs_exit(zfsvfs, FTAG); } else { @@ -861,6 +870,36 @@ zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) zfsvfs->z_xattr_sa = B_TRUE; } + error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSERQUOTA, + &zfsvfs->z_defaultuserquota); + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPQUOTA, + &zfsvfs->z_defaultgroupquota); + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTQUOTA, + &zfsvfs->z_defaultprojectquota); + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSEROBJQUOTA, + &zfsvfs->z_defaultuserobjquota); + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPOBJQUOTA, + &zfsvfs->z_defaultgroupobjquota); + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTOBJQUOTA, + &zfsvfs->z_defaultprojectobjquota); + if (error != 0) + return (error); + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); if (error != 0) @@ -1057,7 +1096,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) if (mounting) { boolean_t readonly; - ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); + ASSERT0P(zfsvfs->z_kstat.dk_kstats); error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); if (error) return (error); @@ -1175,6 +1214,8 @@ zfs_set_fuid_feature(zfsvfs_t *zfsvfs) zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } +extern int zfs_xattr_compat; + static int zfs_domount(vfs_t *vfsp, char *osname) { @@ -1255,6 +1296,16 @@ zfs_domount(vfs_t *vfsp, char *osname) goto out; } +#if __FreeBSD_version >= 1500040 + /* + * Named attributes can only work if the xattr property is set to + * on/dir and not sa. Also, zfs_xattr_compat must be set. + */ + if ((zfsvfs->z_flags & ZSB_XATTR) != 0 && !zfsvfs->z_xattr_sa && + zfs_xattr_compat) + vfsp->mnt_flag |= MNT_NAMEDATTR; +#endif + vfs_mountedfrom(vfsp, osname); if (!zfsvfs->z_issnap) @@ -1778,6 +1829,14 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) err = vn_lock(*vpp, flags); if (err != 0) vrele(*vpp); +#if __FreeBSD_version >= 1500040 + else if ((zp->z_pflags & ZFS_XATTR) != 0) { + if ((*vpp)->v_type == VDIR) + vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR); + else + vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR); + } +#endif } if (err != 0) *vpp = NULL; @@ -1930,9 +1989,17 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) *vpp = ZTOV(zp); zfs_exit(zfsvfs, FTAG); err = vn_lock(*vpp, flags); - if (err == 0) + if (err == 0) { vnode_create_vobject(*vpp, zp->z_size, curthread); - else +#if __FreeBSD_version >= 1500040 + if ((zp->z_pflags & ZFS_XATTR) != 0) { + if ((*vpp)->v_type == VDIR) + vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR); + else + vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR); + } +#endif + } else *vpp = NULL; return (err); } @@ -2241,6 +2308,62 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) return (0); } +int +zfs_set_default_quota(zfsvfs_t *zfsvfs, zfs_prop_t prop, uint64_t quota) +{ + int error; + objset_t *os = zfsvfs->z_os; + const char *propstr = zfs_prop_to_name(prop); + dmu_tx_t *tx; + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, propstr); + error = dmu_tx_assign(tx, DMU_TX_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + if (quota == 0) { + error = zap_remove(os, MASTER_NODE_OBJ, propstr, tx); + if (error == ENOENT) + error = 0; + } else { + error = zap_update(os, MASTER_NODE_OBJ, propstr, 8, 1, + "a, tx); + } + + if (error) + goto out; + + switch (prop) { + case ZFS_PROP_DEFAULTUSERQUOTA: + zfsvfs->z_defaultuserquota = quota; + break; + case ZFS_PROP_DEFAULTGROUPQUOTA: + zfsvfs->z_defaultgroupquota = quota; + break; + case ZFS_PROP_DEFAULTPROJECTQUOTA: + zfsvfs->z_defaultprojectquota = quota; + break; + case ZFS_PROP_DEFAULTUSEROBJQUOTA: + zfsvfs->z_defaultuserobjquota = quota; + break; + case ZFS_PROP_DEFAULTGROUPOBJQUOTA: + zfsvfs->z_defaultgroupobjquota = quota; + break; + case ZFS_PROP_DEFAULTPROJECTOBJQUOTA: + zfsvfs->z_defaultprojectobjquota = quota; + break; + default: + break; + } + +out: + dmu_tx_commit(tx); + return (error); +} + /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index e468377eb44f..411225786089 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -25,6 +25,7 @@ * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2025, Klara, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ @@ -60,6 +61,7 @@ #include <sys/fs/zfs.h> #include <sys/dmu.h> #include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> #include <sys/spa.h> #include <sys/txg.h> #include <sys/dbuf.h> @@ -74,6 +76,7 @@ #include <sys/zfs_quota.h> #include <sys/zfs_sa.h> #include <sys/zfs_rlock.h> +#include <sys/zfs_project.h> #include <sys/bio.h> #include <sys/buf.h> #include <sys/sched.h> @@ -113,6 +116,8 @@ typedef uint64_t cookie_t; typedef ulong_t cookie_t; #endif +static int zfs_check_attrname(const char *name); + /* * Programming rules. * @@ -267,6 +272,71 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) } static int +zfs_ioctl_getxattr(vnode_t *vp, zfsxattr_t *fsx) +{ + znode_t *zp = VTOZ(vp); + + memset(fsx, 0, sizeof (*fsx)); + fsx->fsx_xflags = (zp->z_pflags & ZFS_PROJINHERIT) ? + ZFS_PROJINHERIT_FL : 0; + fsx->fsx_projid = zp->z_projid; + + return (0); +} + +static int +zfs_ioctl_setflags(vnode_t *vp, uint32_t ioctl_flags, xvattr_t *xva) +{ + uint64_t zfs_flags = VTOZ(vp)->z_pflags; + xoptattr_t *xoap; + + if (ioctl_flags & ~(ZFS_PROJINHERIT_FL)) + return (SET_ERROR(EOPNOTSUPP)); + + xva_init(xva); + xoap = xva_getxoptattr(xva); + +#define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \ + if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \ + ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \ + XVA_SET_REQ(xva, (xflag)); \ + (xfield) = ((ioctl_flags & (iflag)) != 0); \ + } \ +} while (0) + + FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT, + xoap->xoa_projinherit); + +#undef FLAG_CHANGE + + return (0); +} + +static int +zfs_ioctl_setxattr(vnode_t *vp, zfsxattr_t *fsx, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + xvattr_t xva; + xoptattr_t *xoap; + int err; + + if (!zpl_is_valid_projid(fsx->fsx_projid)) + return (SET_ERROR(EINVAL)); + + err = zfs_ioctl_setflags(vp, fsx->fsx_xflags, &xva); + if (err) + return (err); + + xoap = xva_getxoptattr(&xva); + XVA_SET_REQ(&xva, XAT_PROJID); + xoap->xoa_projid = fsx->fsx_projid; + + err = zfs_setattr(zp, (vattr_t *)&xva, 0, cr, NULL); + + return (err); +} + +static int zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, int *rvalp) { @@ -305,6 +375,38 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, *(offset_t *)data = off; return (0); } + case ZFS_IOC_FSGETXATTR: { + zfsxattr_t *fsx = (zfsxattr_t *)data; + error = vn_lock(vp, LK_SHARED); + if (error) + return (error); + error = zfs_ioctl_getxattr(vp, fsx); + VOP_UNLOCK(vp); + return (error); + } + case ZFS_IOC_FSSETXATTR: { + zfsxattr_t *fsx = (zfsxattr_t *)data; + error = vn_lock(vp, LK_EXCLUSIVE); + if (error) + return (error); + vn_seqc_write_begin(vp); + error = zfs_ioctl_setxattr(vp, fsx, cred); + vn_seqc_write_end(vp); + VOP_UNLOCK(vp); + return (error); + } + case ZFS_IOC_REWRITE: { + zfs_rewrite_args_t *args = (zfs_rewrite_args_t *)data; + if ((flag & FWRITE) == 0) + return (SET_ERROR(EBADF)); + error = vn_lock(vp, LK_SHARED); + if (error) + return (error); + error = zfs_rewrite(VTOZ(vp), args->off, args->len, + args->flags, args->arg); + VOP_UNLOCK(vp); + return (error); + } } return (SET_ERROR(ENOTTY)); } @@ -518,7 +620,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) page_unhold(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, bytes); + uio, bytes, DMU_READ_PREFETCH); } len -= bytes; off = 0; @@ -717,7 +819,12 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, /* * Do we have permission to get into attribute directory? */ - error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr, NULL); + if (flags & LOOKUP_NAMED_ATTR) + error = zfs_zaccess(zp, ACE_EXECUTE, V_NAMEDATTR, + B_FALSE, cr, NULL); + else + error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr, + NULL); if (error) { vrele(ZTOV(zp)); } @@ -997,7 +1104,7 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, zfs_exit(zfsvfs, FTAG); return (error); } - ASSERT3P(zp, ==, NULL); + ASSERT0P(zp); /* * Create a new file object and update the directory @@ -1089,8 +1196,8 @@ out: *zpp = zp; } - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -1219,9 +1326,8 @@ out: if (xzp) vrele(ZTOV(xzp)); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -1378,7 +1484,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, zfs_exit(zfsvfs, FTAG); return (error); } - ASSERT3P(zp, ==, NULL); + ASSERT0P(zp); if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, mnt_ns))) { @@ -1452,8 +1558,8 @@ out: getnewvnode_drop_reserve(); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -1533,8 +1639,8 @@ zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) if (zfsvfs->z_use_namecache) cache_vop_rmdir(dvp, vp); out: - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -1632,7 +1738,7 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, /* * Quit if directory has been removed (posix) */ - if ((*eofp = zp->z_unlinked) != 0) { + if ((*eofp = (zp->z_unlinked != 0)) != 0) { zfs_exit(zfsvfs, FTAG); return (0); } @@ -1910,7 +2016,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) if (vp->v_type == VBLK || vp->v_type == VCHR) vap->va_rdev = zfs_cmpldev(rdev); else - vap->va_rdev = 0; + vap->va_rdev = NODEV; vap->va_gen = zp->z_gen; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ vap->va_filerev = zp->z_seq; @@ -2046,6 +2152,134 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) } /* + * For the operation of changing file's user/group/project, we need to + * handle not only the main object that is assigned to the file directly, + * but also the ones that are used by the file via hidden xattr directory. + * + * Because the xattr directory may contains many EA entries, as to it may + * be impossible to change all of them via the transaction of changing the + * main object's user/group/project attributes. Then we have to change them + * via other multiple independent transactions one by one. It may be not good + * solution, but we have no better idea yet. + */ +static int +zfs_setattr_dir(znode_t *dzp) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + zap_cursor_t zc; + zap_attribute_t *zap; + znode_t *zp = NULL; + dmu_tx_t *tx = NULL; + uint64_t uid, gid; + sa_bulk_attr_t bulk[4]; + int count; + int err; + + zap = zap_attribute_alloc(); + zap_cursor_init(&zc, os, dzp->z_id); + while ((err = zap_cursor_retrieve(&zc, zap)) == 0) { + count = 0; + if (zap->za_integer_length != 8 || zap->za_num_integers != 1) { + err = ENXIO; + break; + } + + err = zfs_dirent_lookup(dzp, zap->za_name, &zp, ZEXISTS); + if (err == ENOENT) + goto next; + if (err) + break; + + if (zp->z_uid == dzp->z_uid && + zp->z_gid == dzp->z_gid && + zp->z_projid == dzp->z_projid) + goto next; + + tx = dmu_tx_create(os); + if (!(zp->z_pflags & ZFS_PROJID)) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + + err = dmu_tx_assign(tx, DMU_TX_WAIT); + if (err) + break; + + vn_seqc_write_begin(ZTOV(zp)); + mutex_enter(&dzp->z_lock); + + if (zp->z_uid != dzp->z_uid) { + uid = dzp->z_uid; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &uid, sizeof (uid)); + zp->z_uid = uid; + } + + if (zp->z_gid != dzp->z_gid) { + gid = dzp->z_gid; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &gid, sizeof (gid)); + zp->z_gid = gid; + } + + uint64_t projid = dzp->z_projid; + if (zp->z_projid != projid) { + if (!(zp->z_pflags & ZFS_PROJID)) { + err = sa_add_projid(zp->z_sa_hdl, tx, projid); + if (unlikely(err == EEXIST)) { + err = 0; + } else if (err != 0) { + goto sa_add_projid_err; + } else { + projid = ZFS_INVALID_PROJID; + } + } + + if (projid != ZFS_INVALID_PROJID) { + zp->z_projid = projid; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, + sizeof (zp->z_projid)); + } + } + +sa_add_projid_err: + mutex_exit(&dzp->z_lock); + + if (likely(count > 0)) { + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } else if (projid == ZFS_INVALID_PROJID) { + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + tx = NULL; + vn_seqc_write_end(ZTOV(zp)); + if (err != 0 && err != ENOENT) + break; + +next: + if (zp) { + zrele(zp); + zp = NULL; + } + zap_cursor_advance(&zc); + } + + if (tx) + dmu_tx_abort(tx); + if (zp) { + zrele(zp); + } + zap_cursor_fini(&zc); + zap_attribute_free(zap); + + return (err == ENOENT ? 0 : err); +} + +/* * Set the file attributes to the values contained in the * vattr structure. * @@ -2090,6 +2324,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; + boolean_t handle_eadir = B_FALSE; sa_bulk_attr_t bulk[7], xattr_bulk[7]; int count = 0, xattr_count = 0; @@ -2441,6 +2676,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) mask = vap->va_mask; if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) { + handle_eadir = B_TRUE; err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); @@ -2770,11 +3006,15 @@ out: } else { err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); + if (attrzp) { + if (err2 == 0 && handle_eadir) + err = zfs_setattr_dir(attrzp); + } } out2: - if (os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS) + err = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (err); @@ -3303,7 +3543,7 @@ out_seq: out: if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -3426,8 +3666,7 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, return (error); } - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, - 0 /* projid */)) { + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { zfs_acl_ids_free(&acl_ids); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); @@ -3496,7 +3735,7 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, *zpp = zp; if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + error = zil_commit(zilog, 0); } zfs_exit(zfsvfs, FTAG); @@ -3686,8 +3925,8 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, vnevent_link(ZTOV(szp), ct); } - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -4072,6 +4311,43 @@ zfs_freebsd_getpages(struct vop_getpages_args *ap) ap->a_rahead)); } +typedef struct { + uint_t pca_npages; + vm_page_t pca_pages[]; +} putpage_commit_arg_t; + +static void +zfs_putpage_commit_cb(void *arg, int err) +{ + putpage_commit_arg_t *pca = arg; + vm_object_t object = pca->pca_pages[0]->object; + + zfs_vmobject_wlock(object); + + for (uint_t i = 0; i < pca->pca_npages; i++) { + vm_page_t pp = pca->pca_pages[i]; + + if (err == 0) { + /* + * Writeback succeeded, so undirty the page. If it + * fails, we leave it in the same state it was. That's + * most likely dirty, so it will get tried again some + * other time. + */ + vm_page_undirty(pp); + } + + vm_page_sunbusy(pp); + } + + vm_object_pip_wakeupn(object, pca->pca_npages); + + zfs_vmobject_wunlock(object); + + kmem_free(pca, + offsetof(putpage_commit_arg_t, pca_pages[pca->pca_npages])); +} + static int zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, int *rtvals) @@ -4173,10 +4449,12 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, } if (zp->z_blksz < PAGE_SIZE) { - for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { - tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; + vm_ooffset_t woff = off; + size_t wlen = len; + for (i = 0; wlen > 0; woff += tocopy, wlen -= tocopy, i++) { + tocopy = MIN(PAGE_SIZE, wlen); va = zfs_map_page(ma[i], &sf); - dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); + dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx); zfs_unmap_page(sf); } } else { @@ -4197,19 +4475,48 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(err); - /* - * XXX we should be passing a callback to undirty - * but that would make the locking messier - */ - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, - len, commit, B_FALSE, NULL, NULL); - zfs_vmobject_wlock(object); - for (i = 0; i < ncount; i++) { - rtvals[i] = zfs_vm_pagerret_ok; - vm_page_undirty(ma[i]); + if (commit) { + /* + * Caller requested that we commit immediately. We set + * a callback on the log entry, to be called once its + * on disk after the call to zil_commit() below. The + * pages will be undirtied and unbusied there. + */ + putpage_commit_arg_t *pca = kmem_alloc( + offsetof(putpage_commit_arg_t, pca_pages[ncount]), + KM_SLEEP); + pca->pca_npages = ncount; + memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount); + + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, + B_TRUE, B_FALSE, zfs_putpage_commit_cb, pca); + + for (i = 0; i < ncount; i++) + rtvals[i] = zfs_vm_pagerret_pend; + } else { + /* + * Caller just wants the page written back somewhere, + * but doesn't need it committed yet. We've already + * written it back to the DMU, so we just need to put + * it on the async log, then undirty the page and + * return. + * + * We cannot use a callback here, because it would keep + * the page busy (locked) until it is eventually + * written down at txg sync. + */ + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, + B_FALSE, B_FALSE, NULL, NULL); + + zfs_vmobject_wlock(object); + for (i = 0; i < ncount; i++) { + rtvals[i] = zfs_vm_pagerret_ok; + vm_page_undirty(ma[i]); + } + zfs_vmobject_wunlock(object); } - zfs_vmobject_wunlock(object); + VM_CNT_INC(v_vnodeout); VM_CNT_ADD(v_vnodepgsout, ncount); } @@ -4217,8 +4524,13 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, out: zfs_rangelock_exit(lr); - if (commit) - zil_commit(zfsvfs->z_log, zp->z_id); + if (commit) { + err = zil_commit(zfsvfs->z_log, zp->z_id); + if (err != 0) { + zfs_exit(zfsvfs, FTAG); + return (err); + } + } dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, len); @@ -4480,8 +4792,16 @@ zfs_freebsd_access(struct vop_access_args *ap) * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, */ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); - if (accmode != 0) - error = zfs_access(zp, accmode, 0, ap->a_cred); + if (accmode != 0) { +#if __FreeBSD_version >= 1500040 + /* For named attributes, do the checks. */ + if ((vn_irflag_read(vp) & VIRF_NAMEDATTR) != 0) + error = zfs_access(zp, accmode, V_NAMEDATTR, + ap->a_cred); + else +#endif + error = zfs_access(zp, accmode, 0, ap->a_cred); + } /* * VADMIN has to be handled by vaccess(). @@ -4514,6 +4834,190 @@ struct vop_lookup_args { }; #endif +#if __FreeBSD_version >= 1500040 +static int +zfs_lookup_nameddir(struct vnode *dvp, struct componentname *cnp, + struct vnode **vpp) +{ + struct vnode *xvp; + int error, flags; + + *vpp = NULL; + flags = LOOKUP_XATTR | LOOKUP_NAMED_ATTR; + if ((cnp->cn_flags & CREATENAMED) != 0) + flags |= CREATE_XATTR_DIR; + error = zfs_lookup(dvp, NULL, &xvp, NULL, 0, cnp->cn_cred, flags, + B_FALSE); + if (error == 0) { + if ((cnp->cn_flags & LOCKLEAF) != 0) + error = vn_lock(xvp, cnp->cn_lkflags); + if (error == 0) { + vn_irflag_set_cond(xvp, VIRF_NAMEDDIR); + *vpp = xvp; + } else { + vrele(xvp); + } + } + return (error); +} + +static ssize_t +zfs_readdir_named(struct vnode *vp, char *buf, ssize_t blen, off_t *offp, + int *eofflagp, struct ucred *cred, struct thread *td) +{ + struct uio io; + struct iovec iv; + zfs_uio_t uio; + int error; + + io.uio_offset = *offp; + io.uio_segflg = UIO_SYSSPACE; + io.uio_rw = UIO_READ; + io.uio_td = td; + iv.iov_base = buf; + iv.iov_len = blen; + io.uio_iov = &iv; + io.uio_iovcnt = 1; + io.uio_resid = blen; + zfs_uio_init(&uio, &io); + error = zfs_readdir(vp, &uio, cred, eofflagp, NULL, NULL); + if (error != 0) + return (-1); + *offp = io.uio_offset; + return (blen - io.uio_resid); +} + +static bool +zfs_has_namedattr(struct vnode *vp, struct ucred *cred) +{ + struct componentname cn; + struct vnode *xvp; + struct dirent *dp; + off_t offs; + ssize_t rsize; + char *buf, *cp, *endcp; + int eofflag, error; + bool ret; + + MNT_ILOCK(vp->v_mount); + if ((vp->v_mount->mnt_flag & MNT_NAMEDATTR) == 0) { + MNT_IUNLOCK(vp->v_mount); + return (false); + } + MNT_IUNLOCK(vp->v_mount); + + /* Now see if a named attribute directory exists. */ + cn.cn_flags = LOCKLEAF; + cn.cn_lkflags = LK_SHARED; + cn.cn_cred = cred; + error = zfs_lookup_nameddir(vp, &cn, &xvp); + if (error != 0) + return (false); + + /* It exists, so see if there is any entry other than "." and "..". */ + buf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK); + ret = false; + offs = 0; + do { + rsize = zfs_readdir_named(xvp, buf, DEV_BSIZE, &offs, &eofflag, + cred, curthread); + if (rsize <= 0) + break; + cp = buf; + endcp = &buf[rsize]; + while (cp < endcp) { + dp = (struct dirent *)cp; + if (dp->d_fileno != 0 && (dp->d_type == DT_REG || + dp->d_type == DT_UNKNOWN) && + !ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name) && + ((dp->d_namlen == 1 && dp->d_name[0] != '.') || + (dp->d_namlen == 2 && (dp->d_name[0] != '.' || + dp->d_name[1] != '.')) || dp->d_namlen > 2)) { + ret = true; + break; + } + cp += dp->d_reclen; + } + } while (!ret && rsize > 0 && eofflag == 0); + vput(xvp); + free(buf, M_TEMP); + return (ret); +} + +static int +zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) +{ + struct componentname *cnp = ap->a_cnp; + char nm[NAME_MAX + 1]; + int error; + struct vnode **vpp = ap->a_vpp, *dvp = ap->a_dvp, *xvp; + bool is_nameddir, needs_nameddir, opennamed = false; + + /* + * These variables are used to handle the named attribute cases: + * opennamed - Is true when this is a call from open with O_NAMEDATTR + * specified and it is the last component. + * is_nameddir - Is true when the directory is a named attribute dir. + * needs_nameddir - Is set when the lookup needs to look for/create + * a named attribute directory. It is only set when is_nameddir + * is_nameddir is false and opennamed is true. + * xvp - Is the directory that the lookup needs to be done in. + * Usually dvp, unless needs_nameddir is true where it is the + * result of the first non-named directory lookup. + * Note that name caching must be disabled for named attribute + * handling. + */ + needs_nameddir = false; + xvp = dvp; + opennamed = (cnp->cn_flags & (OPENNAMED | ISLASTCN)) == + (OPENNAMED | ISLASTCN); + is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0; + if (is_nameddir && (cnp->cn_flags & ISLASTCN) == 0) + return (ENOATTR); + if (opennamed && !is_nameddir && (cnp->cn_flags & ISDOTDOT) != 0) + return (ENOATTR); + if (opennamed || is_nameddir) + cnp->cn_flags &= ~MAKEENTRY; + if (opennamed && !is_nameddir) + needs_nameddir = true; + ASSERT3U(cnp->cn_namelen, <, sizeof (nm)); + error = 0; + *vpp = NULL; + if (needs_nameddir) { + if (VOP_ISLOCKED(dvp) != LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + error = zfs_lookup_nameddir(dvp, cnp, &xvp); + if (error == 0) + is_nameddir = true; + } + if (error == 0) { + if (!needs_nameddir || cnp->cn_namelen != 1 || + *cnp->cn_nameptr != '.') { + strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, + sizeof (nm))); + error = zfs_lookup(xvp, nm, vpp, cnp, cnp->cn_nameiop, + cnp->cn_cred, 0, cached); + if (is_nameddir && error == 0 && + (cnp->cn_namelen != 1 || *cnp->cn_nameptr != '.') && + (cnp->cn_flags & ISDOTDOT) == 0) { + if ((*vpp)->v_type == VDIR) + vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR); + else + vn_irflag_set_cond(*vpp, + VIRF_NAMEDATTR); + } + if (needs_nameddir && xvp != *vpp) + vput(xvp); + } else { + /* + * Lookup of "." when a named attribute dir is needed. + */ + *vpp = xvp; + } + } + return (error); +} +#else static int zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) { @@ -4526,6 +5030,7 @@ zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, cnp->cn_cred, 0, cached)); } +#endif static int zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap) @@ -4548,7 +5053,11 @@ zfs_cache_lookup(struct vop_lookup_args *ap) zfsvfs_t *zfsvfs; zfsvfs = ap->a_dvp->v_mount->mnt_data; +#if __FreeBSD_version >= 1500040 + if (zfsvfs->z_use_namecache && (ap->a_cnp->cn_flags & OPENNAMED) == 0) +#else if (zfsvfs->z_use_namecache) +#endif return (vfs_cache_lookup(ap)); else return (zfs_freebsd_lookup(ap, B_FALSE)); @@ -4571,6 +5080,11 @@ zfs_freebsd_create(struct vop_create_args *ap) vattr_t *vap = ap->a_vap; znode_t *zp = NULL; int rc, mode; + struct vnode *dvp = ap->a_dvp; +#if __FreeBSD_version >= 1500040 + struct vnode *xvp; + bool is_nameddir; +#endif #if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); @@ -4581,10 +5095,36 @@ zfs_freebsd_create(struct vop_create_args *ap) zfsvfs = ap->a_dvp->v_mount->mnt_data; *ap->a_vpp = NULL; - rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, 0, mode, - &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL); + rc = 0; +#if __FreeBSD_version >= 1500040 + xvp = NULL; + is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0; + if (!is_nameddir && (cnp->cn_flags & OPENNAMED) != 0) { + /* Needs a named attribute directory. */ + rc = zfs_lookup_nameddir(dvp, cnp, &xvp); + if (rc == 0) { + dvp = xvp; + is_nameddir = true; + } + } + if (is_nameddir && rc == 0) + rc = zfs_check_attrname(cnp->cn_nameptr); +#endif + if (rc == 0) + rc = zfs_create(VTOZ(dvp), cnp->cn_nameptr, vap, 0, mode, + &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL); +#if __FreeBSD_version >= 1500040 + if (xvp != NULL) + vput(xvp); +#endif + if (rc == 0) { *ap->a_vpp = ZTOV(zp); +#if __FreeBSD_version >= 1500040 + if (is_nameddir) + vn_irflag_set_cond(*ap->a_vpp, VIRF_NAMEDATTR); +#endif + } if (zfsvfs->z_use_namecache && rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, cnp); @@ -4603,13 +5143,21 @@ struct vop_remove_args { static int zfs_freebsd_remove(struct vop_remove_args *ap) { + int error = 0; #if __FreeBSD_version < 1400068 ASSERT(ap->a_cnp->cn_flags & SAVENAME); #endif - return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, - ap->a_cnp->cn_cred)); +#if __FreeBSD_version >= 1500040 + if ((vn_irflag_read(ap->a_dvp) & VIRF_NAMEDDIR) != 0) + error = zfs_check_attrname(ap->a_cnp->cn_nameptr); +#endif + + if (error == 0) + error = zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, + ap->a_cnp->cn_cred); + return (error); } #ifndef _SYS_SYSPROTO_H_ @@ -4694,8 +5242,32 @@ struct vop_fsync_args { static int zfs_freebsd_fsync(struct vop_fsync_args *ap) { + vnode_t *vp = ap->a_vp; + int err = 0; - return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred)); + /* + * Push any dirty mmap()'d data out to the DMU and ZIL, ready for + * zil_commit() to be called in zfs_fsync(). + */ + if (vm_object_mightbedirty(vp->v_object)) { + zfs_vmobject_wlock(vp->v_object); + if (!vm_object_page_clean(vp->v_object, 0, 0, 0)) + err = SET_ERROR(EIO); + zfs_vmobject_wunlock(vp->v_object); + if (err) { + /* + * Unclear what state things are in. zfs_putpages() + * will ensure the pages remain dirty if they haven't + * been written down to the DMU, but because there may + * be nothing logged, we can't assume that zfs_sync() + * -> zil_commit() will give us a useful error. It's + * safest if we just error out here. + */ + return (err); + } + } + + return (zfs_fsync(VTOZ(vp), 0, ap->a_td->td_ucred)); } #ifndef _SYS_SYSPROTO_H_ @@ -4767,6 +5339,11 @@ zfs_freebsd_getattr(struct vop_getattr_args *ap) #undef FLAG_CHECK *vap = xvap.xva_vattr; vap->va_flags = fflags; + +#if __FreeBSD_version >= 1500040 + if ((vn_irflag_read(ap->a_vp) & (VIRF_NAMEDDIR | VIRF_NAMEDATTR)) != 0) + vap->va_bsdflags |= SFBSD_NAMEDATTR; +#endif return (0); } @@ -4909,15 +5486,24 @@ zfs_freebsd_rename(struct vop_rename_args *ap) vnode_t *fvp = ap->a_fvp; vnode_t *tdvp = ap->a_tdvp; vnode_t *tvp = ap->a_tvp; - int error; + int error = 0; #if __FreeBSD_version < 1400068 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); #endif - error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, - ap->a_tcnp, ap->a_fcnp->cn_cred); +#if __FreeBSD_version >= 1500040 + if ((vn_irflag_read(fdvp) & VIRF_NAMEDDIR) != 0) { + error = zfs_check_attrname(ap->a_fcnp->cn_nameptr); + if (error == 0) + error = zfs_check_attrname(ap->a_tcnp->cn_nameptr); + } +#endif + + if (error == 0) + error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, + ap->a_tcnp, ap->a_fcnp->cn_cred); vrele(fdvp); vrele(fvp); @@ -5146,6 +5732,9 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap) { ulong_t val; int error; +#ifdef _PC_CLONE_BLKSIZE + zfsvfs_t *zfsvfs; +#endif error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); @@ -5171,12 +5760,48 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap) return (0); } return (EINVAL); +#if __FreeBSD_version >= 1500040 + case _PC_NAMEDATTR_ENABLED: + MNT_ILOCK(ap->a_vp->v_mount); + if ((ap->a_vp->v_mount->mnt_flag & MNT_NAMEDATTR) != 0) + *ap->a_retval = 1; + else + *ap->a_retval = 0; + MNT_IUNLOCK(ap->a_vp->v_mount); + return (0); + case _PC_HAS_NAMEDATTR: + if (zfs_has_namedattr(ap->a_vp, curthread->td_ucred)) + *ap->a_retval = 1; + else + *ap->a_retval = 0; + return (0); +#endif +#ifdef _PC_HAS_HIDDENSYSTEM + case _PC_HAS_HIDDENSYSTEM: + *ap->a_retval = 1; + return (0); +#endif +#ifdef _PC_CLONE_BLKSIZE + case _PC_CLONE_BLKSIZE: + zfsvfs = (zfsvfs_t *)ap->a_vp->v_mount->mnt_data; + if (zfs_bclone_enabled && + spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os), + SPA_FEATURE_BLOCK_CLONING)) + *ap->a_retval = dsl_dataset_feature_is_active( + zfsvfs->z_os->os_dsl_dataset, + SPA_FEATURE_LARGE_BLOCKS) ? + SPA_MAXBLOCKSIZE : + SPA_OLD_MAXBLOCKSIZE; + else + *ap->a_retval = 0; + return (0); +#endif default: return (vop_stdpathconf(ap)); } } -static int zfs_xattr_compat = 1; +int zfs_xattr_compat = 1; static int zfs_check_attrname(const char *name) @@ -6043,6 +6668,78 @@ zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap) return (EOPNOTSUPP); } +#ifndef _SYS_SYSPROTO_H_ +struct vop_advise_args { + struct vnode *a_vp; + off_t a_start; + off_t a_end; + int a_advice; +}; +#endif + +static int +zfs_freebsd_advise(struct vop_advise_args *ap) +{ + vnode_t *vp = ap->a_vp; + off_t start = ap->a_start; + off_t end = ap->a_end; + int advice = ap->a_advice; + off_t len; + znode_t *zp; + zfsvfs_t *zfsvfs; + objset_t *os; + int error = 0; + + if (end < start) + return (EINVAL); + + error = vn_lock(vp, LK_SHARED); + if (error) + return (error); + + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + os = zp->z_zfsvfs->z_os; + + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + goto out_unlock; + + /* kern_posix_fadvise points to the last byte, we want one past */ + if (end != OFF_MAX) + end += 1; + len = end - start; + + switch (advice) { + case POSIX_FADV_WILLNEED: + /* + * Pass on the caller's size directly, but note that + * dmu_prefetch_max will effectively cap it. If there really + * is a larger sequential access pattern, perhaps dmu_zfetch + * will detect it. + */ + dmu_prefetch(os, zp->z_id, 0, start, len, + ZIO_PRIORITY_ASYNC_READ); + break; + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_DONTNEED: + case POSIX_FADV_NOREUSE: + /* ignored for now */ + break; + default: + error = EINVAL; + break; + } + + zfs_exit(zfsvfs, FTAG); + +out_unlock: + VOP_UNLOCK(vp); + + return (error); +} + static int zfs_vptocnp(struct vop_vptocnp_args *ap) { @@ -6137,9 +6834,11 @@ zfs_deallocate(struct vop_deallocate_args *ap) if (error == 0) { if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS || (ap->a_ioflag & IO_SYNC) != 0) - zil_commit(zilog, zp->z_id); - *ap->a_offset = off + len; - *ap->a_len = 0; + error = zil_commit(zilog, zp->z_id); + if (error == 0) { + *ap->a_offset = off + len; + *ap->a_len = 0; + } } zfs_exit(zfsvfs, FTAG); @@ -6279,6 +6978,7 @@ struct vop_vector zfs_vnodeops = { .vop_link = zfs_freebsd_link, .vop_symlink = zfs_freebsd_symlink, .vop_readlink = zfs_freebsd_readlink, + .vop_advise = zfs_freebsd_advise, .vop_read = zfs_freebsd_read, .vop_write = zfs_freebsd_write, .vop_remove = zfs_freebsd_remove, diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c index ce7b93d20a47..649022ab5bcb 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c @@ -67,8 +67,12 @@ #include "zfs_comutil.h" /* Used by fstat(1). */ +#ifdef SYSCTL_SIZEOF +SYSCTL_SIZEOF(znode, znode_t); +#else SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof (znode_t), "sizeof(znode_t)"); +#endif /* * Define ZNODE_STATS to turn on statistic gathering. By default, it is only @@ -146,8 +150,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_vnode = NULL; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; return (0); } @@ -159,18 +161,15 @@ zfs_znode_cache_destructor(void *buf, void *arg) znode_t *zp = buf; ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); - ASSERT3P(zp->z_vnode, ==, NULL); + ASSERT0P(zp->z_vnode); ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); mutex_destroy(&zp->z_acl_lock); rw_destroy(&zp->z_xattr_lock); zfs_rangelock_fini(&zp->z_rangelock); - ASSERT3P(zp->z_acl_cached, ==, NULL); - ASSERT3P(zp->z_xattr_cached, ==, NULL); - - ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); - ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); + ASSERT0P(zp->z_acl_cached); + ASSERT0P(zp->z_xattr_cached); } @@ -196,7 +195,7 @@ zfs_znode_init(void) /* * Initialize zcache */ - ASSERT3P(znode_uma_zone, ==, NULL); + ASSERT0P(znode_uma_zone); znode_uma_zone = uma_zcreate("zfs_znode_cache", sizeof (znode_t), zfs_znode_cache_constructor_smr, zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0); @@ -225,7 +224,7 @@ zfs_znode_init(void) /* * Initialize zcache */ - ASSERT3P(znode_cache, ==, NULL); + ASSERT0P(znode_cache); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_RECLAIMABLE); @@ -289,6 +288,7 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) sharezp->z_atime_dirty = 0; sharezp->z_zfsvfs = zfsvfs; sharezp->z_is_sa = zfsvfs->z_use_sa; + sharezp->z_pflags = 0; VERIFY0(zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, kcred, NULL, &acl_ids, NULL)); @@ -353,8 +353,8 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); - ASSERT3P(zp->z_sa_hdl, ==, NULL); - ASSERT3P(zp->z_acl_cached, ==, NULL); + ASSERT0P(zp->z_sa_hdl); + ASSERT0P(zp->z_acl_cached); if (sa_hdl == NULL) { VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); @@ -451,8 +451,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; atomic_store_ptr(&zp->z_cached_symlink, NULL); zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); @@ -560,6 +558,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; uint64_t dzp_pflags = 0; + uint64_t projid = ZFS_DEFAULT_PROJID; uint64_t rdev = 0; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; dmu_buf_t *db; @@ -667,6 +666,23 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, if (flag & IS_XATTR) pflags |= ZFS_XATTR; + if (vap->va_type == VREG || vap->va_type == VDIR) { + /* + * With ZFS_PROJID flag, we can easily know whether there is + * project ID stored on disk or not. See zpl_get_file_info(). + */ + if (obj_type != DMU_OT_ZNODE && + dmu_objset_projectquota_enabled(zfsvfs->z_os)) + pflags |= ZFS_PROJID; + + /* + * Inherit project ID from parent if required. + */ + projid = zfs_inherit_projid(dzp); + if (dzp_pflags & ZFS_PROJINHERIT) + pflags |= ZFS_PROJINHERIT; + } + /* * No execs denied will be determined when zfs_mode_compute() is called. */ @@ -748,6 +764,10 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, &empty_xattr, 8); + } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && + pflags & ZFS_PROJID) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), + NULL, &projid, 8); } if (obj_type == DMU_OT_ZNODE || (vap->va_type == VBLK || vap->va_type == VCHR)) { @@ -795,6 +815,11 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, (*zpp)->z_pflags = pflags; (*zpp)->z_mode = mode; (*zpp)->z_dnodesize = dnodesize; + (*zpp)->z_projid = projid; + + vnode_t *vp = ZTOV(*zpp); + if (!(flag & IS_ROOT_NODE)) + vn_seqc_write_begin(vp); if (vap->va_mask & AT_XVATTR) zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); @@ -804,7 +829,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } if (!(flag & IS_ROOT_NODE)) { - vnode_t *vp = ZTOV(*zpp); + vn_seqc_write_end(vp); vp->v_vflag |= VV_FORCEINSMQ; int err = insmntque(vp, zfsvfs->z_vfs); vp->v_vflag &= ~VV_FORCEINSMQ; @@ -912,6 +937,11 @@ zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SPARSE); } + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_PROJINHERIT); + } } int @@ -1064,6 +1094,7 @@ zfs_rezget(znode_t *zp) int err; int count = 0; uint64_t gen; + uint64_t projid = ZFS_DEFAULT_PROJID; /* * Remove cached pages before reloading the znode, so that they are not @@ -1100,7 +1131,7 @@ zfs_rezget(znode_t *zp) } rw_exit(&zp->z_xattr_lock); - ASSERT3P(zp->z_sa_hdl, ==, NULL); + ASSERT0P(zp->z_sa_hdl); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); @@ -1144,6 +1175,17 @@ zfs_rezget(znode_t *zp) return (SET_ERROR(EIO)); } + if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), + &projid, 8); + if (err != 0 && err != ENOENT) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (err); + } + } + + zp->z_projid = projid; zp->z_mode = mode; if (gen != zp->z_gen) { @@ -1260,7 +1302,7 @@ zfs_znode_free(znode_t *zp) zfsvfs_t *zfsvfs = zp->z_zfsvfs; char *symlink; - ASSERT3P(zp->z_sa_hdl, ==, NULL); + ASSERT0P(zp->z_sa_hdl); zp->z_vnode = NULL; mutex_enter(&zfsvfs->z_znodes_lock); POINTER_INVALIDATE(&zp->z_zfsvfs); @@ -1725,6 +1767,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; rootzp->z_is_sa = USE_SA(version, os); + rootzp->z_pflags = 0; zfsvfs->z_os = os; zfsvfs->z_parent = zfsvfs; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c index 5a2c4b8cbf22..91cf38016e00 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c @@ -1822,9 +1822,3 @@ error: return (SET_ERROR(ret)); } - -#if defined(_KERNEL) && defined(HAVE_SPL) -module_param(zfs_key_max_salt_uses, ulong, 0644); -MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " - "can be used for generating encryption keys before it is rotated"); -#endif diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index d18ea9d59fa3..0dd2ecd7fd8d 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -31,7 +31,7 @@ * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Klara, Inc. */ /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */ @@ -99,6 +99,7 @@ #include <geom/geom.h> #include <sys/zvol.h> #include <sys/zvol_impl.h> +#include <cityhash.h> #include "zfs_namecheck.h" @@ -112,12 +113,6 @@ #define ZVOL_RW_READ_HELD RW_READ_HELD #endif -enum zvol_geom_state { - ZVOL_GEOM_UNINIT, - ZVOL_GEOM_STOPPED, - ZVOL_GEOM_RUNNING, -}; - struct zvol_state_os { #define zso_dev _zso_state._zso_dev #define zso_geom _zso_state._zso_geom @@ -131,9 +126,6 @@ struct zvol_state_os { /* volmode=geom */ struct zvol_state_geom { struct g_provider *zsg_provider; - struct bio_queue_head zsg_queue; - struct mtx zsg_queue_mtx; - enum zvol_geom_state zsg_state; } _zso_geom; } _zso_state; int zso_dying; @@ -143,8 +135,7 @@ static uint32_t zvol_minors; SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); -SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, - "Expose as GEOM providers (1), device files (2) or neither"); + static boolean_t zpool_on_zvol = B_FALSE; SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, "Allow zpools to use zvols as vdevs (DANGEROUS)"); @@ -169,7 +160,7 @@ static d_close_t zvol_cdev_close; static d_ioctl_t zvol_cdev_ioctl; static d_read_t zvol_cdev_read; static d_write_t zvol_cdev_write; -static d_strategy_t zvol_geom_bio_strategy; +static d_strategy_t zvol_cdev_bio_strategy; static d_kqfilter_t zvol_cdev_kqfilter; static struct cdevsw zvol_cdevsw = { @@ -181,7 +172,7 @@ static struct cdevsw zvol_cdevsw = { .d_ioctl = zvol_cdev_ioctl, .d_read = zvol_cdev_read, .d_write = zvol_cdev_write, - .d_strategy = zvol_geom_bio_strategy, + .d_strategy = zvol_cdev_bio_strategy, .d_kqfilter = zvol_cdev_kqfilter, }; @@ -205,13 +196,10 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); static int zvol_geom_open(struct g_provider *pp, int flag, int count); static int zvol_geom_close(struct g_provider *pp, int flag, int count); -static void zvol_geom_run(zvol_state_t *zv); -static void zvol_geom_destroy(zvol_state_t *zv); static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); -static void zvol_geom_worker(void *arg); static void zvol_geom_bio_start(struct bio *bp); static int zvol_geom_bio_getattr(struct bio *bp); -/* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ +static void zvol_geom_bio_strategy(struct bio *bp, boolean_t sync); /* * GEOM mode implementation @@ -237,25 +225,14 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) } retry: - rw_enter(&zvol_state_lock, ZVOL_RW_READER); - /* - * Obtain a copy of private under zvol_state_lock to make sure either - * the result of zvol free code setting private to NULL is observed, - * or the zv is protected from being freed because of the positive - * zv_open_count. - */ - zv = pp->private; - if (zv == NULL) { - rw_exit(&zvol_state_lock); - err = SET_ERROR(ENXIO); - goto out_locked; - } + zv = atomic_load_ptr(&pp->private); + if (zv == NULL) + return (SET_ERROR(ENXIO)); mutex_enter(&zv->zv_state_lock); if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) { - rw_exit(&zvol_state_lock); err = SET_ERROR(ENXIO); - goto out_zv_locked; + goto out_locked; } ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); @@ -268,8 +245,24 @@ retry: drop_suspend = B_TRUE; if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); + + /* + * Removal may happen while the locks are down, so + * we can't trust zv any longer; we have to start over. + */ + zv = atomic_load_ptr(&pp->private); + if (zv == NULL) + return (SET_ERROR(ENXIO)); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); + + if (zv->zv_zso->zso_dying || + zv->zv_flags & ZVOL_REMOVING) { + err = SET_ERROR(ENXIO); + goto out_locked; + } + /* Check to see if zv_suspend_lock is needed. */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); @@ -277,7 +270,6 @@ retry: } } } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -305,7 +297,7 @@ retry: if (drop_namespace) mutex_exit(&spa_namespace_lock); if (err) - goto out_zv_locked; + goto out_locked; pp->mediasize = zv->zv_volsize; pp->stripeoffset = 0; pp->stripesize = zv->zv_volblocksize; @@ -340,9 +332,8 @@ out_opened: zvol_last_close(zv); wakeup(zv); } -out_zv_locked: - mutex_exit(&zv->zv_state_lock); out_locked: + mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (err); @@ -356,12 +347,9 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) boolean_t drop_suspend = B_TRUE; int new_open_count; - rw_enter(&zvol_state_lock, ZVOL_RW_READER); - zv = pp->private; - if (zv == NULL) { - rw_exit(&zvol_state_lock); + zv = atomic_load_ptr(&pp->private); + if (zv == NULL) return (SET_ERROR(ENXIO)); - } mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { @@ -388,6 +376,15 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); + + /* + * Unlike in zvol_geom_open(), we don't check if + * removal started here, because we might be one of the + * openers that needs to be thrown out! If we're the + * last, we need to call zvol_last_close() below to + * finish cleanup. So, no special treatment for us. + */ + /* Check to see if zv_suspend_lock is needed. */ new_open_count = zv->zv_open_count - count; if (new_open_count != 0) { @@ -398,7 +395,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) } else { drop_suspend = B_FALSE; } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -419,37 +415,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) return (0); } -static void -zvol_geom_run(zvol_state_t *zv) -{ - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp = zsg->zsg_provider; - - ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); - - g_error_provider(pp, 0); - - kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, - "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); -} - -static void -zvol_geom_destroy(zvol_state_t *zv) -{ - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp = zsg->zsg_provider; - - ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); - - g_topology_assert(); - - mutex_enter(&zv->zv_state_lock); - VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING); - mutex_exit(&zv->zv_state_lock); - zsg->zsg_provider = NULL; - g_wither_geom(pp->geom, ENXIO); -} - void zvol_wait_close(zvol_state_t *zv) { @@ -482,7 +447,7 @@ zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", pp->name, acr, acw, ace)); - if (pp->private == NULL) { + if (atomic_load_ptr(&pp->private) == NULL) { if (acr <= 0 && acw <= 0 && ace <= 0) return (0); return (pp->error); @@ -517,43 +482,9 @@ zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) } static void -zvol_geom_worker(void *arg) -{ - zvol_state_t *zv = arg; - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct bio *bp; - - ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); - - thread_lock(curthread); - sched_prio(curthread, PRIBIO); - thread_unlock(curthread); - - for (;;) { - mtx_lock(&zsg->zsg_queue_mtx); - bp = bioq_takefirst(&zsg->zsg_queue); - if (bp == NULL) { - if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { - zsg->zsg_state = ZVOL_GEOM_RUNNING; - wakeup(&zsg->zsg_state); - mtx_unlock(&zsg->zsg_queue_mtx); - kthread_exit(); - } - msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, - PRIBIO | PDROP, "zvol:io", 0); - continue; - } - mtx_unlock(&zsg->zsg_queue_mtx); - zvol_geom_bio_strategy(bp); - } -} - -static void zvol_geom_bio_start(struct bio *bp) { zvol_state_t *zv = bp->bio_to->private; - struct zvol_state_geom *zsg; - boolean_t first; if (zv == NULL) { g_io_deliver(bp, ENXIO); @@ -565,18 +496,8 @@ zvol_geom_bio_start(struct bio *bp) return; } - if (!THREAD_CAN_SLEEP()) { - zsg = &zv->zv_zso->zso_geom; - mtx_lock(&zsg->zsg_queue_mtx); - first = (bioq_first(&zsg->zsg_queue) == NULL); - bioq_insert_tail(&zsg->zsg_queue, bp); - mtx_unlock(&zsg->zsg_queue_mtx); - if (first) - wakeup_one(&zsg->zsg_queue); - return; - } - - zvol_geom_bio_strategy(bp); + zvol_geom_bio_strategy(bp, !g_is_geom_thread(curthread) && + THREAD_CAN_SLEEP()); } static int @@ -660,9 +581,10 @@ zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn) } static void -zvol_geom_bio_strategy(struct bio *bp) +zvol_strategy_impl(zv_request_t *zvr) { zvol_state_t *zv; + struct bio *bp; uint64_t off, volsize; size_t resid; char *addr; @@ -673,11 +595,8 @@ zvol_geom_bio_strategy(struct bio *bp) boolean_t is_dumpified; boolean_t commit; - if (bp->bio_to) - zv = bp->bio_to->private; - else - zv = bp->bio_dev->si_drv2; - + bp = zvr->bio; + zv = zvr->zv; if (zv == NULL) { error = SET_ERROR(ENXIO); goto out; @@ -752,7 +671,7 @@ zvol_geom_bio_strategy(struct bio *bp) while (resid != 0 && off < volsize) { size_t size = MIN(resid, zvol_maxphys); if (doread) { - error = dmu_read(os, ZVOL_OBJ, off, size, addr, + error = dmu_read_by_dnode(zv->zv_dn, off, size, addr, DMU_READ_PREFETCH); } else { dmu_tx_t *tx = dmu_tx_create(os); @@ -761,7 +680,8 @@ zvol_geom_bio_strategy(struct bio *bp) if (error) { dmu_tx_abort(tx); } else { - dmu_write(os, ZVOL_OBJ, off, size, addr, tx); + dmu_write_by_dnode(zv->zv_dn, off, size, addr, + tx, DMU_READ_PREFETCH); zvol_log_write(zv, tx, off, size, commit); dmu_tx_commit(tx); } @@ -800,9 +720,9 @@ unlock: break; } - if (commit) { + if (error == 0 && commit) { commit: - zil_commit(zv->zv_zilog, ZVOL_OBJ); + error = zil_commit(zv->zv_zilog, ZVOL_OBJ); } resume: rw_exit(&zv->zv_suspend_lock); @@ -813,6 +733,63 @@ out: biofinish(bp, NULL, error); } +static void +zvol_strategy_task(void *arg) +{ + zv_request_task_t *task = arg; + + zvol_strategy_impl(&task->zvr); + zv_request_task_free(task); +} + +static void +zvol_geom_bio_strategy(struct bio *bp, boolean_t sync) +{ + zv_taskq_t *ztqs = &zvol_taskqs; + zv_request_task_t *task; + zvol_state_t *zv; + uint_t tq_idx; + uint_t taskq_hash; + int error; + + if (bp->bio_to) + zv = bp->bio_to->private; + else + zv = bp->bio_dev->si_drv2; + + if (zv == NULL) { + error = SET_ERROR(ENXIO); + if (bp->bio_to) + g_io_deliver(bp, error); + else + biofinish(bp, NULL, error); + return; + } + + zv_request_t zvr = { + .zv = zv, + .bio = bp, + }; + + if (sync || zvol_request_sync) { + zvol_strategy_impl(&zvr); + return; + } + + taskq_hash = cityhash3((uintptr_t)zv, curcpu, bp->bio_offset >> + ZVOL_TASKQ_OFFSET_SHIFT); + tq_idx = taskq_hash % ztqs->tqs_cnt; + task = zv_request_task_create(zvr); + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_strategy_task, task, + 0, &task->ent); +} + +static void +zvol_cdev_bio_strategy(struct bio *bp) +{ + zvol_geom_bio_strategy(bp, B_FALSE); +} + /* * Character device mode implementation */ @@ -850,7 +827,8 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) if (bytes > volsize - zfs_uio_offset(&uio)) bytes = volsize - zfs_uio_offset(&uio); - error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); + error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes, + DMU_READ_PREFETCH); if (error) { /* Convert checksum errors into IO errors. */ if (error == ECKSUM) @@ -909,7 +887,8 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) dmu_tx_abort(tx); break; } - error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); + error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx, + DMU_READ_PREFETCH); if (error == 0) zvol_log_write(zv, tx, off, bytes, commit); dmu_tx_commit(tx); @@ -920,8 +899,8 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) zfs_rangelock_exit(lr); int64_t nwritten = start_resid - zfs_uio_resid(&uio); dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); - if (commit) - zil_commit(zv->zv_zilog, ZVOL_OBJ); + if (error == 0 && commit) + error = zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); return (error); @@ -935,25 +914,14 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) boolean_t drop_suspend = B_FALSE; retry: - rw_enter(&zvol_state_lock, ZVOL_RW_READER); - /* - * Obtain a copy of si_drv2 under zvol_state_lock to make sure either - * the result of zvol free code setting si_drv2 to NULL is observed, - * or the zv is protected from being freed because of the positive - * zv_open_count. - */ - zv = dev->si_drv2; - if (zv == NULL) { - rw_exit(&zvol_state_lock); - err = SET_ERROR(ENXIO); - goto out_locked; - } + zv = atomic_load_ptr(&dev->si_drv2); + if (zv == NULL) + return (SET_ERROR(ENXIO)); mutex_enter(&zv->zv_state_lock); - if (zv->zv_zso->zso_dying) { - rw_exit(&zvol_state_lock); + if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) { err = SET_ERROR(ENXIO); - goto out_zv_locked; + goto out_locked; } ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); @@ -968,6 +936,13 @@ retry: mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); + + if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { + /* Removal started while locks were down. */ + err = SET_ERROR(ENXIO); + goto out_locked; + } + /* Check to see if zv_suspend_lock is needed. */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); @@ -975,7 +950,6 @@ retry: } } } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -1003,7 +977,7 @@ retry: if (drop_namespace) mutex_exit(&spa_namespace_lock); if (err) - goto out_zv_locked; + goto out_locked; } ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -1030,9 +1004,8 @@ out_opened: zvol_last_close(zv); wakeup(zv); } -out_zv_locked: - mutex_exit(&zv->zv_state_lock); out_locked: + mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (err); @@ -1044,12 +1017,9 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; - rw_enter(&zvol_state_lock, ZVOL_RW_READER); - zv = dev->si_drv2; - if (zv == NULL) { - rw_exit(&zvol_state_lock); + zv = atomic_load_ptr(&dev->si_drv2); + if (zv == NULL) return (SET_ERROR(ENXIO)); - } mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { @@ -1074,6 +1044,15 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); + + /* + * Unlike in zvol_cdev_open(), we don't check if + * removal started here, because we might be one of the + * openers that needs to be thrown out! If we're the + * last, we need to call zvol_last_close() below to + * finish cleanup. So, no special treatment for us. + */ + /* Check to see if zv_suspend_lock is needed. */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); @@ -1083,7 +1062,6 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) } else { drop_suspend = B_FALSE; } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -1115,7 +1093,8 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, int error; boolean_t sync; - zv = dev->si_drv2; + zv = atomic_load_ptr(&dev->si_drv2); + ASSERT3P(zv, !=, NULL); error = 0; KASSERT(zv->zv_open_count > 0, @@ -1131,7 +1110,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, case DIOCGFLUSH: rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); if (zv->zv_zilog != NULL) - zil_commit(zv->zv_zilog, ZVOL_OBJ); + error = zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); break; case DIOCGDELETE: @@ -1166,7 +1145,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, } zfs_rangelock_exit(lr); if (sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); + error = zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); break; case DIOCGSTRIPESIZE: @@ -1176,6 +1155,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, *(off_t *)data = 0; break; case DIOCGATTR: { + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); spa_t *spa = dmu_objset_spa(zv->zv_objset); struct diocgattr_arg *arg = (struct diocgattr_arg *)data; uint64_t refd, avail, usedobjs, availobjs; @@ -1200,6 +1180,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, arg->value.off = refd / DEV_BSIZE; } else error = SET_ERROR(ENOIOCTL); + rw_exit(&zv->zv_suspend_lock); break; } case FIOSEEKHOLE: @@ -1210,10 +1191,12 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, hole = (cmd == FIOSEEKHOLE); noff = *off; + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX, RL_READER); error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); zfs_rangelock_exit(lr); + rw_exit(&zv->zv_suspend_lock); *off = noff; break; } @@ -1262,9 +1245,11 @@ zvol_os_is_zvol(const char *device) return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); } -void +int zvol_os_rename_minor(zvol_state_t *zv, const char *newname) { + int error = 0; + ASSERT(RW_LOCK_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -1318,57 +1303,159 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) args.mda_gid = GID_OPERATOR; args.mda_mode = 0640; args.mda_si_drv2 = zv; - if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) - == 0) { + error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname); + if (error == 0) { dev->si_iosize_max = maxphys; zsd->zsd_cdev = dev; } } strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); dataset_kstats_rename(&zv->zv_kstat, newname); + + return (error); } /* - * Remove minor node for the specified volume. + * Allocate memory for a new zvol_state_t and setup the required + * request queue and generic disk structures for the block device. */ -void -zvol_os_free(zvol_state_t *zv) +static int +zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize, + zvol_state_t **zvp) { - ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); - ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); - ASSERT0(zv->zv_open_count); + zvol_state_t *zv; + uint64_t volmode; + int error; - ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); + error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE), + &volmode, NULL); + if (error) + return (error); - rw_destroy(&zv->zv_suspend_lock); - zfs_rangelock_fini(&zv->zv_rangelock); + if (volmode == ZFS_VOLMODE_DEFAULT) + volmode = zvol_volmode; + + if (volmode == ZFS_VOLMODE_NONE) + return (0); + zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); + mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); + zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); + zv->zv_volmode = volmode; + zv->zv_volsize = volsize; + zv->zv_volblocksize = volblocksize; if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp __maybe_unused = zsg->zsg_provider; + struct g_provider *pp; + struct g_geom *gp; + + g_topology_lock(); + gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); + gp->start = zvol_geom_bio_start; + gp->access = zvol_geom_access; + pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); + pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; + pp->sectorsize = DEV_BSIZE; + pp->mediasize = 0; + pp->private = zv; + + zsg->zsg_provider = pp; + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev; + struct make_dev_args args; + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_devsw = &zvol_cdevsw; + args.mda_cr = NULL; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_OPERATOR; + args.mda_mode = 0640; + args.mda_si_drv2 = zv; + error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name); + if (error) { + kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (zvol_state_t)); + return (error); + } + + dev->si_iosize_max = maxphys; + zsd->zsd_cdev = dev; + knlist_init_sx(&zsd->zsd_selinfo.si_note, &zv->zv_state_lock); + } + (void) strlcpy(zv->zv_name, name, MAXPATHLEN); + rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); + zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); - ASSERT3P(pp->private, ==, NULL); + *zvp = zv; + return (error); +} + +/* + * Remove minor node for the specified volume. + */ +void +zvol_os_remove_minor(zvol_state_t *zv) +{ + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + ASSERT0(zv->zv_open_count); + ASSERT0(atomic_read(&zv->zv_suspend_ref)); + ASSERT(zv->zv_flags & ZVOL_REMOVING); + + struct zvol_state_os *zso = zv->zv_zso; + zv->zv_zso = NULL; + + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + atomic_store_ptr(&pp->private, NULL); + mutex_exit(&zv->zv_state_lock); g_topology_lock(); - zvol_geom_destroy(zv); + g_wither_geom(pp->geom, ENXIO); g_topology_unlock(); - mtx_destroy(&zsg->zsg_queue_mtx); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { - struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct zvol_state_dev *zsd = &zso->zso_dev; struct cdev *dev = zsd->zsd_cdev; + if (dev != NULL) + atomic_store_ptr(&dev->si_drv2, NULL); + mutex_exit(&zv->zv_state_lock); + if (dev != NULL) { - ASSERT3P(dev->si_drv2, ==, NULL); destroy_dev(dev); knlist_clear(&zsd->zsd_selinfo.si_note, 0); knlist_destroy(&zsd->zsd_selinfo.si_note); } } + kmem_free(zso, sizeof (struct zvol_state_os)); + + mutex_enter(&zv->zv_state_lock); +} + +void +zvol_os_free(zvol_state_t *zv) +{ + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT0(zv->zv_open_count); + ASSERT0P(zv->zv_zso); + + ASSERT0P(zv->zv_objset); + ASSERT0P(zv->zv_zilog); + ASSERT0P(zv->zv_dn); + + ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); + + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); + mutex_destroy(&zv->zv_state_lock); cv_destroy(&zv->zv_removing_cv); dataset_kstats_destroy(&zv->zv_kstat); - kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); zvol_minors--; } @@ -1379,14 +1466,17 @@ zvol_os_free(zvol_state_t *zv) int zvol_os_create_minor(const char *name) { - zvol_state_t *zv; + zvol_state_t *zv = NULL; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; - uint64_t volmode, hash; + uint64_t hash, len; int error; bool replayed_zil = B_FALSE; + if (zvol_inhibit_dev) + return (0); + ZFS_LOG(1, "Creating ZVOL %s...", name); hash = zvol_name_hash(name); if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { @@ -1412,78 +1502,22 @@ zvol_os_create_minor(const char *name) if (error) goto out_dmu_objset_disown; - error = dsl_prop_get_integer(name, - zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); - if (error || volmode == ZFS_VOLMODE_DEFAULT) - volmode = zvol_volmode; - error = 0; + error = zvol_alloc(name, volsize, doi->doi_data_block_size, &zv); + if (error || zv == NULL) + goto out_dmu_objset_disown; - /* - * zvol_alloc equivalent ... - */ - zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); zv->zv_hash = hash; - mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); - zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); - zv->zv_volmode = volmode; - if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp; - struct g_geom *gp; - - zsg->zsg_state = ZVOL_GEOM_UNINIT; - mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); - - g_topology_lock(); - gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); - gp->start = zvol_geom_bio_start; - gp->access = zvol_geom_access; - pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); - pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; - pp->sectorsize = DEV_BSIZE; - pp->mediasize = 0; - pp->private = zv; - - zsg->zsg_provider = pp; - bioq_init(&zsg->zsg_queue); - } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { - struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; - struct cdev *dev; - struct make_dev_args args; - - make_dev_args_init(&args); - args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; - args.mda_devsw = &zvol_cdevsw; - args.mda_cr = NULL; - args.mda_uid = UID_ROOT; - args.mda_gid = GID_OPERATOR; - args.mda_mode = 0640; - args.mda_si_drv2 = zv; - if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name) - == 0) { - dev->si_iosize_max = maxphys; - zsd->zsd_cdev = dev; - knlist_init_sx(&zsd->zsd_selinfo.si_note, - &zv->zv_state_lock); - } - } - (void) strlcpy(zv->zv_name, name, MAXPATHLEN); - rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); - zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) zv->zv_flags |= ZVOL_RDONLY; - zv->zv_volblocksize = doi->doi_data_block_size; - zv->zv_volsize = volsize; zv->zv_objset = os; - ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); + ASSERT0P(zv->zv_kstat.dk_kstats); error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); if (error) goto out_dmu_objset_disown; - ASSERT3P(zv->zv_zilog, ==, NULL); + ASSERT0P(zv->zv_zilog); zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) @@ -1495,19 +1529,25 @@ zvol_os_create_minor(const char *name) zil_close(zv->zv_zilog); zv->zv_zilog = NULL; - /* TODO: prefetch for geom tasting */ + len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); + if (len > 0) { + dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_ASYNC_READ); + dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, + ZIO_PRIORITY_ASYNC_READ); + } zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); - if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { - zvol_geom_run(zv); + if (error == 0 && zv && zv->zv_volmode == ZFS_VOLMODE_GEOM) { + g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0); + /* geom was locked inside zvol_alloc() function */ g_topology_unlock(); } out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); - if (error == 0) { + if (error == 0 && zv) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); zvol_minors++; @@ -1518,35 +1558,6 @@ out_doi: return (error); } -void -zvol_os_clear_private(zvol_state_t *zv) -{ - ASSERT(RW_LOCK_HELD(&zvol_state_lock)); - if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp = zsg->zsg_provider; - - if (pp->private == NULL) /* already cleared */ - return; - - mtx_lock(&zsg->zsg_queue_mtx); - zsg->zsg_state = ZVOL_GEOM_STOPPED; - pp->private = NULL; - wakeup_one(&zsg->zsg_queue); - while (zsg->zsg_state != ZVOL_GEOM_RUNNING) - msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx, - 0, "zvol:w", 0); - mtx_unlock(&zsg->zsg_queue_mtx); - ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); - } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { - struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; - struct cdev *dev = zsd->zsd_cdev; - - if (dev != NULL) - dev->si_drv2 = NULL; - } -} - int zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) { @@ -1584,13 +1595,21 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) void zvol_os_set_disk_ro(zvol_state_t *zv, int flags) { - // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); + /* + * The ro/rw ZVOL mode is switched using zvol_set_ro() function by + * enabling/disabling ZVOL_RDONLY flag. No additional FreeBSD-specific + * actions are required for readonly zfs property switching. + */ } void zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) { - // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); + /* + * The ZVOL size/capacity is changed by zvol_set_volsize() function. + * Leave this method empty, because all required job is doing by + * zvol_os_update_volsize() platform-specific function. + */ } /* @@ -1606,8 +1625,7 @@ zvol_busy(void) int zvol_init(void) { - zvol_init_impl(); - return (0); + return (zvol_init_impl()); } void diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c index ce9c9e39e60c..aac5f2ebbfd2 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c @@ -66,9 +66,9 @@ void __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) { ASSERT(cvp); - ASSERT(name == NULL); + ASSERT0P(name); ASSERT(type == CV_DEFAULT); - ASSERT(arg == NULL); + ASSERT0P(arg); cvp->cv_magic = CV_MAGIC; init_waitqueue_head(&cvp->cv_event); @@ -83,7 +83,7 @@ static int cv_destroy_wakeup(kcondvar_t *cvp) { if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) { - ASSERT(cvp->cv_mutex == NULL); + ASSERT0P(cvp->cv_mutex); ASSERT(!waitqueue_active(&cvp->cv_event)); return (1); } @@ -104,7 +104,7 @@ __cv_destroy(kcondvar_t *cvp) while (cv_destroy_wakeup(cvp) == 0) wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1); - ASSERT3P(cvp->cv_mutex, ==, NULL); + ASSERT0P(cvp->cv_mutex); ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0); ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0); ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c index 115c9460f3e6..89ca4a648b2f 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c @@ -555,29 +555,6 @@ ddi_copyin(const void *from, void *to, size_t len, int flags) } EXPORT_SYMBOL(ddi_copyin); -#define define_spl_param(type, fmt) \ -int \ -spl_param_get_##type(char *buf, zfs_kernel_param_t *kp) \ -{ \ - return (scnprintf(buf, PAGE_SIZE, fmt "\n", \ - *(type *)kp->arg)); \ -} \ -int \ -spl_param_set_##type(const char *buf, zfs_kernel_param_t *kp) \ -{ \ - return (kstrto##type(buf, 0, (type *)kp->arg)); \ -} \ -const struct kernel_param_ops spl_param_ops_##type = { \ - .set = spl_param_set_##type, \ - .get = spl_param_get_##type, \ -}; \ -EXPORT_SYMBOL(spl_param_get_##type); \ -EXPORT_SYMBOL(spl_param_set_##type); \ -EXPORT_SYMBOL(spl_param_ops_##type); - -define_spl_param(s64, "%lld") -define_spl_param(u64, "%llu") - /* * Post a uevent to userspace whenever a new vdev adds to the pool. It is * necessary to sync blkid information with udev, which zed daemon uses @@ -732,7 +709,7 @@ zone_get_hostid(void *zone) { uint32_t hostid; - ASSERT3P(zone, ==, NULL); + ASSERT0P(zone); if (spl_hostid != 0) return ((uint32_t)(spl_hostid & HW_HOSTID_MASK)); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c index fab80289b278..22e4ed169d03 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c @@ -296,7 +296,7 @@ spl_slab_free(spl_kmem_slab_t *sks, spl_kmem_cache_t *skc; ASSERT(sks->sks_magic == SKS_MAGIC); - ASSERT(sks->sks_ref == 0); + ASSERT0(sks->sks_ref); skc = sks->sks_cache; ASSERT(skc->skc_magic == SKC_MAGIC); @@ -598,7 +598,7 @@ static void spl_magazine_free(spl_kmem_magazine_t *skm) { ASSERT(skm->skm_magic == SKM_MAGIC); - ASSERT(skm->skm_avail == 0); + ASSERT0(skm->skm_avail); kfree(skm); } @@ -610,7 +610,7 @@ spl_magazine_create(spl_kmem_cache_t *skc) { int i = 0; - ASSERT((skc->skc_flags & KMC_SLAB) == 0); + ASSERT0((skc->skc_flags & KMC_SLAB)); skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) * num_possible_cpus(), kmem_flags_convert(KM_SLEEP)); @@ -640,7 +640,7 @@ spl_magazine_destroy(spl_kmem_cache_t *skc) spl_kmem_magazine_t *skm; int i = 0; - ASSERT((skc->skc_flags & KMC_SLAB) == 0); + ASSERT0((skc->skc_flags & KMC_SLAB)); for_each_possible_cpu(i) { skm = skc->skc_mag[i]; @@ -679,8 +679,8 @@ spl_kmem_cache_create(const char *name, size_t size, size_t align, /* * Unsupported flags */ - ASSERT(vmp == NULL); - ASSERT(reclaim == NULL); + ASSERT0P(vmp); + ASSERT0P(reclaim); might_sleep(); @@ -863,11 +863,11 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) * Validate there are no objects in use and free all the * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */ - ASSERT3U(skc->skc_slab_alloc, ==, 0); - ASSERT3U(skc->skc_obj_alloc, ==, 0); - ASSERT3U(skc->skc_slab_total, ==, 0); - ASSERT3U(skc->skc_obj_total, ==, 0); - ASSERT3U(skc->skc_obj_emergency, ==, 0); + ASSERT0(skc->skc_slab_alloc); + ASSERT0(skc->skc_obj_alloc); + ASSERT0(skc->skc_slab_total); + ASSERT0(skc->skc_obj_total); + ASSERT0(skc->skc_obj_emergency); ASSERT(list_empty(&skc->skc_complete_list)); ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0); @@ -986,7 +986,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT((skc->skc_flags & KMC_SLAB) == 0); + ASSERT0((skc->skc_flags & KMC_SLAB)); *obj = NULL; diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c index 337a4bcf76a0..9fe008cef868 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c @@ -302,13 +302,8 @@ spl_kmem_free_impl(const void *buf, size_t size) #ifdef DEBUG_KMEM /* Shim layer memory accounting */ -#ifdef HAVE_ATOMIC64_T atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); -unsigned long long kmem_alloc_max = 0; -#else /* HAVE_ATOMIC64_T */ -atomic_t kmem_alloc_used = ATOMIC_INIT(0); -unsigned long long kmem_alloc_max = 0; -#endif /* HAVE_ATOMIC64_T */ +uint64_t kmem_alloc_max = 0; EXPORT_SYMBOL(kmem_alloc_used); EXPORT_SYMBOL(kmem_alloc_max); @@ -320,9 +315,9 @@ spl_kmem_alloc_debug(size_t size, int flags, int node) ptr = spl_kmem_alloc_impl(size, flags, node); if (ptr) { - kmem_alloc_used_add(size); - if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) - kmem_alloc_max = kmem_alloc_used_read(); + atomic64_add(size, &kmem_alloc_used); + if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max)) + kmem_alloc_max = atomic64_read(&kmem_alloc_used); } return (ptr); @@ -331,7 +326,7 @@ spl_kmem_alloc_debug(size_t size, int flags, int node) inline void spl_kmem_free_debug(const void *ptr, size_t size) { - kmem_alloc_used_sub(size); + atomic64_sub(size, &kmem_alloc_used); spl_kmem_free_impl(ptr, size); } @@ -595,7 +590,7 @@ spl_kmem_init(void) { #ifdef DEBUG_KMEM - kmem_alloc_used_set(0); + atomic64_set(&kmem_alloc_used, 0); @@ -617,9 +612,10 @@ spl_kmem_fini(void) * at that address to aid in debugging. Performance is not * a serious concern here since it is module unload time. */ - if (kmem_alloc_used_read() != 0) + if (atomic64_read(&kmem_alloc_used) != 0) printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", - (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); + (unsigned long)atomic64_read(&kmem_alloc_used), + kmem_alloc_max); #ifdef DEBUG_KMEM_TRACKING spl_kmem_fini_tracking(&kmem_list, &kmem_lock); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c index 0a6125755118..02c5b42bc4a0 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c @@ -395,7 +395,7 @@ kstat_delete_module(kstat_module_t *module) kstat_module_t *parent = module->ksm_parent; - char *p = module->ksm_name, *frag; + char *p = module->ksm_name, *frag = NULL; while (p != NULL && (frag = strsep(&p, "/"))) {} remove_proc_entry(frag, parent ? parent->ksm_proc : proc_spl_kstat); @@ -420,7 +420,7 @@ kstat_create_module(char *name) (void) strlcpy(buf, name, KSTAT_STRLEN); - parent = NULL; + module = parent = NULL; char *p = buf, *frag; while ((frag = strsep(&p, "/")) != NULL) { module = kstat_find_module(buf); @@ -454,7 +454,6 @@ kstat_create_module(char *name) } return (module); - } static int @@ -542,7 +541,7 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, kstat_t *ksp; ASSERT(ks_module); - ASSERT(ks_instance == 0); + ASSERT0(ks_instance); ASSERT(ks_name); if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c index 4ed0deedd5b9..8cdd5fc5cfe5 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c @@ -82,11 +82,7 @@ proc_domemused(CONST_CTL_TABLE *table, int write, if (write) { *ppos += *lenp; } else { -#ifdef HAVE_ATOMIC64_T val = atomic64_read((atomic64_t *)table->data); -#else - val = atomic_read((atomic_t *)table->data); -#endif /* HAVE_ATOMIC64_T */ rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos); } @@ -315,18 +311,14 @@ static struct ctl_table spl_kmem_table[] = { { .procname = "kmem_used", .data = &kmem_alloc_used, -#ifdef HAVE_ATOMIC64_T .maxlen = sizeof (atomic64_t), -#else - .maxlen = sizeof (atomic_t), -#endif /* HAVE_ATOMIC64_T */ .mode = 0444, .proc_handler = &proc_domemused, }, { .procname = "kmem_max", .data = &kmem_alloc_max, - .maxlen = sizeof (unsigned long), + .maxlen = sizeof (uint64_t), .extra1 = &table_min, .extra2 = &table_max, .mode = 0444, diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c index d5b42fdfaf20..092f090d934b 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c @@ -37,6 +37,12 @@ #include <sys/atomic.h> #include <sys/kstat.h> #include <linux/cpuhotplug.h> +#include <linux/mod_compat.h> + +/* Linux 6.2 renamed timer_delete_sync(); point it at its old name for those. */ +#ifndef HAVE_TIMER_DELETE_SYNC +#define timer_delete_sync(t) del_timer_sync(t) +#endif typedef struct taskq_kstats { /* static values, for completeness */ @@ -633,7 +639,7 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id) */ if (timer_pending(&t->tqent_timer)) { spin_unlock_irqrestore(&tq->tq_lock, flags); - del_timer_sync(&t->tqent_timer); + timer_delete_sync(&t->tqent_timer); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); } @@ -1646,18 +1652,8 @@ spl_taskq_kstat_fini(void) static unsigned int spl_taskq_kick = 0; -/* - * 2.6.36 API Change - * module_param_cb is introduced to take kernel_param_ops and - * module_param_call is marked as obsolete. Also set and get operations - * were changed to take a 'const struct kernel_param *'. - */ static int -#ifdef module_param_cb -param_set_taskq_kick(const char *val, const struct kernel_param *kp) -#else -param_set_taskq_kick(const char *val, struct kernel_param *kp) -#endif +param_set_taskq_kick(const char *val, zfs_kernel_param_t *kp) { int ret; taskq_t *tq = NULL; @@ -1687,16 +1683,8 @@ param_set_taskq_kick(const char *val, struct kernel_param *kp) return (ret); } -#ifdef module_param_cb -static const struct kernel_param_ops param_ops_taskq_kick = { - .set = param_set_taskq_kick, - .get = param_get_uint, -}; -module_param_cb(spl_taskq_kick, ¶m_ops_taskq_kick, &spl_taskq_kick, 0644); -#else module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint, &spl_taskq_kick, 0644); -#endif MODULE_PARM_DESC(spl_taskq_kick, "Write nonzero to kick stuck taskqs to spawn more threads"); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c index 1398483a3ac8..8f5c73b13df5 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c @@ -28,6 +28,7 @@ #include <sys/kmem.h> #include <sys/tsd.h> #include <sys/string.h> +#include <sys/misc.h> /* * Thread interfaces @@ -79,7 +80,7 @@ __thread_create(caddr_t stk, size_t stksize, thread_func_t func, /* Option pp is simply ignored */ /* Variable stack size unsupported */ - ASSERT(stk == NULL); + ASSERT0P(stk); tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE); if (tp == NULL) @@ -197,3 +198,14 @@ issig(void) } EXPORT_SYMBOL(issig); + +/* + * Check if the current thread is a memory reclaim thread. + * Returns true if current thread is kswapd. + */ +int +current_is_reclaim_thread(void) +{ + return (current_is_kswapd()); +} +EXPORT_SYMBOL(current_is_reclaim_thread); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c index 34a61bef7d4f..2e8cedf0dc87 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c @@ -161,7 +161,7 @@ tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value) ulong_t hash; int rc = 0; - ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL); + ASSERT0P(tsd_hash_search(table, key, pid)); /* New entry allocate structure, set value, and add to hash */ entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index 8e4d271976ea..8a8316f63c48 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -256,10 +256,6 @@ abd_unmark_zfs_page(struct page *page) #ifndef CONFIG_HIGHMEM -#ifndef __GFP_RECLAIM -#define __GFP_RECLAIM __GFP_WAIT -#endif - /* * The goal is to minimize fragmentation by preferentially populating ABDs * with higher order compound pages from a single zone. Allocation size is @@ -867,9 +863,9 @@ abd_iter_advance(struct abd_iter *aiter, size_t amount) * Ensure that last chunk is not in use. abd_iterate_*() must clear * this state (directly or abd_iter_unmap()) before advancing. */ - ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0P(aiter->iter_mapaddr); ASSERT0(aiter->iter_mapsize); - ASSERT3P(aiter->iter_page, ==, NULL); + ASSERT0P(aiter->iter_page); ASSERT0(aiter->iter_page_doff); ASSERT0(aiter->iter_page_dsize); @@ -901,7 +897,7 @@ abd_iter_map(struct abd_iter *aiter) void *paddr; size_t offset = 0; - ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0P(aiter->iter_mapaddr); ASSERT0(aiter->iter_mapsize); /* There's nothing left to iterate over, so do nothing */ @@ -1340,6 +1336,8 @@ abd_bio_map_off(struct bio *bio, abd_t *abd, return (io_size); } +EXPORT_SYMBOL(abd_alloc_from_pages); + /* Tunable Parameters */ module_param(zfs_abd_scatter_enabled, int, 0644); MODULE_PARM_DESC(zfs_abd_scatter_enabled, diff --git a/sys/contrib/openzfs/module/os/linux/zfs/policy.c b/sys/contrib/openzfs/module/os/linux/zfs/policy.c index c50ffcfe6992..4396a5d9e076 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/policy.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/policy.c @@ -24,6 +24,7 @@ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013, Joyent, Inc. All rights reserved. * Copyright (C) 2016 Lawrence Livermore National Security, LLC. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> * * For Linux the vast majority of this enforcement is already handled via * the standard Linux VFS permission checks. However certain administrative @@ -35,28 +36,32 @@ #include <linux/security.h> #include <linux/vfs_compat.h> -/* - * The passed credentials cannot be directly verified because Linux only - * provides and interface to check the *current* process credentials. In - * order to handle this the capable() test is only run when the passed - * credentials match the current process credentials or the kcred. In - * all other cases this function must fail and return the passed err. - */ static int priv_policy_ns(const cred_t *cr, int capability, int err, struct user_namespace *ns) { - if (cr != CRED() && (cr != kcred)) - return (err); + /* + * The passed credentials cannot be directly verified because Linux + * only provides an interface to check the *current* process + * credentials. In order to handle this we check if the passed in + * creds match the current process credentials or the kcred. If not, + * we swap the passed credentials into the current task, perform the + * check, and then revert it before returning. + */ + const cred_t *old = + (cr != CRED() && cr != kcred) ? override_creds(cr) : NULL; #if defined(CONFIG_USER_NS) - if (!(ns ? ns_capable(ns, capability) : capable(capability))) + if (ns ? ns_capable(ns, capability) : capable(capability)) #else - if (!capable(capability)) + if (capable(capability)) #endif - return (err); + err = 0; - return (0); + if (old) + revert_creds(old); + + return (err); } static int @@ -249,19 +254,6 @@ secpolicy_zfs(const cred_t *cr) return (priv_policy(cr, CAP_SYS_ADMIN, EACCES)); } -/* - * Equivalent to secpolicy_zfs(), but works even if the cred_t is not that of - * the current process. Takes both cred_t and proc_t so that this can work - * easily on all platforms. - */ -int -secpolicy_zfs_proc(const cred_t *cr, proc_t *proc) -{ - if (!has_capability(proc, CAP_SYS_ADMIN)) - return (EACCES); - return (0); -} - void secpolicy_setid_clear(vattr_t *vap, cred_t *cr) { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c index 29e54b39aa1a..1bd3500e9f66 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c @@ -25,7 +25,7 @@ * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. * LLNL-CODE-403049. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. - * Copyright (c) 2023, 2024, Klara Inc. + * Copyright (c) 2023, 2024, 2025, Klara, Inc. */ #include <sys/zfs_context.h> @@ -471,13 +471,17 @@ vdev_disk_close(vdev_t *v) if (v->vdev_reopening || vd == NULL) return; + rw_enter(&vd->vd_lock, RW_WRITER); + if (vd->vd_bdh != NULL) vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), zfs_vdev_holder); + v->vdev_tsd = NULL; + + rw_exit(&vd->vd_lock); rw_destroy(&vd->vd_lock); kmem_free(vd, sizeof (vdev_disk_t)); - v->vdev_tsd = NULL; } /* @@ -552,7 +556,7 @@ vdev_bio_associate_blkg(struct bio *bio) #endif ASSERT3P(q, !=, NULL); - ASSERT3P(bio->bi_blkg, ==, NULL); + ASSERT0P(bio->bi_blkg); if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) bio->bi_blkg = q->root_blkg; @@ -574,7 +578,7 @@ vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) bio->bi_bdev = bdev; ASSERT3P(q, !=, NULL); - ASSERT3P(bio->bi_blkg, ==, NULL); + ASSERT0P(bio->bi_blkg); if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) bio->bi_blkg = q->root_blkg; @@ -614,7 +618,7 @@ static inline uint_t vdev_bio_max_segs(struct block_device *bdev) { /* - * Smallest of the device max segs and the tuneable max segs. Minimum + * Smallest of the device max segs and the tunable max segs. Minimum * 4, so there's room to finish split pages if they come up. */ const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); @@ -806,7 +810,7 @@ vbio_completion(struct bio *bio) * here; instead we stash vbio on the zio and take care of it in the * done callback. */ - ASSERT3P(zio->io_bio, ==, NULL); + ASSERT0P(zio->io_bio); zio->io_bio = vbio; zio_delay_interrupt(zio); @@ -966,234 +970,6 @@ vdev_disk_io_rw(zio_t *zio) return (0); } -/* ========== */ - -/* - * This is the classic, battle-tested BIO submission code. Until we're totally - * sure that the new code is safe and correct in all cases, this will remain - * available and can be enabled by setting zfs_vdev_disk_classic=1 at module - * load time. - * - * These functions have been renamed to vdev_classic_* to make it clear what - * they belong to, but their implementations are unchanged. - */ - -/* - * Virtual device vector for disks. - */ -typedef struct dio_request { - zio_t *dr_zio; /* Parent ZIO */ - atomic_t dr_ref; /* References */ - int dr_error; /* Bio error */ - int dr_bio_count; /* Count of bio's */ - struct bio *dr_bio[]; /* Attached bio's */ -} dio_request_t; - -static dio_request_t * -vdev_classic_dio_alloc(int bio_count) -{ - dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + - sizeof (struct bio *) * bio_count, KM_SLEEP); - atomic_set(&dr->dr_ref, 0); - dr->dr_bio_count = bio_count; - dr->dr_error = 0; - - for (int i = 0; i < dr->dr_bio_count; i++) - dr->dr_bio[i] = NULL; - - return (dr); -} - -static void -vdev_classic_dio_free(dio_request_t *dr) -{ - int i; - - for (i = 0; i < dr->dr_bio_count; i++) - if (dr->dr_bio[i]) - bio_put(dr->dr_bio[i]); - - kmem_free(dr, sizeof (dio_request_t) + - sizeof (struct bio *) * dr->dr_bio_count); -} - -static void -vdev_classic_dio_get(dio_request_t *dr) -{ - atomic_inc(&dr->dr_ref); -} - -static void -vdev_classic_dio_put(dio_request_t *dr) -{ - int rc = atomic_dec_return(&dr->dr_ref); - - /* - * Free the dio_request when the last reference is dropped and - * ensure zio_interpret is called only once with the correct zio - */ - if (rc == 0) { - zio_t *zio = dr->dr_zio; - int error = dr->dr_error; - - vdev_classic_dio_free(dr); - - if (zio) { - zio->io_error = error; - ASSERT3S(zio->io_error, >=, 0); - if (zio->io_error) - vdev_disk_error(zio); - - zio_delay_interrupt(zio); - } - } -} - -static void -vdev_classic_physio_completion(struct bio *bio) -{ - dio_request_t *dr = bio->bi_private; - - if (dr->dr_error == 0) { - dr->dr_error = bi_status_to_errno(bio->bi_status); - } - - /* Drop reference acquired by vdev_classic_physio */ - vdev_classic_dio_put(dr); -} - -static inline unsigned int -vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) -{ - unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, - bio_size, abd_offset); - -#ifdef HAVE_BIO_MAX_SEGS - return (bio_max_segs(nr_segs)); -#else - return (MIN(nr_segs, BIO_MAX_PAGES)); -#endif -} - -static int -vdev_classic_physio(zio_t *zio) -{ - vdev_t *v = zio->io_vd; - vdev_disk_t *vd = v->vdev_tsd; - struct block_device *bdev = BDH_BDEV(vd->vd_bdh); - size_t io_size = zio->io_size; - uint64_t io_offset = zio->io_offset; - int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; - int flags = 0; - - dio_request_t *dr; - uint64_t abd_offset; - uint64_t bio_offset; - int bio_size; - int bio_count = 16; - int error = 0; - struct blk_plug plug; - unsigned short nr_vecs; - - /* - * Accessing outside the block device is never allowed. - */ - if (io_offset + io_size > bdev_capacity(bdev)) { - vdev_dbgmsg(zio->io_vd, - "Illegal access %llu size %llu, device size %llu", - (u_longlong_t)io_offset, - (u_longlong_t)io_size, - (u_longlong_t)bdev_capacity(bdev)); - return (SET_ERROR(EIO)); - } - -retry: - dr = vdev_classic_dio_alloc(bio_count); - - if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && - zio->io_vd->vdev_failfast == B_TRUE) { - bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, - zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); - } - - dr->dr_zio = zio; - - /* - * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which - * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio - * can cover at least 128KB and at most 1MB. When the required number - * of iovec's exceeds this, we are forced to break the IO in multiple - * bio's and wait for them all to complete. This is likely if the - * recordsize property is increased beyond 1MB. The default - * bio_count=16 should typically accommodate the maximum-size zio of - * 16MB. - */ - - abd_offset = 0; - bio_offset = io_offset; - bio_size = io_size; - for (int i = 0; i <= dr->dr_bio_count; i++) { - - /* Finished constructing bio's for given buffer */ - if (bio_size <= 0) - break; - - /* - * If additional bio's are required, we have to retry, but - * this should be rare - see the comment above. - */ - if (dr->dr_bio_count == i) { - vdev_classic_dio_free(dr); - bio_count *= 2; - goto retry; - } - - nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); - dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); - if (unlikely(dr->dr_bio[i] == NULL)) { - vdev_classic_dio_free(dr); - return (SET_ERROR(ENOMEM)); - } - - /* Matching put called by vdev_classic_physio_completion */ - vdev_classic_dio_get(dr); - - BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; - dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; - dr->dr_bio[i]->bi_private = dr; - bio_set_op_attrs(dr->dr_bio[i], rw, flags); - - /* Remaining size is returned to become the new size */ - bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, - bio_size, abd_offset); - - /* Advance in buffer and construct another bio if needed */ - abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); - bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); - } - - /* Extra reference to protect dio_request during vdev_submit_bio */ - vdev_classic_dio_get(dr); - - if (dr->dr_bio_count > 1) - blk_start_plug(&plug); - - /* Submit all bio's associated with this dio */ - for (int i = 0; i < dr->dr_bio_count; i++) { - if (dr->dr_bio[i]) - vdev_submit_bio(dr->dr_bio[i]); - } - - if (dr->dr_bio_count > 1) - blk_finish_plug(&plug); - - vdev_classic_dio_put(dr); - - return (error); -} - -/* ========== */ - static void vdev_disk_io_flush_completion(struct bio *bio) { @@ -1339,8 +1115,6 @@ vdev_disk_io_trim(zio_t *zio) return (0); } -int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; - static void vdev_disk_io_start(zio_t *zio) { @@ -1413,7 +1187,7 @@ vdev_disk_io_start(zio_t *zio) case ZIO_TYPE_READ: case ZIO_TYPE_WRITE: zio->io_target_timestamp = zio_handle_io_delay(zio); - error = vdev_disk_io_rw_fn(zio); + error = vdev_disk_io_rw(zio); rw_exit(&vd->vd_lock); if (error) { zio->io_error = error; @@ -1508,53 +1282,13 @@ vdev_disk_rele(vdev_t *vd) /* XXX: Implement me as a vnode rele for the device */ } -/* - * BIO submission method. See comment above about vdev_classic. - * Set zfs_vdev_disk_classic=0 for new, =1 for classic - */ -static uint_t zfs_vdev_disk_classic = 0; /* default new */ - -/* Set submission function from module parameter */ -static int -vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp) -{ - int err = param_set_uint(buf, kp); - if (err < 0) - return (SET_ERROR(err)); - - vdev_disk_io_rw_fn = - zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; - - printk(KERN_INFO "ZFS: forcing %s BIO submission\n", - zfs_vdev_disk_classic ? "classic" : "new"); - - return (0); -} - -/* - * At first use vdev use, set the submission function from the default value if - * it hasn't been set already. - */ -static int -vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) -{ - (void) spa; - (void) nv; - (void) tsd; - - if (vdev_disk_io_rw_fn == NULL) - vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? - vdev_classic_physio : vdev_disk_io_rw; - - return (0); -} - vdev_ops_t vdev_disk_ops = { - .vdev_op_init = vdev_disk_init, + .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_disk_open, .vdev_op_close = vdev_disk_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, + .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_disk_io_start, @@ -1575,29 +1309,6 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post }; -/* - * The zfs_vdev_scheduler module option has been deprecated. Setting this - * value no longer has any effect. It has not yet been entirely removed - * to allow the module to be loaded if this option is specified in the - * /etc/modprobe.d/zfs.conf file. The following warning will be logged. - */ -static int -param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) -{ - int error = param_set_charp(val, kp); - if (error == 0) { - printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " - "is not supported.\n"); - } - - return (error); -} - -static const char *zfs_vdev_scheduler = "unused"; -module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, - param_get_charp, &zfs_vdev_scheduler, 0644); -MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); - int param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) { @@ -1646,7 +1357,3 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, "Maximum number of data segments to add to an IO request (min 4)"); - -ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic, - vdev_disk_param_set_classic, param_get_uint, ZMOD_RD, - "Use classic BIO submission method"); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c index 1b169122f25b..daa4b5776837 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c @@ -1900,7 +1900,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, if (!(flag & IS_ROOT_NODE) && (dzp->z_pflags & ZFS_INHERIT_ACE) && !(dzp->z_pflags & ZFS_XATTR)) { - VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, + VERIFY0(zfs_acl_node_read(dzp, B_TRUE, &paclp, B_FALSE)); acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, vap->va_mode, paclp, acl_ids->z_mode, &need_chmod); @@ -2204,8 +2204,8 @@ top: } error = zfs_aclset_common(zp, aclp, cr, tx); - ASSERT(error == 0); - ASSERT(zp->z_acl_cached == NULL); + ASSERT0(error); + ASSERT0P(zp->z_acl_cached); zp->z_acl_cached = aclp; if (fuid_dirtied) diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c index 84b25cb2c5ac..fb4de50480a3 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c @@ -494,9 +494,9 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, if (!creation) now = current_time(ip); zp = ITOZ(ip); - ASSERT3P(zp->z_dirlocks, ==, NULL); - ASSERT3P(zp->z_acl_cached, ==, NULL); - ASSERT3P(zp->z_xattr_cached, ==, NULL); + ASSERT0P(zp->z_dirlocks); + ASSERT0P(zp->z_acl_cached); + ASSERT0P(zp->z_xattr_cached); zp->z_id = id; zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; @@ -511,8 +511,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, zp->z_pflags = 0; zp->z_mode = 0; zp->z_sync_cnt = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; ip->i_generation = 0; ip->i_ino = id; ip->i_mode = (S_IFDIR | S_IRWXUGO); @@ -592,7 +590,7 @@ zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id, int zfsctl_create(zfsvfs_t *zfsvfs) { - ASSERT(zfsvfs->z_ctldir == NULL); + ASSERT0P(zfsvfs->z_ctldir); zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT, &zpl_fops_root, &zpl_ops_root, 0); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c index 2f935bb3fc8c..e8de536606e2 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c @@ -463,7 +463,7 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) zfsvfs_t *zfsvfs = ZTOZSB(zp); ASSERT(zp->z_unlinked); - ASSERT(ZTOI(zp)->i_nlink == 0); + ASSERT0(ZTOI(zp)->i_nlink); VERIFY3U(0, ==, zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); @@ -662,8 +662,8 @@ zfs_rmnode(znode_t *zp) uint64_t links; int error; - ASSERT(ZTOI(zp)->i_nlink == 0); - ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0); + ASSERT0(ZTOI(zp)->i_nlink); + ASSERT0(atomic_read(&ZTOI(zp)->i_count)); /* * If this is an attribute directory, purge its contents. @@ -710,7 +710,7 @@ zfs_rmnode(znode_t *zp) &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); - ASSERT(error == 0); + ASSERT0(error); } acl_obj = zfs_external_acl(zp); @@ -744,12 +744,12 @@ zfs_rmnode(znode_t *zp) } if (xzp) { - ASSERT(error == 0); + ASSERT0(error); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ clear_nlink(ZTOI(xzp)); /* no more links to it */ links = 0; - VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + VERIFY0(sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &links, sizeof (links), tx)); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); @@ -872,7 +872,7 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) ctime); } error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); + ASSERT0(error); mutex_exit(&zp->z_lock); @@ -894,7 +894,7 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); + ASSERT0(error); mutex_exit(&dzp->z_lock); return (0); @@ -986,7 +986,7 @@ zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, sizeof (links)); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - ASSERT3U(error, ==, 0); + ASSERT0(error); if (unlinkedp != NULL) *unlinkedp = unlinked; @@ -1058,7 +1058,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, /* The only error is !zfs_dirempty() and we checked earlier. */ error = zfs_drop_nlink_locked(zp, tx, &unlinked); - ASSERT3U(error, ==, 0); + ASSERT0(error); mutex_exit(&zp->z_lock); } else { error = zfs_dropname(dl, zp, dzp, tx, flag); @@ -1083,7 +1083,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); + ASSERT0(error); mutex_exit(&dzp->z_lock); if (unlinkedp != NULL) @@ -1167,7 +1167,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr) ASSERT(error == 0 && parent == zp->z_id); #endif - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, + VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, sizeof (xzp->z_id), tx)); if (!zp->z_unlinked) diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c index d193eb80dca2..3fdcdbac6f68 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c @@ -115,8 +115,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) */ int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, - ssize_t *resid) + uint8_t ashift, ssize_t *resid) { + (void) ashift; ssize_t rc; rc = kernel_write(fp, buf, count, &off); @@ -260,24 +261,12 @@ zfs_file_fsync(zfs_file_t *filp, int flags) { int datasync = 0; int error; - int fstrans; if (flags & O_DSYNC) datasync = 1; - /* - * May enter XFS which generates a warning when PF_FSTRANS is set. - * To avoid this the flag is cleared over vfs_sync() and then reset. - */ - fstrans = __spl_pf_fstrans_check(); - if (fstrans) - current->flags &= ~(__SPL_PF_FSTRANS); - error = -vfs_fsync(filp, datasync); - if (fstrans) - current->flags |= __SPL_PF_FSTRANS; - return (error); } @@ -292,14 +281,6 @@ int zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len) { /* - * May enter XFS which generates a warning when PF_FSTRANS is set. - * To avoid this the flag is cleared over vfs_sync() and then reset. - */ - int fstrans = __spl_pf_fstrans_check(); - if (fstrans) - current->flags &= ~(__SPL_PF_FSTRANS); - - /* * When supported by the underlying file system preferentially * use the fallocate() callback to preallocate the space. */ @@ -308,9 +289,6 @@ zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len) error = -fp->f_op->fallocate(fp, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len); - if (fstrans) - current->flags |= __SPL_PF_FSTRANS; - if (error) return (SET_ERROR(error)); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_racct.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_racct.c index 4dbd6a28b594..18c5d67f9e32 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_racct.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_racct.c @@ -30,14 +30,14 @@ #include <linux/task_io_accounting_ops.h> void -zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags) { task_io_account_read(size); spa_iostats_read_add(spa, size, iops, flags); } void -zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags) { task_io_account_write(size); spa_iostats_write_add(spa, size, iops, flags); @@ -46,13 +46,13 @@ zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) #else void -zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags) { (void) spa, (void) size, (void) iops, (void) flags; } void -zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags) { (void) spa, (void) size, (void) iops, (void) flags; } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c index 1c187d7b9cab..895d80b2d79e 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c @@ -223,7 +223,7 @@ zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name) { /* zko_default_group.attrs must be NULL terminated */ ASSERT(zkobj->zko_default_group.attrs != NULL); - ASSERT(zkobj->zko_default_group.attrs[zkobj->zko_attr_count] == NULL); + ASSERT0P(zkobj->zko_default_group.attrs[zkobj->zko_attr_count]); kobject_init(&zkobj->zko_kobj, &zkobj->zko_kobj_type); return (kobject_add(&zkobj->zko_kobj, parent, name)); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c index 901bd191f2df..d282f6d95ddf 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c @@ -233,9 +233,6 @@ zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, { size_t cnt = MIN(n, uio->uio_resid); - if (uio->uio_skip) - iov_iter_advance(uio->uio_iter, uio->uio_skip); - if (rw == UIO_READ) cnt = copy_to_iter(p, cnt, uio->uio_iter); else @@ -507,12 +504,14 @@ static int zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) { long res; - size_t skip = uio->uio_skip; + size_t skip = uio->uio_iter->iov_offset; size_t len = uio->uio_resid - skip; unsigned int gup_flags = 0; unsigned long addr; unsigned long nr_pages; + ASSERT3U(uio->uio_segflg, ==, UIO_ITER); + /* * Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could * possibly be used here in the future to allow for P2P operations with @@ -577,7 +576,7 @@ static int zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) { size_t start; - size_t wanted = uio->uio_resid - uio->uio_skip; + size_t wanted = uio->uio_resid; ssize_t rollback = 0; ssize_t cnt; unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE); @@ -611,7 +610,7 @@ zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) #endif } - ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip); + ASSERT3U(rollback, ==, uio->uio_resid); iov_iter_revert(uio->uio_iter, rollback); return (0); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index ca75080d5457..8a7d14ab6119 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -265,6 +265,7 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr) { (void) cr; zfsvfs_t *zfsvfs = sb->s_fs_info; + ASSERT3P(zfsvfs, !=, NULL); /* * Semantically, the only requirement is that the sync be initiated. @@ -273,40 +274,19 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr) if (!wait) return (0); - if (zfsvfs != NULL) { - /* - * Sync a specific filesystem. - */ - dsl_pool_t *dp; - int error; - - if ((error = zfs_enter(zfsvfs, FTAG)) != 0) - return (error); - dp = dmu_objset_pool(zfsvfs->z_os); - - /* - * If the system is shutting down, then skip any - * filesystems which may exist on a suspended pool. - */ - if (spa_suspended(dp->dp_spa)) { - zfs_exit(zfsvfs, FTAG); - return (0); - } - - if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, 0); + int err = zfs_enter(zfsvfs, FTAG); + if (err != 0) + return (err); - zfs_exit(zfsvfs, FTAG); - } else { - /* - * Sync all ZFS filesystems. This is what happens when you - * run sync(1). Unlike other filesystems, ZFS honors the - * request by waiting for all pools to commit all dirty data. - */ - spa_sync_allpools(); - } + /* + * Sync any pending writes, but do not block if the pool is suspended. + * This is to help with shutting down with pools suspended, as we don't + * want to block in that case. + */ + err = zil_commit_flags(zfsvfs->z_log, 0, ZIL_COMMIT_NOW); + zfs_exit(zfsvfs, FTAG); - return (0); + return (err); } static void @@ -697,6 +677,36 @@ zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) zfsvfs->z_xattr_sa = B_TRUE; } + error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSERQUOTA, + &zfsvfs->z_defaultuserquota); + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPQUOTA, + &zfsvfs->z_defaultgroupquota); + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTQUOTA, + &zfsvfs->z_defaultprojectquota); + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSEROBJQUOTA, + &zfsvfs->z_defaultuserobjquota); + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPOBJQUOTA, + &zfsvfs->z_defaultgroupobjquota); + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTOBJQUOTA, + &zfsvfs->z_defaultprojectobjquota); + if (error != 0) + return (error); + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error != 0) @@ -868,7 +878,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) * operations out since we closed the ZIL. */ if (mounting) { - ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); + ASSERT0P(zfsvfs->z_kstat.dk_kstats); error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); if (error) return (error); @@ -1038,15 +1048,19 @@ zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp, if (err) return (err); - if (zfsvfs->z_projectquota_obj == 0) - goto objs; - - err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj, - buf + offset, 8, 1, "a); - if (err == ENOENT) - goto objs; - else if (err) - return (err); + if (zfsvfs->z_projectquota_obj == 0) { + if (zfsvfs->z_defaultprojectquota == 0) + goto objs; + quota = zfsvfs->z_defaultprojectquota; + } else { + err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj, + buf + offset, 8, 1, "a); + if (err && (quota = zfsvfs->z_defaultprojectquota) == 0) { + if (err == ENOENT) + goto objs; + return (err); + } + } err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, buf + offset, 8, 1, &used); @@ -1072,15 +1086,21 @@ zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp, statp->f_bavail = statp->f_bfree; objs: - if (zfsvfs->z_projectobjquota_obj == 0) - return (0); - err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj, - buf + offset, 8, 1, "a); - if (err == ENOENT) - return (0); - else if (err) - return (err); + if (zfsvfs->z_projectobjquota_obj == 0) { + if (zfsvfs->z_defaultprojectobjquota == 0) + return (0); + quota = zfsvfs->z_defaultprojectobjquota; + } else { + err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj, + buf + offset, 8, 1, "a); + if (err && (quota = zfsvfs->z_defaultprojectobjquota) == 0) { + if (err == ENOENT) + return (0); + return (err); + } + } + err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, buf, 8, 1, &used); @@ -1192,6 +1212,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) } /* + * Dentry and inode caches referenced by a task in non-root memcg are + * not going to be scanned by the kernel-provided shrinker. So, if + * kernel prunes nothing, fall back to this manual walk to free dnodes. + * To avoid scanning the same znodes multiple times they are always rotated + * to the end of the z_all_znodes list. New znodes are inserted at the + * end of the list so we're always scanning the oldest znodes first. + */ +static int +zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) +{ + znode_t **zp_array, *zp; + int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); + int objects = 0; + int i = 0, j = 0; + + zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); + + mutex_enter(&zfsvfs->z_znodes_lock); + while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { + + if ((i++ > nr_to_scan) || (j >= max_array)) + break; + + ASSERT(list_link_active(&zp->z_link_node)); + list_remove(&zfsvfs->z_all_znodes, zp); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + + /* Skip active znodes and .zfs entries */ + if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) + continue; + + if (igrab(ZTOI(zp)) == NULL) + continue; + + zp_array[j] = zp; + j++; + } + mutex_exit(&zfsvfs->z_znodes_lock); + + for (i = 0; i < j; i++) { + zp = zp_array[i]; + + ASSERT3P(zp, !=, NULL); + d_prune_aliases(ZTOI(zp)); + + if (atomic_read(&ZTOI(zp)->i_count) == 1) + objects++; + + zrele(zp); + } + + vmem_free(zp_array, max_array * sizeof (znode_t *)); + + return (objects); +} + +/* * The ARC has requested that the filesystem drop entries from the dentry * and inode caches. This can occur when the ARC needs to free meta data * blocks but can't because they are all pinned by entries in these caches. @@ -1242,6 +1319,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) *objects = (*shrinker->scan_objects)(shrinker, &sc); #endif + /* + * Fall back to zfs_prune_aliases if kernel's shrinker did nothing + * due to dentry and inode caches being referenced by a task running + * in non-root memcg. + */ + if (*objects == 0) + *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); + zfs_exit(zfsvfs, FTAG); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, @@ -1471,6 +1556,12 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) sb->s_xattr = zpl_xattr_handlers; sb->s_export_op = &zpl_export_operations; +#ifdef HAVE_SET_DEFAULT_D_OP + set_default_d_op(sb, &zpl_dentry_operations); +#else + sb->s_d_op = &zpl_dentry_operations; +#endif + /* Set features for file system. */ zfs_set_fuid_feature(zfsvfs); @@ -1586,7 +1677,7 @@ zfs_umount(struct super_block *sb) if (zfsvfs->z_arc_prune != NULL) arc_remove_prune_callback(zfsvfs->z_arc_prune); - VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); + VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE)); os = zfsvfs->z_os; /* @@ -1712,8 +1803,8 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) ASSERT(*ipp != NULL); if (object == ZFSCTL_INO_SNAPDIR) { - VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, - 0, kcred, NULL, NULL) == 0); + VERIFY0(zfsctl_root_lookup(*ipp, "snapshot", ipp, + 0, kcred, NULL, NULL)); } else { /* * Must have an existing ref, so igrab() @@ -1815,7 +1906,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) goto bail; ds->ds_dir->dd_activity_cancelled = B_FALSE; - VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE)); zfs_set_fuid_feature(zfsvfs); zfsvfs->z_rollback_time = jiffies; @@ -1988,7 +2079,7 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); - VERIFY(0 == sa_set_sa_object(os, sa_obj)); + VERIFY0(sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } @@ -2005,6 +2096,62 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) return (0); } +int +zfs_set_default_quota(zfsvfs_t *zfsvfs, zfs_prop_t prop, uint64_t quota) +{ + int error; + objset_t *os = zfsvfs->z_os; + const char *propstr = zfs_prop_to_name(prop); + dmu_tx_t *tx; + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, propstr); + error = dmu_tx_assign(tx, DMU_TX_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + if (quota == 0) { + error = zap_remove(os, MASTER_NODE_OBJ, propstr, tx); + if (error == ENOENT) + error = 0; + } else { + error = zap_update(os, MASTER_NODE_OBJ, propstr, 8, 1, + "a, tx); + } + + if (error) + goto out; + + switch (prop) { + case ZFS_PROP_DEFAULTUSERQUOTA: + zfsvfs->z_defaultuserquota = quota; + break; + case ZFS_PROP_DEFAULTGROUPQUOTA: + zfsvfs->z_defaultgroupquota = quota; + break; + case ZFS_PROP_DEFAULTPROJECTQUOTA: + zfsvfs->z_defaultprojectquota = quota; + break; + case ZFS_PROP_DEFAULTUSEROBJQUOTA: + zfsvfs->z_defaultuserobjquota = quota; + break; + case ZFS_PROP_DEFAULTGROUPOBJQUOTA: + zfsvfs->z_defaultgroupobjquota = quota; + break; + case ZFS_PROP_DEFAULTPROJECTOBJQUOTA: + zfsvfs->z_defaultprojectobjquota = quota; + break; + default: + break; + } + +out: + dmu_tx_commit(tx); + return (error); +} + /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. @@ -2073,4 +2220,5 @@ EXPORT_SYMBOL(zfs_remount); EXPORT_SYMBOL(zfs_statvfs); EXPORT_SYMBOL(zfs_vget); EXPORT_SYMBOL(zfs_prune); +EXPORT_SYMBOL(zfs_set_default_quota); #endif diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index 9ceb6cb8dbdd..6106726651a3 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -25,6 +25,7 @@ * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2025, Klara, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ @@ -329,7 +330,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) put_page(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, bytes); + uio, bytes, DMU_READ_PREFETCH); } len -= bytes; @@ -840,8 +841,8 @@ out: *zpp = zp; } - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -1202,8 +1203,8 @@ out: zfs_zrele_async(xzp); } - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -1391,14 +1392,15 @@ out: zfs_dirent_unlock(dl); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - if (error != 0) { zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); + } zfs_exit(zfsvfs, FTAG); return (error); @@ -1527,8 +1529,8 @@ out: zfs_znode_update_vfs(zp); zrele(zp); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -2482,10 +2484,10 @@ top: new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); - ASSERT(err == 0); + ASSERT0(err); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); - ASSERT(err == 0); + ASSERT0(err); } } @@ -2599,7 +2601,7 @@ out: if (err == 0 && xattr_count > 0) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); - ASSERT(err2 == 0); + ASSERT0(err2); } if (aclp) @@ -2629,8 +2631,8 @@ out: } out2: - if (os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS) + err = zil_commit(zilog, 0); out3: kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); @@ -3156,7 +3158,7 @@ top: * zfs_link_create() to add back the same entry, but with a new * dnode (szp), should not fail. */ - ASSERT3P(tzp, ==, NULL); + ASSERT0P(tzp); goto commit_link_tzp; } @@ -3234,8 +3236,8 @@ out: zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -3435,7 +3437,7 @@ top: *zpp = zp; if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + error = zil_commit(zilog, 0); } else { zrele(zp); } @@ -3653,8 +3655,8 @@ top: * operation are sync safe. */ if (is_tmpfile) { - VERIFY(zap_remove_int(zfsvfs->z_os, - zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); + VERIFY0(zap_remove_int(zfsvfs->z_os, + zfsvfs->z_unlinkedobj, szp->z_id, tx)); } else { if (flags & FIGNORECASE) txtype |= TX_CI; @@ -3669,11 +3671,22 @@ top: zfs_dirent_unlock(dl); - if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); + if (error == 0) { + if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); + + if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { + txg_wait_flag_t wait_flags = + spa_get_failmode(dmu_objset_spa(zfsvfs->z_os)) == + ZIO_FAILURE_MODE_CONTINUE ? TXG_WAIT_SUSPEND : 0; + error = txg_wait_synced_flags( + dmu_objset_pool(zfsvfs->z_os), txg, wait_flags); + if (error != 0) { + ASSERT3U(error, ==, ESHUTDOWN); + error = SET_ERROR(EIO); + } + } + } zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); @@ -3681,24 +3694,39 @@ top: return (error); } -static void -zfs_putpage_sync_commit_cb(void *arg) +/* Finish page writeback. */ +static inline void +zfs_page_writeback_done(struct page *pp, int err) { - struct page *pp = arg; + if (err != 0) { + /* + * Writeback failed. Re-dirty the page. It was undirtied before + * the IO was issued (in zfs_putpage() or write_cache_pages()). + * The kernel only considers writeback for dirty pages; if we + * don't do this, it is eligible for eviction without being + * written out, which we definitely don't want. + */ +#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO + filemap_dirty_folio(page_mapping(pp), page_folio(pp)); +#else + __set_page_dirty_nobuffers(pp); +#endif + } ClearPageError(pp); end_page_writeback(pp); } +/* + * ZIL callback for page writeback. Passes to zfs_log_write() in zfs_putpage() + * for syncing writes. Called when the ZIL itx has been written to the log or + * the whole txg syncs, or if the ZIL crashes or the pool suspends. Any failure + * is passed as `err`. + */ static void -zfs_putpage_async_commit_cb(void *arg) +zfs_putpage_commit_cb(void *arg, int err) { - struct page *pp = arg; - znode_t *zp = ITOZ(pp->mapping->host); - - ClearPageError(pp); - end_page_writeback(pp); - atomic_dec_32(&zp->z_async_writes_cnt); + zfs_page_writeback_done(arg, err); } /* @@ -3818,15 +3846,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { - /* - * Speed up any non-sync page writebacks since - * they may take several seconds to complete. - * Refer to the comment in zpl_fsync() for details. - */ - if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { - zil_commit(zfsvfs->z_log, zp->z_id); - } - if (PageWriteback(pp)) #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT folio_wait_bit(page_folio(pp), PG_writeback); @@ -3852,8 +3871,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, * was in fact not skipped and should not be counted as if it were. */ wbc->pages_skipped--; - if (!for_sync) - atomic_inc_32(&zp->z_async_writes_cnt); set_page_writeback(pp); unlock_page(pp); @@ -3865,18 +3882,15 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, err = dmu_tx_assign(tx, DMU_TX_WAIT); if (err != 0) { dmu_tx_abort(tx); -#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO - filemap_dirty_folio(page_mapping(pp), page_folio(pp)); -#else - __set_page_dirty_nobuffers(pp); -#endif - ClearPageError(pp); - end_page_writeback(pp); - if (!for_sync) - atomic_dec_32(&zp->z_async_writes_cnt); + zfs_page_writeback_done(pp, err); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); - return (err); + + /* + * Don't return error for an async writeback; we've re-dirtied + * the page so it will be tried again some other time. + */ + return (for_sync ? err : 0); } va = kmap(pp); @@ -3899,36 +3913,70 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); - boolean_t commit = B_FALSE; - if (wbc->sync_mode != WB_SYNC_NONE) { - /* - * Note that this is rarely called under writepages(), because - * writepages() normally handles the entire commit for - * performance reasons. - */ - commit = B_TRUE; - } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { + /* + * A note about for_sync vs wbc->sync_mode. + * + * for_sync indicates that this is a syncing writeback, that is, kernel + * caller expects the data to be durably stored before being notified. + * Often, but not always, the call was triggered by a userspace syncing + * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE + * means that that page should remain "locked" (in the writeback state) + * until it is definitely on disk (ie zil_commit() or spa_sync()). + * Otherwise, we can unlock and return as soon as it is on the + * in-memory ZIL. + * + * wbc->sync_mode has similar meaning. wbc is passed from the kernel to + * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE + * indicates this a regular async writeback (eg a cache eviction) and + * so does not need a durability guarantee, while WB_SYNC_ALL indicates + * a syncing op that must be waited on (by convention, we test for + * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over + * performance should there ever be a new mode that we have not yet + * added support for). + * + * So, why a separate for_sync field? This is because zpl_writepages() + * calls zfs_putpage() multiple times for a single "logical" operation. + * It wants all the individual pages to be for_sync==TRUE ie only + * unlocked once durably stored, but it only wants one call to + * zil_commit() at the very end, once all the pages are synced. So, + * it repurposes sync_mode slightly to indicate who issue and wait for + * the IO: for NONE, the caller to zfs_putpage() will do it, while for + * ALL, zfs_putpage should do it. + * + * Summary: + * for_sync: 0=unlock immediately; 1=unlock once on disk + * sync_mode: NONE=caller will commit; ALL=we will commit + */ + boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE); + + /* + * We use for_sync as the "commit" arg to zfs_log_write() (arg 7) + * because it is a policy flag that indicates "someone will call + * zil_commit() soon". for_sync=TRUE means exactly that; the only + * question is whether it will be us, or zpl_writepages(). + */ + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync, + B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp); + + if (!for_sync) { /* - * If the caller does not intend to wait synchronously - * for this page writeback to complete and there are active - * synchronous calls on this file, do a commit so that - * the latter don't accidentally end up waiting for - * our writeback to complete. Refer to the comment in - * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. + * Async writeback is logged and written to the DMU, so page + * can now be unlocked. */ - commit = B_TRUE; + zfs_page_writeback_done(pp, 0); } - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, - B_FALSE, for_sync ? zfs_putpage_sync_commit_cb : - zfs_putpage_async_commit_cb, pp); - dmu_tx_commit(tx); zfs_rangelock_exit(lr); - if (commit) - zil_commit(zfsvfs->z_log, zp->z_id); + if (need_commit) { + err = zil_commit_flags(zfsvfs->z_log, zp->z_id, ZIL_COMMIT_NOW); + if (err != 0) { + zfs_exit(zfsvfs, FTAG); + return (err); + } + } dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c index 607b3995cb60..bcaabeb32b8a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c @@ -126,8 +126,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; return (0); } @@ -146,12 +144,9 @@ zfs_znode_cache_destructor(void *buf, void *arg) rw_destroy(&zp->z_xattr_lock); zfs_rangelock_fini(&zp->z_rangelock); - ASSERT3P(zp->z_dirlocks, ==, NULL); - ASSERT3P(zp->z_acl_cached, ==, NULL); - ASSERT3P(zp->z_xattr_cached, ==, NULL); - - ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); - ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); + ASSERT0P(zp->z_dirlocks); + ASSERT0P(zp->z_acl_cached); + ASSERT0P(zp->z_xattr_cached); } static int @@ -183,13 +178,13 @@ zfs_znode_init(void) * backed by kmalloc() when on the Linux slab in order that any * wait_on_bit() operations on the related inode operate properly. */ - ASSERT(znode_cache == NULL); + ASSERT0P(znode_cache); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB | KMC_RECLAIMABLE); - ASSERT(znode_hold_cache == NULL); + ASSERT0P(znode_hold_cache); znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); @@ -332,10 +327,10 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, mutex_enter(&zp->z_lock); - ASSERT(zp->z_sa_hdl == NULL); - ASSERT(zp->z_acl_cached == NULL); + ASSERT0P(zp->z_sa_hdl); + ASSERT0P(zp->z_acl_cached); if (sa_hdl == NULL) { - VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, + VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); } else { zp->z_sa_hdl = sa_hdl; @@ -371,6 +366,12 @@ zfs_inode_alloc(struct super_block *sb, struct inode **ip) return (0); } +void +zfs_inode_free(struct inode *ip) +{ + kmem_cache_free(znode_cache, ITOZ(ip)); +} + /* * Called in multiple places when an inode should be destroyed. */ @@ -395,8 +396,15 @@ zfs_inode_destroy(struct inode *ip) nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } - - kmem_cache_free(znode_cache, zp); +#ifndef HAVE_SOPS_FREE_INODE + /* + * inode needs to be freed in RCU callback. If we have + * super_operations->free_inode, Linux kernel will do call_rcu + * for us. But if we don't have it, since call_rcu is GPL-only + * symbol, we can only free synchronously and accept the risk. + */ + zfs_inode_free(ip); +#endif } static void @@ -522,9 +530,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, return (NULL); zp = ITOZ(ip); - ASSERT(zp->z_dirlocks == NULL); - ASSERT3P(zp->z_acl_cached, ==, NULL); - ASSERT3P(zp->z_xattr_cached, ==, NULL); + ASSERT0P(zp->z_dirlocks); + ASSERT0P(zp->z_acl_cached); + ASSERT0P(zp->z_xattr_cached); zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; zp->z_is_ctldir = B_FALSE; @@ -535,8 +543,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); @@ -605,7 +611,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, * processing so do not hash unlinked znodes. */ if (links > 0) - VERIFY3S(insert_inode_locked(ip), ==, 0); + VERIFY0(insert_inode_locked(ip)); mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); @@ -768,7 +774,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { /* * With ZFS_PROJID flag, we can easily know whether there is - * project ID stored on disk or not. See zfs_space_delta_cb(). + * project ID stored on disk or not. See zpl_get_file_info(). */ if (obj_type != DMU_OT_ZNODE && dmu_objset_projectquota_enabled(zfsvfs->z_os)) @@ -805,7 +811,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, } /* Now add in all of the "SA" attributes */ - VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, + VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, &sa_hdl)); /* @@ -895,7 +901,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, acl_ids->z_fuid, acl_ids->z_fgid); } - VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); + VERIFY0(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx)); if (!(flag & IS_ROOT_NODE)) { /* @@ -1194,7 +1200,7 @@ zfs_rezget(znode_t *zp) } rw_exit(&zp->z_xattr_lock); - ASSERT(zp->z_sa_hdl == NULL); + ASSERT0P(zp->z_sa_hdl); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { zfs_znode_hold_exit(zfsvfs, zh); @@ -1308,9 +1314,9 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) zh = zfs_znode_hold_enter(zfsvfs, obj); if (acl_obj) { VERIFY(!zp->z_is_sa); - VERIFY(0 == dmu_object_free(os, acl_obj, tx)); + VERIFY0(dmu_object_free(os, acl_obj, tx)); } - VERIFY(0 == dmu_object_free(os, obj, tx)); + VERIFY0(dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); } @@ -1530,7 +1536,7 @@ zfs_extend(znode_t *zp, uint64_t end) zp->z_size = end; - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), + VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), &zp->z_size, sizeof (zp->z_size), tx)); zfs_rangelock_exit(lr); @@ -1720,7 +1726,7 @@ zfs_trunc(znode_t *zp, uint64_t end) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); } - VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); + VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); dmu_tx_commit(tx); zfs_rangelock_exit(lr); @@ -1787,7 +1793,7 @@ log: NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); + ASSERT0(error); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); @@ -1834,7 +1840,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); - ASSERT(error == 0); + ASSERT0(error); /* * Set starting attributes. @@ -1847,7 +1853,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) const char *name; ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); - VERIFY(nvpair_value_uint64(elem, &val) == 0); + VERIFY0(nvpair_value_uint64(elem, &val)); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (val < version) @@ -1855,7 +1861,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } - ASSERT(error == 0); + ASSERT0(error); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) norm = val; else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) @@ -1863,7 +1869,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) } ASSERT(version != 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); - ASSERT(error == 0); + ASSERT0(error); /* * Create zap object used for SA attribute registration @@ -1873,7 +1879,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); - ASSERT(error == 0); + ASSERT0(error); } else { sa_obj = 0; } @@ -1883,7 +1889,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); - ASSERT(error == 0); + ASSERT0(error); /* * Create root znode. Create minimal znode/inode/zfsvfs/sb @@ -1916,7 +1922,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); - ASSERT(error == 0); + ASSERT0(error); /* * Fold case on file systems that are always or sometimes case @@ -1940,12 +1946,12 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); } - VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, + VERIFY0(zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids, zfs_init_idmap)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); - ASSERT(error == 0); + ASSERT0(error); zfs_acl_ids_free(&acl_ids); atomic_set(&ZTOI(rootzp)->i_count, 0); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c index 0b04ec6866f4..81ac26cb0c93 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c @@ -202,7 +202,7 @@ zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags) return (!!dentry->d_inode); } -static dentry_operations_t zpl_dops_snapdirs = { +static const struct dentry_operations zpl_dops_snapdirs = { /* * Auto mounting of snapshots is only supported for 2.6.37 and * newer kernels. Prior to this kernel the ops->follow_link() @@ -215,6 +215,51 @@ static dentry_operations_t zpl_dops_snapdirs = { .d_revalidate = zpl_snapdir_revalidate, }; +/* + * For the .zfs control directory to work properly we must be able to override + * the default operations table and register custom .d_automount and + * .d_revalidate callbacks. + */ +static void +set_snapdir_dentry_ops(struct dentry *dentry, unsigned int extraflags) { + static const unsigned int op_flags = + DCACHE_OP_HASH | DCACHE_OP_COMPARE | + DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE | + DCACHE_OP_PRUNE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_REAL; + +#ifdef HAVE_D_SET_D_OP + /* + * d_set_d_op() will set the DCACHE_OP_ flags according to what it + * finds in the passed dentry_operations, so we don't have to. + * + * We clear the flags and the old op table before calling d_set_d_op() + * because issues a warning when the dentry operations table is already + * set. + */ + dentry->d_op = NULL; + dentry->d_flags &= ~op_flags; + d_set_d_op(dentry, &zpl_dops_snapdirs); + dentry->d_flags |= extraflags; +#else + /* + * Since 6.17 there's no exported way to modify dentry ops, so we have + * to reach in and do it ourselves. This should be safe for our very + * narrow use case, which is to create or splice in an entry to give + * access to a snapshot. + * + * We need to set the op flags directly. We hardcode + * DCACHE_OP_REVALIDATE because that's the only operation we have; if + * we ever extend zpl_dops_snapdirs we will need to update the op flags + * to match. + */ + spin_lock(&dentry->d_lock); + dentry->d_op = &zpl_dops_snapdirs; + dentry->d_flags &= ~op_flags; + dentry->d_flags |= DCACHE_OP_REVALIDATE | extraflags; + spin_unlock(&dentry->d_lock); +#endif +} + static struct dentry * zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) @@ -236,10 +281,7 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, return (ERR_PTR(error)); ASSERT(error == 0 || ip == NULL); - d_clear_d_op(dentry); - d_set_d_op(dentry, &zpl_dops_snapdirs); - dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; - + set_snapdir_dentry_ops(dentry, DCACHE_NEED_AUTOMOUNT); return (d_splice_alias(ip, dentry)); } @@ -341,14 +383,20 @@ zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry) return (error); } +#if defined(HAVE_IOPS_MKDIR_USERNS) static int -#ifdef HAVE_IOPS_MKDIR_USERNS zpl_snapdir_mkdir(struct user_namespace *user_ns, struct inode *dip, struct dentry *dentry, umode_t mode) #elif defined(HAVE_IOPS_MKDIR_IDMAP) +static int +zpl_snapdir_mkdir(struct mnt_idmap *user_ns, struct inode *dip, + struct dentry *dentry, umode_t mode) +#elif defined(HAVE_IOPS_MKDIR_DENTRY) +static struct dentry * zpl_snapdir_mkdir(struct mnt_idmap *user_ns, struct inode *dip, struct dentry *dentry, umode_t mode) #else +static int zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) #endif { @@ -367,8 +415,7 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); if (error == 0) { - d_clear_d_op(dentry); - d_set_d_op(dentry, &zpl_dops_snapdirs); + set_snapdir_dentry_ops(dentry, 0); d_instantiate(dentry, ip); } @@ -376,7 +423,11 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) ASSERT3S(error, <=, 0); crfree(cr); +#if defined(HAVE_IOPS_MKDIR_DENTRY) + return (ERR_PTR(error)); +#else return (error); +#endif } /* diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index 787d3cb31410..d07317b0d910 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ @@ -36,10 +37,7 @@ #include <sys/zfs_vfsops.h> #include <sys/zfs_vnops.h> #include <sys/zfs_project.h> -#if defined(HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS) || \ - defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO) -#include <linux/pagemap.h> -#endif +#include <linux/pagemap_compat.h> #include <linux/fadvise.h> #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO #include <linux/writeback.h> @@ -109,60 +107,52 @@ zpl_iterate(struct file *filp, struct dir_context *ctx) return (error); } +static inline int +zpl_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, void *data); + static int zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; znode_t *zp = ITOZ(inode); - zfsvfs_t *zfsvfs = ITOZSB(inode); cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; /* - * The variables z_sync_writes_cnt and z_async_writes_cnt work in - * tandem so that sync writes can detect if there are any non-sync - * writes going on and vice-versa. The "vice-versa" part to this logic - * is located in zfs_putpage() where non-sync writes check if there are - * any ongoing sync writes. If any sync and non-sync writes overlap, - * we do a commit to complete the non-sync writes since the latter can - * potentially take several seconds to complete and thus block sync - * writes in the upcoming call to filemap_write_and_wait_range(). - */ - atomic_inc_32(&zp->z_sync_writes_cnt); - /* - * If the following check does not detect an overlapping non-sync write - * (say because it's just about to start), then it is guaranteed that - * the non-sync write will detect this sync write. This is because we - * always increment z_sync_writes_cnt / z_async_writes_cnt before doing - * the check on z_async_writes_cnt / z_sync_writes_cnt here and in - * zfs_putpage() respectively. + * Force dirty pages in the range out to the DMU and the log, ready + * for zil_commit() to write down. + * + * We call write_cache_pages() directly to ensure that zpl_putpage() is + * called with the flags we need. We need WB_SYNC_NONE to avoid a call + * to zil_commit() (since we're doing this as a kind of pre-sync); but + * we do need for_sync so that the pages remain in writeback until + * they're on disk, and so that we get an error if the DMU write fails. */ - if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { - if ((error = zpl_enter(zfsvfs, FTAG)) != 0) { - atomic_dec_32(&zp->z_sync_writes_cnt); + if (filemap_range_has_page(inode->i_mapping, start, end)) { + int for_sync = 1; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = LONG_MAX, + .range_start = start, + .range_end = end, + }; + error = + zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync); + if (error != 0) { + /* + * Unclear what state things are in. zfs_putpage() will + * ensure the pages remain dirty if they haven't been + * written down to the DMU, but because there may be + * nothing logged, we can't assume that zfs_sync() -> + * zil_commit() will give us a useful error. It's + * safest if we just error out here. + */ return (error); } - zil_commit(zfsvfs->z_log, zp->z_id); - zpl_exit(zfsvfs, FTAG); } - error = filemap_write_and_wait_range(inode->i_mapping, start, end); - - /* - * The sync write is not complete yet but we decrement - * z_sync_writes_cnt since zfs_fsync() increments and decrements - * it internally. If a non-sync write starts just after the decrement - * operation but before we call zfs_fsync(), it may not detect this - * overlapping sync write but it does not matter since we have already - * gone past filemap_write_and_wait_range() and we won't block due to - * the non-sync write. - */ - atomic_dec_32(&zp->z_sync_writes_cnt); - - if (error) - return (error); - crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(zp, datasync, cr); @@ -226,7 +216,7 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) ssize_t count = iov_iter_count(to); zfs_uio_t uio; - zfs_uio_iov_iter_init(&uio, to, kiocb->ki_pos, count, 0); + zfs_uio_iov_iter_init(&uio, to, kiocb->ki_pos, count); crhold(cr); cookie = spl_fstrans_mark(); @@ -276,8 +266,7 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) if (ret) return (ret); - zfs_uio_iov_iter_init(&uio, from, kiocb->ki_pos, count, - from->iov_offset); + zfs_uio_iov_iter_init(&uio, from, kiocb->ki_pos, count); crhold(cr); cookie = spl_fstrans_mark(); @@ -539,11 +528,30 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) if (sync_mode != wbc->sync_mode) { if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (result); - if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, zp->z_id); + + if (zfsvfs->z_log != NULL) { + /* + * We don't want to block here if the pool suspends, + * because this is not a syncing op by itself, but + * might be part of one that the caller will + * coordinate. + */ + result = -zil_commit_flags(zfsvfs->z_log, zp->z_id, + ZIL_COMMIT_NOW); + } + zpl_exit(zfsvfs, FTAG); /* + * If zil_commit_flags() failed, it's unclear what state things + * are currently in. putpage() has written back out what it can + * to the DMU, but it may not be on disk. We have little choice + * but to escape. + */ + if (result != 0) + return (result); + + /* * We need to call write_cache_pages() again (we can't just * return after the commit) because the previous call in * non-SYNC mode does not guarantee that we got all the dirty @@ -556,6 +564,7 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) return (result); } +#ifdef HAVE_VFS_WRITEPAGE /* * Write out dirty pages to the ARC, this function is only required to * support mmap(2). Mapped pages may be dirtied by memory operations @@ -572,6 +581,7 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc) return (zpl_putpage(pp, wbc, &for_sync)); } +#endif /* * The flag combination which matches the behavior of zfs_space() is @@ -986,6 +996,27 @@ zpl_ioctl_setdosflags(struct file *filp, void __user *arg) return (err); } +static int +zpl_ioctl_rewrite(struct file *filp, void __user *arg) +{ + struct inode *ip = file_inode(filp); + zfs_rewrite_args_t args; + fstrans_cookie_t cookie; + int err; + + if (copy_from_user(&args, arg, sizeof (args))) + return (-EFAULT); + + if (unlikely(!(filp->f_mode & FMODE_WRITE))) + return (-EBADF); + + cookie = spl_fstrans_mark(); + err = -zfs_rewrite(ITOZ(ip), args.off, args.len, args.flags, args.arg); + spl_fstrans_unmark(cookie); + + return (err); +} + static long zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -1004,12 +1035,8 @@ zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return (zpl_ioctl_getdosflags(filp, (void *)arg)); case ZFS_IOC_SETDOSFLAGS: return (zpl_ioctl_setdosflags(filp, (void *)arg)); - case ZFS_IOC_COMPAT_FICLONE: - return (zpl_ioctl_ficlone(filp, (void *)arg)); - case ZFS_IOC_COMPAT_FICLONERANGE: - return (zpl_ioctl_ficlonerange(filp, (void *)arg)); - case ZFS_IOC_COMPAT_FIDEDUPERANGE: - return (zpl_ioctl_fideduperange(filp, (void *)arg)); + case ZFS_IOC_REWRITE: + return (zpl_ioctl_rewrite(filp, (void *)arg)); default: return (-ENOTTY); } @@ -1047,7 +1074,9 @@ const struct address_space_operations zpl_address_space_operations = { #else .readpage = zpl_readpage, #endif +#ifdef HAVE_VFS_WRITEPAGE .writepage = zpl_writepage, +#endif .writepages = zpl_writepages, .direct_IO = zpl_direct_IO, #ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS @@ -1058,7 +1087,7 @@ const struct address_space_operations zpl_address_space_operations = { #endif #ifdef HAVE_VFS_MIGRATE_FOLIO .migrate_folio = migrate_folio, -#else +#elif defined(HAVE_VFS_MIGRATEPAGE) .migratepage = migrate_page, #endif }; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c index cb8562d21421..c40dde046142 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c @@ -212,85 +212,3 @@ zpl_dedupe_file_range(struct file *src_file, loff_t src_off, return (-EOPNOTSUPP); } #endif /* HAVE_VFS_DEDUPE_FILE_RANGE */ - -/* Entry point for FICLONE, before Linux 4.5. */ -long -zpl_ioctl_ficlone(struct file *dst_file, void *arg) -{ - unsigned long sfd = (unsigned long)arg; - - struct file *src_file = fget(sfd); - if (src_file == NULL) - return (-EBADF); - - if (dst_file->f_op != src_file->f_op) { - fput(src_file); - return (-EXDEV); - } - - size_t len = i_size_read(file_inode(src_file)); - - ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len); - - fput(src_file); - - if (ret < 0) { - if (ret == -EOPNOTSUPP) - return (-ENOTTY); - return (ret); - } - - if (ret != len) - return (-EINVAL); - - return (0); -} - -/* Entry point for FICLONERANGE, before Linux 4.5. */ -long -zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg) -{ - zfs_ioc_compat_file_clone_range_t fcr; - - if (copy_from_user(&fcr, arg, sizeof (fcr))) - return (-EFAULT); - - struct file *src_file = fget(fcr.fcr_src_fd); - if (src_file == NULL) - return (-EBADF); - - if (dst_file->f_op != src_file->f_op) { - fput(src_file); - return (-EXDEV); - } - - size_t len = fcr.fcr_src_length; - if (len == 0) - len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset; - - ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset, - dst_file, fcr.fcr_dest_offset, len); - - fput(src_file); - - if (ret < 0) { - if (ret == -EOPNOTSUPP) - return (-ENOTTY); - return (ret); - } - - if (ret != len) - return (-EINVAL); - - return (0); -} - -/* Entry point for FIDEDUPERANGE, before Linux 4.5. */ -long -zpl_ioctl_fideduperange(struct file *filp, void *arg) -{ - (void) arg; - - /* No support for dedup yet */ - return (-ENOTTY); -} diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c index 85df9b9acf28..f97662d052c7 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c @@ -247,7 +247,7 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, * and fifos, but we want to know if this behavior ever changes. */ if (S_ISSOCK(mode) || S_ISFIFO(mode)) - ASSERT(rdev == 0); + ASSERT0(rdev); crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); @@ -374,14 +374,20 @@ zpl_unlink(struct inode *dir, struct dentry *dentry) return (error); } +#if defined(HAVE_IOPS_MKDIR_USERNS) static int -#ifdef HAVE_IOPS_MKDIR_USERNS zpl_mkdir(struct user_namespace *user_ns, struct inode *dir, struct dentry *dentry, umode_t mode) #elif defined(HAVE_IOPS_MKDIR_IDMAP) +static int +zpl_mkdir(struct mnt_idmap *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode) +#elif defined(HAVE_IOPS_MKDIR_DENTRY) +static struct dentry * zpl_mkdir(struct mnt_idmap *user_ns, struct inode *dir, struct dentry *dentry, umode_t mode) #else +static int zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) #endif { @@ -390,12 +396,14 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) znode_t *zp; int error; fstrans_cookie_t cookie; -#if !(defined(HAVE_IOPS_MKDIR_USERNS) || defined(HAVE_IOPS_MKDIR_IDMAP)) +#if !(defined(HAVE_IOPS_MKDIR_USERNS) || \ + defined(HAVE_IOPS_MKDIR_IDMAP) || defined(HAVE_IOPS_MKDIR_DENTRY)) zidmap_t *user_ns = kcred->user_ns; #endif if (is_nametoolong(dentry)) { - return (-ENAMETOOLONG); + error = -ENAMETOOLONG; + goto err; } crhold(cr); @@ -422,9 +430,14 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); - ASSERT3S(error, <=, 0); +err: + ASSERT3S(error, <=, 0); +#if defined(HAVE_IOPS_MKDIR_DENTRY) + return (error != 0 ? ERR_PTR(error) : NULL); +#else return (error); +#endif } static int diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index 40c25e464c5d..444948d03cb3 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2023, Datto Inc. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ @@ -31,7 +32,22 @@ #include <sys/zfs_ctldir.h> #include <sys/zpl.h> #include <linux/iversion.h> +#include <linux/version.h> +/* + * What to do when the last reference to an inode is released. If 0, the kernel + * will cache it on the superblock. If 1, the inode will be freed immediately. + * See zpl_drop_inode(). + */ +int zfs_delete_inode = 0; + +/* + * What to do when the last reference to a dentry is released. If 0, the kernel + * will cache it until the entry (file) is destroyed. If 1, the dentry will be + * marked for cleanup, at which time its inode reference will be released. See + * zpl_dentry_delete(). + */ +int zfs_delete_dentry = 0; static struct inode * zpl_inode_alloc(struct super_block *sb) @@ -44,10 +60,19 @@ zpl_inode_alloc(struct super_block *sb) return (ip); } +#ifdef HAVE_SOPS_FREE_INODE +static void +zpl_inode_free(struct inode *ip) +{ + ASSERT0(atomic_read(&ip->i_count)); + zfs_inode_free(ip); +} +#endif + static void zpl_inode_destroy(struct inode *ip) { - ASSERT(atomic_read(&ip->i_count) == 0); + ASSERT0(atomic_read(&ip->i_count)); zfs_inode_destroy(ip); } @@ -67,11 +92,36 @@ zpl_dirty_inode(struct inode *ip, int flags) } /* - * When ->drop_inode() is called its return value indicates if the - * inode should be evicted from the inode cache. If the inode is - * unhashed and has no links the default policy is to evict it - * immediately. + * ->drop_inode() is called when the last reference to an inode is released. + * Its return value indicates if the inode should be destroyed immediately, or + * cached on the superblock structure. + * + * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns + * "destroy immediately" if the inode is unhashed and has no links (roughly: no + * longer exists on disk). On datasets with millions of rarely-accessed files, + * this can cause a large amount of memory to be "pinned" by cached inodes, + * which in turn pin their associated dnodes and dbufs, until the kernel starts + * reporting memory pressure and requests OpenZFS release some memory (see + * zfs_prune()). + * + * When set to 1, we call generic_delete_node(), which always returns "destroy + * immediately", resulting in inodes being destroyed immediately, releasing + * their associated dnodes and dbufs to the dbuf cached and the ARC to be + * evicted as normal. * + * Note that the "last reference" doesn't always mean the last _userspace_ + * reference; the dentry cache also holds a reference, so "busy" inodes will + * still be kept alive that way (subject to dcache tuning). + */ +static int +zpl_drop_inode(struct inode *ip) +{ + if (zfs_delete_inode) + return (generic_delete_inode(ip)); + return (generic_drop_inode(ip)); +} + +/* * The ->evict_inode() callback must minimally truncate the inode pages, * and call clear_inode(). For 2.6.35 and later kernels this will * simply update the inode state, with the sync occurring before the @@ -105,6 +155,42 @@ zpl_put_super(struct super_block *sb) ASSERT3S(error, <=, 0); } +/* + * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2) + * syscalls, via sb->s_op->sync_fs(). + * + * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() -> + * sync_filesystem() would ignore the return from sync_fs(), instead only + * considing the error from syncing the underlying block device (sb->s_dev). + * Since OpenZFS doesn't _have_ an underlying block device, there's no way for + * us to report a sync directly. + * + * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra + * error store `s_wb_err`, to carry errors seen on page writeback since the + * last call to syncfs(). If sync_filesystem() does not return an error, any + * existing writeback error on the superblock will be used instead (and cleared + * either way). We don't use this (page writeback is a different thing for us), + * so for 5.8-5.17 we can use that instead to get syncfs() to return the error. + * + * Before 5.8, we have no other good options - no matter what happens, the + * userspace program will be told the call has succeeded, and so we must make + * it so, Therefore, when we are asked to wait for sync to complete (wait == + * 1), if zfs_sync() has returned an error we have no choice but to block, + * regardless of the reason. + * + * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely + * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the + * mainline Linux series at time of writing), and has likely been backported to + * vendor kernels before 5.8. We don't really want to use a workaround when we + * don't have to, but we can't really detect whether or not sync_filesystem() + * will return our errors (without a difficult runtime test anyway). So, we use + * a static version check: any kernel reporting its version as 5.17+ will use a + * direct error return, otherwise, we'll either use s_wb_err if it was detected + * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will + * block to ensure the correct semantics. + * + * See https://github.com/openzfs/zfs/issues/17416 for further discussion. + */ static int zpl_sync_fs(struct super_block *sb, int wait) { @@ -115,10 +201,28 @@ zpl_sync_fs(struct super_block *sb, int wait) crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_sync(sb, wait, cr); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0) +#ifdef HAVE_SUPER_BLOCK_S_WB_ERR + if (error && wait) + errseq_set(&sb->s_wb_err, error); +#else + if (error && wait) { + zfsvfs_t *zfsvfs = sb->s_fs_info; + ASSERT3P(zfsvfs, !=, NULL); + if (zfs_enter(zfsvfs, FTAG) == 0) { + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + zfs_exit(zfsvfs, FTAG); + error = 0; + } + } +#endif +#endif /* < 5.17.0 */ + spl_fstrans_unmark(cookie); crfree(cr); - ASSERT3S(error, <=, 0); + ASSERT3S(error, <=, 0); return (error); } @@ -400,9 +504,13 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg) const struct super_operations zpl_super_operations = { .alloc_inode = zpl_inode_alloc, +#ifdef HAVE_SOPS_FREE_INODE + .free_inode = zpl_inode_free, +#endif .destroy_inode = zpl_inode_destroy, .dirty_inode = zpl_dirty_inode, .write_inode = NULL, + .drop_inode = zpl_drop_inode, .evict_inode = zpl_evict_inode, .put_super = zpl_put_super, .sync_fs = zpl_sync_fs, @@ -413,6 +521,35 @@ const struct super_operations zpl_super_operations = { .show_stats = NULL, }; +/* + * ->d_delete() is called when the last reference to a dentry is released. Its + * return value indicates if the dentry should be destroyed immediately, or + * retained in the dentry cache. + * + * By default (zfs_delete_dentry=0) the kernel will always cache unused + * entries. Each dentry holds an inode reference, so cached dentries can hold + * the final inode reference indefinitely, leading to the inode and its related + * data being pinned (see zpl_drop_inode()). + * + * When set to 1, we signal that the dentry should be destroyed immediately and + * never cached. This reduces memory usage, at the cost of higher overheads to + * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be + * reloaded and reinflated. + * + * Note that userspace does not have direct control over dentry references and + * reclaim; rather, this is part of the kernel's caching and reclaim subsystems + * (eg vm.vfs_cache_pressure). + */ +static int +zpl_dentry_delete(const struct dentry *dentry) +{ + return (zfs_delete_dentry ? 1 : 0); +} + +const struct dentry_operations zpl_dentry_operations = { + .d_delete = zpl_dentry_delete, +}; + struct file_system_type zpl_fs_type = { .owner = THIS_MODULE, .name = ZFS_DRIVER, @@ -424,3 +561,10 @@ struct file_system_type zpl_fs_type = { .mount = zpl_mount, .kill_sb = zpl_kill_sb, }; + +ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW, + "Delete inodes as soon as the last reference is released."); + +ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW, + "Delete dentries from dentry cache as soon as the last reference is " + "released."); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c index a098197e7448..d93282db815a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c @@ -1494,7 +1494,7 @@ zpl_posix_acl_free(void *arg) acl_rel_head = NULL; if (cmpxchg(&acl_rel_tail, &a->next, &acl_rel_head) == &a->next) { - ASSERT3P(a->next, ==, NULL); + ASSERT0P(a->next); a->next = freelist; freelist = a; break; @@ -1544,7 +1544,7 @@ zpl_posix_acl_release_impl(struct posix_acl *acl) a->time = ddi_get_lbolt(); /* atomically points tail to us and get the previous tail */ prev = xchg(&acl_rel_tail, &a->next); - ASSERT3P(*prev, ==, NULL); + ASSERT0P(*prev); *prev = a; /* if it was empty before, schedule the free task */ if (prev == &acl_rel_head) diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index c8a04539258f..967a018640e1 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Klara, Inc. */ #include <sys/dataset_kstats.h> @@ -51,21 +51,12 @@ static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, boolean_t force_sync); static unsigned int zvol_major = ZVOL_MAJOR; -static unsigned int zvol_request_sync = 0; -static unsigned int zvol_prefetch_bytes = (128 * 1024); static unsigned long zvol_max_discard_blocks = 16384; -/* - * Switch taskq at multiple of 512 MB offset. This can be set to a lower value - * to utilize more threads for small files but may affect prefetch hits. - */ -#define ZVOL_TASKQ_OFFSET_SHIFT 29 - #ifndef HAVE_BLKDEV_GET_ERESTARTSYS static unsigned int zvol_open_timeout_ms = 1000; #endif -static unsigned int zvol_threads = 0; static unsigned int zvol_blk_mq_threads = 0; static unsigned int zvol_blk_mq_actual_threads; static boolean_t zvol_use_blk_mq = B_FALSE; @@ -82,8 +73,6 @@ static boolean_t zvol_use_blk_mq = B_FALSE; */ static unsigned int zvol_blk_mq_blocks_per_thread = 8; -static unsigned int zvol_num_taskqs = 0; - #ifndef BLKDEV_DEFAULT_RQ /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ @@ -95,8 +84,9 @@ static unsigned int zvol_num_taskqs = 0; static inline void zvol_end_io(struct bio *bio, struct request *rq, int error) { + ASSERT3U(error, >=, 0); if (bio) { - bio->bi_status = errno_to_bi_status(-error); + bio->bi_status = errno_to_bi_status(error); bio_endio(bio); } else { blk_mq_end_request(rq, errno_to_bi_status(error)); @@ -117,45 +107,8 @@ struct zvol_state_os { boolean_t use_blk_mq; }; -typedef struct zv_taskq { - uint_t tqs_cnt; - taskq_t **tqs_taskq; -} zv_taskq_t; -static zv_taskq_t zvol_taskqs; static struct ida zvol_ida; -typedef struct zv_request_stack { - zvol_state_t *zv; - struct bio *bio; - struct request *rq; -} zv_request_t; - -typedef struct zv_work { - struct request *rq; - struct work_struct work; -} zv_work_t; - -typedef struct zv_request_task { - zv_request_t zvr; - taskq_ent_t ent; -} zv_request_task_t; - -static zv_request_task_t * -zv_request_task_create(zv_request_t zvr) -{ - zv_request_task_t *task; - task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); - taskq_init_ent(&task->ent); - task->zvr = zvr; - return (task); -} - -static void -zv_request_task_free(zv_request_task_t *task) -{ - kmem_free(task, sizeof (*task)); -} - /* * This is called when a new block multiqueue request comes in. A request * contains one or more BIOs. @@ -256,8 +209,14 @@ zvol_write(zv_request_t *zvr) disk = zv->zv_zso->zvo_disk; /* bio marked as FLUSH need to flush before write */ - if (io_is_flush(bio, rq)) - zil_commit(zv->zv_zilog, ZVOL_OBJ); + if (io_is_flush(bio, rq)) { + error = zil_commit(zv->zv_zilog, ZVOL_OBJ); + if (error != 0) { + rw_exit(&zv->zv_suspend_lock); + zvol_end_io(bio, rq, -error); + return; + } + } /* Some requests are just for flush and nothing else. */ if (io_size(bio, rq) == 0) { @@ -305,7 +264,8 @@ zvol_write(zv_request_t *zvr) dmu_tx_abort(tx); break; } - error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); + error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx, + DMU_READ_PREFETCH); if (error == 0) { zvol_log_write(zv, tx, off, bytes, sync); } @@ -320,8 +280,8 @@ zvol_write(zv_request_t *zvr) dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); task_io_account_write(nwritten); - if (sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); + if (error == 0 && sync) + error = zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); @@ -329,7 +289,7 @@ zvol_write(zv_request_t *zvr) blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); } - zvol_end_io(bio, rq, -error); + zvol_end_io(bio, rq, error); } static void @@ -408,7 +368,7 @@ zvol_discard(zv_request_t *zvr) zfs_rangelock_exit(lr); if (error == 0 && sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); + error = zil_commit(zv->zv_zilog, ZVOL_OBJ); unlock: rw_exit(&zv->zv_suspend_lock); @@ -418,7 +378,7 @@ unlock: start_time); } - zvol_end_io(bio, rq, -error); + zvol_end_io(bio, rq, error); } static void @@ -475,7 +435,8 @@ zvol_read(zv_request_t *zvr) if (bytes > volsize - uio.uio_loffset) bytes = volsize - uio.uio_loffset; - error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); + error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes, + DMU_READ_PREFETCH); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) @@ -495,7 +456,7 @@ zvol_read(zv_request_t *zvr) blk_generic_end_io_acct(q, disk, READ, bio, start_time); } - zvol_end_io(bio, rq, -error); + zvol_end_io(bio, rq, error); } static void @@ -523,10 +484,31 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = io_offset(bio, rq); uint64_t size = io_size(bio, rq); - int rw = io_data_dir(bio, rq); + int rw; + + if (rq != NULL) { + /* + * Flush & trim requests go down the zvol_write codepath. Or + * more specifically: + * + * If request is a write, or if it's op_is_sync() and not a + * read, or if it's a flush, or if it's a discard, then send the + * request down the write path. + */ + if (op_is_write(rq->cmd_flags) || + (op_is_sync(rq->cmd_flags) && req_op(rq) != REQ_OP_READ) || + req_op(rq) == REQ_OP_FLUSH || + op_is_discard(rq->cmd_flags)) { + rw = WRITE; + } else { + rw = READ; + } + } else { + rw = bio_data_dir(bio); + } if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { - zvol_end_io(bio, rq, -SET_ERROR(ENXIO)); + zvol_end_io(bio, rq, SET_ERROR(ENXIO)); goto out; } @@ -545,7 +527,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, (long long unsigned)offset, (long unsigned)size); - zvol_end_io(bio, rq, -SET_ERROR(EIO)); + zvol_end_io(bio, rq, SET_ERROR(EIO)); goto out; } @@ -558,8 +540,8 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, #ifdef HAVE_BLK_MQ_RQ_HCTX blk_mq_hw_queue = rq->mq_hctx->queue_num; #else - blk_mq_hw_queue = - rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; + blk_mq_hw_queue = rq->q->queue_hw_ctx[ + rq->q->mq_map[raw_smp_processor_id()]]->queue_num; #endif taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, blk_mq_hw_queue); @@ -567,7 +549,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { - zvol_end_io(bio, rq, -SET_ERROR(EROFS)); + zvol_end_io(bio, rq, SET_ERROR(EROFS)); goto out; } @@ -718,28 +700,19 @@ zvol_open(struct block_device *bdev, fmode_t flag) retry: #endif - rw_enter(&zvol_state_lock, RW_READER); - /* - * Obtain a copy of private_data under the zvol_state_lock to make - * sure that either the result of zvol free code path setting - * disk->private_data to NULL is observed, or zvol_os_free() - * is not called on this zv because of the positive zv_open_count. - */ + #ifdef HAVE_BLK_MODE_T - zv = disk->private_data; + zv = atomic_load_ptr(&disk->private_data); #else - zv = bdev->bd_disk->private_data; + zv = atomic_load_ptr(&bdev->bd_disk->private_data); #endif if (zv == NULL) { - rw_exit(&zvol_state_lock); return (-SET_ERROR(ENXIO)); } mutex_enter(&zv->zv_state_lock); - if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { mutex_exit(&zv->zv_state_lock); - rw_exit(&zvol_state_lock); return (-SET_ERROR(ENXIO)); } @@ -751,8 +724,28 @@ retry: if (zv->zv_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); + + /* + * Removal may happen while the locks are down, so + * we can't trust zv any longer; we have to start over. + */ +#ifdef HAVE_BLK_MODE_T + zv = atomic_load_ptr(&disk->private_data); +#else + zv = atomic_load_ptr(&bdev->bd_disk->private_data); +#endif + if (zv == NULL) + return (-SET_ERROR(ENXIO)); + rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); + + if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); + return (-SET_ERROR(ENXIO)); + } + /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); @@ -763,7 +756,6 @@ retry: drop_suspend = B_TRUE; } } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -860,11 +852,11 @@ zvol_release(struct gendisk *disk, fmode_t unused) #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) (void) unused; #endif - zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; - rw_enter(&zvol_state_lock, RW_READER); - zv = disk->private_data; + zvol_state_t *zv = atomic_load_ptr(&disk->private_data); + if (zv == NULL) + return; mutex_enter(&zv->zv_state_lock); ASSERT3U(zv->zv_open_count, >, 0); @@ -878,6 +870,15 @@ zvol_release(struct gendisk *disk, fmode_t unused) mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); + + /* + * Unlike in zvol_open(), we don't check if removal + * started here, because we might be one of the openers + * that needs to be thrown out! If we're the last, we + * need to call zvol_last_close() below to finish + * cleanup. So, no special treatment for us. + */ + /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); @@ -887,7 +888,6 @@ zvol_release(struct gendisk *disk, fmode_t unused) } else { drop_suspend = B_FALSE; } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -907,9 +907,10 @@ static int zvol_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; + zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); + ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); switch (cmd) { @@ -932,16 +933,18 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode, case BLKZNAME: mutex_enter(&zv->zv_state_lock); - error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); + error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); mutex_exit(&zv->zv_state_lock); + if (error) + error = SET_ERROR(error); break; default: - error = -ENOTTY; + error = SET_ERROR(ENOTTY); break; } - return (SET_ERROR(error)); + return (-error); } #ifdef CONFIG_COMPAT @@ -960,9 +963,8 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing) { unsigned int mask = 0; - rw_enter(&zvol_state_lock, RW_READER); + zvol_state_t *zv = atomic_load_ptr(&disk->private_data); - zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; @@ -970,17 +972,14 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing) mutex_exit(&zv->zv_state_lock); } - rw_exit(&zvol_state_lock); - return (mask); } static int zvol_revalidate_disk(struct gendisk *disk) { - rw_enter(&zvol_state_lock, RW_READER); + zvol_state_t *zv = atomic_load_ptr(&disk->private_data); - zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); set_capacity(zv->zv_zso->zvo_disk, @@ -988,8 +987,6 @@ zvol_revalidate_disk(struct gendisk *disk) mutex_exit(&zv->zv_state_lock); } - rw_exit(&zvol_state_lock); - return (0); } @@ -1008,16 +1005,6 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) return (0); } -void -zvol_os_clear_private(zvol_state_t *zv) -{ - /* - * Cleared while holding zvol_state_lock as a writer - * which will prevent zvol_open() from opening it. - */ - zv->zv_zso->zvo_disk->private_data = NULL; -} - /* * Provide a simple virtual geometry for legacy compatibility. For devices * smaller than 1 MiB a small head and sector count is used to allow very @@ -1027,9 +1014,10 @@ zvol_os_clear_private(zvol_state_t *zv) static int zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) { - zvol_state_t *zv = bdev->bd_disk->private_data; sector_t sectors; + zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); + ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); sectors = get_capacity(zv->zv_zso->zvo_disk); @@ -1348,27 +1336,30 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. */ -static zvol_state_t * -zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize) +static int +zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize, + zvol_state_t **zvp) { zvol_state_t *zv; struct zvol_state_os *zso; uint64_t volmode; int ret; - if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) - return (NULL); + ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL); + if (ret) + return (ret); if (volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; if (volmode == ZFS_VOLMODE_NONE) - return (NULL); + return (0); zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_zso = zso; zv->zv_volmode = volmode; + zv->zv_volsize = volsize; zv->zv_volblocksize = volblocksize; list_link_init(&zv->zv_next); @@ -1397,13 +1388,15 @@ zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize) */ if (zv->zv_zso->use_blk_mq) { ret = zvol_alloc_blk_mq(zv, &limits); + if (ret != 0) + goto out_kmem; zso->zvo_disk->fops = &zvol_ops_blk_mq; } else { ret = zvol_alloc_non_blk_mq(zso, &limits); + if (ret != 0) + goto out_kmem; zso->zvo_disk->fops = &zvol_ops; } - if (ret != 0) - goto out_kmem; /* Limit read-ahead to a single page to prevent over-prefetching. */ blk_queue_set_read_ahead(zso->zvo_queue, 1); @@ -1440,61 +1433,79 @@ zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize) snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", ZVOL_DEV_NAME, (dev & MINORMASK)); - return (zv); + *zvp = zv; + return (ret); out_kmem: kmem_free(zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); - return (NULL); + return (ret); } -/* - * Cleanup then free a zvol_state_t which was created by zvol_alloc(). - * At this time, the structure is not opened by anyone, is taken off - * the zvol_state_list, and has its private data set to NULL. - * The zvol_state_lock is dropped. - * - * This function may take many milliseconds to complete (e.g. we've seen - * it take over 256ms), due to the calls to "blk_cleanup_queue" and - * "del_gendisk". Thus, consumers need to be careful to account for this - * latency when calling this function. - */ void -zvol_os_free(zvol_state_t *zv) +zvol_os_remove_minor(zvol_state_t *zv) { - - ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); - ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT0(zv->zv_open_count); - ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); + ASSERT0(atomic_read(&zv->zv_suspend_ref)); + ASSERT(zv->zv_flags & ZVOL_REMOVING); - rw_destroy(&zv->zv_suspend_lock); - zfs_rangelock_fini(&zv->zv_rangelock); + struct zvol_state_os *zso = zv->zv_zso; + zv->zv_zso = NULL; + + /* Clearing private_data will make new callers return immediately. */ + atomic_store_ptr(&zso->zvo_disk->private_data, NULL); + + /* + * Drop the state lock before calling del_gendisk(). There may be + * callers waiting to acquire it, but del_gendisk() will block until + * they exit, which would deadlock. + */ + mutex_exit(&zv->zv_state_lock); - del_gendisk(zv->zv_zso->zvo_disk); + del_gendisk(zso->zvo_disk); #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) #if defined(HAVE_BLK_CLEANUP_DISK) - blk_cleanup_disk(zv->zv_zso->zvo_disk); + blk_cleanup_disk(zso->zvo_disk); #else - put_disk(zv->zv_zso->zvo_disk); + put_disk(zso->zvo_disk); #endif #else - blk_cleanup_queue(zv->zv_zso->zvo_queue); - put_disk(zv->zv_zso->zvo_disk); + blk_cleanup_queue(zso->zvo_queue); + put_disk(zso->zvo_disk); #endif - if (zv->zv_zso->use_blk_mq) - blk_mq_free_tag_set(&zv->zv_zso->tag_set); + if (zso->use_blk_mq) + blk_mq_free_tag_set(&zso->tag_set); - ida_simple_remove(&zvol_ida, - MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); + ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS); + + kmem_free(zso, sizeof (struct zvol_state_os)); + + mutex_enter(&zv->zv_state_lock); +} + +void +zvol_os_free(zvol_state_t *zv) +{ + + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT0(zv->zv_open_count); + ASSERT0P(zv->zv_zso); + + ASSERT0P(zv->zv_objset); + ASSERT0P(zv->zv_zilog); + ASSERT0P(zv->zv_dn); + + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); cv_destroy(&zv->zv_removing_cv); mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); - kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); } @@ -1514,7 +1525,9 @@ __zvol_os_add_disk(struct gendisk *disk) { int error = 0; #ifdef HAVE_ADD_DISK_RET - error = add_disk(disk); + error = -add_disk(disk); + if (error) + error = SET_ERROR(error); #else add_disk(disk); #endif @@ -1606,7 +1619,7 @@ zvol_os_add_disk(struct gendisk *disk) int zvol_os_create_minor(const char *name) { - zvol_state_t *zv; + zvol_state_t *zv = NULL; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; @@ -1655,18 +1668,16 @@ zvol_os_create_minor(const char *name) if (error) goto out_dmu_objset_disown; - zv = zvol_alloc(MKDEV(zvol_major, minor), name, - doi->doi_data_block_size); - if (zv == NULL) { - error = SET_ERROR(EAGAIN); + error = zvol_alloc(MKDEV(zvol_major, minor), name, + volsize, doi->doi_data_block_size, &zv); + if (error || zv == NULL) goto out_dmu_objset_disown; - } + zv->zv_hash = hash; if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; - zv->zv_volsize = volsize; zv->zv_objset = os; /* Default */ @@ -1691,11 +1702,11 @@ zvol_os_create_minor(const char *name) blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); #endif - ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); + ASSERT0P(zv->zv_kstat.dk_kstats); error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); if (error) goto out_dmu_objset_disown; - ASSERT3P(zv->zv_zilog, ==, NULL); + ASSERT0P(zv->zv_zilog); zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) @@ -1733,7 +1744,7 @@ out_doi: * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() * directly as well. */ - if (error == 0) { + if (error == 0 && zv) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); rw_exit(&zvol_state_lock); @@ -1745,7 +1756,7 @@ out_doi: return (error); } -void +int zvol_os_rename_minor(zvol_state_t *zv, const char *newname) { int readonly = get_disk_ro(zv->zv_zso->zvo_disk); @@ -1772,6 +1783,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) set_disk_ro(zv->zv_zso->zvo_disk, readonly); dataset_kstats_rename(&zv->zv_kstat, newname); + + return (0); } void @@ -1793,61 +1806,16 @@ zvol_init(void) { int error; - /* - * zvol_threads is the module param the user passes in. - * - * zvol_actual_threads is what we use internally, since the user can - * pass zvol_thread = 0 to mean "use all the CPUs" (the default). - */ - static unsigned int zvol_actual_threads; - - if (zvol_threads == 0) { - /* - * See dde9380a1 for why 32 was chosen here. This should - * probably be refined to be some multiple of the number - * of CPUs. - */ - zvol_actual_threads = MAX(num_online_cpus(), 32); - } else { - zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); + error = zvol_init_impl(); + if (error) { + printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error); + return (error); } - /* - * Use atleast 32 zvol_threads but for many core system, - * prefer 6 threads per taskq, but no more taskqs - * than threads in them on large systems. - * - * taskq total - * cpus taskqs threads threads - * ------- ------- ------- ------- - * 1 1 32 32 - * 2 1 32 32 - * 4 1 32 32 - * 8 2 16 32 - * 16 3 11 33 - * 32 5 7 35 - * 64 8 8 64 - * 128 11 12 132 - * 256 16 16 256 - */ - zv_taskq_t *ztqs = &zvol_taskqs; - uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); - if (num_tqs == 0) { - num_tqs = 1 + num_online_cpus() / 6; - while (num_tqs * num_tqs > zvol_actual_threads) - num_tqs--; - } - uint_t per_tq_thread = zvol_actual_threads / num_tqs; - if (per_tq_thread * num_tqs < zvol_actual_threads) - per_tq_thread++; - ztqs->tqs_cnt = num_tqs; - ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); - error = register_blkdev(zvol_major, ZVOL_DRIVER); + error = -register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { - kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); - ztqs->tqs_taskq = NULL; printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); - return (error); + return (SET_ERROR(error)); } if (zvol_blk_mq_queue_depth == 0) { @@ -1864,25 +1832,6 @@ zvol_init(void) 1024); } - for (uint_t i = 0; i < num_tqs; i++) { - char name[32]; - (void) snprintf(name, sizeof (name), "%s_tq-%u", - ZVOL_DRIVER, i); - ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, - maxclsyspri, per_tq_thread, INT_MAX, - TASKQ_PREPOPULATE | TASKQ_DYNAMIC); - if (ztqs->tqs_taskq[i] == NULL) { - for (int j = i - 1; j >= 0; j--) - taskq_destroy(ztqs->tqs_taskq[j]); - unregister_blkdev(zvol_major, ZVOL_DRIVER); - kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * - sizeof (taskq_t *)); - ztqs->tqs_taskq = NULL; - return (-ENOMEM); - } - } - - zvol_init_impl(); ida_init(&zvol_ida); return (0); } @@ -1890,50 +1839,19 @@ zvol_init(void) void zvol_fini(void) { - zv_taskq_t *ztqs = &zvol_taskqs; - zvol_fini_impl(); unregister_blkdev(zvol_major, ZVOL_DRIVER); - if (ztqs->tqs_taskq == NULL) { - ASSERT3U(ztqs->tqs_cnt, ==, 0); - } else { - for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { - ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); - taskq_destroy(ztqs->tqs_taskq[i]); - } - kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * - sizeof (taskq_t *)); - ztqs->tqs_taskq = NULL; - } + zvol_fini_impl(); ida_destroy(&zvol_ida); } -module_param(zvol_inhibit_dev, uint, 0644); -MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); - module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); -module_param(zvol_threads, uint, 0444); -MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" - "to 0 to use all active CPUs"); - -module_param(zvol_request_sync, uint, 0644); -MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); - module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); -module_param(zvol_num_taskqs, uint, 0444); -MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); - -module_param(zvol_prefetch_bytes, uint, 0644); -MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); - -module_param(zvol_volmode, uint, 0644); -MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); - module_param(zvol_blk_mq_queue_depth, uint, 0644); MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); |