diff options
Diffstat (limited to 'sys/contrib/openzfs/module')
50 files changed, 1304 insertions, 344 deletions
diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in index 362d2295e091..58a80dc4402c 100644 --- a/sys/contrib/openzfs/module/Kbuild.in +++ b/sys/contrib/openzfs/module/Kbuild.in @@ -4,7 +4,7 @@ ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement ZFS_MODULE_CFLAGS += -Wmissing-prototypes -ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @NO_FORMAT_ZERO_LENGTH@ +ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @KERNEL_NO_FORMAT_ZERO_LENGTH@ ifneq ($(KBUILD_EXTMOD),) zfs_include = @abs_top_srcdir@/include diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c index 6d3bcca9f995..dcb0a391dda4 100644 --- a/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c +++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c @@ -38,11 +38,14 @@ kfpu_begin(); E(s, d, b); kfpu_end(); \ } +#if defined(__x86_64) || defined(__aarch64__) || defined(__arm__) || \ + defined(__PPC64__) /* some implementation is always okay */ static inline boolean_t sha2_is_supported(void) { return (B_TRUE); } +#endif #if defined(__x86_64) diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c index d0fcca798fa9..ad707341eec7 100644 --- a/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c +++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c @@ -77,7 +77,8 @@ static const uint32_t SHA256_K[64] = { h = g, g = f, f = e, e = d + T1; \ d = c, c = b, b = a, a = T1 + T2; -static void sha256_generic(uint32_t state[8], const void *data, size_t num_blks) +static void +icp_sha256_generic(uint32_t state[8], const void *data, size_t num_blks) { uint64_t blk; @@ -173,7 +174,8 @@ static const uint64_t SHA512_K[80] = { 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 }; -static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks) +static void +icp_sha512_generic(uint64_t state[8], const void *data, size_t num_blks) { uint64_t blk; @@ -226,7 +228,8 @@ static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks) } } -static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len) +static void +icp_sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len) { uint64_t pos = ctx->count[0]; uint64_t total = ctx->count[1]; @@ -258,7 +261,8 @@ static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len) ctx->count[1] = total; } -static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len) +static void +icp_sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len) { uint64_t pos = ctx->count[0]; uint64_t total = ctx->count[1]; @@ -290,7 +294,8 @@ static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len) ctx->count[1] = total; } -static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits) +static void +icp_sha256_final(sha256_ctx *ctx, uint8_t *result, int bits) { uint64_t mlen, pos = ctx->count[0]; uint8_t *m = ctx->wbuf; @@ -334,7 +339,8 @@ static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits) memset(ctx, 0, sizeof (*ctx)); } -static void sha512_final(sha512_ctx *ctx, uint8_t *result, int bits) +static void +icp_sha512_final(sha512_ctx *ctx, uint8_t *result, int bits) { uint64_t mlen, pos = ctx->count[0]; uint8_t *m = ctx->wbuf, *r; @@ -461,14 +467,14 @@ SHA2Update(SHA2_CTX *ctx, const void *data, size_t len) switch (ctx->algotype) { case SHA256: - sha256_update(&ctx->sha256, data, len); + icp_sha256_update(&ctx->sha256, data, len); break; case SHA512: case SHA512_HMAC_MECH_INFO_TYPE: - sha512_update(&ctx->sha512, data, len); + icp_sha512_update(&ctx->sha512, data, len); break; case SHA512_256: - sha512_update(&ctx->sha512, data, len); + icp_sha512_update(&ctx->sha512, data, len); break; } } @@ -479,32 +485,33 @@ SHA2Final(void *digest, SHA2_CTX *ctx) { switch (ctx->algotype) { case SHA256: - sha256_final(&ctx->sha256, digest, 256); + icp_sha256_final(&ctx->sha256, digest, 256); break; case SHA512: case SHA512_HMAC_MECH_INFO_TYPE: - sha512_final(&ctx->sha512, digest, 512); + icp_sha512_final(&ctx->sha512, digest, 512); break; case SHA512_256: - sha512_final(&ctx->sha512, digest, 256); + icp_sha512_final(&ctx->sha512, digest, 256); break; } } /* the generic implementation is always okay */ -static boolean_t sha2_is_supported(void) +static boolean_t +icp_sha2_is_supported(void) { return (B_TRUE); } const sha256_ops_t sha256_generic_impl = { .name = "generic", - .transform = sha256_generic, - .is_supported = sha2_is_supported + .transform = icp_sha256_generic, + .is_supported = icp_sha2_is_supported }; const sha512_ops_t sha512_generic_impl = { .name = "generic", - .transform = sha512_generic, - .is_supported = sha2_is_supported + .transform = icp_sha512_generic, + .is_supported = icp_sha2_is_supported }; diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c index 2efd9fcf4c99..a85a71a83df4 100644 --- a/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c +++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c @@ -38,11 +38,14 @@ kfpu_begin(); E(s, d, b); kfpu_end(); \ } +#if defined(__x86_64) || defined(__aarch64__) || defined(__arm__) || \ + defined(__aarch64__) || defined(__arm__) || defined(__PPC64__) /* some implementation is always okay */ static inline boolean_t sha2_is_supported(void) { return (B_TRUE); } +#endif #if defined(__x86_64) diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c index ace2360c032d..ebc2c0eeb6d2 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -163,6 +163,13 @@ param_set_arc_int(SYSCTL_HANDLER_ARGS) return (0); } +static void +warn_deprecated_sysctl(const char *old, const char *new) +{ + printf("WARNING: sysctl vfs.zfs.%s is deprecated. Use vfs.zfs.%s instead.\n", + old, new); +} + int param_set_arc_max(SYSCTL_HANDLER_ARGS) { @@ -185,12 +192,15 @@ param_set_arc_max(SYSCTL_HANDLER_ARGS) if (val != 0) zfs_arc_max = arc_c_max; + if (arg2 != 0) + warn_deprecated_sysctl("arc_max", "arc.max"); + return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - NULL, 0, param_set_arc_max, "LU", + NULL, 1, param_set_arc_max, "LU", "Maximum ARC size in bytes (LEGACY)"); int @@ -214,12 +224,15 @@ param_set_arc_min(SYSCTL_HANDLER_ARGS) if (val != 0) zfs_arc_min = arc_c_min; + if (arg2 != 0) + warn_deprecated_sysctl("arc_min", "arc.min"); + return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - NULL, 0, param_set_arc_min, "LU", + NULL, 1, param_set_arc_min, "LU", "Minimum ARC size in bytes (LEGACY)"); extern uint_t zfs_arc_free_target; @@ -242,6 +255,9 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS) zfs_arc_free_target = val; + if (arg2 != 0) + warn_deprecated_sysctl("arc_free_target", "arc.free_target"); + return (0); } @@ -251,7 +267,7 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS) */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, - NULL, 0, param_set_arc_free_target, "IU", + NULL, 1, param_set_arc_free_target, "IU", "Desired number of free pages below which ARC triggers reclaim" " (LEGACY)"); @@ -270,12 +286,15 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) arc_no_grow_shift = val; + if (arg2 != 0) + warn_deprecated_sysctl("arc_no_grow_shift", "arc.no_grow_shift"); + return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - NULL, 0, param_set_arc_no_grow_shift, "I", + NULL, 1, param_set_arc_no_grow_shift, "I", "log2(fraction of ARC which must be free to allow growing) (LEGACY)"); extern uint64_t l2arc_write_max; @@ -746,12 +765,15 @@ param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) zfs_vdev_min_auto_ashift = val; + if (arg2 != 0) + warn_deprecated_sysctl("min_auto_ashift", + "vdev.min_auto_ashift"); + return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, - CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift), + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1, param_set_min_auto_ashift, "IU", "Min ashift used when creating new top-level vdev. (LEGACY)"); @@ -771,12 +793,15 @@ param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) zfs_vdev_max_auto_ashift = val; + if (arg2 != 0) + warn_deprecated_sysctl("max_auto_ashift", + "vdev.max_auto_ashift"); + return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, - CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift), + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1, param_set_max_auto_ashift, "IU", "Max ashift used when optimizing for logical -> physical sector size on" " new top-level vdevs. (LEGACY)"); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c index b15a3e6e38c0..cb5787269db2 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c @@ -1175,7 +1175,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) int count = 0; zfs_acl_phys_t acl_phys; - if (zp->z_zfsvfs->z_replay == B_FALSE) { + if (ZTOV(zp) != NULL && zp->z_zfsvfs->z_replay == B_FALSE) { ASSERT_VOP_IN_SEQC(ZTOV(zp)); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c index a222c5de4a2a..d0a9c662e6f0 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c @@ -674,7 +674,6 @@ zfsctl_root_readdir(struct vop_readdir_args *ap) zfs_uio_t uio; int *eofp = ap->a_eofflag; off_t dots_offset; - ssize_t orig_resid; int error; zfs_uio_init(&uio, ap->a_uio); @@ -694,11 +693,13 @@ zfsctl_root_readdir(struct vop_readdir_args *ap) return (0); } - orig_resid = zfs_uio_resid(&uio); error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, &uio, &dots_offset); - if (error != 0) - goto err; + if (error != 0) { + if (error == ENAMETOOLONG) /* ran out of destination space */ + error = 0; + return (error); + } if (zfs_uio_offset(&uio) != dots_offset) return (SET_ERROR(EINVAL)); @@ -711,11 +712,8 @@ zfsctl_root_readdir(struct vop_readdir_args *ap) entry.d_reclen = sizeof (entry); error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio)); if (error != 0) { -err: - if (error == ENAMETOOLONG) { - error = orig_resid == zfs_uio_resid(&uio) ? - EINVAL : 0; - } + if (error == ENAMETOOLONG) + error = 0; return (SET_ERROR(error)); } if (eofp != NULL) @@ -764,8 +762,7 @@ zfsctl_common_pathconf(struct vop_pathconf_args *ap) return (0); case _PC_MIN_HOLE_SIZE: - *ap->a_retval = (int)SPA_MINBLOCKSIZE; - return (0); + return (EINVAL); case _PC_ACL_EXTENDED: *ap->a_retval = 0; @@ -1060,21 +1057,17 @@ zfsctl_snapdir_readdir(struct vop_readdir_args *ap) zfs_uio_t uio; int *eofp = ap->a_eofflag; off_t dots_offset; - ssize_t orig_resid; int error; zfs_uio_init(&uio, ap->a_uio); - orig_resid = zfs_uio_resid(&uio); ASSERT3S(vp->v_type, ==, VDIR); error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, &uio, &dots_offset); if (error != 0) { - if (error == ENAMETOOLONG) { /* ran out of destination space */ - error = orig_resid == zfs_uio_resid(&uio) ? - EINVAL : 0; - } + if (error == ENAMETOOLONG) /* ran out of destination space */ + error = 0; return (error); } @@ -1092,13 +1085,9 @@ zfsctl_snapdir_readdir(struct vop_readdir_args *ap) dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); if (error != 0) { if (error == ENOENT) { - if (orig_resid == zfs_uio_resid(&uio)) { - error = EINVAL; - } else { - error = 0; - if (eofp != NULL) - *eofp = 1; - } + if (eofp != NULL) + *eofp = 1; + error = 0; } zfs_exit(zfsvfs, FTAG); return (error); @@ -1111,10 +1100,8 @@ zfsctl_snapdir_readdir(struct vop_readdir_args *ap) entry.d_reclen = sizeof (entry); error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio)); if (error != 0) { - if (error == ENAMETOOLONG) { - error = orig_resid == zfs_uio_resid(&uio) ? - EINVAL : 0; - } + if (error == ENAMETOOLONG) + error = 0; zfs_exit(zfsvfs, FTAG); return (SET_ERROR(error)); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c index 21e5f7938f9f..ca13569a1235 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c @@ -164,8 +164,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, - ssize_t *resid) + uint8_t ashift, ssize_t *resid) { + (void) ashift; return (zfs_file_write_impl(fp, buf, count, &off, resid)); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index 174141a5deab..f34a2fd37a77 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -61,6 +61,7 @@ #include <sys/fs/zfs.h> #include <sys/dmu.h> #include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> #include <sys/spa.h> #include <sys/txg.h> #include <sys/dbuf.h> @@ -388,7 +389,9 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, error = vn_lock(vp, LK_EXCLUSIVE); if (error) return (error); + vn_seqc_write_begin(vp); error = zfs_ioctl_setxattr(vp, fsx, cred); + vn_seqc_write_end(vp); VOP_UNLOCK(vp); return (error); } @@ -1695,7 +1698,6 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, objset_t *os; caddr_t outbuf; size_t bufsize; - ssize_t orig_resid; zap_cursor_t zc; zap_attribute_t *zap; uint_t bytes_wanted; @@ -1744,7 +1746,6 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, error = 0; os = zfsvfs->z_os; offset = zfs_uio_offset(uio); - orig_resid = zfs_uio_resid(uio); prefetch = zp->z_zn_prefetch; zap = zap_attribute_long_alloc(); @@ -1924,7 +1925,7 @@ update: kmem_free(outbuf, bufsize); if (error == ENOENT) - error = orig_resid == zfs_uio_resid(uio) ? EINVAL : 0; + error = 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); @@ -2205,6 +2206,7 @@ zfs_setattr_dir(znode_t *dzp) if (err) break; + vn_seqc_write_begin(ZTOV(zp)); mutex_enter(&dzp->z_lock); if (zp->z_uid != dzp->z_uid) { @@ -2254,6 +2256,7 @@ sa_add_projid_err: dmu_tx_abort(tx); } tx = NULL; + vn_seqc_write_end(ZTOV(zp)); if (err != 0 && err != ENOENT) break; @@ -4113,6 +4116,7 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, { znode_t *zp; zfsvfs_t *zfsvfs; + uint_t blksize, iosize; int error; switch (cmd) { @@ -4124,8 +4128,20 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, *valp = 64; return (0); case _PC_MIN_HOLE_SIZE: - *valp = (int)SPA_MINBLOCKSIZE; - return (0); + iosize = vp->v_mount->mnt_stat.f_iosize; + if (vp->v_type == VREG) { + zp = VTOZ(vp); + blksize = zp->z_blksz; + if (zp->z_size <= blksize) + blksize = MAX(blksize, iosize); + *valp = (int)blksize; + return (0); + } + if (vp->v_type == VDIR) { + *valp = (int)iosize; + return (0); + } + return (EINVAL); case _PC_ACL_EXTENDED: #if 0 /* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */ zp = VTOZ(vp); @@ -4207,8 +4223,20 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, zfs_vmobject_wlock(object); (void) vm_page_grab_pages(object, OFF_TO_IDX(start), - VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_ZERO, + VM_ALLOC_NORMAL | VM_ALLOC_WAITOK, ma, count); + if (!vm_page_all_valid(ma[count - 1])) { + /* + * Later in this function, we copy DMU data to + * invalid pages only. The last page may not be + * entirely filled though, if the file does not + * end on a page boundary. Therefore, we zero + * that last page here to make sure it does not + * contain garbage after the end of file. + */ + ASSERT(vm_page_none_valid(ma[count - 1])); + vm_page_zero_invalid(ma[count - 1], FALSE); + } zfs_vmobject_wunlock(object); } if (blksz == zp->z_blksz) @@ -5729,6 +5757,9 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap) { ulong_t val; int error; +#ifdef _PC_CLONE_BLKSIZE + zfsvfs_t *zfsvfs; +#endif error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); @@ -5775,6 +5806,21 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap) *ap->a_retval = 1; return (0); #endif +#ifdef _PC_CLONE_BLKSIZE + case _PC_CLONE_BLKSIZE: + zfsvfs = (zfsvfs_t *)ap->a_vp->v_mount->mnt_data; + if (zfs_bclone_enabled && + spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os), + SPA_FEATURE_BLOCK_CLONING)) + *ap->a_retval = dsl_dataset_feature_is_active( + zfsvfs->z_os->os_dsl_dataset, + SPA_FEATURE_LARGE_BLOCKS) ? + SPA_MAXBLOCKSIZE : + SPA_OLD_MAXBLOCKSIZE; + else + *ap->a_retval = 0; + return (0); +#endif default: return (vop_stdpathconf(ap)); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c index 7cd0a153577c..649022ab5bcb 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c @@ -817,6 +817,10 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, (*zpp)->z_dnodesize = dnodesize; (*zpp)->z_projid = projid; + vnode_t *vp = ZTOV(*zpp); + if (!(flag & IS_ROOT_NODE)) + vn_seqc_write_begin(vp); + if (vap->va_mask & AT_XVATTR) zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); @@ -825,7 +829,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } if (!(flag & IS_ROOT_NODE)) { - vnode_t *vp = ZTOV(*zpp); + vn_seqc_write_end(vp); vp->v_vflag |= VV_FORCEINSMQ; int err = insmntque(vp, zfsvfs->z_vfs); vp->v_vflag &= ~VV_FORCEINSMQ; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c index 91cf38016e00..8562c42b3220 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c @@ -437,6 +437,7 @@ zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + memset(&cuio_s, 0, sizeof (cuio_s)); zfs_uio_init(&cuio, &cuio_s); keydata_len = zio_crypt_table[crypt].ci_keylen; @@ -519,6 +520,7 @@ zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, keydata_len = zio_crypt_table[crypt].ci_keylen; rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + memset(&cuio_s, 0, sizeof (cuio_s)); zfs_uio_init(&cuio, &cuio_s); /* diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index 0dd2ecd7fd8d..3ddbfcb97184 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -183,6 +183,7 @@ static struct filterops zvol_filterops_vnode = { .f_isfd = 1, .f_detach = zvol_filter_detach, .f_event = zvol_filter_vnode, + .f_copy = knote_triv_copy, }; extern uint_t zfs_geom_probe_vdev_key; diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c index 45c2999a4bb1..b2eae5d00b10 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c @@ -25,6 +25,10 @@ * SUCH DAMAGE. */ +/* + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> + */ + #include <sys/types.h> #include <sys/sysmacros.h> #include <sys/kmem.h> @@ -56,6 +60,19 @@ typedef struct zone_dataset { } zone_dataset_t; #ifdef CONFIG_USER_NS + +/* + * Linux 6.18 moved the generic namespace type away from ns->ops->type onto + * ns_common itself. + */ +#ifdef HAVE_NS_COMMON_TYPE +#define ns_is_newuser(ns) \ + ((ns)->ns_type == CLONE_NEWUSER) +#else +#define ns_is_newuser(ns) \ + ((ns)->ops != NULL && (ns)->ops->type == CLONE_NEWUSER) +#endif + /* * Returns: * - 0 on success @@ -84,7 +101,7 @@ user_ns_get(int fd, struct user_namespace **userns) goto done; } ns = get_proc_ns(file_inode(nsfile)); - if (ns->ops->type != CLONE_NEWUSER) { + if (!ns_is_newuser(ns)) { error = ENOTTY; goto done; } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index 8a8316f63c48..18f2426fbbfc 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -23,6 +23,7 @@ * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 by Delphix. All rights reserved. * Copyright (c) 2023, 2024, Klara Inc. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> */ /* @@ -1109,6 +1110,14 @@ abd_return_buf_copy(abd_t *abd, void *buf, size_t n) #define ABD_ITER_PAGE_SIZE(page) (PAGESIZE) #endif +#ifndef nth_page +/* + * Since 6.18 nth_page() no longer exists, and is no longer required to iterate + * within a single SG entry, so we replace it with a simple addition. + */ +#define nth_page(p, n) ((p)+(n)) +#endif + void abd_iter_page(struct abd_iter *aiter) { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c index 830fad7fe793..1bd3500e9f66 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c @@ -471,13 +471,17 @@ vdev_disk_close(vdev_t *v) if (v->vdev_reopening || vd == NULL) return; + rw_enter(&vd->vd_lock, RW_WRITER); + if (vd->vd_bdh != NULL) vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), zfs_vdev_holder); + v->vdev_tsd = NULL; + + rw_exit(&vd->vd_lock); rw_destroy(&vd->vd_lock); kmem_free(vd, sizeof (vdev_disk_t)); - v->vdev_tsd = NULL; } /* diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c index daa4b5776837..934d74a112fd 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c @@ -2524,7 +2524,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, * Also note: DOS R/O is ignored for directories. */ if ((v4_mode & WRITE_MASK_DATA) && - S_ISDIR(ZTOI(zp)->i_mode) && + !S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_READONLY)) { return (SET_ERROR(EPERM)); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c index c729947369c2..3fdcdbac6f68 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c @@ -115,8 +115,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) */ int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, - ssize_t *resid) + uint8_t ashift, ssize_t *resid) { + (void) ashift; ssize_t rc; rc = kernel_write(fp, buf, count, &off); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index cd606e667bff..8a7d14ab6119 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -1556,6 +1556,12 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) sb->s_xattr = zpl_xattr_handlers; sb->s_export_op = &zpl_export_operations; +#ifdef HAVE_SET_DEFAULT_D_OP + set_default_d_op(sb, &zpl_dentry_operations); +#else + sb->s_d_op = &zpl_dentry_operations; +#endif + /* Set features for file system. */ zfs_set_fuid_feature(zfsvfs); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index 6106726651a3..e845ad69ad78 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -2033,10 +2033,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) goto out3; } - if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { - err = SET_ERROR(EPERM); - goto out3; - } + /* ZFS_READONLY will be handled in zfs_zaccess() */ /* * Verify timestamps doesn't overflow 32 bits. diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c index 48dae79a2373..81ac26cb0c93 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c @@ -202,7 +202,7 @@ zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags) return (!!dentry->d_inode); } -static dentry_operations_t zpl_dops_snapdirs = { +static const struct dentry_operations zpl_dops_snapdirs = { /* * Auto mounting of snapshots is only supported for 2.6.37 and * newer kernels. Prior to this kernel the ops->follow_link() @@ -215,6 +215,51 @@ static dentry_operations_t zpl_dops_snapdirs = { .d_revalidate = zpl_snapdir_revalidate, }; +/* + * For the .zfs control directory to work properly we must be able to override + * the default operations table and register custom .d_automount and + * .d_revalidate callbacks. + */ +static void +set_snapdir_dentry_ops(struct dentry *dentry, unsigned int extraflags) { + static const unsigned int op_flags = + DCACHE_OP_HASH | DCACHE_OP_COMPARE | + DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE | + DCACHE_OP_PRUNE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_REAL; + +#ifdef HAVE_D_SET_D_OP + /* + * d_set_d_op() will set the DCACHE_OP_ flags according to what it + * finds in the passed dentry_operations, so we don't have to. + * + * We clear the flags and the old op table before calling d_set_d_op() + * because issues a warning when the dentry operations table is already + * set. + */ + dentry->d_op = NULL; + dentry->d_flags &= ~op_flags; + d_set_d_op(dentry, &zpl_dops_snapdirs); + dentry->d_flags |= extraflags; +#else + /* + * Since 6.17 there's no exported way to modify dentry ops, so we have + * to reach in and do it ourselves. This should be safe for our very + * narrow use case, which is to create or splice in an entry to give + * access to a snapshot. + * + * We need to set the op flags directly. We hardcode + * DCACHE_OP_REVALIDATE because that's the only operation we have; if + * we ever extend zpl_dops_snapdirs we will need to update the op flags + * to match. + */ + spin_lock(&dentry->d_lock); + dentry->d_op = &zpl_dops_snapdirs; + dentry->d_flags &= ~op_flags; + dentry->d_flags |= DCACHE_OP_REVALIDATE | extraflags; + spin_unlock(&dentry->d_lock); +#endif +} + static struct dentry * zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) @@ -236,10 +281,7 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, return (ERR_PTR(error)); ASSERT(error == 0 || ip == NULL); - d_clear_d_op(dentry); - d_set_d_op(dentry, &zpl_dops_snapdirs); - dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; - + set_snapdir_dentry_ops(dentry, DCACHE_NEED_AUTOMOUNT); return (d_splice_alias(ip, dentry)); } @@ -373,8 +415,7 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); if (error == 0) { - d_clear_d_op(dentry); - d_set_d_op(dentry, &zpl_dops_snapdirs); + set_snapdir_dentry_ops(dentry, 0); d_instantiate(dentry, ip); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index d07317b0d910..02965ac8cbee 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -23,6 +23,7 @@ * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright (c) 2025, Klara, Inc. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> */ @@ -478,6 +479,7 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) return (ret); } +#ifdef HAVE_WRITE_CACHE_PAGES #ifdef HAVE_WRITEPAGE_T_FOLIO static int zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data) @@ -499,6 +501,78 @@ zpl_write_cache_pages(struct address_space *mapping, #endif return (result); } +#else +static inline int +zpl_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, void *data) +{ + pgoff_t start = wbc->range_start >> PAGE_SHIFT; + pgoff_t end = wbc->range_end >> PAGE_SHIFT; + + struct folio_batch fbatch; + folio_batch_init(&fbatch); + + /* + * This atomically (-ish) tags all DIRTY pages in the range with + * TOWRITE, allowing users to continue dirtying or undirtying pages + * while we get on with writeback, without us treading on each other. + */ + tag_pages_for_writeback(mapping, start, end); + + int err = 0; + unsigned int npages; + + /* + * Grab references to the TOWRITE pages just flagged. This may not get + * all of them, so we do it in a loop until there are none left. + */ + while ((npages = filemap_get_folios_tag(mapping, &start, end, + PAGECACHE_TAG_TOWRITE, &fbatch)) != 0) { + + /* Loop over each page and write it out. */ + struct folio *folio; + while ((folio = folio_batch_next(&fbatch)) != NULL) { + folio_lock(folio); + + /* + * If the folio has been remapped, or is no longer + * dirty, then there's nothing to do. + */ + if (folio->mapping != mapping || + !folio_test_dirty(folio)) { + folio_unlock(folio); + continue; + } + + /* + * If writeback is already in progress, wait for it to + * finish. We continue after this even if the page + * ends up clean; zfs_putpage() will skip it if no + * further work is required. + */ + while (folio_test_writeback(folio)) + folio_wait_bit(folio, PG_writeback); + + /* + * Write it out and collect any error. zfs_putpage() + * will clear the TOWRITE and DIRTY flags, and return + * with the page unlocked. + */ + int ferr = zpl_putpage(&folio->page, wbc, data); + if (err == 0 && ferr != 0) + err = ferr; + + /* Housekeeping for the caller. */ + wbc->nr_to_write -= folio_nr_pages(folio); + } + + /* Release any remaining references on the batch. */ + folio_batch_release(&fbatch); + } + + return (err); +} +#endif static int zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index 53819628627d..347b352506e5 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -22,6 +22,8 @@ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2023, Datto Inc. All rights reserved. + * Copyright (c) 2025, Klara, Inc. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> */ @@ -32,7 +34,22 @@ #include <sys/zpl.h> #include <linux/iversion.h> #include <linux/version.h> +#include <linux/vfs_compat.h> +/* + * What to do when the last reference to an inode is released. If 0, the kernel + * will cache it on the superblock. If 1, the inode will be freed immediately. + * See zpl_drop_inode(). + */ +int zfs_delete_inode = 0; + +/* + * What to do when the last reference to a dentry is released. If 0, the kernel + * will cache it until the entry (file) is destroyed. If 1, the dentry will be + * marked for cleanup, at which time its inode reference will be released. See + * zpl_dentry_delete(). + */ +int zfs_delete_dentry = 0; static struct inode * zpl_inode_alloc(struct super_block *sb) @@ -77,11 +94,36 @@ zpl_dirty_inode(struct inode *ip, int flags) } /* - * When ->drop_inode() is called its return value indicates if the - * inode should be evicted from the inode cache. If the inode is - * unhashed and has no links the default policy is to evict it - * immediately. + * ->drop_inode() is called when the last reference to an inode is released. + * Its return value indicates if the inode should be destroyed immediately, or + * cached on the superblock structure. + * + * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns + * "destroy immediately" if the inode is unhashed and has no links (roughly: no + * longer exists on disk). On datasets with millions of rarely-accessed files, + * this can cause a large amount of memory to be "pinned" by cached inodes, + * which in turn pin their associated dnodes and dbufs, until the kernel starts + * reporting memory pressure and requests OpenZFS release some memory (see + * zfs_prune()). + * + * When set to 1, we call generic_delete_inode(), which always returns "destroy + * immediately", resulting in inodes being destroyed immediately, releasing + * their associated dnodes and dbufs to the dbuf cached and the ARC to be + * evicted as normal. * + * Note that the "last reference" doesn't always mean the last _userspace_ + * reference; the dentry cache also holds a reference, so "busy" inodes will + * still be kept alive that way (subject to dcache tuning). + */ +static int +zpl_drop_inode(struct inode *ip) +{ + if (zfs_delete_inode) + return (generic_delete_inode(ip)); + return (generic_drop_inode(ip)); +} + +/* * The ->evict_inode() callback must minimally truncate the inode pages, * and call clear_inode(). For 2.6.35 and later kernels this will * simply update the inode state, with the sync occurring before the @@ -470,6 +512,7 @@ const struct super_operations zpl_super_operations = { .destroy_inode = zpl_inode_destroy, .dirty_inode = zpl_dirty_inode, .write_inode = NULL, + .drop_inode = zpl_drop_inode, .evict_inode = zpl_evict_inode, .put_super = zpl_put_super, .sync_fs = zpl_sync_fs, @@ -480,6 +523,35 @@ const struct super_operations zpl_super_operations = { .show_stats = NULL, }; +/* + * ->d_delete() is called when the last reference to a dentry is released. Its + * return value indicates if the dentry should be destroyed immediately, or + * retained in the dentry cache. + * + * By default (zfs_delete_dentry=0) the kernel will always cache unused + * entries. Each dentry holds an inode reference, so cached dentries can hold + * the final inode reference indefinitely, leading to the inode and its related + * data being pinned (see zpl_drop_inode()). + * + * When set to 1, we signal that the dentry should be destroyed immediately and + * never cached. This reduces memory usage, at the cost of higher overheads to + * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be + * reloaded and reinflated. + * + * Note that userspace does not have direct control over dentry references and + * reclaim; rather, this is part of the kernel's caching and reclaim subsystems + * (eg vm.vfs_cache_pressure). + */ +static int +zpl_dentry_delete(const struct dentry *dentry) +{ + return (zfs_delete_dentry ? 1 : 0); +} + +const struct dentry_operations zpl_dentry_operations = { + .d_delete = zpl_dentry_delete, +}; + struct file_system_type zpl_fs_type = { .owner = THIS_MODULE, .name = ZFS_DRIVER, @@ -491,3 +563,10 @@ struct file_system_type zpl_fs_type = { .mount = zpl_mount, .kill_sb = zpl_kill_sb, }; + +ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW, + "Delete inodes as soon as the last reference is released."); + +ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW, + "Delete dentries from dentry cache as soon as the last reference is " + "released."); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index bac166fcd89e..fe939150b641 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -21,7 +21,7 @@ */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. - * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> + * Copyright (c) 2024, 2025, Rob Norris <robn@despairlabs.com> * Copyright (c) 2024, 2025, Klara, Inc. */ @@ -337,16 +337,14 @@ zvol_discard(zv_request_t *zvr) } /* - * Align the request to volume block boundaries when a secure erase is - * not required. This will prevent dnode_free_range() from zeroing out - * the unaligned parts which is slow (read-modify-write) and useless - * since we are not freeing any space by doing so. + * Align the request to volume block boundaries. This will prevent + * dnode_free_range() from zeroing out the unaligned parts which is + * slow (read-modify-write) and useless since we are not freeing any + * space by doing so. */ - if (!io_is_secure_erase(bio, rq)) { - start = P2ROUNDUP(start, zv->zv_volblocksize); - end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); - size = end - start; - } + start = P2ROUNDUP(start, zv->zv_volblocksize); + end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); + size = end - start; if (start >= end) goto unlock; @@ -467,6 +465,24 @@ zvol_read_task(void *arg) zv_request_task_free(task); } +/* + * Note: + * + * The kernel uses different enum names for the IO opcode, depending on the + * kernel version ('req_opf', 'req_op'). To sidestep this, use macros rather + * than inline functions for these checks. + */ +/* Should this IO go down the zvol write path? */ +#define ZVOL_OP_IS_WRITE(op) \ + (op == REQ_OP_WRITE || \ + op == REQ_OP_FLUSH || \ + op == REQ_OP_DISCARD) + +/* Is this IO type supported by zvols? */ +#define ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op)) + +/* Get the IO opcode */ +#define ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq)) /* * Process a BIO or request @@ -484,7 +500,33 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = io_offset(bio, rq); uint64_t size = io_size(bio, rq); - int rw = io_data_dir(bio, rq); + int rw; + + if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) { + zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x", + rq != NULL ? "request" : "BIO", + ZVOL_OP(bio, rq), + rq != NULL ? rq->cmd_flags : bio->bi_opf); + ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq))); + zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP)); + goto out; + } + + if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) { + rw = WRITE; + } else { + rw = READ; + } + + /* + * Sanity check + * + * If we're a BIO, check our rw matches the kernel's + * bio_data_dir(bio) rw. We need to check because we support fewer + * IO operations, and want to verify that what we think are reads and + * writes from those operations match what the kernel thinks. + */ + ASSERT(rq != NULL || rw == bio_data_dir(bio)); if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { zvol_end_io(bio, rq, SET_ERROR(ENXIO)); @@ -589,7 +631,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, * interfaces lack this functionality (they block waiting for * the i/o to complete). */ - if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { + if (io_is_discard(bio, rq)) { if (force_sync) { zvol_discard(&zvr); } else { @@ -990,12 +1032,12 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) * tiny devices. For devices over 1 Mib a standard head and sector count * is used to keep the cylinders count reasonable. */ -static int -zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static inline int +zvol_getgeo_impl(struct gendisk *disk, struct hd_geometry *geo) { + zvol_state_t *zv = atomic_load_ptr(&disk->private_data); sector_t sectors; - zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); @@ -1015,6 +1057,20 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) return (0); } +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK +static int +zvol_getgeo(struct gendisk *disk, struct hd_geometry *geo) +{ + return (zvol_getgeo_impl(disk, geo)); +} +#else +static int +zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + return (zvol_getgeo_impl(bdev->bd_disk, geo)); +} +#endif + /* * Why have two separate block_device_operations structs? * @@ -1458,7 +1514,7 @@ zvol_os_remove_minor(zvol_state_t *zv) if (zso->use_blk_mq) blk_mq_free_tag_set(&zso->tag_set); - ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS); + ida_free(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS); kmem_free(zso, sizeof (struct zvol_state_os)); @@ -1613,7 +1669,7 @@ zvol_os_create_minor(const char *name) if (zvol_inhibit_dev) return (0); - idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); + idx = ida_alloc(&zvol_ida, kmem_flags_convert(KM_SLEEP)); if (idx < 0) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; @@ -1621,7 +1677,7 @@ zvol_os_create_minor(const char *name) /* too many partitions can cause an overflow */ zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", name, minor, MINOR(minor)); - ida_simple_remove(&zvol_ida, idx); + ida_free(&zvol_ida, idx); return (SET_ERROR(EINVAL)); } @@ -1629,7 +1685,7 @@ zvol_os_create_minor(const char *name) if (zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); mutex_exit(&zv->zv_state_lock); - ida_simple_remove(&zvol_ida, idx); + ida_free(&zvol_ida, idx); return (SET_ERROR(EEXIST)); } @@ -1729,7 +1785,7 @@ out_doi: rw_exit(&zvol_state_lock); error = zvol_os_add_disk(zv->zv_zso->zvo_disk); } else { - ida_simple_remove(&zvol_ida, idx); + ida_free(&zvol_ida, idx); } return (error); diff --git a/sys/contrib/openzfs/module/zcommon/zfs_deleg.c b/sys/contrib/openzfs/module/zcommon/zfs_deleg.c index 49bb534ca26c..87596558c9a1 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_deleg.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_deleg.c @@ -59,6 +59,7 @@ const zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = { {ZFS_DELEG_PERM_SNAPSHOT}, {ZFS_DELEG_PERM_SHARE}, {ZFS_DELEG_PERM_SEND}, + {ZFS_DELEG_PERM_SEND_RAW}, {ZFS_DELEG_PERM_USERPROP}, {ZFS_DELEG_PERM_USERQUOTA}, {ZFS_DELEG_PERM_GROUPQUOTA}, diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c index 864e3898b365..9190ae0362ea 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c @@ -364,8 +364,8 @@ zfs_prop_init(void) static const zprop_index_t xattr_table[] = { { "off", ZFS_XATTR_OFF }, - { "on", ZFS_XATTR_SA }, { "sa", ZFS_XATTR_SA }, + { "on", ZFS_XATTR_SA }, { "dir", ZFS_XATTR_DIR }, { NULL } }; diff --git a/sys/contrib/openzfs/module/zcommon/zpool_prop.c b/sys/contrib/openzfs/module/zcommon/zpool_prop.c index 04ae9f986d8f..07819ba2be8b 100644 --- a/sys/contrib/openzfs/module/zcommon/zpool_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zpool_prop.c @@ -467,9 +467,15 @@ vdev_prop_init(void) zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING", boolean_table, sfeatures); + zprop_register_index(VDEV_PROP_SIT_OUT, "sit_out", 0, + PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "SIT_OUT", boolean_table, + sfeatures); zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP", boolean_table, sfeatures); + zprop_register_index(VDEV_PROP_AUTOSIT, "autosit", 0, + PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "AUTOSIT", boolean_table, + sfeatures); /* default index properties */ zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE, diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index df41e3b49204..dbb5e942e2e6 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -1157,7 +1157,7 @@ buf_fini(void) #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages - * should be using vmem_free() in the linux kernel\ + * should be using vmem_free() in the linux kernel. */ vmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); @@ -1392,6 +1392,7 @@ arc_get_complevel(arc_buf_t *buf) return (buf->b_hdr->b_complevel); } +__maybe_unused static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { @@ -4650,10 +4651,10 @@ arc_flush_task(void *arg) arc_flush_impl(spa_guid, B_FALSE); arc_async_flush_remove(spa_guid, af->af_cache_level); - uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time); - if (elaspsed > 0) { + uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time); + if (elapsed > 0) { zfs_dbgmsg("spa %llu arc flushed in %llu ms", - (u_longlong_t)spa_guid, (u_longlong_t)elaspsed); + (u_longlong_t)spa_guid, (u_longlong_t)elapsed); } } @@ -9151,7 +9152,7 @@ top: if (dev->l2ad_first) { /* * This is the first sweep through the device. There is - * nothing to evict. We have already trimmmed the + * nothing to evict. We have already trimmed the * whole device. */ goto out; @@ -10085,12 +10086,12 @@ l2arc_device_teardown(void *arg) kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); vmem_free(remdev, sizeof (l2arc_dev_t)); - uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time); - if (elaspsed > 0) { + uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time); + if (elapsed > 0) { zfs_dbgmsg("spa %llu, vdev %llu removed in %llu ms", (u_longlong_t)rva->rva_spa_gid, (u_longlong_t)rva->rva_vdev_gid, - (u_longlong_t)elaspsed); + (u_longlong_t)elapsed); } if (rva->rva_async) diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index 7403f10d91b7..fccc4c5b5b94 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -2270,14 +2270,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (dn->dn_objset->os_dsl_dataset != NULL) rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); #endif - /* - * We make this assert for private objects as well, but after we - * check if we're already dirty. They are allowed to re-dirty - * in syncing context. - */ - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); mutex_enter(&db->db_mtx); /* @@ -2289,12 +2281,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); - mutex_enter(&dn->dn_mtx); - dnode_set_dirtyctx(dn, tx, db); - if (tx->tx_txg > dn->dn_dirty_txg) - dn->dn_dirty_txg = tx->tx_txg; - mutex_exit(&dn->dn_mtx); - if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; @@ -2313,13 +2299,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (dr_next); } - /* - * Only valid if not already dirty. - */ - ASSERT(dn->dn_object == 0 || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); - ASSERT3U(dn->dn_nlevels, >, db->db_level); /* diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c index d6658375f810..0dc9adc7fd4f 100644 --- a/sys/contrib/openzfs/module/zfs/ddt.c +++ b/sys/contrib/openzfs/module/zfs/ddt.c @@ -1701,9 +1701,11 @@ ddt_load(spa_t *spa) } } - error = ddt_log_load(ddt); - if (error != 0 && error != ENOENT) - return (error); + if (ddt->ddt_flags & DDT_FLAG_LOG) { + error = ddt_log_load(ddt); + if (error != 0 && error != ENOENT) + return (error); + } DDT_KSTAT_SET(ddt, dds_log_active_entries, avl_numnodes(&ddt->ddt_log_active->ddl_tree)); diff --git a/sys/contrib/openzfs/module/zfs/ddt_log.c b/sys/contrib/openzfs/module/zfs/ddt_log.c index 3d30e244c1f7..c7a2426f3a77 100644 --- a/sys/contrib/openzfs/module/zfs/ddt_log.c +++ b/sys/contrib/openzfs/module/zfs/ddt_log.c @@ -176,11 +176,13 @@ ddt_log_update_stats(ddt_t *ddt) * that's reasonable to expect anyway. */ dmu_object_info_t doi; - uint64_t nblocks; - dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi); - nblocks = doi.doi_physical_blocks_512; - dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi); - nblocks += doi.doi_physical_blocks_512; + uint64_t nblocks = 0; + if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, + &doi) == 0) + nblocks += doi.doi_physical_blocks_512; + if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, + &doi) == 0) + nblocks += doi.doi_physical_blocks_512; ddt_object_t *ddo = &ddt->ddt_log_stats; ddo->ddo_count = @@ -243,6 +245,13 @@ ddt_log_alloc_entry(ddt_t *ddt) } static void +ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle) +{ + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); +} + +static void ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) { /* Create the log tree entry from a live or stored entry */ @@ -347,8 +356,7 @@ ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); avl_remove(&ddl->ddl_tree, ddle); - kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? - ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + ddt_log_free_entry(ddt, ddle); return (B_TRUE); } @@ -365,8 +373,7 @@ ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk) ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); avl_remove(&ddl->ddl_tree, ddle); - kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? - ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + ddt_log_free_entry(ddt, ddle); return (B_TRUE); } @@ -527,8 +534,7 @@ ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl) IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree)); while ((ddle = avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) { - kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? - ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + ddt_log_free_entry(ddt, ddle); } ASSERT(avl_is_empty(&ddl->ddl_tree)); } @@ -727,7 +733,7 @@ ddt_log_load(ddt_t *ddt) ddle = fe; fe = AVL_NEXT(fl, fe); avl_remove(fl, ddle); - + ddt_log_free_entry(ddt, ddle); ddle = ae; ae = AVL_NEXT(al, ae); } diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index f7f808d5b8f7..a7a5c89bdafb 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -759,6 +759,8 @@ dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, */ uint8_t ibps = ibs - SPA_BLKPTRSHIFT; limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs; + if (limit == 0) + end2 = start2; do { level2++; start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps; @@ -1689,8 +1691,8 @@ dmu_object_cached_size(objset_t *os, uint64_t object, dmu_object_info_from_dnode(dn, &doi); - for (uint64_t off = 0; off < doi.doi_max_offset; - off += dmu_prefetch_max) { + for (uint64_t off = 0; off < doi.doi_max_offset && + dmu_prefetch_max > 0; off += dmu_prefetch_max) { /* dbuf_read doesn't prefetch L1 blocks. */ dmu_prefetch_by_dnode(dn, 1, off, dmu_prefetch_max, ZIO_PRIORITY_SYNC_READ); diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c index a77f338bdfd3..8e6b569c2100 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_objset.c +++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c @@ -2037,6 +2037,8 @@ userquota_updates_task(void *arg) dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); + ASSERT3U(dn->dn_dirtycnt, >, 0); + dn->dn_dirtycnt--; mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); @@ -2070,6 +2072,10 @@ dnode_rele_task(void *arg) dnode_t *dn; while ((dn = multilist_sublist_head(list)) != NULL) { + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_dirtycnt, >, 0); + dn->dn_dirtycnt--; + mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); dnode_rele(dn, &os->os_synced_dnodes); } diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c index 963ff41232a3..e88d394b5229 100644 --- a/sys/contrib/openzfs/module/zfs/dnode.c +++ b/sys/contrib/openzfs/module/zfs/dnode.c @@ -173,9 +173,7 @@ dnode_cons(void *arg, void *unused, int kmflag) dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; - dn->dn_dirty_txg = 0; - dn->dn_dirtyctx = 0; - dn->dn_dirtyctx_firstset = NULL; + dn->dn_dirtycnt = 0; dn->dn_bonus = NULL; dn->dn_have_spill = B_FALSE; dn->dn_zio = NULL; @@ -229,9 +227,7 @@ dnode_dest(void *arg, void *unused) ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_free_txg); ASSERT0(dn->dn_assigned_txg); - ASSERT0(dn->dn_dirty_txg); - ASSERT0(dn->dn_dirtyctx); - ASSERT0P(dn->dn_dirtyctx_firstset); + ASSERT0(dn->dn_dirtycnt); ASSERT0P(dn->dn_bonus); ASSERT(!dn->dn_have_spill); ASSERT0P(dn->dn_zio); @@ -692,10 +688,8 @@ dnode_destroy(dnode_t *dn) dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; - dn->dn_dirty_txg = 0; + dn->dn_dirtycnt = 0; - dn->dn_dirtyctx = 0; - dn->dn_dirtyctx_firstset = NULL; if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); dbuf_destroy(dn->dn_bonus); @@ -800,11 +794,9 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dn->dn_bonuslen = bonuslen; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; - dn->dn_dirtyctx = 0; dn->dn_free_txg = 0; - dn->dn_dirtyctx_firstset = NULL; - dn->dn_dirty_txg = 0; + dn->dn_dirtycnt = 0; dn->dn_allocated_txg = tx->tx_txg; dn->dn_id_flags = 0; @@ -955,9 +947,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_allocated_txg = odn->dn_allocated_txg; ndn->dn_free_txg = odn->dn_free_txg; ndn->dn_assigned_txg = odn->dn_assigned_txg; - ndn->dn_dirty_txg = odn->dn_dirty_txg; - ndn->dn_dirtyctx = odn->dn_dirtyctx; - ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; + ndn->dn_dirtycnt = odn->dn_dirtycnt; ASSERT0(zfs_refcount_count(&odn->dn_tx_holds)); zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds); ASSERT(avl_is_empty(&ndn->dn_dbufs)); @@ -1020,9 +1010,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) odn->dn_allocated_txg = 0; odn->dn_free_txg = 0; odn->dn_assigned_txg = 0; - odn->dn_dirty_txg = 0; - odn->dn_dirtyctx = 0; - odn->dn_dirtyctx_firstset = NULL; + odn->dn_dirtycnt = 0; odn->dn_have_spill = B_FALSE; odn->dn_zio = NULL; odn->dn_oldused = 0; @@ -1273,8 +1261,8 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots) } else if (DN_SLOT_IS_PTR(dn)) { mutex_enter(&dn->dn_mtx); boolean_t can_free = (dn->dn_type == DMU_OT_NONE && - zfs_refcount_is_zero(&dn->dn_holds) && - !DNODE_IS_DIRTY(dn)); + dn->dn_dirtycnt == 0 && + zfs_refcount_is_zero(&dn->dn_holds)); mutex_exit(&dn->dn_mtx); if (!can_free) @@ -1757,17 +1745,23 @@ dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp) * reference on the dnode. Returns FALSE if unable to add a * new reference. */ +static boolean_t +dnode_add_ref_locked(dnode_t *dn, const void *tag) +{ + ASSERT(MUTEX_HELD(&dn->dn_mtx)); + if (zfs_refcount_is_zero(&dn->dn_holds)) + return (FALSE); + VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag)); + return (TRUE); +} + boolean_t dnode_add_ref(dnode_t *dn, const void *tag) { mutex_enter(&dn->dn_mtx); - if (zfs_refcount_is_zero(&dn->dn_holds)) { - mutex_exit(&dn->dn_mtx); - return (FALSE); - } - VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag)); + boolean_t r = dnode_add_ref_locked(dn, tag); mutex_exit(&dn->dn_mtx); - return (TRUE); + return (r); } void @@ -1830,31 +1824,20 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots) } /* - * Checks if the dnode itself is dirty, or is carrying any uncommitted records. - * It is important to check both conditions, as some operations (eg appending - * to a file) can dirty both as a single logical unit, but they are not synced - * out atomically, so checking one and not the other can result in an object - * appearing to be clean mid-way through a commit. + * Test if the dnode is dirty, or carrying uncommitted records. * - * Do not change this lightly! If you get it wrong, dmu_offset_next() can - * detect a hole where there is really data, leading to silent corruption. + * dn_dirtycnt is the number of txgs this dnode is dirty on. It's incremented + * in dnode_setdirty() the first time the dnode is dirtied on a txg, and + * decremented in either dnode_rele_task() or userquota_updates_task() when the + * txg is synced out. */ boolean_t dnode_is_dirty(dnode_t *dn) { mutex_enter(&dn->dn_mtx); - - for (int i = 0; i < TXG_SIZE; i++) { - if (multilist_link_active(&dn->dn_dirty_link[i]) || - !list_is_empty(&dn->dn_dirty_records[i])) { - mutex_exit(&dn->dn_mtx); - return (B_TRUE); - } - } - + boolean_t dirty = (dn->dn_dirtycnt != 0); mutex_exit(&dn->dn_mtx); - - return (B_FALSE); + return (dirty); } void @@ -1916,7 +1899,11 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) * dnode will hang around after we finish processing its * children. */ - VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); + mutex_enter(&dn->dn_mtx); + VERIFY(dnode_add_ref_locked(dn, (void *)(uintptr_t)tx->tx_txg)); + dn->dn_dirtycnt++; + ASSERT3U(dn->dn_dirtycnt, <=, 3); + mutex_exit(&dn->dn_mtx); (void) dbuf_dirty(dn->dn_dbuf, tx); @@ -2221,32 +2208,6 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, mutex_exit(&dn->dn_dbufs_mtx); } -void -dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag) -{ - /* - * Don't set dirtyctx to SYNC if we're just modifying this as we - * initialize the objset. - */ - if (dn->dn_dirtyctx == DN_UNDIRTIED) { - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - - if (ds != NULL) { - rrw_enter(&ds->ds_bp_rwlock, RW_READER, tag); - } - if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { - if (dmu_tx_is_syncing(tx)) - dn->dn_dirtyctx = DN_DIRTY_SYNC; - else - dn->dn_dirtyctx = DN_DIRTY_OPEN; - dn->dn_dirtyctx_firstset = tag; - } - if (ds != NULL) { - rrw_exit(&ds->ds_bp_rwlock, tag); - } - } -} - static void dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len, dmu_tx_t *tx) @@ -2695,6 +2656,32 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, } /* + * Adjust *offset to the next (or previous) block byte offset at lvl. + * Returns FALSE if *offset would overflow or underflow. + */ +static boolean_t +dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl) +{ + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + int span = lvl * epbs + dn->dn_datablkshift; + uint64_t blkid, maxblkid; + + if (span >= 8 * sizeof (uint64_t)) + return (B_FALSE); + + blkid = *offset >> span; + maxblkid = 1ULL << (8 * sizeof (*offset) - span); + if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid) + *offset = (blkid + 1) << span; + else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0) + *offset = (blkid << span) - 1; + else + return (B_FALSE); + + return (B_TRUE); +} + +/* * Find the next hole, data, or sparse region at or after *offset. * The value 'blkfill' tells us how many items we expect to find * in an L0 data block; this value is 1 for normal objects, @@ -2721,7 +2708,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, int minlvl, uint64_t blkfill, uint64_t txg) { - uint64_t initial_offset = *offset; + uint64_t matched = *offset; int lvl, maxlvl; int error = 0; @@ -2745,16 +2732,36 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, maxlvl = dn->dn_phys->dn_nlevels; - for (lvl = minlvl; lvl <= maxlvl; lvl++) { + for (lvl = minlvl; lvl <= maxlvl; ) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); - if (error != ESRCH) + if (error == 0 && lvl > minlvl) { + --lvl; + matched = *offset; + } else if (error == ESRCH && lvl < maxlvl && + dnode_next_block(dn, flags, &matched, lvl)) { + /* + * Continue search at next/prev offset in lvl+1 block. + * + * Usually we only search upwards at the start of the + * search as higher level blocks point at a matching + * minlvl block in most cases, but we backtrack if not. + * + * This can happen for txg > 0 searches if the block + * contains only BPs/dnodes freed at that txg. It also + * happens if we are still syncing out the tree, and + * some BP's at higher levels are not updated yet. + * + * We must adjust offset to avoid coming back to the + * same offset and getting stuck looping forever. This + * also deals with the case where offset is already at + * the beginning or end of the object. + */ + ++lvl; + *offset = matched; + } else { break; - } - - while (error == 0 && --lvl >= minlvl) { - error = dnode_next_offset_level(dn, - flags, offset, lvl, blkfill, txg); + } } /* @@ -2766,9 +2773,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, error = 0; } - if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? - initial_offset < *offset : initial_offset > *offset)) - error = SET_ERROR(ESRCH); out: if (!(flags & DNODE_FIND_HAVELOCK)) rw_exit(&dn->dn_struct_rwlock); diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c index 7db72b9b04b0..fd46127b6068 100644 --- a/sys/contrib/openzfs/module/zfs/mmp.c +++ b/sys/contrib/openzfs/module/zfs/mmp.c @@ -446,7 +446,7 @@ mmp_write_uberblock(spa_t *spa) uint64_t offset; hrtime_t lock_acquire_time = gethrtime(); - spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER); + spa_config_enter_priority(spa, SCL_STATE, mmp_tag, RW_READER); lock_acquire_time = gethrtime() - lock_acquire_time; if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10)) zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns " diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c index ea2d2c7227c8..d73195f1a21f 100644 --- a/sys/contrib/openzfs/module/zfs/range_tree.c +++ b/sys/contrib/openzfs/module/zfs/range_tree.c @@ -585,7 +585,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, * the size, since we do not support removing partial segments * of range trees with gaps. */ - zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) - + zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) - zfs_rs_get_start_raw(rs, rt)); zfs_range_tree_stat_incr(rt, &rs_tmp); diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c index cf28955b0c50..f615591e826b 100644 --- a/sys/contrib/openzfs/module/zfs/spa_config.c +++ b/sys/contrib/openzfs/module/zfs/spa_config.c @@ -372,6 +372,8 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MIN_ALLOC, spa->spa_min_alloc); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, spa->spa_max_alloc); if (spa->spa_comment != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index dceafbc27556..0bead6d49666 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -251,11 +251,11 @@ spa_mode_t spa_mode_global = SPA_MODE_UNINIT; #ifdef ZFS_DEBUG /* - * Everything except dprintf, set_error, spa, and indirect_remap is on - * by default in debug builds. + * Everything except dprintf, set_error, indirect_remap, and raidz_reconstruct + * is on by default in debug builds. */ int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR | - ZFS_DEBUG_INDIRECT_REMAP); + ZFS_DEBUG_INDIRECT_REMAP | ZFS_DEBUG_RAIDZ_RECONSTRUCT); #else int zfs_flags = 0; #endif @@ -510,7 +510,7 @@ spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw) static void spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, - int mmp_flag) + int priority_flag) { (void) tag; int wlocks_held = 0; @@ -526,7 +526,7 @@ spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || - (!mmp_flag && scl->scl_write_wanted)) { + (!priority_flag && scl->scl_write_wanted)) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { @@ -551,7 +551,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) } /* - * The spa_config_enter_mmp() allows the mmp thread to cut in front of + * The spa_config_enter_priority() allows the mmp thread to cut in front of * outstanding write lock requests. This is needed since the mmp updates are * time sensitive and failure to service them promptly will result in a * suspended pool. This pool suspension has been seen in practice when there is @@ -560,7 +560,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) */ void -spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw) +spa_config_enter_priority(spa_t *spa, int locks, const void *tag, krw_t rw) { spa_config_enter_impl(spa, locks, tag, rw, 1); } @@ -806,6 +806,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; + spa->spa_max_alloc = 0; spa->spa_gcd_alloc = INT_MAX; /* Reset cached value */ @@ -1865,6 +1866,19 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) } /* + * Return the range of minimum allocation sizes for the normal allocation + * class. This can be used by external consumers of the DMU to estimate + * potential wasted capacity when setting the recordsize for an object. + * This is mainly for dRAID pools which always pad to a full stripe width. + */ +void +spa_get_min_alloc_range(spa_t *spa, uint64_t *min_alloc, uint64_t *max_alloc) +{ + *min_alloc = spa->spa_min_alloc; + *max_alloc = spa->spa_max_alloc; +} + +/* * Return the amount of slop space in bytes. It is typically 1/32 of the pool * (3.2%), minus the embedded log space. On very small pools, it may be * slightly larger than this. On very large pools, it will be capped to @@ -3085,6 +3099,7 @@ EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_freeze_txg); +EXPORT_SYMBOL(spa_get_min_alloc_range); /* for Lustre */ EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index 9cf35e379000..c8d7280387a2 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -29,7 +29,7 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. - * Copyright (c) 2021, Klara Inc. + * Copyright (c) 2021, 2025, Klara, Inc. * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ @@ -1086,6 +1086,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } } + if (top_level && (ops == &vdev_raidz_ops || ops == &vdev_draid_ops)) + vd->vdev_autosit = + vdev_prop_default_numeric(VDEV_PROP_AUTOSIT); + /* * Add ourselves to the parent's list of children. */ @@ -1187,6 +1191,9 @@ vdev_free(vdev_t *vd) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); + if (vd->vdev_prev_histo) + kmem_free(vd->vdev_prev_histo, + sizeof (uint64_t) * VDEV_L_HISTO_BUCKETS); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); @@ -1490,12 +1497,14 @@ vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) { if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; - if (spa->spa_gcd_alloc == INT_MAX) { + + if (min_alloc > spa->spa_max_alloc) + spa->spa_max_alloc = min_alloc; + + if (spa->spa_gcd_alloc == INT_MAX) spa->spa_gcd_alloc = min_alloc; - } else { - spa->spa_gcd_alloc = vdev_gcd(min_alloc, - spa->spa_gcd_alloc); - } + else + spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc); } void @@ -1553,8 +1562,7 @@ vdev_metaslab_group_create(vdev_t *vd) if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; - uint64_t min_alloc = vdev_get_min_alloc(vd); - vdev_spa_set_alloc(spa, min_alloc); + vdev_spa_set_alloc(spa, vdev_get_min_alloc(vd)); } } } @@ -3857,6 +3865,26 @@ vdev_load(vdev_t *vd) } } + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + spa_t *spa = vd->vdev_spa; + uint64_t autosit; + + error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + vdev_prop_to_name(VDEV_PROP_AUTOSIT), sizeof (autosit), + 1, &autosit); + if (error == 0) { + vd->vdev_autosit = autosit == 1; + } else if (error == ENOENT) { + vd->vdev_autosit = vdev_prop_default_numeric( + VDEV_PROP_AUTOSIT); + } else { + vdev_dbgmsg(vd, + "vdev_load: zap_lookup(top_zap=%llu) " + "failed [error=%d]", + (u_longlong_t)vd->vdev_top_zap, error); + } + } + /* * Load any rebuild state from the top-level vdev zap. */ @@ -4616,6 +4644,8 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_dio_verify_errors = 0; vd->vdev_stat.vs_slow_ios = 0; + atomic_store_64(&vd->vdev_outlier_count, 0); + vd->vdev_read_sit_out_expire = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); @@ -6107,6 +6137,56 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) } vd->vdev_failfast = intval & 1; break; + case VDEV_PROP_SIT_OUT: + /* Only expose this for a draid or raidz leaf */ + if (!vd->vdev_ops->vdev_op_leaf || + vd->vdev_top == NULL || + (vd->vdev_top->vdev_ops != &vdev_raidz_ops && + vd->vdev_top->vdev_ops != &vdev_draid_ops)) { + error = ENOTSUP; + break; + } + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + if (intval == 1) { + vdev_t *ancestor = vd; + while (ancestor->vdev_parent != vd->vdev_top) + ancestor = ancestor->vdev_parent; + vdev_t *pvd = vd->vdev_top; + uint_t sitouts = 0; + for (int i = 0; i < pvd->vdev_children; i++) { + if (pvd->vdev_child[i] == ancestor) + continue; + if (vdev_sit_out_reads( + pvd->vdev_child[i], 0)) { + sitouts++; + } + } + if (sitouts >= vdev_get_nparity(pvd)) { + error = ZFS_ERR_TOO_MANY_SITOUTS; + break; + } + if (error == 0) + vdev_raidz_sit_child(vd, + INT64_MAX - gethrestime_sec()); + } else { + vdev_raidz_unsit_child(vd); + } + break; + case VDEV_PROP_AUTOSIT: + if (vd->vdev_ops != &vdev_raidz_ops && + vd->vdev_ops != &vdev_draid_ops) { + error = ENOTSUP; + break; + } + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_autosit = intval == 1; + break; case VDEV_PROP_CHECKSUM_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; @@ -6456,6 +6536,19 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) ZPROP_SRC_NONE); } continue; + case VDEV_PROP_SIT_OUT: + /* Only expose this for a draid or raidz leaf */ + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_top != NULL && + (vd->vdev_top->vdev_ops == + &vdev_raidz_ops || + vd->vdev_top->vdev_ops == + &vdev_draid_ops)) { + vdev_prop_add_list(outnvl, propname, + NULL, vdev_sit_out_reads(vd, 0), + ZPROP_SRC_NONE); + } + continue; case VDEV_PROP_TRIM_SUPPORT: /* only valid for leaf vdevs */ if (vd->vdev_ops->vdev_op_leaf) { @@ -6506,6 +6599,29 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) vdev_prop_add_list(outnvl, propname, strval, intval, src); break; + case VDEV_PROP_AUTOSIT: + /* Only raidz vdevs cannot have this property */ + if (vd->vdev_ops != &vdev_raidz_ops && + vd->vdev_ops != &vdev_draid_ops) { + src = ZPROP_SRC_NONE; + intval = ZPROP_BOOLEAN_NA; + } else { + err = vdev_prop_get_int(vd, prop, + &intval); + if (err && err != ENOENT) + break; + + if (intval == + vdev_prop_default_numeric(prop)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + } + + vdev_prop_add_list(outnvl, propname, NULL, + intval, src); + break; + case VDEV_PROP_CHECKSUM_N: case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c index a05289102af2..8588cfee3f7d 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_draid.c +++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2018 Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + * Copyright (c) 2025, Klara, Inc. */ #include <sys/zfs_context.h> @@ -1996,6 +1997,33 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) rc->rc_allow_repair = 1; } } + + if (vdev_sit_out_reads(cvd, zio->io_flags)) { + rr->rr_outlier_cnt++; + ASSERT0(rc->rc_latency_outlier); + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_outlier_cnt > 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) >= + (rr->rr_missingdata + 1)) { + + for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error == 0 && rc->rc_latency_outlier) { + rr->rr_missingdata++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } + } } /* diff --git a/sys/contrib/openzfs/module/zfs/vdev_file.c b/sys/contrib/openzfs/module/zfs/vdev_file.c index f457669bc809..20b4db65ec06 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_file.c +++ b/sys/contrib/openzfs/module/zfs/vdev_file.c @@ -228,7 +228,8 @@ vdev_file_io_strategy(void *arg) abd_return_buf_copy(zio->io_abd, buf, size); } else { buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); - err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); + err = zfs_file_pwrite(vf->vf_file, buf, size, off, + vd->vdev_ashift, &resid); abd_return_buf(zio->io_abd, buf, size); } zio->io_error = err; diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c index c44f654b0261..0d4fdaa77ba0 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_label.c +++ b/sys/contrib/openzfs/module/zfs/vdev_label.c @@ -511,6 +511,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_MIN_ALLOC, + vdev_get_min_alloc(vd)); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); if (vd->vdev_noalloc) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c index b597d6daefde..56b8e3b60b22 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c @@ -24,6 +24,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ #include <sys/zfs_context.h> @@ -356,6 +357,32 @@ unsigned long raidz_expand_max_reflow_bytes = 0; uint_t raidz_expand_pause_point = 0; /* + * This represents the duration for a slow drive read sit out. + */ +static unsigned long vdev_read_sit_out_secs = 600; + +/* + * How often each RAID-Z and dRAID vdev will check for slow disk outliers. + * Increasing this interval will reduce the sensitivity of detection (since all + * I/Os since the last check are included in the statistics), but will slow the + * response to a disk developing a problem. + * + * Defaults to once per second; setting extremely small values may cause + * negative performance effects. + */ +static hrtime_t vdev_raidz_outlier_check_interval_ms = 1000; + +/* + * When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is + * used to determine how far out an outlier must be before it counts as an event + * worth consdering. + * + * Smaller values will result in more aggressive sitting out of disks that may + * have problems, but may significantly increase the rate of spurious sit-outs. + */ +static uint32_t vdev_raidz_outlier_insensitivity = 50; + +/* * Maximum amount of copy io's outstanding at once. */ #ifdef _ILP32 @@ -2311,6 +2338,41 @@ vdev_raidz_min_asize(vdev_t *vd) vd->vdev_children); } +/* + * return B_TRUE if a read should be skipped due to being too slow. + * + * In vdev_child_slow_outlier() it looks for outliers based on disk + * latency from the most recent child reads. Here we're checking if, + * over time, a disk has has been an outlier too many times and is + * now in a sit out period. + */ +boolean_t +vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags) +{ + if (vdev_read_sit_out_secs == 0) + return (B_FALSE); + + /* Avoid skipping a data column read when scrubbing */ + if (io_flags & ZIO_FLAG_SCRUB) + return (B_FALSE); + + if (!vd->vdev_ops->vdev_op_leaf) { + boolean_t sitting = B_FALSE; + for (int c = 0; c < vd->vdev_children; c++) { + sitting |= vdev_sit_out_reads(vd->vdev_child[c], + io_flags); + } + return (sitting); + } + + if (vd->vdev_read_sit_out_expire >= gethrestime_sec()) + return (B_TRUE); + + vd->vdev_read_sit_out_expire = 0; + + return (B_FALSE); +} + void vdev_raidz_child_done(zio_t *zio) { @@ -2475,6 +2537,45 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) rc->rc_skipped = 1; continue; } + + if (vdev_sit_out_reads(cvd, zio->io_flags)) { + rr->rr_outlier_cnt++; + ASSERT0(rc->rc_latency_outlier); + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_outlier_cnt > 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) >= + (rr->rr_missingdata + 1)) { + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error == 0 && rc->rc_latency_outlier) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } + } + } + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error || rc->rc_size == 0) + continue; + if (forceparity || c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { @@ -2498,6 +2599,7 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) ASSERT3U(prc->rc_devidx, ==, i); vdev_t *cvd = vd->vdev_child[i]; + if (!vdev_readable(cvd)) { prc->rc_error = SET_ERROR(ENXIO); prc->rc_tried = 1; /* don't even try */ @@ -2774,6 +2876,239 @@ vdev_raidz_worst_error(raidz_row_t *rr) return (error); } +/* + * Find the median value from a set of n values + */ +static uint64_t +latency_median_value(const uint64_t *data, size_t n) +{ + uint64_t m; + + if (n % 2 == 0) + m = (data[(n >> 1) - 1] + data[n >> 1]) >> 1; + else + m = data[((n + 1) >> 1) - 1]; + + return (m); +} + +/* + * Calculate the outlier fence from a set of n latency values + * + * fence = Q3 + vdev_raidz_outlier_insensitivity x (Q3 - Q1) + */ +static uint64_t +latency_quartiles_fence(const uint64_t *data, size_t n, uint64_t *iqr) +{ + uint64_t q1 = latency_median_value(&data[0], n >> 1); + uint64_t q3 = latency_median_value(&data[(n + 1) >> 1], n >> 1); + + /* + * To avoid detecting false positive outliers when N is small and + * and the latencies values are very close, make sure the IQR + * is at least 25% larger than Q1. + */ + *iqr = MAX(q3 - q1, q1 / 4); + + return (q3 + (*iqr * vdev_raidz_outlier_insensitivity)); +} +#define LAT_CHILDREN_MIN 5 +#define LAT_OUTLIER_LIMIT 20 + +static int +latency_compare(const void *arg1, const void *arg2) +{ + const uint64_t *l1 = (uint64_t *)arg1; + const uint64_t *l2 = (uint64_t *)arg2; + + return (TREE_CMP(*l1, *l2)); +} + +void +vdev_raidz_sit_child(vdev_t *svd, uint64_t secs) +{ + for (int c = 0; c < svd->vdev_children; c++) + vdev_raidz_sit_child(svd->vdev_child[c], secs); + + if (!svd->vdev_ops->vdev_op_leaf) + return; + + /* Begin a sit out period for this slow drive */ + svd->vdev_read_sit_out_expire = gethrestime_sec() + + secs; + + /* Count each slow io period */ + mutex_enter(&svd->vdev_stat_lock); + svd->vdev_stat.vs_slow_ios++; + mutex_exit(&svd->vdev_stat_lock); +} + +void +vdev_raidz_unsit_child(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_raidz_unsit_child(vd->vdev_child[c]); + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + vd->vdev_read_sit_out_expire = 0; +} + +/* + * Check for any latency outlier from latest set of child reads. + * + * Uses a Tukey's fence, with K = 50, for detecting extreme outliers. This + * rule defines extreme outliers as data points outside the fence of the + * third quartile plus fifty times the Interquartile Range (IQR). This range + * is the distance between the first and third quartile. + * + * Fifty is an extremely large value for Tukey's fence, but the outliers we're + * attempting to detect here are orders of magnitude times larger than the + * median. This large value should capture any truly fault disk quickly, + * without causing spurious sit-outs. + * + * To further avoid spurious sit-outs, vdevs must be detected multiple times + * as an outlier before they are sat, and outlier counts will gradually decay. + * Every nchildren times we have detected an outlier, we subtract 2 from the + * outlier count of all children. If detected outliers are close to uniformly + * distributed, this will result in the outlier count remaining close to 0 + * (in expectation; over long enough time-scales, spurious sit-outs are still + * possible). + */ +static void +vdev_child_slow_outlier(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + if (!vd->vdev_autosit || vdev_read_sit_out_secs == 0 || + vd->vdev_children < LAT_CHILDREN_MIN) + return; + + hrtime_t now = getlrtime(); + uint64_t last = atomic_load_64(&vd->vdev_last_latency_check); + + if ((now - last) < MSEC2NSEC(vdev_raidz_outlier_check_interval_ms)) + return; + + /* Allow a single winner when there are racing callers. */ + if (atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last) + return; + + int children = vd->vdev_children; + uint64_t *lat_data = kmem_alloc(sizeof (uint64_t) * children, KM_SLEEP); + + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + if (cvd->vdev_prev_histo == NULL) { + mutex_enter(&cvd->vdev_stat_lock); + size_t size = + sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); + cvd->vdev_prev_histo = kmem_zalloc(size, KM_SLEEP); + memcpy(cvd->vdev_prev_histo, + cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ], + size); + mutex_exit(&cvd->vdev_stat_lock); + } + } + uint64_t max = 0; + vdev_t *svd = NULL; + uint_t sitouts = 0; + boolean_t skip = B_FALSE, svd_sitting = B_FALSE; + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + boolean_t sitting = vdev_sit_out_reads(cvd, 0) || + cvd->vdev_state != VDEV_STATE_HEALTHY; + + /* We can't sit out more disks than we have parity */ + if (sitting && ++sitouts >= vdev_get_nparity(vd)) + skip = B_TRUE; + + mutex_enter(&cvd->vdev_stat_lock); + + uint64_t *prev_histo = cvd->vdev_prev_histo; + uint64_t *histo = + cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ]; + if (skip) { + size_t size = + sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); + memcpy(prev_histo, histo, size); + mutex_exit(&cvd->vdev_stat_lock); + continue; + } + uint64_t count = 0; + lat_data[c] = 0; + for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) { + uint64_t this_count = histo[i] - prev_histo[i]; + lat_data[c] += (1ULL << i) * this_count; + count += this_count; + } + size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); + memcpy(prev_histo, histo, size); + mutex_exit(&cvd->vdev_stat_lock); + lat_data[c] /= MAX(1, count); + + /* Wait until all disks have been read from */ + if (lat_data[c] == 0 && !sitting) { + skip = B_TRUE; + continue; + } + + /* Keep track of the vdev with largest value */ + if (lat_data[c] > max) { + max = lat_data[c]; + svd = cvd; + svd_sitting = sitting; + } + } + + if (skip) { + kmem_free(lat_data, sizeof (uint64_t) * children); + return; + } + + qsort((void *)lat_data, children, sizeof (uint64_t), latency_compare); + + uint64_t iqr; + uint64_t fence = latency_quartiles_fence(lat_data, children, &iqr); + + ASSERT3U(lat_data[children - 1], ==, max); + if (max > fence && !svd_sitting) { + ASSERT3U(iqr, >, 0); + uint64_t incr = MAX(1, MIN((max - fence) / iqr, + LAT_OUTLIER_LIMIT / 4)); + vd->vdev_outlier_count += incr; + if (vd->vdev_outlier_count >= children) { + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + cvd->vdev_outlier_count -= 2; + cvd->vdev_outlier_count = MAX(0, + cvd->vdev_outlier_count); + } + vd->vdev_outlier_count = 0; + } + /* + * Keep track of how many times this child has had + * an outlier read. A disk that persitently has a + * higher than peers outlier count will be considered + * a slow disk. + */ + svd->vdev_outlier_count += incr; + if (svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) { + ASSERT0(svd->vdev_read_sit_out_expire); + vdev_raidz_sit_child(svd, vdev_read_sit_out_secs); + (void) zfs_ereport_post(FM_EREPORT_ZFS_SITOUT, + zio->io_spa, svd, NULL, NULL, 0); + vdev_dbgmsg(svd, "begin read sit out for %d secs", + (int)vdev_read_sit_out_secs); + + for (int c = 0; c < vd->vdev_children; c++) + vd->vdev_child[c]->vdev_outlier_count = 0; + } + } + + kmem_free(lat_data, sizeof (uint64_t) * children); +} + static void vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { @@ -3515,6 +3850,9 @@ vdev_raidz_io_done(zio_t *zio) raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_verified(zio, rr); } + /* Periodically check for a read outlier */ + if (zio->io_type == ZIO_TYPE_READ) + vdev_child_slow_outlier(zio); zio_checksum_verified(zio); } else { /* @@ -5155,3 +5493,10 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, "For expanded RAIDZ, automatically start a pool scrub when expansion " "completes"); +ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW, + "Raidz/draid slow disk sit out time period in seconds"); +ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, U64, + ZMOD_RW, "Interval to check for slow raidz/draid children"); +ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_insensitivity, UINT, + ZMOD_RW, "How insensitive the slow raidz/draid child check should be"); +/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c index 2f7a739da241..abb71543e3ab 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_removal.c +++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c @@ -51,34 +51,70 @@ #include <sys/trace_zfs.h> /* - * This file contains the necessary logic to remove vdevs from a - * storage pool. Currently, the only devices that can be removed - * are log, cache, and spare devices; and top level vdevs from a pool - * w/o raidz or mirrors. (Note that members of a mirror can be removed - * by the detach operation.) + * This file contains the necessary logic to remove vdevs from a storage + * pool. Note that members of a mirror can be removed by the detach + * operation. Currently, the only devices that can be removed are: * - * Log vdevs are removed by evacuating them and then turning the vdev - * into a hole vdev while holding spa config locks. + * 1) Traditional hot spare and cache vdevs. Note that draid distributed + * spares are fixed at creation time and cannot be removed. * - * Top level vdevs are removed and converted into an indirect vdev via - * a multi-step process: + * 2) Log vdevs are removed by evacuating them and then turning the vdev + * into a hole vdev while holding spa config locks. * - * - Disable allocations from this device (spa_vdev_remove_top). + * 3) Top-level singleton and mirror vdevs, including dedup and special + * vdevs, are removed and converted into an indirect vdev via a + * multi-step process: * - * - From a new thread (spa_vdev_remove_thread), copy data from - * the removing vdev to a different vdev. The copy happens in open - * context (spa_vdev_copy_impl) and issues a sync task - * (vdev_mapping_sync) so the sync thread can update the partial - * indirect mappings in core and on disk. + * - Disable allocations from this device (spa_vdev_remove_top). * - * - If a free happens during a removal, it is freed from the - * removing vdev, and if it has already been copied, from the new - * location as well (free_from_removing_vdev). + * - From a new thread (spa_vdev_remove_thread), copy data from the + * removing vdev to a different vdev. The copy happens in open context + * (spa_vdev_copy_impl) and issues a sync task (vdev_mapping_sync) so + * the sync thread can update the partial indirect mappings in core + * and on disk. * - * - After the removal is completed, the copy thread converts the vdev - * into an indirect vdev (vdev_remove_complete) before instructing - * the sync thread to destroy the space maps and finish the removal - * (spa_finish_removal). + * - If a free happens during a removal, it is freed from the removing + * vdev, and if it has already been copied, from the new location as + * well (free_from_removing_vdev). + * + * - After the removal is completed, the copy thread converts the vdev + * into an indirect vdev (vdev_remove_complete) before instructing + * the sync thread to destroy the space maps and finish the removal + * (spa_finish_removal). + * + * The following constraints currently apply primary device removal: + * + * - All vdevs must be online, healthy, and not be missing any data + * according to the DTLs. + * + * - When removing a singleton or mirror vdev, regardless of it's a + * special, dedup, or primary device, it must have the same ashift + * as the devices in the normal allocation class. Furthermore, all + * vdevs in the normal allocation class must have the same ashift to + * ensure the new allocations never includes additional padding. + * + * - The normal allocation class cannot contain any raidz or draid + * top-level vdevs since segments are copied without regard for block + * boundaries. This makes it impossible to calculate the required + * parity columns when using these vdev types as the destination. + * + * - The encryption keys must be loaded so the ZIL logs can be reset + * in order to prevent writing to the device being removed. + * + * N.B. ashift and raidz/draid constraints for primary top-level device + * removal could be slightly relaxed if it were possible to request that + * DVAs from a mirror or singleton in the specified allocation class be + * used (metaslab_alloc_dva). + * + * This flexibility would be particularly useful for raidz/draid pools which + * often include a mirrored special device. If a mistakenly added top-level + * singleton were added it could then still be removed at the cost of some + * special device capacity. This may be a worthwhile tradeoff depending on + * the pool capacity and expense (cost, complexity, time) of creating a new + * pool and copying all of the data to correct the configuration. + * + * Furthermore, while not currently supported it should be possible to allow + * vdevs of any type to be removed as long as they've never been written to. */ typedef struct vdev_copy_arg { diff --git a/sys/contrib/openzfs/module/zfs/zfeature.c b/sys/contrib/openzfs/module/zfs/zfeature.c index 0816ea134bf3..4cf9e0dbb405 100644 --- a/sys/contrib/openzfs/module/zfs/zfeature.c +++ b/sys/contrib/openzfs/module/zfs/zfeature.c @@ -308,6 +308,7 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount, ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature)); uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + ASSERT(MUTEX_HELD(&spa->spa_feat_stats_lock)); VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid, sizeof (uint64_t), 1, &refcount, tx)); @@ -360,7 +361,9 @@ feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) feature->fi_guid, 1, strlen(feature->fi_desc) + 1, feature->fi_desc, tx)); + mutex_enter(&spa->spa_feat_stats_lock); feature_sync(spa, feature, initial_refcount, tx); + mutex_exit(&spa->spa_feat_stats_lock); if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) { uint64_t enabling_txg = dmu_tx_get_txg(tx); @@ -416,6 +419,7 @@ feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action, ASSERT(dmu_tx_is_syncing(tx)); ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + mutex_enter(&spa->spa_feat_stats_lock); VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP); switch (action) { @@ -433,6 +437,7 @@ feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action, } feature_sync(spa, feature, refcount, tx); + mutex_exit(&spa->spa_feat_stats_lock); } void diff --git a/sys/contrib/openzfs/module/zfs/zfs_crrd.c b/sys/contrib/openzfs/module/zfs/zfs_crrd.c index f9267ed41d71..30d4c7c36897 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_crrd.c +++ b/sys/contrib/openzfs/module/zfs/zfs_crrd.c @@ -162,9 +162,9 @@ dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg) daydiff = time - rrd_tail(&db->dbr_days); monthdiff = time - rrd_tail(&db->dbr_months); - if (monthdiff >= 0 && monthdiff >= SEC2NSEC(30 * 24 * 60 * 60)) + if (monthdiff >= 0 && monthdiff >= 30 * 24 * 60 * 60) rrd_add(&db->dbr_months, time, txg); - else if (daydiff >= 0 && daydiff >= SEC2NSEC(24 * 60 * 60)) + else if (daydiff >= 0 && daydiff >= 24 * 60 * 60) rrd_add(&db->dbr_days, time, txg); else if (minutedif >= 0) rrd_add(&db->dbr_minutes, time, txg); @@ -208,7 +208,8 @@ dbrrd_closest(hrtime_t tv, const rrd_data_t *r1, const rrd_data_t *r2) if (r2 == NULL) return (r1); - return (ABS(tv - r1->rrdd_time) < ABS(tv - r2->rrdd_time) ? r1 : r2); + return (ABS(tv - (hrtime_t)r1->rrdd_time) < + ABS(tv - (hrtime_t)r2->rrdd_time) ? r1 : r2); } uint64_t diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index 121b966b9864..5ca7c2320c4e 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -683,6 +683,7 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) dsl_dataset_t *ds; const char *cp; int error; + boolean_t rawok = (zc->zc_flags & 0x8); /* * Generate the current snapshot name from the given objsetid, then @@ -705,6 +706,10 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); + if (error != 0 && rawok == B_TRUE) { + error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, + ZFS_DELEG_PERM_SEND_RAW, cr); + } dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); @@ -714,9 +719,17 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + boolean_t rawok = nvlist_exists(innvl, "rawok"); + int error; + (void) innvl; - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_SEND, cr)); + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_SEND, cr); + if (error != 0 && rawok == B_TRUE) { + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_SEND_RAW, cr); + } + return (error); } static int @@ -4726,7 +4739,7 @@ zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) error = error ? error : resume_err; } zfs_vfs_rele(zfsvfs); - } else if ((zv = zvol_suspend(fsname)) != NULL) { + } else if (zvol_suspend(fsname, &zv) == 0) { error = dsl_dataset_rollback(fsname, target, zvol_tag(zv), outnvl); zvol_resume(zv); @@ -5448,7 +5461,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, const char *origin, } error = error ? error : end_err; zfs_vfs_rele(zfsvfs); - } else if ((zv = zvol_suspend(tofs)) != NULL) { + } else if (zvol_suspend(tofs, &zv) == 0) { error = dmu_recv_end(&drc, zvol_tag(zv)); zvol_resume(zv); } else { @@ -7619,7 +7632,7 @@ zfs_ioctl_init(void) zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB, zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, - POOL_CHECK_NONE, B_TRUE, B_TRUE, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub)); zfs_ioctl_register("get_props", ZFS_IOC_POOL_GET_PROPS, diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 4cf8912d4269..aeea58bedfe4 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -4574,8 +4574,29 @@ zio_vdev_io_start(zio_t *zio) ASSERT0(zio->io_child_error[ZIO_CHILD_VDEV]); if (vd == NULL) { - if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) - spa_config_enter(spa, SCL_ZIO, zio, RW_READER); + if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) { + /* + * A deadlock workaround. The ddt_prune_unique_entries() + * -> prune_candidates_sync() code path takes the + * SCL_ZIO reader lock and may request it again here. + * If there is another thread who wants the SCL_ZIO + * writer lock, then scl_write_wanted will be set. + * Thus, the spa_config_enter_priority() is used to + * ignore pending writer requests. + * + * The locking should be revised to remove the need + * for this workaround. If that's not workable then + * it should only be applied to the zios involved in + * the pruning process. This impacts the read/write + * I/O balance while pruning. + */ + if (spa->spa_active_ddt_prune) + spa_config_enter_priority(spa, SCL_ZIO, zio, + RW_READER); + else + spa_config_enter(spa, SCL_ZIO, zio, + RW_READER); + } /* * The mirror_ops handle multiple DVAs in a single BP. @@ -5305,6 +5326,16 @@ zio_ready(zio_t *zio) return (NULL); } + if (zio_injection_enabled) { + hrtime_t target = zio_handle_ready_delay(zio); + if (target != 0 && zio->io_target_timestamp == 0) { + zio->io_stage >>= 1; + zio->io_target_timestamp = target; + zio_delay_interrupt(zio); + return (NULL); + } + } + if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(BP_GET_BIRTH(bp) == zio->io_txg || diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c index 981a1be4847c..287577018ed1 100644 --- a/sys/contrib/openzfs/module/zfs/zio_inject.c +++ b/sys/contrib/openzfs/module/zfs/zio_inject.c @@ -827,6 +827,44 @@ zio_handle_export_delay(spa_t *spa, hrtime_t elapsed) zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT); } +/* + * For testing, inject a delay before ready state. + */ +hrtime_t +zio_handle_ready_delay(zio_t *zio) +{ + inject_handler_t *handler; + hrtime_t now = gethrtime(); + hrtime_t target = 0; + + /* + * Ignore I/O not associated with any logical data. + */ + if (zio->io_logical == NULL) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_DELAY_READY) + continue; + + /* If this handler matches, inject the delay */ + if (zio_match_iotype(zio, handler->zi_record.zi_iotype) && + zio_match_handler(&zio->io_logical->io_bookmark, + zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, + zio_match_dva(zio), &handler->zi_record, zio->io_error)) { + target = now + (hrtime_t)handler->zi_record.zi_timer; + break; + } + } + + rw_exit(&inject_lock); + return (target); +} + static int zio_calculate_range(const char *pool, zinject_record_t *record) { diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index 2fd3e1c37045..00f98168d3d8 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -410,7 +410,7 @@ zvol_set_volthreading(const char *name, boolean_t value) { zvol_state_t *zv = zvol_find_by_name(name, RW_NONE); if (zv == NULL) - return (SET_ERROR(ENOENT)); + return (-1); zv->zv_threading = value; mutex_exit(&zv->zv_state_lock); return (0); @@ -1145,20 +1145,34 @@ zvol_tag(zvol_state_t *zv) /* * Suspend the zvol for recv and rollback. */ -zvol_state_t * -zvol_suspend(const char *name) +int +zvol_suspend(const char *name, zvol_state_t **zvp) { zvol_state_t *zv; zv = zvol_find_by_name(name, RW_WRITER); if (zv == NULL) - return (NULL); + return (SET_ERROR(ENOENT)); /* block all I/O, release in zvol_resume. */ ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); + /* + * If it's being removed, unlock and return error. It doesn't make any + * sense to try to suspend a zvol being removed, but being here also + * means that zvol_remove_minors_impl() is about to call zvol_remove() + * and then destroy the zvol_state_t, so returning a pointer to it for + * the caller to mess with would be a disaster anyway. + */ + if (zv->zv_flags & ZVOL_REMOVING) { + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); + /* NB: Returning EIO here to match zfsvfs_teardown() */ + return (SET_ERROR(EIO)); + } + atomic_inc(&zv->zv_suspend_ref); if (zv->zv_open_count > 0) @@ -1171,7 +1185,8 @@ zvol_suspend(const char *name) mutex_exit(&zv->zv_state_lock); /* zv_suspend_lock is released in zvol_resume() */ - return (zv); + *zvp = zv; + return (0); } int diff --git a/sys/contrib/openzfs/module/zstd/zfs_zstd.c b/sys/contrib/openzfs/module/zstd/zfs_zstd.c index 3db196953f74..c403c001086a 100644 --- a/sys/contrib/openzfs/module/zstd/zfs_zstd.c +++ b/sys/contrib/openzfs/module/zstd/zfs_zstd.c @@ -441,64 +441,6 @@ zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level) } #ifndef IN_LIBSA -static size_t -zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len, - int level) -{ - int16_t zstd_level; - if (zstd_enum_to_level(level, &zstd_level)) { - ZSTDSTAT_BUMP(zstd_stat_com_inval); - return (s_len); - } - /* - * A zstd early abort heuristic. - * - * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently - * 128k), don't try any of this, just go. - * (because experimentally that was a reasonable cutoff for a perf win - * with tiny ratio change) - * - First, we try LZ4 compression, and if it doesn't early abort, we - * jump directly to whatever compression level we intended to try. - * - Second, we try zstd-1 - if that errors out (usually, but not - * exclusively, if it would overflow), we give up early. - * - * If it works, instead we go on and compress anyway. - * - * Why two passes? LZ4 alone gets you a lot of the way, but on highly - * compressible data, it was losing up to 8.5% of the compressed - * savings versus no early abort, and all the zstd-fast levels are - * worse indications on their own than LZ4, and don't improve the LZ4 - * pass noticably if stacked like this. - */ - size_t actual_abort_size = zstd_abort_size; - if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level && - s_len >= actual_abort_size) { - int pass_len = 1; - pass_len = zfs_lz4_compress(s_start, d_start, s_len, d_len, 0); - if (pass_len < d_len) { - ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed); - goto keep_trying; - } - ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected); - - pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len, - ZIO_ZSTD_LEVEL_1); - if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) { - ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected); - return (s_len); - } - ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed); - } else { - ZSTDSTAT_BUMP(zstd_stat_passignored); - if (s_len < actual_abort_size) { - ZSTDSTAT_BUMP(zstd_stat_passignored_size); - } - } -keep_trying: - return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level)); - -} - /* Compress block using zstd */ static size_t zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len, |
