diff options
Diffstat (limited to 'sys/contrib/openzfs/module')
29 files changed, 759 insertions, 190 deletions
diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c index d0fcca798fa9..ad707341eec7 100644 --- a/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c +++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c @@ -77,7 +77,8 @@ static const uint32_t SHA256_K[64] = { h = g, g = f, f = e, e = d + T1; \ d = c, c = b, b = a, a = T1 + T2; -static void sha256_generic(uint32_t state[8], const void *data, size_t num_blks) +static void +icp_sha256_generic(uint32_t state[8], const void *data, size_t num_blks) { uint64_t blk; @@ -173,7 +174,8 @@ static const uint64_t SHA512_K[80] = { 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 }; -static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks) +static void +icp_sha512_generic(uint64_t state[8], const void *data, size_t num_blks) { uint64_t blk; @@ -226,7 +228,8 @@ static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks) } } -static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len) +static void +icp_sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len) { uint64_t pos = ctx->count[0]; uint64_t total = ctx->count[1]; @@ -258,7 +261,8 @@ static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len) ctx->count[1] = total; } -static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len) +static void +icp_sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len) { uint64_t pos = ctx->count[0]; uint64_t total = ctx->count[1]; @@ -290,7 +294,8 @@ static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len) ctx->count[1] = total; } -static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits) +static void +icp_sha256_final(sha256_ctx *ctx, uint8_t *result, int bits) { uint64_t mlen, pos = ctx->count[0]; uint8_t *m = ctx->wbuf; @@ -334,7 +339,8 @@ static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits) memset(ctx, 0, sizeof (*ctx)); } -static void sha512_final(sha512_ctx *ctx, uint8_t *result, int bits) +static void +icp_sha512_final(sha512_ctx *ctx, uint8_t *result, int bits) { uint64_t mlen, pos = ctx->count[0]; uint8_t *m = ctx->wbuf, *r; @@ -461,14 +467,14 @@ SHA2Update(SHA2_CTX *ctx, const void *data, size_t len) switch (ctx->algotype) { case SHA256: - sha256_update(&ctx->sha256, data, len); + icp_sha256_update(&ctx->sha256, data, len); break; case SHA512: case SHA512_HMAC_MECH_INFO_TYPE: - sha512_update(&ctx->sha512, data, len); + icp_sha512_update(&ctx->sha512, data, len); break; case SHA512_256: - sha512_update(&ctx->sha512, data, len); + icp_sha512_update(&ctx->sha512, data, len); break; } } @@ -479,32 +485,33 @@ SHA2Final(void *digest, SHA2_CTX *ctx) { switch (ctx->algotype) { case SHA256: - sha256_final(&ctx->sha256, digest, 256); + icp_sha256_final(&ctx->sha256, digest, 256); break; case SHA512: case SHA512_HMAC_MECH_INFO_TYPE: - sha512_final(&ctx->sha512, digest, 512); + icp_sha512_final(&ctx->sha512, digest, 512); break; case SHA512_256: - sha512_final(&ctx->sha512, digest, 256); + icp_sha512_final(&ctx->sha512, digest, 256); break; } } /* the generic implementation is always okay */ -static boolean_t sha2_is_supported(void) +static boolean_t +icp_sha2_is_supported(void) { return (B_TRUE); } const sha256_ops_t sha256_generic_impl = { .name = "generic", - .transform = sha256_generic, - .is_supported = sha2_is_supported + .transform = icp_sha256_generic, + .is_supported = icp_sha2_is_supported }; const sha512_ops_t sha512_generic_impl = { .name = "generic", - .transform = sha512_generic, - .is_supported = sha2_is_supported + .transform = icp_sha512_generic, + .is_supported = icp_sha2_is_supported }; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c index 393bfaa65ff5..ebc2c0eeb6d2 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -163,6 +163,13 @@ param_set_arc_int(SYSCTL_HANDLER_ARGS) return (0); } +static void +warn_deprecated_sysctl(const char *old, const char *new) +{ + printf("WARNING: sysctl vfs.zfs.%s is deprecated. Use vfs.zfs.%s instead.\n", + old, new); +} + int param_set_arc_max(SYSCTL_HANDLER_ARGS) { @@ -185,9 +192,17 @@ param_set_arc_max(SYSCTL_HANDLER_ARGS) if (val != 0) zfs_arc_max = arc_c_max; + if (arg2 != 0) + warn_deprecated_sysctl("arc_max", "arc.max"); + return (0); } +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, + CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + NULL, 1, param_set_arc_max, "LU", + "Maximum ARC size in bytes (LEGACY)"); + int param_set_arc_min(SYSCTL_HANDLER_ARGS) { @@ -209,9 +224,17 @@ param_set_arc_min(SYSCTL_HANDLER_ARGS) if (val != 0) zfs_arc_min = arc_c_min; + if (arg2 != 0) + warn_deprecated_sysctl("arc_min", "arc.min"); + return (0); } +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, + CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + NULL, 1, param_set_arc_min, "LU", + "Minimum ARC size in bytes (LEGACY)"); + extern uint_t zfs_arc_free_target; int @@ -232,9 +255,22 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS) zfs_arc_free_target = val; + if (arg2 != 0) + warn_deprecated_sysctl("arc_free_target", "arc.free_target"); + return (0); } +/* + * NOTE: This sysctl is CTLFLAG_RW not CTLFLAG_RWTUN due to its dependency on + * pagedaemon initialization. + */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, + NULL, 1, param_set_arc_free_target, "IU", + "Desired number of free pages below which ARC triggers reclaim" + " (LEGACY)"); + int param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) { @@ -250,9 +286,193 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) arc_no_grow_shift = val; + if (arg2 != 0) + warn_deprecated_sysctl("arc_no_grow_shift", "arc.no_grow_shift"); + return (0); } +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, + CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + NULL, 1, param_set_arc_no_grow_shift, "I", + "log2(fraction of ARC which must be free to allow growing) (LEGACY)"); + +extern uint64_t l2arc_write_max; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, + CTLFLAG_RWTUN, &l2arc_write_max, 0, + "Max write bytes per interval (LEGACY)"); + +extern uint64_t l2arc_write_boost; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, + CTLFLAG_RWTUN, &l2arc_write_boost, 0, + "Extra write bytes during device warmup (LEGACY)"); + +extern uint64_t l2arc_headroom; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, + CTLFLAG_RWTUN, &l2arc_headroom, 0, + "Number of max device writes to precache (LEGACY)"); + +extern uint64_t l2arc_headroom_boost; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom_boost, + CTLFLAG_RWTUN, &l2arc_headroom_boost, 0, + "Compressed l2arc_headroom multiplier (LEGACY)"); + +extern uint64_t l2arc_feed_secs; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, + CTLFLAG_RWTUN, &l2arc_feed_secs, 0, + "Seconds between L2ARC writing (LEGACY)"); + +extern uint64_t l2arc_feed_min_ms; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, + CTLFLAG_RWTUN, &l2arc_feed_min_ms, 0, + "Min feed interval in milliseconds (LEGACY)"); + +extern int l2arc_noprefetch; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, + CTLFLAG_RWTUN, &l2arc_noprefetch, 0, + "Skip caching prefetched buffers (LEGACY)"); + +extern int l2arc_feed_again; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, + CTLFLAG_RWTUN, &l2arc_feed_again, 0, + "Turbo L2ARC warmup (LEGACY)"); + +extern int l2arc_norw; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, + CTLFLAG_RWTUN, &l2arc_norw, 0, + "No reads during writes (LEGACY)"); + +static int +param_get_arc_state_size(SYSCTL_HANDLER_ARGS) +{ + arc_state_t *state = (arc_state_t *)arg1; + int64_t val; + + val = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]) + + zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); + return (sysctl_handle_64(oidp, &val, 0, req)); +} + +extern arc_state_t ARC_anon; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, anon_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_anon, 0, param_get_arc_state_size, "Q", + "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in anonymous state"); + +extern arc_state_t ARC_mru; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mru, 0, param_get_arc_state_size, "Q", + "size of mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in mru state"); + +extern arc_state_t ARC_mru_ghost; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_ghost_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mru_ghost, 0, param_get_arc_state_size, "Q", + "size of mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in mru ghost state"); + +extern arc_state_t ARC_mfu; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mfu, 0, param_get_arc_state_size, "Q", + "size of mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in mfu state"); + +extern arc_state_t ARC_mfu_ghost; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_ghost_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mfu_ghost, 0, param_get_arc_state_size, "Q", + "size of mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in mfu ghost state"); + +extern arc_state_t ARC_uncached; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, uncached_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_uncached, 0, param_get_arc_state_size, "Q", + "size of uncached state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD, + &ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in uncached state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD, + &ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in uncached state"); + +extern arc_state_t ARC_l2c_only; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, l2c_only_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_l2c_only, 0, param_get_arc_state_size, "Q", + "size of l2c_only state"); + +/* dbuf.c */ + +/* dmu.c */ + +/* dmu_zfetch.c */ + +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)"); + +extern uint32_t zfetch_max_distance; + +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, + CTLFLAG_RWTUN, &zfetch_max_distance, 0, + "Max bytes to prefetch per stream (LEGACY)"); + +extern uint32_t zfetch_max_idistance; + +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, + CTLFLAG_RWTUN, &zfetch_max_idistance, 0, + "Max bytes to prefetch indirects for per stream (LEGACY)"); + +/* dsl_pool.c */ + +/* dnode.c */ + +/* dsl_scan.c */ + /* metaslab.c */ int @@ -313,6 +533,19 @@ SYSCTL_UINT(_vfs_zfs, OID_AUTO, condense_pct, "Condense on-disk spacemap when it is more than this many percents" " of in-memory counterpart"); +extern uint_t zfs_remove_max_segment; + +SYSCTL_UINT(_vfs_zfs, OID_AUTO, remove_max_segment, + CTLFLAG_RWTUN, &zfs_remove_max_segment, 0, + "Largest contiguous segment ZFS will attempt to allocate when removing" + " a device"); + +extern int zfs_removal_suspend_progress; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, + CTLFLAG_RWTUN, &zfs_removal_suspend_progress, 0, + "Ensures certain actions can happen while in the middle of a removal"); + /* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy @@ -532,9 +765,18 @@ param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) zfs_vdev_min_auto_ashift = val; + if (arg2 != 0) + warn_deprecated_sysctl("min_auto_ashift", + "vdev.min_auto_ashift"); + return (0); } +SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1, + param_set_min_auto_ashift, "IU", + "Min ashift used when creating new top-level vdev. (LEGACY)"); + int param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) { @@ -551,9 +793,19 @@ param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) zfs_vdev_max_auto_ashift = val; + if (arg2 != 0) + warn_deprecated_sysctl("max_auto_ashift", + "vdev.max_auto_ashift"); + return (0); } +SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1, + param_set_max_auto_ashift, "IU", + "Max ashift used when optimizing for logical -> physical sector size on" + " new top-level vdevs. (LEGACY)"); + /* * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. @@ -575,6 +827,23 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, &zfs_vdev_standard_sm_blksz, 0, "Block size for standard space map. Power of 2 greater than 4096."); +extern int vdev_validate_skip; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, + CTLFLAG_RDTUN, &vdev_validate_skip, 0, + "Enable to bypass vdev_validate()."); + +/* vdev_mirror.c */ + +/* vdev_queue.c */ + +extern uint_t zfs_vdev_max_active; + +SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, + CTLFLAG_RWTUN, &zfs_vdev_max_active, 0, + "The maximum number of I/Os of all types active for each device." + " (LEGACY)"); + /* zio.c */ SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c index 4de48e013ec4..d0a9c662e6f0 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c @@ -762,8 +762,7 @@ zfsctl_common_pathconf(struct vop_pathconf_args *ap) return (0); case _PC_MIN_HOLE_SIZE: - *ap->a_retval = (int)SPA_MINBLOCKSIZE; - return (0); + return (EINVAL); case _PC_ACL_EXTENDED: *ap->a_retval = 0; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index 411225786089..f34a2fd37a77 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -4116,6 +4116,7 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, { znode_t *zp; zfsvfs_t *zfsvfs; + uint_t blksize, iosize; int error; switch (cmd) { @@ -4127,8 +4128,20 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, *valp = 64; return (0); case _PC_MIN_HOLE_SIZE: - *valp = (int)SPA_MINBLOCKSIZE; - return (0); + iosize = vp->v_mount->mnt_stat.f_iosize; + if (vp->v_type == VREG) { + zp = VTOZ(vp); + blksize = zp->z_blksz; + if (zp->z_size <= blksize) + blksize = MAX(blksize, iosize); + *valp = (int)blksize; + return (0); + } + if (vp->v_type == VDIR) { + *valp = (int)iosize; + return (0); + } + return (EINVAL); case _PC_ACL_EXTENDED: #if 0 /* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */ zp = VTOZ(vp); @@ -4210,8 +4223,20 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, zfs_vmobject_wlock(object); (void) vm_page_grab_pages(object, OFF_TO_IDX(start), - VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_ZERO, + VM_ALLOC_NORMAL | VM_ALLOC_WAITOK, ma, count); + if (!vm_page_all_valid(ma[count - 1])) { + /* + * Later in this function, we copy DMU data to + * invalid pages only. The last page may not be + * entirely filled though, if the file does not + * end on a page boundary. Therefore, we zero + * that last page here to make sure it does not + * contain garbage after the end of file. + */ + ASSERT(vm_page_none_valid(ma[count - 1])); + vm_page_zero_invalid(ma[count - 1], FALSE); + } zfs_vmobject_wunlock(object); } if (blksz == zp->z_blksz) diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c index 91cf38016e00..8562c42b3220 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c @@ -437,6 +437,7 @@ zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + memset(&cuio_s, 0, sizeof (cuio_s)); zfs_uio_init(&cuio, &cuio_s); keydata_len = zio_crypt_table[crypt].ci_keylen; @@ -519,6 +520,7 @@ zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, keydata_len = zio_crypt_table[crypt].ci_keylen; rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + memset(&cuio_s, 0, sizeof (cuio_s)); zfs_uio_init(&cuio, &cuio_s); /* diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index 0dd2ecd7fd8d..3ddbfcb97184 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -183,6 +183,7 @@ static struct filterops zvol_filterops_vnode = { .f_isfd = 1, .f_detach = zvol_filter_detach, .f_event = zvol_filter_vnode, + .f_copy = knote_triv_copy, }; extern uint_t zfs_geom_probe_vdev_key; diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c index 45c2999a4bb1..b2eae5d00b10 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c @@ -25,6 +25,10 @@ * SUCH DAMAGE. */ +/* + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> + */ + #include <sys/types.h> #include <sys/sysmacros.h> #include <sys/kmem.h> @@ -56,6 +60,19 @@ typedef struct zone_dataset { } zone_dataset_t; #ifdef CONFIG_USER_NS + +/* + * Linux 6.18 moved the generic namespace type away from ns->ops->type onto + * ns_common itself. + */ +#ifdef HAVE_NS_COMMON_TYPE +#define ns_is_newuser(ns) \ + ((ns)->ns_type == CLONE_NEWUSER) +#else +#define ns_is_newuser(ns) \ + ((ns)->ops != NULL && (ns)->ops->type == CLONE_NEWUSER) +#endif + /* * Returns: * - 0 on success @@ -84,7 +101,7 @@ user_ns_get(int fd, struct user_namespace **userns) goto done; } ns = get_proc_ns(file_inode(nsfile)); - if (ns->ops->type != CLONE_NEWUSER) { + if (!ns_is_newuser(ns)) { error = ENOTTY; goto done; } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index 8a8316f63c48..18f2426fbbfc 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -23,6 +23,7 @@ * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 by Delphix. All rights reserved. * Copyright (c) 2023, 2024, Klara Inc. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> */ /* @@ -1109,6 +1110,14 @@ abd_return_buf_copy(abd_t *abd, void *buf, size_t n) #define ABD_ITER_PAGE_SIZE(page) (PAGESIZE) #endif +#ifndef nth_page +/* + * Since 6.18 nth_page() no longer exists, and is no longer required to iterate + * within a single SG entry, so we replace it with a simple addition. + */ +#define nth_page(p, n) ((p)+(n)) +#endif + void abd_iter_page(struct abd_iter *aiter) { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c index daa4b5776837..934d74a112fd 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c @@ -2524,7 +2524,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, * Also note: DOS R/O is ignored for directories. */ if ((v4_mode & WRITE_MASK_DATA) && - S_ISDIR(ZTOI(zp)->i_mode) && + !S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_READONLY)) { return (SET_ERROR(EPERM)); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index 6106726651a3..e845ad69ad78 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -2033,10 +2033,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) goto out3; } - if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { - err = SET_ERROR(EPERM); - goto out3; - } + /* ZFS_READONLY will be handled in zfs_zaccess() */ /* * Verify timestamps doesn't overflow 32 bits. diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index d07317b0d910..02965ac8cbee 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -23,6 +23,7 @@ * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright (c) 2025, Klara, Inc. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> */ @@ -478,6 +479,7 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) return (ret); } +#ifdef HAVE_WRITE_CACHE_PAGES #ifdef HAVE_WRITEPAGE_T_FOLIO static int zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data) @@ -499,6 +501,78 @@ zpl_write_cache_pages(struct address_space *mapping, #endif return (result); } +#else +static inline int +zpl_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, void *data) +{ + pgoff_t start = wbc->range_start >> PAGE_SHIFT; + pgoff_t end = wbc->range_end >> PAGE_SHIFT; + + struct folio_batch fbatch; + folio_batch_init(&fbatch); + + /* + * This atomically (-ish) tags all DIRTY pages in the range with + * TOWRITE, allowing users to continue dirtying or undirtying pages + * while we get on with writeback, without us treading on each other. + */ + tag_pages_for_writeback(mapping, start, end); + + int err = 0; + unsigned int npages; + + /* + * Grab references to the TOWRITE pages just flagged. This may not get + * all of them, so we do it in a loop until there are none left. + */ + while ((npages = filemap_get_folios_tag(mapping, &start, end, + PAGECACHE_TAG_TOWRITE, &fbatch)) != 0) { + + /* Loop over each page and write it out. */ + struct folio *folio; + while ((folio = folio_batch_next(&fbatch)) != NULL) { + folio_lock(folio); + + /* + * If the folio has been remapped, or is no longer + * dirty, then there's nothing to do. + */ + if (folio->mapping != mapping || + !folio_test_dirty(folio)) { + folio_unlock(folio); + continue; + } + + /* + * If writeback is already in progress, wait for it to + * finish. We continue after this even if the page + * ends up clean; zfs_putpage() will skip it if no + * further work is required. + */ + while (folio_test_writeback(folio)) + folio_wait_bit(folio, PG_writeback); + + /* + * Write it out and collect any error. zfs_putpage() + * will clear the TOWRITE and DIRTY flags, and return + * with the page unlocked. + */ + int ferr = zpl_putpage(&folio->page, wbc, data); + if (err == 0 && ferr != 0) + err = ferr; + + /* Housekeeping for the caller. */ + wbc->nr_to_write -= folio_nr_pages(folio); + } + + /* Release any remaining references on the batch. */ + folio_batch_release(&fbatch); + } + + return (err); +} +#endif static int zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index 444948d03cb3..347b352506e5 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -23,6 +23,7 @@ * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2023, Datto Inc. All rights reserved. * Copyright (c) 2025, Klara, Inc. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> */ @@ -33,6 +34,7 @@ #include <sys/zpl.h> #include <linux/iversion.h> #include <linux/version.h> +#include <linux/vfs_compat.h> /* * What to do when the last reference to an inode is released. If 0, the kernel @@ -104,7 +106,7 @@ zpl_dirty_inode(struct inode *ip, int flags) * reporting memory pressure and requests OpenZFS release some memory (see * zfs_prune()). * - * When set to 1, we call generic_delete_node(), which always returns "destroy + * When set to 1, we call generic_delete_inode(), which always returns "destroy * immediately", resulting in inodes being destroyed immediately, releasing * their associated dnodes and dbufs to the dbuf cached and the ARC to be * evicted as normal. diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index bac166fcd89e..fe939150b641 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -21,7 +21,7 @@ */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. - * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> + * Copyright (c) 2024, 2025, Rob Norris <robn@despairlabs.com> * Copyright (c) 2024, 2025, Klara, Inc. */ @@ -337,16 +337,14 @@ zvol_discard(zv_request_t *zvr) } /* - * Align the request to volume block boundaries when a secure erase is - * not required. This will prevent dnode_free_range() from zeroing out - * the unaligned parts which is slow (read-modify-write) and useless - * since we are not freeing any space by doing so. + * Align the request to volume block boundaries. This will prevent + * dnode_free_range() from zeroing out the unaligned parts which is + * slow (read-modify-write) and useless since we are not freeing any + * space by doing so. */ - if (!io_is_secure_erase(bio, rq)) { - start = P2ROUNDUP(start, zv->zv_volblocksize); - end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); - size = end - start; - } + start = P2ROUNDUP(start, zv->zv_volblocksize); + end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); + size = end - start; if (start >= end) goto unlock; @@ -467,6 +465,24 @@ zvol_read_task(void *arg) zv_request_task_free(task); } +/* + * Note: + * + * The kernel uses different enum names for the IO opcode, depending on the + * kernel version ('req_opf', 'req_op'). To sidestep this, use macros rather + * than inline functions for these checks. + */ +/* Should this IO go down the zvol write path? */ +#define ZVOL_OP_IS_WRITE(op) \ + (op == REQ_OP_WRITE || \ + op == REQ_OP_FLUSH || \ + op == REQ_OP_DISCARD) + +/* Is this IO type supported by zvols? */ +#define ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op)) + +/* Get the IO opcode */ +#define ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq)) /* * Process a BIO or request @@ -484,7 +500,33 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = io_offset(bio, rq); uint64_t size = io_size(bio, rq); - int rw = io_data_dir(bio, rq); + int rw; + + if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) { + zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x", + rq != NULL ? "request" : "BIO", + ZVOL_OP(bio, rq), + rq != NULL ? rq->cmd_flags : bio->bi_opf); + ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq))); + zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP)); + goto out; + } + + if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) { + rw = WRITE; + } else { + rw = READ; + } + + /* + * Sanity check + * + * If we're a BIO, check our rw matches the kernel's + * bio_data_dir(bio) rw. We need to check because we support fewer + * IO operations, and want to verify that what we think are reads and + * writes from those operations match what the kernel thinks. + */ + ASSERT(rq != NULL || rw == bio_data_dir(bio)); if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { zvol_end_io(bio, rq, SET_ERROR(ENXIO)); @@ -589,7 +631,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, * interfaces lack this functionality (they block waiting for * the i/o to complete). */ - if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { + if (io_is_discard(bio, rq)) { if (force_sync) { zvol_discard(&zvr); } else { @@ -990,12 +1032,12 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) * tiny devices. For devices over 1 Mib a standard head and sector count * is used to keep the cylinders count reasonable. */ -static int -zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static inline int +zvol_getgeo_impl(struct gendisk *disk, struct hd_geometry *geo) { + zvol_state_t *zv = atomic_load_ptr(&disk->private_data); sector_t sectors; - zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); @@ -1015,6 +1057,20 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) return (0); } +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK +static int +zvol_getgeo(struct gendisk *disk, struct hd_geometry *geo) +{ + return (zvol_getgeo_impl(disk, geo)); +} +#else +static int +zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + return (zvol_getgeo_impl(bdev->bd_disk, geo)); +} +#endif + /* * Why have two separate block_device_operations structs? * @@ -1458,7 +1514,7 @@ zvol_os_remove_minor(zvol_state_t *zv) if (zso->use_blk_mq) blk_mq_free_tag_set(&zso->tag_set); - ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS); + ida_free(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS); kmem_free(zso, sizeof (struct zvol_state_os)); @@ -1613,7 +1669,7 @@ zvol_os_create_minor(const char *name) if (zvol_inhibit_dev) return (0); - idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); + idx = ida_alloc(&zvol_ida, kmem_flags_convert(KM_SLEEP)); if (idx < 0) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; @@ -1621,7 +1677,7 @@ zvol_os_create_minor(const char *name) /* too many partitions can cause an overflow */ zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", name, minor, MINOR(minor)); - ida_simple_remove(&zvol_ida, idx); + ida_free(&zvol_ida, idx); return (SET_ERROR(EINVAL)); } @@ -1629,7 +1685,7 @@ zvol_os_create_minor(const char *name) if (zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); mutex_exit(&zv->zv_state_lock); - ida_simple_remove(&zvol_ida, idx); + ida_free(&zvol_ida, idx); return (SET_ERROR(EEXIST)); } @@ -1729,7 +1785,7 @@ out_doi: rw_exit(&zvol_state_lock); error = zvol_os_add_disk(zv->zv_zso->zvo_disk); } else { - ida_simple_remove(&zvol_ida, idx); + ida_free(&zvol_ida, idx); } return (error); diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c index 864e3898b365..9190ae0362ea 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c @@ -364,8 +364,8 @@ zfs_prop_init(void) static const zprop_index_t xattr_table[] = { { "off", ZFS_XATTR_OFF }, - { "on", ZFS_XATTR_SA }, { "sa", ZFS_XATTR_SA }, + { "on", ZFS_XATTR_SA }, { "dir", ZFS_XATTR_DIR }, { NULL } }; diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index bd6dc8edd8ca..dbb5e942e2e6 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -486,13 +486,13 @@ static taskq_t *arc_flush_taskq; static uint_t zfs_arc_evict_threads = 0; /* The 7 states: */ -static arc_state_t ARC_anon; -/* */ arc_state_t ARC_mru; -static arc_state_t ARC_mru_ghost; -/* */ arc_state_t ARC_mfu; -static arc_state_t ARC_mfu_ghost; -static arc_state_t ARC_l2c_only; -static arc_state_t ARC_uncached; +arc_state_t ARC_anon; +arc_state_t ARC_mru; +arc_state_t ARC_mru_ghost; +arc_state_t ARC_mfu; +arc_state_t ARC_mfu_ghost; +arc_state_t ARC_l2c_only; +arc_state_t ARC_uncached; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, @@ -832,15 +832,15 @@ typedef struct arc_async_flush { #define L2ARC_FEED_TYPES 4 /* L2ARC Performance Tunables */ -static uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ -static uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ -static uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ -static uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; -static uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ -static uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ -static int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ -static int l2arc_feed_again = B_TRUE; /* turbo warmup */ -static int l2arc_norw = B_FALSE; /* no reads during writes */ +uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ +uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ +uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ +uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; +uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ +int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +int l2arc_feed_again = B_TRUE; /* turbo warmup */ +int l2arc_norw = B_FALSE; /* no reads during writes */ static uint_t l2arc_meta_percent = 33; /* limit on headers size */ /* @@ -1157,7 +1157,7 @@ buf_fini(void) #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages - * should be using vmem_free() in the linux kernel\ + * should be using vmem_free() in the linux kernel. */ vmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); @@ -1392,6 +1392,7 @@ arc_get_complevel(arc_buf_t *buf) return (buf->b_hdr->b_complevel); } +__maybe_unused static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { @@ -4650,10 +4651,10 @@ arc_flush_task(void *arg) arc_flush_impl(spa_guid, B_FALSE); arc_async_flush_remove(spa_guid, af->af_cache_level); - uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time); - if (elaspsed > 0) { + uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time); + if (elapsed > 0) { zfs_dbgmsg("spa %llu arc flushed in %llu ms", - (u_longlong_t)spa_guid, (u_longlong_t)elaspsed); + (u_longlong_t)spa_guid, (u_longlong_t)elapsed); } } @@ -9151,7 +9152,7 @@ top: if (dev->l2ad_first) { /* * This is the first sweep through the device. There is - * nothing to evict. We have already trimmmed the + * nothing to evict. We have already trimmed the * whole device. */ goto out; @@ -10085,12 +10086,12 @@ l2arc_device_teardown(void *arg) kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); vmem_free(remdev, sizeof (l2arc_dev_t)); - uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time); - if (elaspsed > 0) { + uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time); + if (elapsed > 0) { zfs_dbgmsg("spa %llu, vdev %llu removed in %llu ms", (u_longlong_t)rva->rva_spa_gid, (u_longlong_t)rva->rva_vdev_gid, - (u_longlong_t)elaspsed); + (u_longlong_t)elapsed); } if (rva->rva_async) diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c index 3d3a9c713568..51165d0bf723 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c +++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c @@ -57,19 +57,19 @@ static unsigned int zfetch_max_sec_reap = 2; /* min bytes to prefetch per stream (default 2MB) */ static unsigned int zfetch_min_distance = 2 * 1024 * 1024; /* max bytes to prefetch per stream (default 8MB) */ -static unsigned int zfetch_max_distance = 8 * 1024 * 1024; +unsigned int zfetch_max_distance = 8 * 1024 * 1024; #else /* min bytes to prefetch per stream (default 4MB) */ static unsigned int zfetch_min_distance = 4 * 1024 * 1024; /* max bytes to prefetch per stream (default 64MB) */ -static unsigned int zfetch_max_distance = 64 * 1024 * 1024; +unsigned int zfetch_max_distance = 64 * 1024 * 1024; #endif /* max bytes to prefetch indirects for per stream (default 128MB) */ -static unsigned int zfetch_max_idistance = 128 * 1024 * 1024; +unsigned int zfetch_max_idistance = 128 * 1024 * 1024; /* max request reorder distance within a stream (default 16MB) */ -static unsigned int zfetch_max_reorder = 16 * 1024 * 1024; +unsigned int zfetch_max_reorder = 16 * 1024 * 1024; /* Max log2 fraction of holes in a stream */ -static unsigned int zfetch_hole_shift = 2; +unsigned int zfetch_hole_shift = 2; typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c index 6c150d31c669..e88d394b5229 100644 --- a/sys/contrib/openzfs/module/zfs/dnode.c +++ b/sys/contrib/openzfs/module/zfs/dnode.c @@ -2656,6 +2656,32 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, } /* + * Adjust *offset to the next (or previous) block byte offset at lvl. + * Returns FALSE if *offset would overflow or underflow. + */ +static boolean_t +dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl) +{ + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + int span = lvl * epbs + dn->dn_datablkshift; + uint64_t blkid, maxblkid; + + if (span >= 8 * sizeof (uint64_t)) + return (B_FALSE); + + blkid = *offset >> span; + maxblkid = 1ULL << (8 * sizeof (*offset) - span); + if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid) + *offset = (blkid + 1) << span; + else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0) + *offset = (blkid << span) - 1; + else + return (B_FALSE); + + return (B_TRUE); +} + +/* * Find the next hole, data, or sparse region at or after *offset. * The value 'blkfill' tells us how many items we expect to find * in an L0 data block; this value is 1 for normal objects, @@ -2682,7 +2708,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, int minlvl, uint64_t blkfill, uint64_t txg) { - uint64_t initial_offset = *offset; + uint64_t matched = *offset; int lvl, maxlvl; int error = 0; @@ -2706,16 +2732,36 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, maxlvl = dn->dn_phys->dn_nlevels; - for (lvl = minlvl; lvl <= maxlvl; lvl++) { + for (lvl = minlvl; lvl <= maxlvl; ) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); - if (error != ESRCH) + if (error == 0 && lvl > minlvl) { + --lvl; + matched = *offset; + } else if (error == ESRCH && lvl < maxlvl && + dnode_next_block(dn, flags, &matched, lvl)) { + /* + * Continue search at next/prev offset in lvl+1 block. + * + * Usually we only search upwards at the start of the + * search as higher level blocks point at a matching + * minlvl block in most cases, but we backtrack if not. + * + * This can happen for txg > 0 searches if the block + * contains only BPs/dnodes freed at that txg. It also + * happens if we are still syncing out the tree, and + * some BP's at higher levels are not updated yet. + * + * We must adjust offset to avoid coming back to the + * same offset and getting stuck looping forever. This + * also deals with the case where offset is already at + * the beginning or end of the object. + */ + ++lvl; + *offset = matched; + } else { break; - } - - while (error == 0 && --lvl >= minlvl) { - error = dnode_next_offset_level(dn, - flags, offset, lvl, blkfill, txg); + } } /* @@ -2727,9 +2773,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, error = 0; } - if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? - initial_offset < *offset : initial_offset > *offset)) - error = SET_ERROR(ESRCH); out: if (!(flags & DNODE_FIND_HAVELOCK)) rw_exit(&dn->dn_struct_rwlock); diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c index 7db72b9b04b0..fd46127b6068 100644 --- a/sys/contrib/openzfs/module/zfs/mmp.c +++ b/sys/contrib/openzfs/module/zfs/mmp.c @@ -446,7 +446,7 @@ mmp_write_uberblock(spa_t *spa) uint64_t offset; hrtime_t lock_acquire_time = gethrtime(); - spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER); + spa_config_enter_priority(spa, SCL_STATE, mmp_tag, RW_READER); lock_acquire_time = gethrtime() - lock_acquire_time; if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10)) zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns " diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c index ea2d2c7227c8..d73195f1a21f 100644 --- a/sys/contrib/openzfs/module/zfs/range_tree.c +++ b/sys/contrib/openzfs/module/zfs/range_tree.c @@ -585,7 +585,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, * the size, since we do not support removing partial segments * of range trees with gaps. */ - zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) - + zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) - zfs_rs_get_start_raw(rs, rt)); zfs_range_tree_stat_incr(rt, &rs_tmp); diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c index cf28955b0c50..f615591e826b 100644 --- a/sys/contrib/openzfs/module/zfs/spa_config.c +++ b/sys/contrib/openzfs/module/zfs/spa_config.c @@ -372,6 +372,8 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MIN_ALLOC, spa->spa_min_alloc); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, spa->spa_max_alloc); if (spa->spa_comment != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index 6f7c060f97f8..0bead6d49666 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -510,7 +510,7 @@ spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw) static void spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, - int mmp_flag) + int priority_flag) { (void) tag; int wlocks_held = 0; @@ -526,7 +526,7 @@ spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || - (!mmp_flag && scl->scl_write_wanted)) { + (!priority_flag && scl->scl_write_wanted)) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { @@ -551,7 +551,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) } /* - * The spa_config_enter_mmp() allows the mmp thread to cut in front of + * The spa_config_enter_priority() allows the mmp thread to cut in front of * outstanding write lock requests. This is needed since the mmp updates are * time sensitive and failure to service them promptly will result in a * suspended pool. This pool suspension has been seen in practice when there is @@ -560,7 +560,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) */ void -spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw) +spa_config_enter_priority(spa_t *spa, int locks, const void *tag, krw_t rw) { spa_config_enter_impl(spa, locks, tag, rw, 1); } @@ -806,6 +806,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; + spa->spa_max_alloc = 0; spa->spa_gcd_alloc = INT_MAX; /* Reset cached value */ @@ -1865,6 +1866,19 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) } /* + * Return the range of minimum allocation sizes for the normal allocation + * class. This can be used by external consumers of the DMU to estimate + * potential wasted capacity when setting the recordsize for an object. + * This is mainly for dRAID pools which always pad to a full stripe width. + */ +void +spa_get_min_alloc_range(spa_t *spa, uint64_t *min_alloc, uint64_t *max_alloc) +{ + *min_alloc = spa->spa_min_alloc; + *max_alloc = spa->spa_max_alloc; +} + +/* * Return the amount of slop space in bytes. It is typically 1/32 of the pool * (3.2%), minus the embedded log space. On very small pools, it may be * slightly larger than this. On very large pools, it will be capped to @@ -3085,6 +3099,7 @@ EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_freeze_txg); +EXPORT_SYMBOL(spa_get_min_alloc_range); /* for Lustre */ EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index fc6d445f9785..c8d7280387a2 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -100,7 +100,7 @@ static uint_t zfs_vdev_default_ms_shift = 29; /* upper limit for metaslab size (16G) */ static uint_t zfs_vdev_max_ms_shift = 34; -static int vdev_validate_skip = B_FALSE; +int vdev_validate_skip = B_FALSE; /* * Since the DTL space map of a vdev is not expected to have a lot of @@ -1497,12 +1497,14 @@ vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) { if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; - if (spa->spa_gcd_alloc == INT_MAX) { + + if (min_alloc > spa->spa_max_alloc) + spa->spa_max_alloc = min_alloc; + + if (spa->spa_gcd_alloc == INT_MAX) spa->spa_gcd_alloc = min_alloc; - } else { - spa->spa_gcd_alloc = vdev_gcd(min_alloc, - spa->spa_gcd_alloc); - } + else + spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc); } void @@ -1560,8 +1562,7 @@ vdev_metaslab_group_create(vdev_t *vd) if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; - uint64_t min_alloc = vdev_get_min_alloc(vd); - vdev_spa_set_alloc(spa, min_alloc); + vdev_spa_set_alloc(spa, vdev_get_min_alloc(vd)); } } } diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c index c44f654b0261..0d4fdaa77ba0 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_label.c +++ b/sys/contrib/openzfs/module/zfs/vdev_label.c @@ -511,6 +511,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_MIN_ALLOC, + vdev_get_min_alloc(vd)); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); if (vd->vdev_noalloc) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c index e69e5598939e..c12713b107bf 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_queue.c +++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c @@ -122,7 +122,7 @@ * The maximum number of i/os active to each device. Ideally, this will be >= * the sum of each queue's max_active. */ -static uint_t zfs_vdev_max_active = 1000; +uint_t zfs_vdev_max_active = 1000; /* * Per-queue limits on the number of i/os active to each device. If the diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c index 2ce0121324ad..abb71543e3ab 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_removal.c +++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c @@ -51,34 +51,70 @@ #include <sys/trace_zfs.h> /* - * This file contains the necessary logic to remove vdevs from a - * storage pool. Currently, the only devices that can be removed - * are log, cache, and spare devices; and top level vdevs from a pool - * w/o raidz or mirrors. (Note that members of a mirror can be removed - * by the detach operation.) + * This file contains the necessary logic to remove vdevs from a storage + * pool. Note that members of a mirror can be removed by the detach + * operation. Currently, the only devices that can be removed are: * - * Log vdevs are removed by evacuating them and then turning the vdev - * into a hole vdev while holding spa config locks. + * 1) Traditional hot spare and cache vdevs. Note that draid distributed + * spares are fixed at creation time and cannot be removed. * - * Top level vdevs are removed and converted into an indirect vdev via - * a multi-step process: + * 2) Log vdevs are removed by evacuating them and then turning the vdev + * into a hole vdev while holding spa config locks. * - * - Disable allocations from this device (spa_vdev_remove_top). + * 3) Top-level singleton and mirror vdevs, including dedup and special + * vdevs, are removed and converted into an indirect vdev via a + * multi-step process: * - * - From a new thread (spa_vdev_remove_thread), copy data from - * the removing vdev to a different vdev. The copy happens in open - * context (spa_vdev_copy_impl) and issues a sync task - * (vdev_mapping_sync) so the sync thread can update the partial - * indirect mappings in core and on disk. + * - Disable allocations from this device (spa_vdev_remove_top). * - * - If a free happens during a removal, it is freed from the - * removing vdev, and if it has already been copied, from the new - * location as well (free_from_removing_vdev). + * - From a new thread (spa_vdev_remove_thread), copy data from the + * removing vdev to a different vdev. The copy happens in open context + * (spa_vdev_copy_impl) and issues a sync task (vdev_mapping_sync) so + * the sync thread can update the partial indirect mappings in core + * and on disk. * - * - After the removal is completed, the copy thread converts the vdev - * into an indirect vdev (vdev_remove_complete) before instructing - * the sync thread to destroy the space maps and finish the removal - * (spa_finish_removal). + * - If a free happens during a removal, it is freed from the removing + * vdev, and if it has already been copied, from the new location as + * well (free_from_removing_vdev). + * + * - After the removal is completed, the copy thread converts the vdev + * into an indirect vdev (vdev_remove_complete) before instructing + * the sync thread to destroy the space maps and finish the removal + * (spa_finish_removal). + * + * The following constraints currently apply primary device removal: + * + * - All vdevs must be online, healthy, and not be missing any data + * according to the DTLs. + * + * - When removing a singleton or mirror vdev, regardless of it's a + * special, dedup, or primary device, it must have the same ashift + * as the devices in the normal allocation class. Furthermore, all + * vdevs in the normal allocation class must have the same ashift to + * ensure the new allocations never includes additional padding. + * + * - The normal allocation class cannot contain any raidz or draid + * top-level vdevs since segments are copied without regard for block + * boundaries. This makes it impossible to calculate the required + * parity columns when using these vdev types as the destination. + * + * - The encryption keys must be loaded so the ZIL logs can be reset + * in order to prevent writing to the device being removed. + * + * N.B. ashift and raidz/draid constraints for primary top-level device + * removal could be slightly relaxed if it were possible to request that + * DVAs from a mirror or singleton in the specified allocation class be + * used (metaslab_alloc_dva). + * + * This flexibility would be particularly useful for raidz/draid pools which + * often include a mirrored special device. If a mistakenly added top-level + * singleton were added it could then still be removed at the cost of some + * special device capacity. This may be a worthwhile tradeoff depending on + * the pool capacity and expense (cost, complexity, time) of creating a new + * pool and copying all of the data to correct the configuration. + * + * Furthermore, while not currently supported it should be possible to allow + * vdevs of any type to be removed as long as they've never been written to. */ typedef struct vdev_copy_arg { @@ -105,7 +141,7 @@ static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024; * * See also the accessor function spa_remove_max_segment(). */ -static uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE; +uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE; /* * Ignore hard IO errors during device removal. When set if a device @@ -137,7 +173,7 @@ uint_t vdev_removal_max_span = 32 * 1024; * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a removal. */ -static int zfs_removal_suspend_progress = 0; +int zfs_removal_suspend_progress = 0; #define VDEV_REMOVAL_ZAP_OBJS "lzap" diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 4cf8912d4269..aeea58bedfe4 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -4574,8 +4574,29 @@ zio_vdev_io_start(zio_t *zio) ASSERT0(zio->io_child_error[ZIO_CHILD_VDEV]); if (vd == NULL) { - if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) - spa_config_enter(spa, SCL_ZIO, zio, RW_READER); + if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) { + /* + * A deadlock workaround. The ddt_prune_unique_entries() + * -> prune_candidates_sync() code path takes the + * SCL_ZIO reader lock and may request it again here. + * If there is another thread who wants the SCL_ZIO + * writer lock, then scl_write_wanted will be set. + * Thus, the spa_config_enter_priority() is used to + * ignore pending writer requests. + * + * The locking should be revised to remove the need + * for this workaround. If that's not workable then + * it should only be applied to the zios involved in + * the pruning process. This impacts the read/write + * I/O balance while pruning. + */ + if (spa->spa_active_ddt_prune) + spa_config_enter_priority(spa, SCL_ZIO, zio, + RW_READER); + else + spa_config_enter(spa, SCL_ZIO, zio, + RW_READER); + } /* * The mirror_ops handle multiple DVAs in a single BP. @@ -5305,6 +5326,16 @@ zio_ready(zio_t *zio) return (NULL); } + if (zio_injection_enabled) { + hrtime_t target = zio_handle_ready_delay(zio); + if (target != 0 && zio->io_target_timestamp == 0) { + zio->io_stage >>= 1; + zio->io_target_timestamp = target; + zio_delay_interrupt(zio); + return (NULL); + } + } + if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(BP_GET_BIRTH(bp) == zio->io_txg || diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c index 981a1be4847c..287577018ed1 100644 --- a/sys/contrib/openzfs/module/zfs/zio_inject.c +++ b/sys/contrib/openzfs/module/zfs/zio_inject.c @@ -827,6 +827,44 @@ zio_handle_export_delay(spa_t *spa, hrtime_t elapsed) zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT); } +/* + * For testing, inject a delay before ready state. + */ +hrtime_t +zio_handle_ready_delay(zio_t *zio) +{ + inject_handler_t *handler; + hrtime_t now = gethrtime(); + hrtime_t target = 0; + + /* + * Ignore I/O not associated with any logical data. + */ + if (zio->io_logical == NULL) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_DELAY_READY) + continue; + + /* If this handler matches, inject the delay */ + if (zio_match_iotype(zio, handler->zi_record.zi_iotype) && + zio_match_handler(&zio->io_logical->io_bookmark, + zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, + zio_match_dva(zio), &handler->zi_record, zio->io_error)) { + target = now + (hrtime_t)handler->zi_record.zi_timer; + break; + } + } + + rw_exit(&inject_lock); + return (target); +} + static int zio_calculate_range(const char *pool, zinject_record_t *record) { diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index faced0db7e9e..00f98168d3d8 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -410,7 +410,7 @@ zvol_set_volthreading(const char *name, boolean_t value) { zvol_state_t *zv = zvol_find_by_name(name, RW_NONE); if (zv == NULL) - return (SET_ERROR(ENOENT)); + return (-1); zv->zv_threading = value; mutex_exit(&zv->zv_state_lock); return (0); diff --git a/sys/contrib/openzfs/module/zstd/zfs_zstd.c b/sys/contrib/openzfs/module/zstd/zfs_zstd.c index 3db196953f74..c403c001086a 100644 --- a/sys/contrib/openzfs/module/zstd/zfs_zstd.c +++ b/sys/contrib/openzfs/module/zstd/zfs_zstd.c @@ -441,64 +441,6 @@ zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level) } #ifndef IN_LIBSA -static size_t -zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len, - int level) -{ - int16_t zstd_level; - if (zstd_enum_to_level(level, &zstd_level)) { - ZSTDSTAT_BUMP(zstd_stat_com_inval); - return (s_len); - } - /* - * A zstd early abort heuristic. - * - * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently - * 128k), don't try any of this, just go. - * (because experimentally that was a reasonable cutoff for a perf win - * with tiny ratio change) - * - First, we try LZ4 compression, and if it doesn't early abort, we - * jump directly to whatever compression level we intended to try. - * - Second, we try zstd-1 - if that errors out (usually, but not - * exclusively, if it would overflow), we give up early. - * - * If it works, instead we go on and compress anyway. - * - * Why two passes? LZ4 alone gets you a lot of the way, but on highly - * compressible data, it was losing up to 8.5% of the compressed - * savings versus no early abort, and all the zstd-fast levels are - * worse indications on their own than LZ4, and don't improve the LZ4 - * pass noticably if stacked like this. - */ - size_t actual_abort_size = zstd_abort_size; - if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level && - s_len >= actual_abort_size) { - int pass_len = 1; - pass_len = zfs_lz4_compress(s_start, d_start, s_len, d_len, 0); - if (pass_len < d_len) { - ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed); - goto keep_trying; - } - ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected); - - pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len, - ZIO_ZSTD_LEVEL_1); - if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) { - ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected); - return (s_len); - } - ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed); - } else { - ZSTDSTAT_BUMP(zstd_stat_passignored); - if (s_len < actual_abort_size) { - ZSTDSTAT_BUMP(zstd_stat_passignored_size); - } - } -keep_trying: - return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level)); - -} - /* Compress block using zstd */ static size_t zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len, |
